In [24]:
import pickle
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from tdamapper.core import MapperAlgorithm
from tdamapper.cover import CubicalCover
from tdamapper.plot import MapperLayoutInteractive, MapperLayoutStatic
from tdamapper.clustering import FailSafeClustering
from sklearn import metrics

from functions import *
from chi import *
from regressionP import *
from models import *

In [54]:
# import pickle

# with open('CalculatedData/new1.pkl', 'wb') as f:
#     pickle.dump(mapper_graph1, f)

# import pickle

# with open('CalculatedData/mapper_graph1.pkl', 'rb') as f:
#     mapper_graph1 = pickle.load(f)

In [25]:
data1 = pd.read_csv("./Data/NPA_TMA2_1.csv", low_memory=False)[:-2]
data2 = pd.read_csv("./Data/NPA_TMA2_2.csv", low_memory=False)[:-2]
data3 = pd.read_csv("./Data/NPA_TMA2_3.csv", low_memory=False)[:-2]
data4 = pd.read_csv("./Data/NPA_TMA2_4.csv", low_memory=False)[:-2]
dataA2 = pd.concat([data1, data2, data3, data4], ignore_index=True)

dataA1 = pd.read_csv("./Data/NPA_TMA1.csv")[:-2]

rbind_dataprocess_age資料應該先合併，再proccess_data，再分離，因為個別做可能標籤會不同

In [28]:
def preprocess(input_data, select_lst, sample = 592):
    sample_data = input_data[input_data['當事者順位'] == 1].reset_index(drop=True, inplace=False)#.sample(sample).reset_index(drop=True)
    # sample_data = sample_data[sample_data['發生月份'] < 3]
    dataA = sample_data[select_lst]
    
    death_injury_data = split_death_injury(dataA['死亡受傷人數'])
    dist_df = pd.concat([dataA, death_injury_data], axis=1)
    dist_df.drop(columns=['死亡受傷人數'], inplace=True)
    
    return dist_df, sample_data

# List of columns to select
select_lst = [
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '速限-第1當事者', #'道路障礙-視距品質名稱', # 和速限共線
    '道路型態大類別名稱', '事故位置大類別名稱', 
    # '路面狀況-路面鋪裝名稱', '路面狀況-路面缺陷名稱', '道路障礙-障礙物名稱', # 分類幾乎都是無缺陷
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '死亡受傷人數',
    '經度', '緯度',
]

dist_dfA1 = preprocess(dataA1, select_lst, sample = 592)
dist_dfA2 = preprocess(dataA2, select_lst, sample = 20000) # 120420

rbind_data = pd.concat([dist_dfA1[0], dist_dfA2[0]], axis=0, ignore_index=True)

rbind_data.loc[rbind_data['受傷'] > 1, '受傷'] = 2
rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)

dist_df = process_data(rbind_data)
scaler = StandardScaler()

full_dist = pd.DataFrame(scaler.fit_transform(dist_df), columns = dist_df.columns)
X1 = full_dist.drop(['受傷', '死亡', '經度', '緯度'], axis=1).to_numpy()

full_dist.head()

Unnamed: 0,光線名稱,道路類別-第1當事者-名稱,速限-第1當事者,道路型態大類別名稱,事故位置大類別名稱,號誌-號誌種類名稱,車道劃分設施-分向設施大類別名稱,車道劃分設施-分道設施-快車道或一般車道間名稱,車道劃分設施-分道設施-快慢車道間名稱,車道劃分設施-分道設施-路面邊線名稱,事故類型及型態大類別名稱,經度,緯度,死亡,受傷
0,0.146102,2.120706,-0.146069,1.210906,0.485564,-0.713755,-1.627801,1.771301,-1.697696,-0.917071,1.798967,-0.63422,-1.868211,13.660843,-2.803204
1,1.465217,-0.25172,-0.146069,-0.814068,-0.872635,2.771975,-1.627801,-0.822895,-1.697696,-0.917071,-0.012319,1.111877,0.974928,13.660843,-0.620846
2,-1.173013,2.911514,-0.146069,1.210906,1.164664,-0.713755,1.177412,1.122752,-1.697696,-0.917071,1.798967,-2.459704,-0.63318,13.660843,-2.803204
3,-1.173013,-0.25172,-0.146069,-0.814068,-0.872635,2.771975,-0.225194,-0.822895,0.422707,1.090428,-0.012319,0.936797,1.016777,13.660843,-0.620846
4,1.465217,2.120706,-0.146069,1.210906,1.164664,-0.713755,0.476109,-0.822895,0.422707,-0.917071,1.798967,1.832866,0.186897,13.660843,-2.803204


In [27]:
full_dist.shape

(121012, 15)

In [4]:
full_dist.shape

(121012, 15)

In [14]:
# 最佳模型
start_time = time.time()

lens1 = PCA(10).fit_transform(X1)

mapper_algo1 = MapperAlgorithm(
    cover = CubicalCover(
        n_intervals = 4,
        overlap_frac = 0.4
    ),
    clustering = FailSafeClustering(
        clustering = AgglomerativeClustering(3, linkage='ward'),
        verbose = False)
)

mapper_graph1 = mapper_algo1.fit_transform(X1, lens1)

end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)

435.4554030895233


In [21]:
# # 道路型態大類別名稱
# mapper_plot1 = MapperLayoutInteractive(
#     mapper_graph1,
#     colors = dist_df[['速限-第1當事者']].to_numpy(),
#     cmap = 'jet',
#     # agg = np.nanmean,
#     agg = most_frequent_nonan,
#     dim = 3,
#     iterations = 30,
#     seed = 5,
#     width = 800,
#     height = 500)

# fig_mean1 = mapper_plot1.plot()
# fig_mean1.show(config={'scrollZoom': True})

## Get plot information

In [17]:
x = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['x']
y = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['y']
z = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['z']

threeDimData = pd.DataFrame({'x': x, 'y': y, 'z': z})

import re
data_tuple = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['text']

data = []
for item in data_tuple:
    color = int(re.search(r'color: (\d+)', item).group(1))
    node = int(re.search(r'node: (\d+)', item).group(1))
    size = int(re.search(r'size: (\d+)', item).group(1))
    data.append({'color': color, 'node': node, 'size': size})
component_info = pd.DataFrame(data)

full_info = pd.concat([component_info, threeDimData], axis=1)

mp_content_origin = vars(mapper_plot1._MapperLayoutInteractive__graph)['_node']

mp_content = pd.DataFrame.from_dict(mp_content_origin, orient='index')
mp_content.reset_index(inplace=True)
mp_content.rename(columns={'index': 'node'}, inplace=True)

full_info = pd.merge(full_info, mp_content, on=['node', 'size'], how='inner')
# full_info.iloc[:, 3:6].head()

## Cluster info for DBSCAN

In [19]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

calinski_data = get_calinski_from_db(full_info, 0.043)
labels = calinski_data[3]
db = calinski_data[2]
n_clusters_ = calinski_data[4]

unique_labels = set(labels)
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

def matplotlib_to_plotly(cmap, alpha=1):
    """rgba"""
    return f'rgba({int(cmap[0]*200)}, {int(cmap[1]*200)}, {int(cmap[2]*200)}, {alpha})'

# colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]  
colors = [matplotlib_to_plotly(plt.cm.Spectral(each), alpha=0.8) for each in np.linspace(0, 1, len(unique_labels))]
fig = go.Figure()

for k, col in zip(unique_labels, colors):
    if k == -1:
        # col = 'rgba(0,0,0,0)'
        col = 'rgba(0,0,0,0)'

    class_member_mask = labels == k

    core_samples = full_info.iloc[:, 3:6][class_member_mask & core_samples_mask]
    fig.add_trace(go.Scatter3d(
        x=core_samples.iloc[:, 0],
        y=core_samples.iloc[:, 1],
        z=core_samples.iloc[:, 2],
        mode='markers',
        marker=dict(
            size=6,
            color=col,
            opacity=0.8
        ),
        name=f'Cluster {k} Core'
    ))

    non_core_samples = full_info.iloc[:, 3:6][class_member_mask & ~core_samples_mask]
    fig.add_trace(go.Scatter3d(
        x=non_core_samples.iloc[:, 0],
        y=non_core_samples.iloc[:, 1],
        z=non_core_samples.iloc[:, 2],
        mode='markers',
        marker=dict(
            size=6,
            color=col,
            opacity=0.5
        ),
        name=f'Cluster {k} Non-Core'
    ))

fig.update_layout(
    title=f"Estimated number of clusters: {n_clusters_}",
    margin=dict(l=0, r=0, b=0, t=0)
)

fig.show()

## Split label & Count the same point

In [20]:
label_0 = full_info[full_info['label'] == 0]
label_1 = full_info[full_info['label'] == 1]
label_2 = full_info[full_info['label'] == 2]
label_out = full_info[(full_info['label'] != 1) & (full_info['label'] != 2) & (full_info['label'] != 0)]

count_0 = get_count_dict(label_0)
count_1 = get_count_dict(label_1)
count_2 = get_count_dict(label_2)
count_out = get_count_dict(label_out)

full_0 = rbind_data.loc[count_0.keys()]
full_1 = rbind_data.loc[count_1.keys()]
full_2 = rbind_data.loc[count_2.keys()]

# 離群值不需要被處理
full_out = rbind_data.loc[count_out.keys()]

lst01 = list(count_0.keys() & count_1.keys())
lst02 = list(count_0.keys() & count_2.keys())
lst12 = list(count_1.keys() & count_2.keys())
lsto0 = list(count_out.keys() & count_0.keys())
lsto1 = list(count_out.keys() & count_1.keys())
lsto2 = list(count_out.keys() & count_2.keys())

# 將重複的key另外拉出進行分析，這裡drop是為了符合卡方的獨立性前提假設
full_01 = full_0.loc[lst01]
full_02 = full_0.loc[lst02]
full_12 = full_1.loc[lst12]

full_combine = pd.concat([full_01, full_02, full_12], axis=0) # full_out
full_combine = full_combine.reset_index()
full_combine = full_combine.drop_duplicates(subset='index', keep='first')
full_combine = full_combine.drop('index', axis=1)
# 去掉連接點，使分析更嚴謹
full_0 = full_0.drop(lst01 + lst02 + lsto0, errors='ignore')
full_1 = full_1.drop(lst01 + lst12 + lsto1, errors='ignore')
full_2 = full_2.drop(lst02 + lst12 + lsto2, errors='ignore')

print('01連接點數量', len(lst01))
print('02連接點數量', len(lst02))
print('12連接點數量', len(lst12))
print('o0連接點數量', len(lsto0))
print('o1連接點數量', len(lsto1))
print('o2連接點數量', len(lsto2))
print('離群值數量', full_out.shape[0])

full_combine.shape[0] + full_0.shape[0] + full_1.shape[0] + full_2.shape[0] == rbind_data.shape[0]

01連接點數量 92
02連接點數量 0
12連接點數量 0
o0連接點數量 16
o1連接點數量 11
o2連接點數量 9
離群值數量 104


False

## VIF

In [13]:
lst_regression = [
    '光線名稱',
    '道路類別-第1當事者-名稱', 
    # '速限-第1當事者', 
    '道路型態大類別名稱', '事故位置大類別名稱',
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
]

In [35]:
from regressionP import *

X, y, p = pval(full_2, full_1, lst_regression)

from statsmodels.stats.outliers_influence import variance_inflation_factor
select_lst = [
    '光線名稱',
    '道路類別-第1當事者-名稱',
    #'速限-第1當事者', #'道路障礙-視距品質名稱', # 和速限共線
    '道路型態大類別名稱', '事故位置大類別名稱', 
    # '路面狀況-路面鋪裝名稱', '路面狀況-路面缺陷名稱', '道路障礙-障礙物名稱', # 分類幾乎都是無缺陷
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
]
def calculate_vif(X):
    vif = pd.DataFrame()
    vif["features"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif

# 假设 c0_for_lm_X 是你的预测变量DataFrame
vif_df = calculate_vif(X[select_lst])
print(vif_df)

                  features       VIF
0                     光線名稱  1.009844
1            道路類別-第1當事者-名稱  1.016431
2                道路型態大類別名稱  5.838461
3                事故位置大類別名稱  5.479225
4                號誌-號誌種類名稱  1.397519
5         車道劃分設施-分向設施大類別名稱  1.068934
6  車道劃分設施-分道設施-快車道或一般車道間名稱  1.122531
7      車道劃分設施-分道設施-快慢車道間名稱  1.056576
8       車道劃分設施-分道設施-路面邊線名稱  1.125107
9             事故類型及型態大類別名稱       NaN



invalid value encountered in scalar divide



In [None]:
# cluster0_X, cluster0_y = get_clusterN_logit(full_0)
# cluster1_X, cluster1_y = get_clusterN_logit(full_1)
# cluster2_X, cluster2_y = get_clusterN_logit(full_2)

# cluster0_data = get_logit_data(cluster0_X, cluster0_y, lst_regression)
# cluster1_data = get_logit_data(cluster1_X, cluster1_y, lst_regression)
# cluster2_data = get_logit_data(cluster2_X, cluster2_y, lst_regression)

## 慢速限分析

In [29]:
from regressionP import *

In [32]:
lst_regression = [
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '道路型態大類別名稱', '事故位置大類別名稱',
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
]
X01, y01, p01 = pval(full_1, full_2, lst_regression)

In [22]:
lst_regression = [
    '光線名稱',
    '道路類別-第1當事者-名稱', 
    '速限-第1當事者', 
    '道路型態大類別名稱', '事故位置大類別名稱',
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
]

# X01, y01, p01 = pval(full_0, full_1, lst_regression)

# p01[p01['p_value'] < 0.05]

In [21]:
p01.to_csv('CalculatedData/道路分析/慢速限分析_p.csv', index=False)

In [29]:
result = table('事故類型及型態大類別名稱', full_0, full_1, full_01)
result.to_csv('CalculatedData/道路分析/慢速限分析_事故類型比例表01.csv', index=False)

In [30]:
result = table('受傷', full_0, full_1, full_01)
result.to_csv('CalculatedData/道路分析/慢速限分析_受傷比例表01.csv', index=False)

In [31]:
result = table('死亡', full_0, full_1, full_01)
result.to_csv('CalculatedData/道路分析/慢速限分析_死亡比例表01.csv', index=False)

## 去掉關鍵

In [33]:
lst_regression = [
    '光線名稱',
    '道路類別-第1當事者-名稱', 
    '速限-第1當事者', 
    '道路型態大類別名稱', '事故位置大類別名稱',
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    # '事故類型及型態大類別名稱',
]

# X01, y01, p01 = pval(full_0, full_1, lst_regression)

# p01[p01['p_value'] < 0.05]

In [34]:
p01.to_csv('CalculatedData/道路分析/慢速限分析_去掉關鍵p.csv', index=False)

In [35]:
proportions = calculate_proportions(full_0, '事故位置大類別名稱')
proportions.to_csv('CalculatedData/道路分析/慢速限分析_事故位置大類別名稱0.csv', index=False)

In [36]:
proportions = calculate_proportions(full_1, '事故位置大類別名稱')
proportions.to_csv('CalculatedData/道路分析/慢速限分析_事故位置大類別名稱1.csv', index=False)

In [37]:
proportions = calculate_proportions(full_0, '光線名稱')
proportions.to_csv('CalculatedData/道路分析/慢速限分析_光線名稱0.csv', index=False)

In [38]:
proportions = calculate_proportions(full_1, '光線名稱')
proportions.to_csv('CalculatedData/道路分析/慢速限分析_光線名稱1.csv', index=False)

In [39]:
proportions = calculate_proportions(full_0, '車道劃分設施-分向設施大類別名稱')
proportions.to_csv('CalculatedData/道路分析/慢速限分析_分向設施0.csv', index=False)

In [40]:
proportions = calculate_proportions(full_1, '車道劃分設施-分向設施大類別名稱')
proportions.to_csv('CalculatedData/道路分析/慢速限分析_分向設施1.csv', index=False)

In [41]:
proportions = calculate_proportions(full_0, '道路型態大類別名稱')
proportions.to_csv('CalculatedData/道路分析/慢速限分析_道路型態大類別名稱0.csv', index=False)

In [42]:
proportions = calculate_proportions(full_1, '道路型態大類別名稱')
proportions.to_csv('CalculatedData/道路分析/慢速限分析_道路型態大類別名稱1.csv', index=False)

In [153]:
pvalue_lst = chi_compare(full_0, full_1)

光線名稱 p值: 1.2656663335407066e-06 可分群
道路類別-第1當事者-名稱 p值: 0.0 可分群
速限-第1當事者 p值: 0.0 可分群
道路型態大類別名稱 p值: 8.274089870410125e-189 可分群
事故位置大類別名稱 p值: 1.2087506450269801e-211 可分群
號誌-號誌種類名稱 p值: 1.1218418944278677e-42 可分群
車道劃分設施-分向設施大類別名稱 p值: 0.0 可分群
車道劃分設施-分道設施-快車道或一般車道間名稱 p值: 0.0 可分群
車道劃分設施-分道設施-快慢車道間名稱 p值: 1.0363314444658376e-66 可分群
車道劃分設施-分道設施-路面邊線名稱 p值: 7.444777959171966e-260 可分群
事故類型及型態大類別名稱 p值: 2.0005517390094274e-39 可分群
經度 p值: 1.2286895557537435e-240 可分群
緯度 p值: 4.4113423841556225e-210 可分群
死亡 p值: 2.528585370123506e-68 可分群
受傷 p值: 5.807063013139795e-25 可分群


## 快慢速限分析

In [45]:
lst_regression = [
    '光線名稱',
    '道路類別-第1當事者-名稱', 
    '速限-第1當事者', 
    '道路型態大類別名稱', '事故位置大類別名稱',
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
]

# X02, y02, p02 = pval(full_0, full_2, lst_regression)

# p02[p02['p_value'] < 0.05]

In [46]:
p02.to_csv('CalculatedData/道路分析/快慢分析_p.csv', index=False)

In [47]:
result = table('速限-第1當事者', full_0, full_2, full_02)
result.to_csv('CalculatedData/道路分析/快慢分析_速限比例表.csv', index=False)

## 去掉關件

In [49]:
lst_regression = [
    '光線名稱',
    '道路類別-第1當事者-名稱', 
    # '速限-第1當事者', 
    '道路型態大類別名稱', '事故位置大類別名稱',
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
]

# X02, y02, p02 = pval(full_0, full_2, lst_regression)

# p02[p02['p_value'] < 0.05]

In [50]:
p02.to_csv('CalculatedData/道路分析/快慢分析_去掉關鍵p.csv', index=False)

In [51]:
proportions = calculate_proportions(full_0, '車道劃分設施-分向設施大類別名稱')
proportions.to_csv('CalculatedData/道路分析/快慢分析_分向設施0.csv', index=False)

In [52]:
proportions = calculate_proportions(full_2, '車道劃分設施-分向設施大類別名稱')
proportions.to_csv('CalculatedData/道路分析/快慢分析_分向設施2.csv', index=False)

In [55]:
proportions = calculate_proportions(full_0, '車道劃分設施-分道設施-快車道或一般車道間名稱')
proportions.to_csv('CalculatedData/道路分析/快慢分析_快車道0.csv', index=False)

In [56]:
proportions = calculate_proportions(full_2, '車道劃分設施-分道設施-快車道或一般車道間名稱')
proportions.to_csv('CalculatedData/道路分析/快慢分析_快車道2.csv', index=False)

In [57]:
proportions = calculate_proportions(full_0, '車道劃分設施-分道設施-路面邊線名稱')
proportions.to_csv('CalculatedData/道路分析/快慢分析_路面邊線0.csv', index=False)

In [58]:
proportions = calculate_proportions(full_2, '車道劃分設施-分道設施-路面邊線名稱')
proportions.to_csv('CalculatedData/道路分析/快慢分析_路面邊線2.csv', index=False)