In [13]:
import os

current_dir_path = os.getcwd()
current_file_path = os.path.abspath(current_dir_path)
current_dir_path = os.path.dirname(current_file_path)
parent_dir_path = os.path.dirname(current_dir_path)

os.chdir(current_dir_path)

import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler

from tdamapper.core import MapperAlgorithm
from tdamapper.cover import CubicalCover
from tdamapper.plot import MapperLayoutInteractive
from tdamapper.clustering import FailSafeClustering
from sklearn import metrics

from chi import *
from functions import *
from regressionP import *

In [2]:
dataA1 = pd.read_csv("./Data/NPA_TMA1_new.csv")[:-2]

In [87]:
def preprocess(input_data, select_lst, sample = 592):
    sample_data = input_data[input_data['當事者順位'] == 1].reset_index(drop=True, inplace=False)#.sample(sample).reset_index(drop=True)
    # sample_data = sample_data[sample_data['發生月份'] < 3]
    dataA = sample_data[select_lst]
    
    death_injury_data = split_death_injury(dataA['死亡受傷人數'])
    dist_df = pd.concat([dataA, death_injury_data], axis=1)
    dist_df.drop(columns=['死亡受傷人數'], inplace=True)
    
    return dist_df, sample_data

# List of columns to select
select_lst = [
    '光線名稱', '道路類別-第1當事者-名稱', '速限-第1當事者',
    '道路型態大類別名稱', '事故位置大類別名稱',  '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '死亡受傷人數', '經度', '緯度',
]

dist_dfA1 = preprocess(dataA1, select_lst, sample = 592)

rbind_data = pd.concat([dist_dfA1[0]], axis=0, ignore_index=True)

rbind_data.loc[rbind_data['受傷'] > 1, '受傷'] = 2
rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)
# 顏色處理
rbind_data['color'] = rbind_data['速限-第1當事者'].astype(str) + rbind_data['道路型態大類別名稱']
# 標籤
dist_df = process_data(rbind_data)
scaler = StandardScaler()

full_dist = pd.DataFrame(scaler.fit_transform(dist_df), columns = dist_df.columns)
X1 = full_dist.drop(['受傷', '死亡', '經度', '緯度', 'color'], axis=1).to_numpy()

full_dist.head()

Unnamed: 0,光線名稱,道路類別-第1當事者-名稱,速限-第1當事者,道路型態大類別名稱,事故位置大類別名稱,號誌-號誌種類名稱,車道劃分設施-分向設施大類別名稱,車道劃分設施-分道設施-快車道或一般車道間名稱,車道劃分設施-分道設施-快慢車道間名稱,車道劃分設施-分道設施-路面邊線名稱,事故類型及型態大類別名稱,經度,緯度,死亡,受傷,color
0,0.206134,1.156468,-0.343362,0.871624,0.178216,-0.547987,-1.144816,1.637635,-1.445018,-0.690324,1.343545,-0.460481,-1.554677,-0.153782,-0.614936,0.356794
1,1.535867,-0.565109,-0.343362,-1.131203,-1.203615,3.016194,-1.144816,-0.869795,-1.445018,-0.690324,-0.218301,1.175007,1.148414,-0.153782,1.025588,-0.986731
2,-1.123599,1.730326,-0.343362,0.871624,0.869131,-0.547987,1.461655,1.010778,-1.445018,-0.690324,1.343545,-2.170328,-0.380481,-0.153782,-0.614936,0.356794
3,-1.123599,-0.565109,-0.343362,-1.131203,-1.203615,3.016194,-0.275992,-0.869795,0.456703,1.448594,-0.218301,1.011019,1.188202,-0.153782,1.025588,-0.986731
4,1.535867,1.156468,-0.343362,0.871624,0.869131,-0.547987,0.592832,-0.869795,0.456703,-0.690324,1.343545,1.850325,0.399201,-0.153782,-0.614936,0.356794


In [61]:
full_dist.shape

(787, 16)

In [172]:
start_time = time.time()

lens1 = PCA(10).fit_transform(X1)

mapper_algo1 = MapperAlgorithm(
    cover = CubicalCover(
        n_intervals = 3,
        overlap_frac = 0.5
    ),
    clustering = FailSafeClustering(
        clustering = AgglomerativeClustering(3, linkage='ward'),
        verbose = False)
)

# mapper_algo1 = MapperAlgorithm(
#     cover = CubicalCover(
#         n_intervals = 4,
#         overlap_frac = 0.4
#     ),
#     clustering = FailSafeClustering(
#         clustering = AgglomerativeClustering(3, linkage='ward'),
#         verbose = False)
# )

mapper_graph1 = mapper_algo1.fit_transform(X1, lens1)

end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)

mapper_plot1 = MapperLayoutInteractive(
    mapper_graph1,
    colors = dist_df[['color']].to_numpy(),
    cmap = 'jet',
    # agg = np.nanmean,
    agg = most_frequent_nonan,
    dim = 3,
    iterations = 30,
    seed = 5,
    width = 800,
    height = 500)

# fig_mean1 = mapper_plot1.plot()
# fig_mean1.show(config={'scrollZoom': True})

1.071075201034546


In [156]:
x = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['x']
y = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['y']
z = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['z']

threeDimData = pd.DataFrame({'x': x, 'y': y, 'z': z})

import re
data_tuple = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['text']

data = []
for item in data_tuple:
    color = int(re.search(r'color: (\d+)', item).group(1))
    node = int(re.search(r'node: (\d+)', item).group(1))
    size = int(re.search(r'size: (\d+)', item).group(1))
    data.append({'color': color, 'node': node, 'size': size})
component_info = pd.DataFrame(data)

full_info = pd.concat([component_info, threeDimData], axis=1)

mp_content_origin = vars(mapper_plot1._MapperLayoutInteractive__graph)['_node']

mp_content = pd.DataFrame.from_dict(mp_content_origin, orient='index')
mp_content.reset_index(inplace=True)
mp_content.rename(columns={'index': 'node'}, inplace=True)

full_info = pd.merge(full_info, mp_content, on=['node', 'size'], how='inner')

In [162]:
color_0 = full_info[full_info['color'] == 2]
color_1 = full_info[full_info['color'] == 0]
color_2 = full_info[(full_info['color'] == 5) | (full_info['color'] == 4)]

count_0 = get_count_dict(color_0)
count_1 = get_count_dict(color_1)
count_2 = get_count_dict(color_2)

full_0 = rbind_data.loc[count_0.keys()]
full_1 = rbind_data.loc[count_1.keys()]
full_2 = rbind_data.loc[count_2.keys()]

lst01 = list(count_0.keys() & count_1.keys())
lst02 = list(count_0.keys() & count_2.keys())
lst12 = list(count_1.keys() & count_2.keys())

full_0 = full_0.drop(lst01, errors='ignore')
full_0 = full_0.drop(lst02, errors='ignore')

full_1 = full_1.drop(lst01, errors='ignore')
full_1 = full_1.drop(lst12, errors='ignore')

full_2 = full_2.drop(lst02, errors='ignore')
full_2 = full_2.drop(lst12, errors='ignore')

In [163]:
print('單路部分、低速線', full_0.shape[0])
print('交岔路、低速限', full_1.shape[0])
print('單路部分、高速限', full_2.shape[0])

單路部分、低速線 369
交岔路、低速限 309
單路部分、高速限 78


In [164]:
lst_regression = [
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '速限-第1當事者', #'道路障礙-視距品質名稱', # 和速限共線
    '道路型態大類別名稱', # 和事故位置共線
    '事故位置大類別名稱', 
    # '路面狀況-路面鋪裝名稱', '路面狀況-路面缺陷名稱', '道路障礙-障礙物名稱', # 分類幾乎都是無缺陷
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
]

X, y, p = pval(full_0, full_1, lst_regression)
p

Unnamed: 0,coefficients,standard_error,wald_statistics,p_value,feature
道路型態大類別名稱,-1.753957,1.332324,-1.316465,0.188018,道路型態大類別名稱
事故位置大類別名稱,-1.172986,1.443901,-0.812373,0.416577,事故位置大類別名稱
號誌-號誌種類名稱,0.283982,0.738294,0.384647,0.700499,號誌-號誌種類名稱
事故類型及型態大類別名稱,-0.203655,0.798894,-0.254921,0.798784,事故類型及型態大類別名稱
車道劃分設施-分道設施-路面邊線名稱,0.225677,0.917439,0.245986,0.805693,車道劃分設施-分道設施-路面邊線名稱
車道劃分設施-分道設施-快車道或一般車道間名稱,-0.187651,0.863638,-0.21728,0.82799,車道劃分設施-分道設施-快車道或一般車道間名稱
光線名稱,0.067681,0.777279,0.087074,0.930613,光線名稱
道路類別-第1當事者-名稱,0.064812,0.754936,0.085851,0.931585,道路類別-第1當事者-名稱
車道劃分設施-分向設施大類別名稱,-0.046536,0.858832,-0.054186,0.956787,車道劃分設施-分向設施大類別名稱
車道劃分設施-分道設施-快慢車道間名稱,0.021111,0.895548,0.023574,0.981193,車道劃分設施-分道設施-快慢車道間名稱


In [165]:
X, y, p = pval(full_0, full_2, lst_regression)
p


The max_iter was reached which means the coef_ did not converge



Unnamed: 0,coefficients,standard_error,wald_statistics,p_value,feature
速限-第1當事者,1.834529,0.712631,2.574306,0.010044,速限-第1當事者
車道劃分設施-分向設施大類別名稱,-0.275593,1.236846,-0.222819,0.823676,車道劃分設施-分向設施大類別名稱
車道劃分設施-分道設施-快車道或一般車道間名稱,0.230744,1.229244,0.187712,0.851102,車道劃分設施-分道設施-快車道或一般車道間名稱
道路型態大類別名稱,-0.190809,1.656939,-0.115157,0.90832,道路型態大類別名稱
車道劃分設施-分道設施-路面邊線名稱,-0.128587,1.215938,-0.105751,0.91578,車道劃分設施-分道設施-路面邊線名稱
事故位置大類別名稱,-0.109833,1.20398,-0.091225,0.927314,事故位置大類別名稱
道路類別-第1當事者-名稱,-0.073237,0.875459,-0.083656,0.93333,道路類別-第1當事者-名稱
號誌-號誌種類名稱,0.054555,0.835823,0.065271,0.947958,號誌-號誌種類名稱
光線名稱,0.048479,0.835738,0.058008,0.953742,光線名稱
車道劃分設施-分道設施-快慢車道間名稱,0.035905,0.835214,0.042988,0.965711,車道劃分設施-分道設施-快慢車道間名稱


In [166]:
X, y, p = pval(full_1, full_2, lst_regression)
p

Unnamed: 0,coefficients,standard_error,wald_statistics,p_value,feature
速限-第1當事者,1.787718,0.645066,2.77137,0.005582,速限-第1當事者
車道劃分設施-分向設施大類別名稱,-0.225302,1.299253,-0.173409,0.86233,車道劃分設施-分向設施大類別名稱
道路型態大類別名稱,0.26786,1.986923,0.134811,0.892761,道路型態大類別名稱
事故位置大類別名稱,0.197809,1.699185,0.116414,0.907324,事故位置大類別名稱
事故類型及型態大類別名稱,0.107386,1.074203,0.099968,0.92037,事故類型及型態大類別名稱
車道劃分設施-分道設施-快車道或一般車道間名稱,0.12808,1.300362,0.098496,0.921539,車道劃分設施-分道設施-快車道或一般車道間名稱
車道劃分設施-分道設施-路面邊線名稱,-0.061311,1.100297,-0.055722,0.955563,車道劃分設施-分道設施-路面邊線名稱
號誌-號誌種類名稱,-0.04573,1.093107,-0.041834,0.966631,號誌-號誌種類名稱
道路類別-第1當事者-名稱,0.019506,0.910499,0.021423,0.982908,道路類別-第1當事者-名稱
車道劃分設施-分道設施-快慢車道間名稱,-0.011517,0.825539,-0.01395,0.988869,車道劃分設施-分道設施-快慢車道間名稱


In [167]:
def table(colnames, full_0, full_1, full_12):
    
    combined_df = pd.concat([full_0[colnames].value_counts(normalize = True), 
                             full_1[colnames].value_counts(normalize = True),
                             full_12[colnames].value_counts(normalize = True)
                            ],
                            axis=1).fillna(0)

    combined_df.columns = ['單路部分、低速線', '交岔路、低速限', '單路部分、高速限']
    
    return combined_df

In [170]:
for i in lst_regression:
    print(i)
    print(table(i, full_0, full_1, full_2))

光線名稱
           單路部分、低速線   交岔路、低速限  單路部分、高速限
有照明且開啟     0.409214  0.310680  0.384615
有照明未開啟或故障  0.368564  0.508091  0.294872
無照明        0.222222  0.181230  0.320513
道路類別-第1當事者-名稱
        單路部分、低速線   交岔路、低速限  單路部分、高速限
市區道路    0.550136  0.621359  0.128205
村里道路    0.159892  0.223301  0.025641
省道      0.138211  0.058252  0.320513
縣道      0.070461  0.051780  0.012821
鄉道      0.051491  0.029126  0.012821
其他      0.016260  0.012945  0.025641
國道      0.008130  0.000000  0.397436
專用道路    0.002710  0.003236  0.000000
快速(公)道  0.002710  0.000000  0.076923
速限-第1當事者
   單路部分、低速線  交岔路、低速限  單路部分、高速限
0       1.0      1.0       0.0
1       0.0      0.0       1.0
道路型態大類別名稱
      單路部分、低速線  交岔路、低速限  單路部分、高速限
單路部分   0.98916      0.0  0.769231
圓環廣場   0.01084      0.0  0.000000
交岔路    0.00000      1.0  0.230769
事故位置大類別名稱
      單路部分、低速線   交岔路、低速限  單路部分、高速限
路段     0.97019  0.006472  0.743590
其他     0.02168  0.055016  0.012821
交流道    0.00813  0.000000  0.038462
交叉路口   0.00000  0.938511  0.205128
號誌-號誌種類名稱
        