In [1]:
import os

current_dir_path = os.getcwd()
current_file_path = os.path.abspath(current_dir_path)
current_dir_path = os.path.dirname(current_file_path)
parent_dir_path = os.path.dirname(current_dir_path)

os.chdir(current_dir_path)

import pickle
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from tdamapper.core import MapperAlgorithm
from tdamapper.cover import CubicalCover
from tdamapper.plot import MapperLayoutInteractive, MapperLayoutStatic
from tdamapper.clustering import FailSafeClustering
from sklearn import metrics

from functions import *
from chi import *
from regressionP import *
from models import *

In [2]:
data1 = pd.read_csv("./Data/NPA_TMA2_1.csv", low_memory=False)[:-2]
data2 = pd.read_csv("./Data/NPA_TMA2_2.csv", low_memory=False)[:-2]
data3 = pd.read_csv("./Data/NPA_TMA2_3.csv", low_memory=False)[:-2]
data4 = pd.read_csv("./Data/NPA_TMA2_4_new.csv", low_memory=False)[:-2]
data5 = pd.read_csv("./Data/NPA_TMA2_5.csv", low_memory=False)[:-2]
data6 = pd.read_csv("./Data/NPA_TMA2_6_new.csv", low_memory=False)[:-2]

dataA2 = pd.concat([data1, data2, data3, data4, data5, data6], ignore_index=True)

dataA1 = pd.read_csv("./Data/NPA_TMA1_new.csv")[:-2]

In [3]:
select_lst = [
    '道路類別-第1當事者-名稱', '速限-第1當事者', 
    '道路型態大類別名稱', '道路型態子類別名稱',
    '事故位置大類別名稱', '事故位置子類別名稱', 
    '事故類型及型態大類別名稱', '事故類型及型態子類別名稱',
    '路面狀況-路面鋪裝名稱', '路面狀況-路面狀態名稱', '路面狀況-路面缺陷名稱',
    '道路障礙-障礙物名稱', '道路障礙-視距品質名稱', '道路障礙-視距名稱',
    '號誌-號誌種類名稱', '號誌-號誌動作名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分向設施子類別名稱',
    '車道劃分設施-分道設施-快車道或一般車道間名稱', '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '死亡受傷人數'
]

dist_dfA1 = preprocess(dataA1, select_lst)
dist_dfA2 = preprocess(dataA2, select_lst)

rbind_data = pd.concat([dist_dfA1[0], dist_dfA2[0]], axis=0, ignore_index=True)
rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)

select_lst.remove('死亡受傷人數')

rbind_data = pd.get_dummies(rbind_data[select_lst], columns=select_lst)

X1 = rbind_data.to_numpy()

In [4]:
pca = PCA(n_components=15)
lens1 = pca.fit_transform(X1)

# 查看每個主成分保留的變異量比例
explained_variance_ratio = pca.explained_variance_ratio_
print(sum(explained_variance_ratio))
print(explained_variance_ratio)

0.7968339261493916
[0.24139843 0.11647993 0.06990822 0.05897289 0.04888923 0.03367925
 0.03299789 0.03136332 0.02868323 0.02471669 0.02423999 0.02255262
 0.02212497 0.02188056 0.0189467 ]


In [5]:
start_time = time.time()

lens1 = PCA(5).fit_transform(X1)

mapper_algo1 = MapperAlgorithm(
    cover = CubicalCover(
        n_intervals = 5,
        overlap_frac = 0.45
    ),
    clustering = FailSafeClustering(
        clustering = AgglomerativeClustering(3),
        verbose = False)
)

mapper_graph1 = mapper_algo1.fit_transform(X1, lens1)

end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)

678.5134079456329


In [7]:
mapper_plot1 = MapperLayoutInteractive(
    mapper_graph1,
    colors = rbind_data[['速限-第1當事者_0']].to_numpy(),
    cmap = 'jet',
    agg = np.nanmean,
    # agg = most_frequent_nonan,
    dim = 3,
    iterations = 30,
    seed = 6,
    width = 800,
    height = 500)

# fig_mean1 = mapper_plot1.plot()
# fig_mean1.show(config={'scrollZoom': True})

In [1]:
# import pickle

# with open('CalculatedData/道路V2_dummy.pkl', 'wb') as f:
#     pickle.dump(mapper_graph1, f)

# 模型比較

In [30]:
# List of columns to select
select_lst = [
    '天候名稱', 
    '路面狀況-路面狀態名稱',
    '肇因研判大類別名稱-主要', '當事者屬-性-別名稱', '當事者事故發生時年齡', 
    '車輛撞擊部位大類別名稱-最初',
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '速限-第1當事者', 
    '道路型態大類別名稱',
    '事故位置大類別名稱', 
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '死亡受傷人數',
    '經度', '緯度',
    '道路型態子類別名稱', '事故位置子類別名稱', '車道劃分設施-分向設施子類別名稱', '事故類型及型態子類別名稱', 
    '當事者行動狀態子類別名稱', '車輛撞擊部位子類別名稱-最初', '車輛撞擊部位子類別名稱-其他', '肇因研判子類別名稱-個別',
    
    '當事者區分-類別-大類別名稱-車種', '當事者區分-類別-子類別名稱-車種',
    '保護裝備名稱', '行動電話或電腦或其他相類功能裝置名稱', '當事者行動狀態大類別名稱',
    '車輛撞擊部位大類別名稱-其他', '肇因研判大類別名稱-個別', '肇事逃逸類別名稱-是否肇逃',
    '路面狀況-路面鋪裝名稱', '路面狀況-路面缺陷名稱',
    '道路障礙-障礙物名稱', '道路障礙-視距品質名稱', '道路障礙-視距名稱','號誌-號誌動作名稱',
]

dist_dfA1 = preprocess(dataA1, select_lst)
dist_dfA2 = preprocess(dataA2, select_lst)

rbind_data = pd.concat([dist_dfA1[0], dist_dfA2[0]], axis=0, ignore_index=True)

rbind_data.loc[rbind_data['受傷'] > 1, '受傷'] = 2
rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)
rbind_data = process_age(rbind_data)
# 顏色處理
rbind_data['color'] = rbind_data['速限-第1當事者'].astype(str) + rbind_data['事故類型及型態大類別名稱']

dist_df = process_data(rbind_data)
scaler = StandardScaler()

full_dist = pd.DataFrame(scaler.fit_transform(dist_df), columns = dist_df.columns)
X1 = full_dist.drop(['受傷', '死亡'], axis=1).to_numpy()

# full_dist.head()

In [16]:
# # List of columns to select
# select_lst = [
#     '道路類別-第1當事者-名稱', '速限-第1當事者', 
#     '道路型態大類別名稱', '道路型態子類別名稱',
#     '事故位置大類別名稱', '事故位置子類別名稱', 
#     '事故類型及型態大類別名稱', '事故類型及型態子類別名稱',
#     '路面狀況-路面鋪裝名稱', '路面狀況-路面狀態名稱', '路面狀況-路面缺陷名稱',
#     '道路障礙-障礙物名稱', '道路障礙-視距品質名稱', '道路障礙-視距名稱',
#     '號誌-號誌種類名稱', '號誌-號誌動作名稱',
#     '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分向設施子類別名稱',
#     '車道劃分設施-分道設施-快車道或一般車道間名稱', '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
#     '死亡受傷人數'
# ]

# dist_dfA1 = preprocess(dataA1, select_lst)
# dist_dfA2 = preprocess(dataA2, select_lst)

# rbind_data = pd.concat([dist_dfA1[0], dist_dfA2[0]], axis=0, ignore_index=True)
# rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)

# select_lst.remove('死亡受傷人數')

# dist_df = process_data(rbind_data)
# scaler = StandardScaler()
# full_dist = pd.DataFrame(scaler.fit_transform(dist_df), columns = dist_df.columns)
# rbind_data = pd.get_dummies(rbind_data[select_lst], columns=select_lst)

# X1 = rbind_data.to_numpy()
# # rbind_data

# merged_df = pd.concat([dist_dfA1[0][['死亡', '受傷']], dist_dfA2[0][['死亡', '受傷']]], axis=0)

In [29]:
# rbind_data_reset = rbind_data.reset_index(drop=True)
# merged_df_reset = merged_df.reset_index(drop=True)

# rbind_data = pd.concat([rbind_data_reset, merged_df_reset], axis=1)

In [6]:
with open('CalculatedData/道路V2_dummy.pkl', 'rb') as f:
    mapper_graph1 = pickle.load(f)
    
mapper_plot1 = MapperLayoutInteractive(
    mapper_graph1,
    colors = rbind_data[['速限-第1當事者_0']].to_numpy(),
    cmap = 'jet',
    # agg = most_frequent_nonan,
    agg = np.mean,
    dim = 3,
    iterations = 30,
    seed = 6,
    width = 800,
    height = 500)

# fig_mean1 = mapper_plot1.plot()
# fig_mean1.show(config={'scrollZoom': True})

In [31]:
x = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['x']
y = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['y']
z = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['z']

threeDimData = pd.DataFrame({'x': x, 'y': y, 'z': z})

import re
data_tuple = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['text']

data = []
for item in data_tuple:
    color = int(re.search(r'color: (-?\d+)', item).group(1))
    node = int(re.search(r'node: (\d+)', item).group(1))
    size = int(re.search(r'size: (\d+)', item).group(1))
    data.append({'color': color, 'node': node, 'size': size})
component_info = pd.DataFrame(data)

full_info = pd.concat([component_info, threeDimData], axis=1)

mp_content_origin = vars(mapper_plot1._MapperLayoutInteractive__graph)['_node']

mp_content = pd.DataFrame.from_dict(mp_content_origin, orient='index')
mp_content.reset_index(inplace=True)
mp_content.rename(columns={'index': 'node'}, inplace=True)

full_info = pd.merge(full_info, mp_content, on=['node', 'size'], how='inner')

In [32]:
calinski_data = get_calinski_from_db(full_info, 0.15)
labels = calinski_data[3]
db = calinski_data[2]
n_clusters_ = calinski_data[4]

print(n_clusters_)

# do_plot(full_info, calinski_data, labels, db, n_clusters_)

2


In [33]:
label_0 = full_info[full_info['label'] == 0]
label_1 = full_info[full_info['label'] == 1]
label_out = full_info[(full_info['label'] != 1) & (full_info['label'] != 0)]

count_0 = get_count_dict(label_0)
count_1 = get_count_dict(label_1)
count_out = get_count_dict(label_out)

full_0 = rbind_data.loc[count_0.keys()]
full_1 = rbind_data.loc[count_1.keys()]
# 離群值不需要被處理
full_out = rbind_data.loc[count_out.keys()]
lst01 = list(count_0.keys() & count_1.keys())
lsto0 = list(count_out.keys() & count_0.keys())
lsto1 = list(count_out.keys() & count_1.keys())
# Node
full_01 = full_0.loc[lst01]

full_combine = pd.concat([full_01], axis=0)
full_combine = full_combine.reset_index()
full_combine = full_combine.drop_duplicates(subset='index', keep='first')
full_combine = full_combine.drop('index', axis=1)
# 去掉連接點，使分析更嚴謹
full_0 = full_0.drop(lst01 + lsto0, errors='ignore')
full_1 = full_1.drop(lst01 + lsto1, errors='ignore')

print('01連接點數量', len(lst01))
print('o0連接點數量', len(lsto0))
print('o1連接點數量', len(lsto1))
print('離群值數量', full_out.shape[0])

full_combine.shape[0] + full_0.shape[0] + full_1.shape[0] + full_out.shape[0] == rbind_data.shape[0]

01連接點數量 5576
o0連接點數量 6362
o1連接點數量 1026
離群值數量 7541


False

In [37]:
lst_logit = [
    '路面狀況-路面狀態名稱', '當事者屬-性-別名稱', '當事者事故發生時年齡', 
    '車輛撞擊部位大類別名稱-最初', 
    '光線名稱', '道路類別-第1當事者-名稱',
    '速限-第1當事者', '道路型態大類別名稱', '事故位置大類別名稱', 
    '號誌-號誌種類名稱', '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱', '事故位置子類別名稱', '事故類型及型態子類別名稱', 
    '當事者行動狀態子類別名稱', '車輛撞擊部位子類別名稱-最初', '肇因研判子類別名稱-個別',
    '當事者區分-類別-子類別名稱-車種', '保護裝備名稱', '行動電話或電腦或其他相類功能裝置名稱', 
    '當事者行動狀態大類別名稱', '車輛撞擊部位大類別名稱-其他', 
    '肇因研判大類別名稱-個別', '肇事逃逸類別名稱-是否肇逃', '路面狀況-路面鋪裝名稱', 
    # '車道劃分設施-分向設施子類別名稱', '道路障礙-障礙物名稱', '車輛撞擊部位子類別名稱-其他',

    # '號誌-號誌動作名稱', '當事者區分-類別-大類別名稱-車種', '肇因研判大類別名稱-主要', # 降低預測值
    # '道路障礙-視距名稱', '車道劃分設施-分道設施-快慢車道間名稱', '車輛撞擊部位大類別名稱-其他', # 降低預測值
    # '道路型態子類別名稱', '路面狀況-路面缺陷名稱', '天候名稱' # 降低
]

full_combine_X, full_combine_y = get_clusterN_logit(full_combine, lst_logit)
full_0_X, full_0_y = get_clusterN_logit(full_0, lst_logit)
full_1_X, full_1_y = get_clusterN_logit(full_1, lst_logit)
full_out_X, full_out_y = get_clusterN_logit(full_out, lst_logit)

print(full_combine_y.value_counts())
print(full_out_y.value_counts())
print(full_0_y.value_counts())
print(full_1_y.value_counts())

2    5531
1      45
dtype: int64
2    7497
1      44
dtype: int64
2    100396
1       478
dtype: int64
2    75819
1      221
dtype: int64


In [39]:
# pca = PCA(n_components=5)
# full_0_X = pca.fit_transform(full_0_X)
# full_1_X = pca.fit_transform(full_1_X)

In [45]:
start_time = time.time()
matrix_combine, score_combine, cm_combine = logistic_cm_gridsearch(full_combine_X, full_combine_y)
matrix_out, score_out, cm_out = logistic_cm_gridsearch(full_out_X, full_out_y)
print(score_combine, score_out)
end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)

0.6111111111111112 0.6
62.06708002090454


In [40]:
start_time = time.time()

matrix_0, score_0, cm_0 = logistic_cm_gridsearch(full_0_X, full_0_y)
print('0')
matrix_1, score_1, cm_1 = logistic_cm_gridsearch(full_1_X, full_1_y)
print('1')

print(score_0, score_1)
end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)

0
1
0.7013422818791947 0.7619047619047619
666.6848707199097


In [44]:
cm_0_1 = cm_0 + cm_1

TP = cm_0_1[0][0]
FP = cm_0_1[0][1]
FN = cm_0_1[1][0]
TN = cm_0_1[1][1]

# Re-calculating accuracy
accuracy_given_cm = (TP + TN) / (TP + TN + FP + FN)
accuracy_given_cm

0.7193396226415094

# 顯著特徵

In [49]:
def table(colnames, full_0, full_1):
    
    combined_df = pd.concat([full_0[colnames].value_counts(normalize = True), 
                             full_1[colnames].value_counts(normalize = True)
                            ],
                            axis=1).fillna(0)

    combined_df.columns = ['A', 'B']
    
    return combined_df

In [53]:
# for i in full_0.columns:
#     print(table(i, full_0, full_1))

In [82]:
# X, y, p = pval(full_0, full_1, [x for x in full_1.columns if x not in ['受傷', '死亡']])
# p