In [1]:
import pickle
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from tdamapper.core import MapperAlgorithm
from tdamapper.cover import CubicalCover
from tdamapper.plot import MapperLayoutInteractive, MapperLayoutStatic
from tdamapper.clustering import FailSafeClustering
from sklearn import metrics

from functions import *
from chi import *
from regressionP import *
from models import *

In [2]:
data1 = pd.read_csv("./Data/NPA_TMA2_1.csv", low_memory=False)[:-2]
data2 = pd.read_csv("./Data/NPA_TMA2_2.csv", low_memory=False)[:-2]
data3 = pd.read_csv("./Data/NPA_TMA2_3.csv", low_memory=False)[:-2]
data4 = pd.read_csv("./Data/NPA_TMA2_4.csv", low_memory=False)[:-2]
dataA2 = pd.concat([data1, data2, data3, data4], ignore_index=True)

dataA1 = pd.read_csv("./Data/NPA_TMA1.csv")[:-2]

In [5]:
def preprocess(input_data, select_lst, sample = 592):
    sample_data = input_data[input_data['當事者順位'] == 1].reset_index(drop=True, inplace=False)
    # sample_data = sample_data[sample_data['發生月份'] < 3]
    dataA = sample_data[select_lst]
    
    death_injury_data = split_death_injury(dataA['死亡受傷人數'])
    dist_df = pd.concat([dataA, death_injury_data], axis=1)
    dist_df.drop(columns=['死亡受傷人數'], inplace=True)
    
    return dist_df, sample_data

# List of columns to select
select_lst = [
    '天候名稱', 
    '路面狀況-路面狀態名稱',
    '肇因研判大類別名稱-主要', '當事者屬-性-別名稱', '當事者事故發生時年齡', 
    '車輛撞擊部位大類別名稱-最初',
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '速限-第1當事者', 
    '道路型態大類別名稱',
    '事故位置大類別名稱', 
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '死亡受傷人數',
    '經度', '緯度',
    '道路型態子類別名稱', '事故位置子類別名稱', '車道劃分設施-分向設施子類別名稱', '事故類型及型態子類別名稱', 
    '當事者行動狀態子類別名稱', '車輛撞擊部位子類別名稱-最初', '車輛撞擊部位子類別名稱-其他', '肇因研判子類別名稱-個別',
    
    '當事者區分-類別-大類別名稱-車種', '當事者區分-類別-子類別名稱-車種',
    '保護裝備名稱', '行動電話或電腦或其他相類功能裝置名稱', '當事者行動狀態大類別名稱',
    '車輛撞擊部位大類別名稱-其他', '肇因研判大類別名稱-個別', '肇事逃逸類別名稱-是否肇逃',
    '路面狀況-路面鋪裝名稱', '路面狀況-路面缺陷名稱',
    '道路障礙-障礙物名稱', '道路障礙-視距品質名稱', '道路障礙-視距名稱','號誌-號誌動作名稱',
]

dist_dfA1 = preprocess(dataA1, select_lst, sample = 592)
dist_dfA2 = preprocess(dataA2, select_lst, sample = 11841) # 120420

    
rbind_data = pd.concat([dist_dfA1[0], dist_dfA2[0]], axis=0, ignore_index=True)

rbind_data.loc[rbind_data['受傷'] > 1, '受傷'] = 2
rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)
rbind_data = process_age(rbind_data)

dist_df = process_data(rbind_data)
scaler = StandardScaler()

full_dist = pd.DataFrame(scaler.fit_transform(dist_df), columns = dist_df.columns)
X1 = full_dist.drop(['受傷', '死亡'], axis=1).to_numpy()

full_dist.head()

Unnamed: 0,天候名稱,路面狀況-路面狀態名稱,肇因研判大類別名稱-主要,當事者屬-性-別名稱,當事者事故發生時年齡,車輛撞擊部位大類別名稱-最初,光線名稱,道路類別-第1當事者-名稱,速限-第1當事者,道路型態大類別名稱,...,肇因研判大類別名稱-個別,肇事逃逸類別名稱-是否肇逃,路面狀況-路面鋪裝名稱,路面狀況-路面缺陷名稱,道路障礙-障礙物名稱,道路障礙-視距品質名稱,道路障礙-視距名稱,號誌-號誌動作名稱,死亡,受傷
0,-0.447747,-0.331183,0.509425,0.739901,-1.09744,-0.372169,0.146102,2.120706,-0.146069,1.210906,...,0.508523,-0.135001,-0.045604,-0.029347,0.001515,0.118545,0.060048,0.837867,13.660843,-2.803204
1,-0.447747,-0.331183,0.509425,0.739901,0.67326,1.175119,1.465217,-0.25172,-0.146069,-0.814068,...,0.508523,-0.135001,-0.045604,-0.029347,0.001515,0.118545,0.060048,-1.192531,13.660843,-0.620846
2,1.201,-0.331183,0.509425,0.739901,0.67326,-0.372169,-1.173013,2.911514,-0.146069,1.210906,...,0.508523,-0.135001,-0.045604,-0.029347,0.001515,0.118545,0.060048,0.837867,13.660843,-2.803204
3,-0.447747,-0.331183,0.509425,0.739901,-0.21209,-0.372169,-1.173013,-0.25172,-0.146069,-0.814068,...,0.508523,-0.135001,-0.045604,-0.029347,0.001515,0.118545,0.060048,-1.192531,13.660843,-0.620846
4,1.201,-0.331183,0.509425,0.739901,-1.09744,-0.372169,1.465217,2.120706,-0.146069,1.210906,...,0.508523,-0.135001,-0.045604,-0.029347,0.001515,0.118545,0.060048,0.837867,13.660843,-2.803204


In [6]:
with open('CalculatedData/new1.pkl', 'rb') as f:
    mapper_graph1 = pickle.load(f)

mapper_plot1 = MapperLayoutInteractive(
    mapper_graph1,
    colors = dist_df[['天候名稱']].to_numpy(),
    cmap = 'jet',
    # agg = np.nanmean,
    agg = most_frequent_nonan,
    dim = 3,
    iterations = 30,
    seed = 5,
    width = 800,
    height = 500)

In [32]:
# fig_mean1 = mapper_plot1.plot()
# fig_mean1.show(config={'scrollZoom': True})

In [10]:
x = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['x']
y = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['y']
z = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['z']

threeDimData = pd.DataFrame({'x': x, 'y': y, 'z': z})

import re
data_tuple = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['text']

data = []
for item in data_tuple:
    color = int(re.search(r'color: (-?\d+)', item).group(1))
    node = int(re.search(r'node: (\d+)', item).group(1))
    size = int(re.search(r'size: (\d+)', item).group(1))
    data.append({'color': color, 'node': node, 'size': size})
component_info = pd.DataFrame(data)

full_info = pd.concat([component_info, threeDimData], axis=1)

mp_content_origin = vars(mapper_plot1._MapperLayoutInteractive__graph)['_node']

mp_content = pd.DataFrame.from_dict(mp_content_origin, orient='index')
mp_content.reset_index(inplace=True)
mp_content.rename(columns={'index': 'node'}, inplace=True)

full_info = pd.merge(full_info, mp_content, on=['node', 'size'], how='inner')

# calinski_data = get_calinski_from_db(full_info, 0.021)
calinski_data = get_calinski_from_db(full_info, 0.015)

full_info.head()

Unnamed: 0,color,node,size,x,y,z,ids,label
0,0,0,214,0.010557,0.052523,-0.029602,"[0, 34, 164, 210, 262, 317, 342, 386, 417, 503...",0
1,0,38,192,0.007043,0.048798,-0.02737,"[0, 12, 24, 47, 116, 164, 178, 182, 285, 317, ...",0
2,0,2,24,0.007296,-0.0703,-0.093677,"[1, 11643, 13481, 15825, 16714, 18757, 20676, ...",1
3,0,1723,298,-0.000934,-0.054598,-0.087617,"[1, 78, 518, 1513, 1614, 1930, 1943, 1966, 315...",1
4,0,2306,8,-0.005466,-0.049236,-0.082377,"[1, 15825, 16714, 20676, 63665, 75722, 78455, ...",1


In [45]:
labels = calinski_data[3]
db = calinski_data[2]
n_clusters_ = calinski_data[4]

unique_labels = set(labels)
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

def matplotlib_to_plotly(cmap, alpha=1):
    """rgba"""
    return f'rgba({int(cmap[0]*200)}, {int(cmap[1]*200)}, {int(cmap[2]*200)}, {alpha})'

# colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]  
colors = [matplotlib_to_plotly(plt.cm.Spectral(each), alpha=0.8) for each in np.linspace(0, 1, len(unique_labels))]
# fig = go.Figure()

# for k, col in zip(unique_labels, colors):
#     if k == -1:
#         # col = 'rgba(0,0,0,0)'
#         col = 'rgba(0,0,0,0)'

#     class_member_mask = labels == k

#     core_samples = full_info.iloc[:, 3:6][class_member_mask & core_samples_mask]
#     fig.add_trace(go.Scatter3d(
#         x=core_samples.iloc[:, 0],
#         y=core_samples.iloc[:, 1],
#         z=core_samples.iloc[:, 2],
#         mode='markers',
#         marker=dict(
#             size=6,
#             color=col,
#             opacity=0.8
#         ),
#         name=f'Cluster {k} Core'
#     ))

#     non_core_samples = full_info.iloc[:, 3:6][class_member_mask & ~core_samples_mask]
#     fig.add_trace(go.Scatter3d(
#         x=non_core_samples.iloc[:, 0],
#         y=non_core_samples.iloc[:, 1],
#         z=non_core_samples.iloc[:, 2],
#         mode='markers',
#         marker=dict(
#             size=6,
#             color=col,
#             opacity=0.5
#         ),
#         name=f'Cluster {k} Non-Core'
#     ))

# fig.update_layout(
#     title=f"Estimated number of clusters: {n_clusters_}",
#     margin=dict(l=0, r=0, b=0, t=0)
# )

# fig.show()

In [22]:
label_0 = full_info[full_info['label'] == 0]
label_1 = full_info[full_info['label'] == 1]
label_2 = full_info[full_info['label'] == 2]
label_3 = full_info[full_info['label'] == 3]
label_out = full_info[(full_info['label'] != 0) & (full_info['label'] != 1) & (full_info['label'] != 2) & (full_info['label'] != 3)]

count_0 = get_count_dict(label_0)
count_1 = get_count_dict(label_1)
count_2 = get_count_dict(label_2)
count_3 = get_count_dict(label_3)
count_out = get_count_dict(label_out)

print(full_info['label'].unique())

len(count_out)

[ 0  1 -1  2  3  4]


7137

In [20]:
# 處理函數，確保每次操作前後索引的一致性和唯一性
def drop_keys(dataframe, keys_list):
    for keys in keys_list:
        if keys:
            dataframe.drop(keys, errors='ignore', inplace=True)

# 各個集合的初始加載
full_0 = rbind_data.loc[count_0.keys()].copy()
full_1 = rbind_data.loc[count_1.keys()].copy()
full_2 = rbind_data.loc[count_2.keys()].copy()
full_3 = rbind_data.loc[count_3.keys()].copy()
full_out = rbind_data.loc[count_out.keys()].copy()

lst01 = list(count_0.keys() & count_1.keys()) 
lst02 = list(count_0.keys() & count_2.keys()) 
lst03 = list(count_0.keys() & count_3.keys()) 
lst12 = list(count_1.keys() & count_2.keys()) 
lst13 = list(count_1.keys() & count_3.keys())
lst23 = list(count_2.keys() & count_3.keys()) 
lsto0 = list(count_out.keys() & count_0.keys()) 
lsto1 = list(count_out.keys() & count_1.keys()) 
lsto2 = list(count_out.keys() & count_2.keys()) 
lsto3 = list(count_out.keys() & count_3.keys())

full_01 = full_0.loc[lst01]
full_02 = full_0.loc[lst02]
full_03 = full_0.loc[lst03]
full_12 = full_1.loc[lst12]
full_13 = full_1.loc[lst13]
full_23 = full_2.loc[lst23]

# 合併重複數據集並去重
full_combine = pd.concat([full_01, full_02, full_03, full_12, full_13, full_23, full_out], axis=0)
full_combine.reset_index(inplace=True)
full_combine.drop_duplicates(subset='index', keep='first', inplace=True)
full_combine.drop('index', axis=1, inplace=True)

# 移除每個數據集中的重複鍵
drop_keys(full_0, [lst01, lst02, lst03, lsto0])
drop_keys(full_1, [lst01, lst12, lst13, lsto1])
drop_keys(full_2, [lst02, lst12, lst23, lsto2])
drop_keys(full_3, [lst03, lst13, lst23, lsto3]) 

full_combine.shape[0] + full_0.shape[0] + full_1.shape[0] == rbind_data.shape[0]

False

In [21]:
full_combine.shape[0] + full_0.shape[0] + full_1.shape[0]

116859

In [24]:
lst_logit = [
    '路面狀況-路面狀態名稱',
    # '肇因研判大類別名稱-主要', # 降低預測值
    '當事者屬-性-別名稱', '當事者事故發生時年齡', 
    '車輛撞擊部位大類別名稱-最初',
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '速限-第1當事者',
    '道路型態大類別名稱',
    '事故位置大類別名稱', 
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '事故位置子類別名稱', '事故類型及型態子類別名稱', 
    '當事者行動狀態子類別名稱', '車輛撞擊部位子類別名稱-最初', '肇因研判子類別名稱-個別',
    # '當事者區分-類別-大類別名稱-車種', 
    '當事者區分-類別-子類別名稱-車種',
    '保護裝備名稱', '行動電話或電腦或其他相類功能裝置名稱', '當事者行動狀態大類別名稱',
    '車輛撞擊部位大類別名稱-其他', '肇因研判大類別名稱-個別', '肇事逃逸類別名稱-是否肇逃',
    '號誌-號誌動作名稱',
    # '路面狀況-路面鋪裝名稱', '道路障礙-視距名稱', '車道劃分設施-分向設施子類別名稱', '車道劃分設施-分道設施-快慢車道間名稱', '車輛撞擊部位大類別名稱-其他',
    # '道路障礙-障礙物名稱', '道路型態子類別名稱', '路面狀況-路面缺陷名稱', '天候名稱', '車輛撞擊部位子類別名稱-其他', 
]
def get_clusterN_logit(cluster_data, lst):
    scaler = StandardScaler()
    
    c0_for_lm = process_data(cluster_data)
    c0_for_lm_X = pd.DataFrame(scaler.fit_transform(c0_for_lm), columns = c0_for_lm.columns)
    # c0_for_lm_y = cluster_data['受傷']
    # c0_for_lm_y = cluster_data.apply(lambda row: 1 if row['死亡'] != 0 else 0, axis=1)
    c0_for_lm_y = cluster_data.apply(lambda row: 2 if row['死亡'] != 0 else (2 if row['受傷'] >= 2 else 1), axis=1)
    # c0_for_lm_y = cluster_data.apply(lambda row: '非常嚴重' if row['死亡'] > 0 or row['受傷'] > 2 else ('嚴重' if row['受傷'] > 1 else '一般'), axis=1)
    c0_for_lm_X = c0_for_lm_X[lst]
    
    return c0_for_lm_X, c0_for_lm_y

full_combine_X,  full_combine_y = get_clusterN_logit(full_combine, lst_logit)
full_0_X, full_0_y = get_clusterN_logit(full_0, lst_logit)
full_1_X, full_1_y = get_clusterN_logit(full_1, lst_logit)
full_2_X, full_2_y = get_clusterN_logit(full_2, lst_logit)
full_3_X, full_3_y = get_clusterN_logit(full_3, lst_logit)

print(full_combine_y.value_counts())
print(full_0_y.value_counts())
print(full_1_y.value_counts())
print(full_2_y.value_counts())
print(full_3_y.value_counts())

1    5743
2    2748
dtype: int64
1    34717
2    12022
dtype: int64
1    42612
2    19017
dtype: int64
1    1124
2     452
dtype: int64
1    1458
2    1119
dtype: int64


In [25]:
full_0_X.shape

(46739, 28)

In [27]:
start_time = time.time()
matrix_combine, score_combine, cm_combine = logistic_cm_gridsearch(full_combine_X,  full_combine_y)
matrix_0, score_0, cm_0 = logistic_cm_gridsearch(full_0_X, full_0_y)
matrix_1, score_1, cm_1 = logistic_cm_gridsearch(full_1_X, full_1_y)
matrix_2, score_2, cm_2 = logistic_cm_gridsearch(full_2_X, full_2_y)
matrix_3, score_3, cm_3 = logistic_cm_gridsearch(full_3_X, full_3_y)
print(score_combine, score_0, score_1, score_2, score_3)
end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge



0.6327251324308417 0.6220581942661532 0.6510627940937855 0.75 0.6763565891472868
352.39549112319946


In [30]:
de = full_combine_X.shape[0] + full_0_X.shape[0] + full_1_X.shape[0] + full_2_X.shape[0] + full_3_X.shape[0]
logit_avg_score = (full_combine_X.shape[0]/de)*score_combine + (full_0_X.shape[0]/de)*score_0 + (full_1_X.shape[0]/de)*score_1 + (full_2_X.shape[0]/de)*score_2 + (full_3_X.shape[0]/de)*score_3
print(round(logit_avg_score, 3))

0.64


In [40]:
def calculate_metrics(confusion_matrix):
    TP = confusion_matrix[1, 1]
    TN = confusion_matrix[0, 0]
    FP = confusion_matrix[0, 1]
    FN = confusion_matrix[1, 0]
    
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return accuracy, precision, recall, f1_score

In [43]:
logistic_metrics = calculate_metrics(cm_combine + cm_0 + cm_1 + cm_2 + cm_3)
logistic_metrics
data = {
    "Metric": ["Accuracy", "Precision", "Recall", "F1 Score"],
    "LR Mapper": logistic_metrics,
}
pd.DataFrame(data)

Unnamed: 0,Metric,LR Mapper
0,Accuracy,0.640405
1,Precision,0.430715
2,Recall,0.75389
3,F1 Score,0.54822


In [None]:
start_time = time.time()
svc_matrix_com, svc_score_com, svc_cm_combine = svc_cm_with_grid_search(full_combine_X, full_combine_y)
svc_matrix_0, svc_score_0, svc_cm_0 = svc_cm_with_grid_search(full_0_X, full_0_y)
svc_matrix_1, svc_score_1, svc_cm_1 = svc_cm_with_grid_search(full_1_X, full_1_y)
print(svc_score_com, svc_score_0, svc_score_1)
end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)


Liblinear failed to converge, increase the number of iterations.


Liblinear failed to converge, increase the number of iterations.

