In [1]:
import os

current_dir_path = os.getcwd()
current_file_path = os.path.abspath(current_dir_path)
current_dir_path = os.path.dirname(current_file_path)
parent_dir_path = os.path.dirname(current_dir_path)

os.chdir(current_dir_path)

import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances_argmin_min, silhouette_score

from tdamapper.core import MapperAlgorithm
from tdamapper.cover import CubicalCover
from tdamapper.plot import MapperLayoutInteractive
from tdamapper.clustering import FailSafeClustering
from sklearn import metrics

from functions import *

In [2]:
data1 = pd.read_csv("./Data/NPA_TMA2_1.csv", low_memory=False)[:-2]
data2 = pd.read_csv("./Data/NPA_TMA2_2.csv", low_memory=False)[:-2]
data3 = pd.read_csv("./Data/NPA_TMA2_3.csv", low_memory=False)[:-2]
data4 = pd.read_csv("./Data/NPA_TMA2_4_new.csv", low_memory=False)[:-2]
data5 = pd.read_csv("./Data/NPA_TMA2_5.csv", low_memory=False)[:-2]
data6 = pd.read_csv("./Data/NPA_TMA2_6.csv", low_memory=False)[:-2]

dataA2 = pd.concat([data1, data2, data3, data4, data5, data6], ignore_index=True)

In [8]:
def preprocess(input_data, select_lst, sample = 592):
    sample_data = input_data[input_data['當事者順位'] == 1].reset_index(drop=True, inplace=False)#.sample(sample).reset_index(drop=True)
    # sample_data = sample_data[sample_data['發生月份'] < 3]
    dataA = sample_data[select_lst]
    
    death_injury_data = split_death_injury(dataA['死亡受傷人數'])
    dist_df = pd.concat([dataA, death_injury_data], axis=1)
    dist_df.drop(columns=['死亡受傷人數'], inplace=True)
    
    return dist_df, sample_data

# List of columns to select
select_lst = [
    '光線名稱', '道路類別-第1當事者-名稱', '速限-第1當事者',
    '道路型態大類別名稱', '事故位置大類別名稱',  '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '死亡受傷人數', '經度', '緯度',
]

dist_dfA2 = preprocess(dataA2, select_lst, sample = 592)

rbind_data = pd.concat([dist_dfA2[0]], axis=0, ignore_index=True)

rbind_data.loc[rbind_data['受傷'] > 1, '受傷'] = 2
rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)
# 顏色處理
rbind_data['color'] = rbind_data['速限-第1當事者'].astype(str) + rbind_data['事故類型及型態大類別名稱']
# 標籤
dist_df = process_data(rbind_data)
scaler = StandardScaler()

full_dist = pd.DataFrame(scaler.fit_transform(dist_df), columns = dist_df.columns)
X1 = full_dist.drop(['受傷', '死亡', '經度', '緯度', 'color'], axis=1).to_numpy()

full_dist.head()

Unnamed: 0,光線名稱,道路類別-第1當事者-名稱,速限-第1當事者,道路型態大類別名稱,事故位置大類別名稱,號誌-號誌種類名稱,車道劃分設施-分向設施大類別名稱,車道劃分設施-分道設施-快車道或一般車道間名稱,車道劃分設施-分道設施-快慢車道間名稱,車道劃分設施-分道設施-路面邊線名稱,事故類型及型態大類別名稱,經度,緯度,死亡,受傷,color
0,-1.206974,-0.255608,-0.147511,1.215495,1.167157,-0.71305,0.477606,-0.818767,0.423229,-0.917151,-0.025904,-0.429782,0.140773,0.0,-0.636623,-0.110396
1,-1.206974,2.104467,6.779154,-0.811423,-0.870473,0.449901,-1.629982,1.127345,-3.838199,-0.917151,-0.025904,-1.14036,-1.297336,0.0,-0.636623,4.122549
2,-1.206974,-0.255608,-0.147511,1.215495,1.167157,-0.71305,1.180135,-0.818767,0.423229,1.090333,-0.025904,-0.359103,0.114745,0.0,1.570788,-0.110396
3,-1.206974,-0.255608,-0.147511,1.215495,1.167157,-0.71305,1.180135,-0.818767,0.423229,-0.917151,-0.025904,-0.617069,-0.456131,0.0,1.570788,-0.110396
4,-1.206974,-0.255608,-0.147511,1.215495,1.167157,-0.71305,0.477606,-0.818767,0.423229,1.090333,1.807917,-1.309713,-1.281229,0.0,-0.636623,1.300586


In [10]:
start_time = time.time()

lens1 = PCA(10).fit_transform(X1)

mapper_algo1 = MapperAlgorithm(
    cover = CubicalCover(
        n_intervals = 4,
        overlap_frac = 0.4
    ),
    clustering = FailSafeClustering(
        clustering = AgglomerativeClustering(3, linkage='ward'),
        verbose = False)
)

mapper_graph1 = mapper_algo1.fit_transform(X1, lens1)

In [25]:
# mapper_plot1 = MapperLayoutInteractive(
#     mapper_graph1,
#     colors = dist_df[['color']].to_numpy(),
#     cmap = 'jet',
#     # agg = np.nanmean,
#     agg = most_frequent_nonan,
#     dim = 3,
#     iterations = 30,
#     seed = 5,
#     width = 800,
#     height = 500)

# fig_mean1 = mapper_plot1.plot()
# fig_mean1.show(config={'scrollZoom': True})

In [17]:
# from statsmodels.stats.outliers_influence import variance_inflation_factor
# def calculate_vif(X):
#     vif = pd.DataFrame()
#     vif["features"] = X.columns
#     vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
#     return vif

# # 假设 c0_for_lm_X 是你的预测变量DataFrame
# vif_df = calculate_vif(full_dist[[
#     '光線名稱', '道路類別-第1當事者-名稱', '速限-第1當事者',
#     '道路型態大類別名稱', '事故位置大類別名稱',  '號誌-號誌種類名稱',
#     '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
#     '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
#     '事故類型及型態大類別名稱']]
#                       )
# print(vif_df)

In [19]:
# import pickle

# with open('CalculatedData/A2道路V2.pkl', 'wb') as f:
#     pickle.dump(mapper_graph1, f)

In [20]:
rbind_data['道路型態大類別名稱'].value_counts(normalize = True)

交岔路     0.600674
單路部分    0.390065
圓環廣場    0.005574
其他      0.003651
平交道     0.000036
Name: 道路型態大類別名稱, dtype: float64

In [21]:
rbind_data['受傷'].value_counts(normalize = True)

1    0.711597
2    0.288403
Name: 受傷, dtype: float64

In [22]:
count_data = rbind_data.groupby(['速限-第1當事者', '事故類型及型態大類別名稱']).size().unstack(fill_value=0)

total_count = count_data.sum(axis=1)
proportion_data = count_data.div(total_count, axis=0)
proportion_data

事故類型及型態大類別名稱,人與車,平交道事故,車與車,車輛本身
速限-第1當事者,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.048034,6e-06,0.844605,0.107355
1,0.010626,0.0,0.823266,0.166107


In [23]:
count_data = rbind_data.groupby(['速限-第1當事者', '事故位置大類別名稱']).size().unstack(fill_value=0)

total_count = count_data.sum(axis=1)
proportion_data = count_data.div(total_count, axis=0)
proportion_data

事故位置大類別名稱,交叉路口,交流道,其他,路段
速限-第1當事者,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.572489,0.001631,0.015559,0.410321
1,0.286913,0.012584,0.010626,0.689877


In [24]:
rbind_data['速限-第1當事者'].value_counts(normalize = True)

0    0.978704
1    0.021296
Name: 速限-第1當事者, dtype: float64

## 事故類型及型態大類別名稱 & 速限  
1. 紅: 高速限, 車與車
2. 綠: 低速限, 車輛本身
3. 淺藍: 低速限, 車與車
4. 深藍: 低速限, 人與車

In [108]:
# mapper_plot1 = MapperLayoutInteractive(
#     mapper_graph1,
#     colors = dist_df[['color']].to_numpy(),
#     cmap = 'jet',
#     # agg = np.nanmean,
#     agg = most_frequent_nonan,
#     dim = 3,
#     iterations = 30,
#     seed = 5,
#     width = 800,
#     height = 500)

# fig_mean1 = mapper_plot1.plot()
# fig_mean1.show(config={'scrollZoom': True})

In [35]:
# import pickle

# with open('CalculatedData/A2個別.pkl', 'wb') as f:
#     pickle.dump(mapper_graph1, f)