In [1]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances_argmin_min, silhouette_score

from tdamapper.core import MapperAlgorithm
from tdamapper.cover import CubicalCover
from tdamapper.plot import MapperLayoutInteractive
from tdamapper.clustering import FailSafeClustering
from sklearn import metrics

from functions import *

In [2]:
data1 = pd.read_csv("./Data/NPA_TMA2_1.csv", low_memory=False)[:-2]
data2 = pd.read_csv("./Data/NPA_TMA2_2.csv", low_memory=False)[:-2]
data3 = pd.read_csv("./Data/NPA_TMA2_3.csv", low_memory=False)[:-2]
data4 = pd.read_csv("./Data/NPA_TMA2_4.csv", low_memory=False)[:-2]
dataA2 = pd.concat([data1, data2, data3, data4], ignore_index=True)

In [3]:
def preprocess(input_data, select_lst, sample = 592):
    sample_data = input_data[input_data['當事者順位'] == 1].reset_index(drop=True, inplace=False)#.sample(sample).reset_index(drop=True)
    # sample_data = sample_data[sample_data['發生月份'] < 3]
    dataA = sample_data[select_lst]
    
    death_injury_data = split_death_injury(dataA['死亡受傷人數'])
    dist_df = pd.concat([dataA, death_injury_data], axis=1)
    dist_df.drop(columns=['死亡受傷人數'], inplace=True)
    
    return dist_df, sample_data

# List of columns to select
select_lst = [
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '速限-第1當事者', #'道路障礙-視距品質名稱', # 和速限共線
    # '道路型態大類別名稱', 
    '事故位置大類別名稱', 
    # '路面狀況-路面鋪裝名稱', '路面狀況-路面缺陷名稱', '道路障礙-障礙物名稱', # 分類幾乎都是無缺陷
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '死亡受傷人數',
    '經度', '緯度',
]

dist_dfA2 = preprocess(dataA2, select_lst, sample = 592)

rbind_data = pd.concat([dist_dfA2[0]], axis=0, ignore_index=True)

rbind_data.loc[rbind_data['受傷'] > 1, '受傷'] = 2
rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)
# 顏色處理
rbind_data['color'] = rbind_data['速限-第1當事者'].astype(str) + rbind_data['事故類型及型態大類別名稱']
# 標籤
dist_df = process_data(rbind_data)
scaler = StandardScaler()

full_dist = pd.DataFrame(scaler.fit_transform(dist_df), columns = dist_df.columns)
X1 = full_dist.drop(['受傷', '死亡', '經度', '緯度', 'color'], axis=1).to_numpy()

full_dist.head()

Unnamed: 0,光線名稱,道路類別-第1當事者-名稱,速限-第1當事者,事故位置大類別名稱,號誌-號誌種類名稱,車道劃分設施-分向設施大類別名稱,車道劃分設施-分道設施-快車道或一般車道間名稱,車道劃分設施-分道設施-快慢車道間名稱,車道劃分設施-分道設施-路面邊線名稱,事故類型及型態大類別名稱,經度,緯度,死亡,受傷,color
0,-1.173378,-0.249747,-0.144291,1.16641,-0.714624,0.475216,-0.822583,0.422543,-0.918429,-0.012539,-0.449723,0.121455,0.0,-0.637094,-0.096852
1,-1.173378,2.129445,6.930436,-0.871198,0.447229,-1.629979,1.123459,-3.821565,-0.918429,-0.012539,-1.168584,-1.323487,0.0,-0.637094,4.155553
2,-1.173378,-0.249747,-0.144291,1.16641,-0.714624,1.176948,-0.822583,0.422543,1.088816,-0.012539,-0.378221,0.095303,0.0,1.569627,-0.096852
3,-1.173378,-0.249747,-0.144291,1.16641,-0.714624,1.176948,-0.822583,0.422543,-0.918429,-0.012539,-0.639194,-0.478286,0.0,1.569627,-0.096852
4,-1.173378,-0.249747,-0.144291,1.16641,-0.714624,0.475216,-0.822583,0.422543,1.088816,1.806612,-1.339912,-1.307304,0.0,-0.637094,1.320616


In [82]:
dist_df.head(10)#[dist_df['事故類型及型態大類別名稱'] == 0]

Unnamed: 0,光線名稱,道路類別-第1當事者-名稱,速限-第1當事者,道路型態大類別名稱,事故位置大類別名稱,號誌-號誌種類名稱,車道劃分設施-分向設施大類別名稱,車道劃分設施-分道設施-快車道或一般車道間名稱,車道劃分設施-分道設施-快慢車道間名稱,車道劃分設施-分道設施-路面邊線名稱,事故類型及型態大類別名稱,經度,緯度,死亡,受傷,color
0,0,3,0,2,3,0,3,0,2,0,2,120.624083,24.209466,0,1,2
1,0,6,1,0,0,1,0,3,0,0,2,120.272716,22.973301,0,1,5
2,0,3,0,2,3,0,4,0,2,1,2,120.659032,24.187093,0,2,2
3,0,3,0,2,3,0,4,0,2,0,2,120.531473,23.696381,0,2,2
4,0,3,0,2,3,0,3,0,2,1,3,120.188974,22.987146,0,1,3
5,0,3,0,2,3,3,0,3,1,0,2,121.220833,24.99372,0,2,2
6,0,3,0,0,0,1,0,0,2,0,2,120.199641,22.996038,0,2,2
7,0,3,0,2,3,0,3,4,2,0,2,121.734898,25.094595,0,2,2
8,0,3,0,2,3,0,3,4,2,1,0,121.566408,25.04309,0,1,0
9,0,8,0,0,0,0,0,3,2,0,2,121.460877,25.134899,0,2,2


In [93]:
# rbind_data#[(rbind_data['事故類型及型態大類別名稱'] == '車輛本身') & (rbind_data['速限-第1當事者'] == 0)]

In [30]:
start_time = time.time()

lens1 = PCA(10).fit_transform(X1)

mapper_algo1 = MapperAlgorithm(
    cover = CubicalCover(
        n_intervals = 4,
        overlap_frac = 0.4
    ),
    clustering = FailSafeClustering(
        clustering = AgglomerativeClustering(3, linkage='ward'),
        verbose = False)
)

mapper_graph1 = mapper_algo1.fit_transform(X1, lens1)

In [90]:
rbind_data['道路型態大類別名稱'].value_counts(normalize = True)

交岔路     0.599875
單路部分    0.390716
圓環廣場    0.005730
其他      0.003637
平交道     0.000042
Name: 道路型態大類別名稱, dtype: float64

In [109]:
rbind_data['受傷'].value_counts(normalize = True)

1    0.711294
2    0.288706
Name: 受傷, dtype: float64

In [103]:
count_data = rbind_data.groupby(['速限-第1當事者', '事故類型及型態大類別名稱']).size().unstack(fill_value=0)

total_count = count_data.sum(axis=1)
proportion_data = count_data.div(total_count, axis=0)
proportion_data

事故類型及型態大類別名稱,人與車,平交道事故,車與車,車輛本身
速限-第1當事者,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.04999,8e-06,0.845673,0.104328
1,0.012215,0.0,0.833876,0.153909


In [4]:
count_data = rbind_data.groupby(['速限-第1當事者', '事故位置大類別名稱']).size().unstack(fill_value=0)

total_count = count_data.sum(axis=1)
proportion_data = count_data.div(total_count, axis=0)
proportion_data

事故位置大類別名稱,交叉路口,交流道,其他,路段
速限-第1當事者,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.5717,0.001551,0.015818,0.41093
1,0.292345,0.013029,0.012215,0.68241


In [99]:
rbind_data['速限-第1當事者'].value_counts(normalize = True)

0    0.979605
1    0.020395
Name: 速限-第1當事者, dtype: float64

## 事故類型及型態大類別名稱 & 速限  
1. 紅: 高速限, 車與車
2. 綠: 低速限, 車輛本身
3. 淺藍: 低速限, 車與車
4. 深藍: 低速限, 人與車

In [108]:
# mapper_plot1 = MapperLayoutInteractive(
#     mapper_graph1,
#     colors = dist_df[['color']].to_numpy(),
#     cmap = 'jet',
#     # agg = np.nanmean,
#     agg = most_frequent_nonan,
#     dim = 3,
#     iterations = 30,
#     seed = 5,
#     width = 800,
#     height = 500)

# fig_mean1 = mapper_plot1.plot()
# fig_mean1.show(config={'scrollZoom': True})

In [35]:
# import pickle

# with open('CalculatedData/A2個別.pkl', 'wb') as f:
#     pickle.dump(mapper_graph1, f)