In [1]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances_argmin_min, silhouette_score

from tdamapper.core import MapperAlgorithm
from tdamapper.cover import CubicalCover
from tdamapper.plot import MapperLayoutInteractive
from tdamapper.clustering import FailSafeClustering
from sklearn import metrics

from functions import *

In [2]:
dataA1 = pd.read_csv("./Data/NPA_TMA1.csv")[:-2]

In [4]:
def preprocess(input_data, select_lst, sample = 592):
    sample_data = input_data[input_data['當事者順位'] == 1].reset_index(drop=True, inplace=False)#.sample(sample).reset_index(drop=True)
    # sample_data = sample_data[sample_data['發生月份'] < 3]
    dataA = sample_data[select_lst]
    
    death_injury_data = split_death_injury(dataA['死亡受傷人數'])
    dist_df = pd.concat([dataA, death_injury_data], axis=1)
    dist_df.drop(columns=['死亡受傷人數'], inplace=True)
    
    return dist_df, sample_data

# List of columns to select
select_lst = [
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '速限-第1當事者', #'道路障礙-視距品質名稱', # 和速限共線
    '道路型態大類別名稱', '事故位置大類別名稱', 
    # '路面狀況-路面鋪裝名稱', '路面狀況-路面缺陷名稱', '道路障礙-障礙物名稱', # 分類幾乎都是無缺陷
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '死亡受傷人數',
    '經度', '緯度',
]

dist_dfA1 = preprocess(dataA1, select_lst, sample = 592)

rbind_data = pd.concat([dist_dfA1[0]], axis=0, ignore_index=True)

rbind_data.loc[rbind_data['受傷'] > 1, '受傷'] = 2
rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)

dist_df = process_data(rbind_data)
scaler = StandardScaler()

full_dist = pd.DataFrame(scaler.fit_transform(dist_df), columns = dist_df.columns)
X1 = full_dist.drop(['受傷', '死亡', '經度', '緯度'], axis=1).to_numpy()

full_dist.head()

Unnamed: 0,光線名稱,道路類別-第1當事者-名稱,速限-第1當事者,道路型態大類別名稱,事故位置大類別名稱,號誌-號誌種類名稱,車道劃分設施-分向設施大類別名稱,車道劃分設施-分道設施-快車道或一般車道間名稱,車道劃分設施-分道設施-快慢車道間名稱,車道劃分設施-分道設施-路面邊線名稱,事故類型及型態大類別名稱,經度,緯度,死亡,受傷
0,0.21059,1.156038,-0.372104,0.855389,0.169821,-0.54482,-1.122302,1.616064,-1.397714,-0.66626,1.380786,-0.487862,-1.582274,-0.162392,-0.667536
1,1.5229,-0.585373,-0.372104,-1.122698,-1.216859,3.038885,-1.122302,-0.888412,-1.397714,-0.66626,-0.206448,1.224667,1.159946,-0.162392,0.900644
2,-1.101719,1.736509,-0.372104,0.855389,0.863162,-0.54482,1.459574,0.989945,-1.397714,-0.66626,1.380786,-2.278252,-0.391081,-0.162392,-0.667536
3,-1.101719,-0.585373,-0.372104,-1.122698,-1.216859,3.038885,-0.261677,-0.888412,0.457548,1.500915,-0.206448,1.052954,1.20031,-0.162392,0.900644
4,1.5229,1.156038,-0.372104,0.855389,0.863162,-0.54482,0.598949,-0.888412,0.457548,-0.66626,1.380786,1.931796,0.399887,-0.162392,-0.667536


In [13]:
# start_time = time.time()

# lens1 = PCA(10).fit_transform(X1)

# mapper_algo1 = MapperAlgorithm(
#     cover = CubicalCover(
#         n_intervals = 3,
#         overlap_frac = 0.5
#     ),
#     clustering = FailSafeClustering(
#         clustering = AgglomerativeClustering(3, linkage='ward'),
#         verbose = False)
# )

# mapper_graph1 = mapper_algo1.fit_transform(X1, lens1)

# end_time = time.time()
# elapsed_time = end_time - start_time
# print(elapsed_time)

# # 道路型態大類別名稱
# mapper_plot1 = MapperLayoutInteractive(
#     mapper_graph1,
#     colors = dist_df[['死亡']].to_numpy(),
#     cmap = 'jet',
#     # agg = np.nanmean,
#     agg = most_frequent_nonan,
#     dim = 3,
#     iterations = 30,
#     seed = 5,
#     width = 800,
#     height = 500)

# fig_mean1 = mapper_plot1.plot()
# fig_mean1.show(config={'scrollZoom': True})