In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import metrics
from sklearn.cluster import DBSCAN, KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances_argmin_min
import plotly.graph_objects as go
from sklearn.decomposition import PCA

from functions import *
from chi import *
from regressionP import *
from models import *

In [2]:
data1 = pd.read_csv("./Data/NPA_TMA2_1.csv", low_memory=False)[:-2]
data2 = pd.read_csv("./Data/NPA_TMA2_2.csv", low_memory=False)[:-2]
data3 = pd.read_csv("./Data/NPA_TMA2_3.csv", low_memory=False)[:-2]
data4 = pd.read_csv("./Data/NPA_TMA2_4.csv", low_memory=False)[:-2]
dataA1 = pd.read_csv("./Data/NPA_TMA1.csv")[:-2]

dataA2 = pd.concat([data1, data2, data3, data4], ignore_index=True)

In [10]:
def preprocess(input_data, select_lst, sample = 592):
    sample_data = input_data[input_data['當事者順位'] == 1].reset_index(drop=True, inplace=False)
    dataA = sample_data[select_lst]
    
    death_injury_data = split_death_injury(dataA['死亡受傷人數'])
    dist_df = pd.concat([dataA, death_injury_data], axis=1)
    dist_df.drop(columns=['死亡受傷人數'], inplace=True)
    
    return dist_df, sample_data

# List of columns to select
select_lst = [
    '光線名稱',
    '道路類別-第1當事者-名稱',
    '速限-第1當事者', #'道路障礙-視距品質名稱', # 和速限共線
    '道路型態大類別名稱', '事故位置大類別名稱', 
    # '路面狀況-路面鋪裝名稱', '路面狀況-路面缺陷名稱', '道路障礙-障礙物名稱', # 分類幾乎都是無缺陷
    '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '死亡受傷人數',
    '經度', '緯度',
]

dist_dfA1 = preprocess(dataA1, select_lst, sample = 592)
dist_dfA2 = preprocess(dataA2, select_lst, sample = 11840) # 120420

    
rbind_data = pd.concat([dist_dfA1[0], dist_dfA2[0]], axis=0, ignore_index=True)

rbind_data.loc[rbind_data['受傷'] > 1, '受傷'] = 2
rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)
# rbind_data = process_age(rbind_data)

dist_df = process_data(rbind_data)
scaler = StandardScaler()

full_dist = pd.DataFrame(scaler.fit_transform(dist_df), columns = dist_df.columns)
X1 = full_dist.drop(['受傷', '死亡', '經度', '緯度'], axis=1)

full_dist.head()

Unnamed: 0,光線名稱,道路類別-第1當事者-名稱,速限-第1當事者,道路型態大類別名稱,事故位置大類別名稱,號誌-號誌種類名稱,車道劃分設施-分向設施大類別名稱,車道劃分設施-分道設施-快車道或一般車道間名稱,車道劃分設施-分道設施-快慢車道間名稱,車道劃分設施-分道設施-路面邊線名稱,事故類型及型態大類別名稱,經度,緯度,死亡,受傷
0,0.146102,2.120706,-0.146069,1.210906,0.485564,-0.713755,-1.627801,1.771301,-1.697696,-0.917071,1.798967,-0.63422,-1.868211,13.660843,-2.803204
1,1.465217,-0.25172,-0.146069,-0.814068,-0.872635,2.771975,-1.627801,-0.822895,-1.697696,-0.917071,-0.012319,1.111877,0.974928,13.660843,-0.620846
2,-1.173013,2.911514,-0.146069,1.210906,1.164664,-0.713755,1.177412,1.122752,-1.697696,-0.917071,1.798967,-2.459704,-0.63318,13.660843,-2.803204
3,-1.173013,-0.25172,-0.146069,-0.814068,-0.872635,2.771975,-0.225194,-0.822895,0.422707,1.090428,-0.012319,0.936797,1.016777,13.660843,-0.620846
4,1.465217,2.120706,-0.146069,1.210906,1.164664,-0.713755,0.476109,-0.822895,0.422707,-0.917071,1.798967,1.832866,0.186897,13.660843,-2.803204


In [39]:
rbind_data.shape[0] == full_dist.shape[0]

True

In [15]:
lens1 = PCA(2)
lens_result = lens1.fit_transform(X1.to_numpy())

# Get the principal components and explained variance
principal_components = lens1.components_
explained_variance = lens1.explained_variance_ratio_

# Create a DataFrame for loadings (principal component coefficients)
loadings_df = pd.DataFrame(principal_components.T, columns=['PC1', 'PC2'], index=X1.columns)

# Add PCA results back to the combined DataFrame for further analysis
X1['PC1'] = lens_result[:, 0]
X1['PC2'] = lens_result[:, 1]

print("Explained Variance Ratio:", explained_variance)
print("Principal Component Loadings:")
print(loadings_df)

Explained Variance Ratio: [0.21695054 0.13478921]
Principal Component Loadings:
                              PC1       PC2
光線名稱                     0.007175 -0.110614
道路類別-第1當事者-名稱            0.023615 -0.126597
速限-第1當事者                 0.133018  0.352007
道路型態大類別名稱                0.590220 -0.152242
事故位置大類別名稱                0.591466 -0.146258
號誌-號誌種類名稱               -0.359922  0.286140
車道劃分設施-分向設施大類別名稱        -0.026578 -0.492137
車道劃分設施-分道設施-快車道或一般車道間名稱  0.241412  0.528466
車道劃分設施-分道設施-快慢車道間名稱     -0.144481 -0.429436
車道劃分設施-分道設施-路面邊線名稱      -0.258382 -0.122170
事故類型及型態大類別名稱             0.085670  0.023831


In [88]:
# plt.figure(figsize=(10, 8))
# sns.scatterplot(
#     data=X1,
#     x='PC1',
#     y='PC2',
#     hue='事故位置大類別名稱',
#     palette='Set1',
#     s=100,
#     edgecolor='w',
#     alpha=0.7
# )

# # Set plot title and labels
# plt.title('PCA Clusters by User')
# plt.xlabel('PC 1')
# plt.ylabel('PC 2')
# plt.legend(title='User ID')
# plt.grid(True)

# # Show the plot
# plt.show()

In [68]:
def find_ratio(input_data, components) :
    best_comp = {}
    for comp in range(1,components+1):   
        pca = PCA(comp).fit(input_data)
        
        best_comp[comp] = pca.explained_variance_ratio_.sum()
        
    max_comp = max(best_comp, key=best_comp.get)  # 使用 key=best_comp.get 找到最大值的鍵
    print("最佳成分數：", max_comp)
    print("解釋方差比率累計值：", best_comp[max_comp])

lens1 = find_ratio(X1.to_numpy(), 6)

最佳成分數： 6
解釋方差比率累計值： 0.7999701757701387


In [97]:
lens1 = PCA(6)
lens_result = lens1.fit_transform(X1.to_numpy())

db = DBSCAN(eps=1.7, min_samples=10).fit(lens_result)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

In [84]:
n_clusters_

2

In [85]:
noise_mask = labels == -1
noise_rows = rbind_data[noise_mask]

noise_rows.shape

(99, 15)

In [93]:
# noise_rows.to_csv('CalculatedData/離群比較/原始.csv', index=False)

In [96]:
# origin_noise = process_data(noise_rows)
# origin_noise.drop(['經度', '緯度'], axis=1)