In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as shc
from collections import Counter

from sklearn.datasets import make_circles
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

from tdamapper.core import MapperAlgorithm
from tdamapper.cover import CubicalCover
from tdamapper.plot import MapperLayoutInteractive
from tdamapper.clustering import FailSafeClustering
from sklearn import metrics

from functions import *

In [35]:
data1 = pd.read_csv("./Data/NPA_TMA2_1.csv", low_memory=False)[:-2]
data2 = pd.read_csv("./Data/NPA_TMA2_2.csv", low_memory=False)[:-2]
data3 = pd.read_csv("./Data/NPA_TMA2_3.csv", low_memory=False)[:-2]
data4 = pd.read_csv("./Data/NPA_TMA2_4.csv", low_memory=False)[:-2]
dataA2 = pd.concat([data1, data2, data3, data4], ignore_index=True)

dataA1 = pd.read_csv("./Data/NPA_TMA1.csv")[:-2]

In [None]:
# List of columns to select
select_lst = [
    # 天氣因素
    # '天候名稱', '路面狀況-路面狀態名稱',
    # # 人
    # # '肇因研判子類別名稱-主要',
    # '肇因研判大類別名稱-主要', '當事者屬-性-別名稱', '當事者事故發生時年齡', 
    # '當事者行動狀態大類別名稱', '車輛撞擊部位大類別名稱-最初', #'肇因研判大類別名稱-個別', #該欄位和主要一樣
    # 其他
    '光線名稱',
    '道路類別-第1當事者-名稱', '速限-第1當事者', 
    '道路型態大類別名稱', '事故位置大類別名稱', 
    '路面狀況-路面鋪裝名稱',
    '路面狀況-路面缺陷名稱', '道路障礙-障礙物名稱',
    '道路障礙-視距品質名稱', '號誌-號誌種類名稱',
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    '事故類型及型態大類別名稱',
    '死亡受傷人數',
    '經度', '緯度'
]

# 景點處理

In [36]:
from scipy.spatial.distance import cdist

scenic = pd.read_csv("./Data/Scenic_Spot_C_f.csv", low_memory=False)

dist_dfA1 = preprocess(dataA1, sample = 592)
dist_dfA2 = preprocess(dataA2, sample = 5920)

# 將經緯度轉換成球體的三維坐標
def latlon_to_xyz(lat, lon):
    lat_rad = np.radians(lat)
    lon_rad = np.radians(lon)
    x = np.cos(lat_rad) * np.cos(lon_rad)
    y = np.cos(lat_rad) * np.sin(lon_rad)
    z = np.sin(lat_rad)
    return np.vstack((x, y, z)).T

# 計算球面距離
def spherical_dist(pos1, pos2, radius=6371):
    cos_angle = np.dot(pos1, pos2.T)
    angle = np.arccos(np.clip(cos_angle, -1, 1))
    return radius * angle

def calculate_distances(df, scenic_df, distance_threshold=1000):
    # 將經緯度轉換成球體的三維坐標
    df_xyz = latlon_to_xyz(df['緯度'], df['經度'])
    scenic_xyz = latlon_to_xyz(scenic_df['Py'], scenic_df['Px'])

    # 計算所有配對之間的球面距離，並轉換為米
    distances = spherical_dist(df_xyz, scenic_xyz) * 1000

    # 檢查哪些距離小於設定的threshold
    distances_less_than_threshold = (distances < distance_threshold)

    # 計算每一行小於閾值的點數量
    df['景點數'] = distances_less_than_threshold.sum(axis=1)

    return df

# 使用函數，假設 dist_dfA1 和 scenic 已經被正確地載入
scenic_dfA1 = calculate_distances(dist_dfA1[0], scenic)
scenic_dfA2 = calculate_distances(dist_dfA2[0], scenic)

rbind_data = pd.concat([scenic_dfA1, scenic_dfA2], axis=0, ignore_index=True)
# 處理數值型資料
rbind_data.loc[rbind_data['景點數'] > 1, '景點數'] = 2
rbind_data.loc[rbind_data['受傷'] > 1, '受傷'] = 2
rbind_data['速限-第1當事者'] = rbind_data['速限-第1當事者'].apply(lambda x: 1 if x > 60 else 0)

dist_df = process_data(rbind_data)
scaler = StandardScaler()

full_dist = pd.DataFrame(scaler.fit_transform(dist_df), columns = dist_df.columns)
X1 = full_dist.drop(['受傷', '死亡', '經度', '緯度'], axis=1).to_numpy()

In [1]:
# lens1 = PCA(10).fit_transform(X1)

# mapper_algo1 = MapperAlgorithm(
#     cover = CubicalCover(
#         n_intervals = 5,
#         overlap_frac = 0.5
#     ),
#     clustering = FailSafeClustering(
#         clustering = AgglomerativeClustering(5, affinity='euclidean', linkage='ward'),
#         verbose = False)
# )

# mapper_graph1 = mapper_algo1.fit_transform(X1, lens1)

# mapper_plot1 = MapperLayoutInteractive(
#     mapper_graph1,
#     colors = dist_df[['景點數']].to_numpy(),
#     cmap = 'jet',
#     # agg = np.nanmean,
#     agg = most_frequent_nonan,
#     dim = 3,
#     iterations = 30,
#     seed = 5,
#     width = 800,
#     height = 500)

# fig_mean1 = mapper_plot1.plot()
# fig_mean1.show(config={'scrollZoom': True})

In [14]:
x = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['x']
y = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['y']
z = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['z']

threeDimData = pd.DataFrame({'x': x, 'y': y, 'z': z})

import re
data_tuple = vars(mapper_plot1._MapperLayoutInteractive__fig)['_data_objs'][1]['text']

data = []
for item in data_tuple:
    color = int(re.search(r'color: (\d+)', item).group(1))
    node = int(re.search(r'node: (\d+)', item).group(1))
    size = int(re.search(r'size: (\d+)', item).group(1))
    data.append({'color': color, 'node': node, 'size': size})
component_info = pd.DataFrame(data)

full_info = pd.concat([component_info, threeDimData], axis=1)

mp_content_origin = vars(mapper_plot1._MapperLayoutInteractive__graph)['_node']

mp_content = pd.DataFrame.from_dict(mp_content_origin, orient='index')
mp_content.reset_index(inplace=True)
mp_content.rename(columns={'index': 'node'}, inplace=True)

full_info = pd.merge(full_info, mp_content, on=['node', 'size'], how='inner')

In [2]:
# import plotly.graph_objects as go
# # calinski_data = get_calinski_from_db(full_info, 0.05)
# calinski_data = get_calinski_from_db(full_info, 0.09)
# labels = calinski_data[3]
# db = calinski_data[2]
# n_clusters_ = calinski_data[4]

# unique_labels = set(labels)
# core_samples_mask = np.zeros_like(labels, dtype=bool)
# core_samples_mask[db.core_sample_indices_] = True

# def matplotlib_to_plotly(cmap, alpha=1):
#     """rgba"""
#     return f'rgba({int(cmap[0]*200)}, {int(cmap[1]*200)}, {int(cmap[2]*200)}, {alpha})'

# # colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]  
# colors = [matplotlib_to_plotly(plt.cm.Spectral(each), alpha=0.8) for each in np.linspace(0, 1, len(unique_labels))]
# fig = go.Figure()

# for k, col in zip(unique_labels, colors):
#     if k == -1:
#         # col = 'rgba(0,0,0,0)'
#         col = 'rgba(0,0,0,0)'

#     class_member_mask = labels == k

#     core_samples = full_info.iloc[:, 3:6][class_member_mask & core_samples_mask]
#     fig.add_trace(go.Scatter3d(
#         x=core_samples.iloc[:, 0],
#         y=core_samples.iloc[:, 1],
#         z=core_samples.iloc[:, 2],
#         mode='markers',
#         marker=dict(
#             size=6,
#             color=col,
#             opacity=0.8
#         ),
#         name=f'Cluster {k} Core'
#     ))

#     non_core_samples = full_info.iloc[:, 3:6][class_member_mask & ~core_samples_mask]
#     fig.add_trace(go.Scatter3d(
#         x=non_core_samples.iloc[:, 0],
#         y=non_core_samples.iloc[:, 1],
#         z=non_core_samples.iloc[:, 2],
#         mode='markers',
#         marker=dict(
#             size=6,
#             color=col,
#             opacity=0.5
#         ),
#         name=f'Cluster {k} Non-Core'
#     ))

# fig.update_layout(
#     title=f"Estimated number of clusters: {n_clusters_}",
#     margin=dict(l=0, r=0, b=0, t=0)
# )

# fig.show()

In [19]:
print(full_info['label'].unique())

label_0 = full_info[full_info['label'] == 0]
label_1 = full_info[full_info['label'] == 1]

def get_count_dict(input_data):
    count = {}
    for key_to_search in input_data['node']:
        ids_to_search = input_data[input_data['node'] == key_to_search]['ids'].values[0]
        for num in ids_to_search:
            if num in count:
                count[num] += 1
            else:
                count[num] = 1
                
    return count

count_0 = get_count_dict(label_0)
count_1 = get_count_dict(label_1)

both_cluster = {key: count_0[key] for key in count_0.keys() & count_1.keys()}

full_0 = rbind_data.loc[count_0.keys()]
full_1 = rbind_data.loc[count_1.keys()]

lst = list(count_0.keys() & count_1.keys())
full_12 = full_0.loc[lst]

# 將重複的key另外拉出進行分析，這裡drop是為了符合卡方的獨立性前提假設
full_0 = full_0.drop(lst)
full_1 = full_1.drop(lst)

# full_12_count = full
print(len(lst))
for key in lst:
    del count_0[key]
    del count_1[key]

[ 0  1 -1]
14


In [20]:
from scipy.stats import chi2_contingency

def add_count(input_data, count):

    input_data['count'] = 0

    for key, value in count.items():
        input_data.loc[key, 'count'] = value

    return input_data

full_0 = add_count(full_0, count_0)
full_1 = add_count(full_1, count_1)
full_12 = add_count(full_12, both_cluster)

print(full_0.shape[0] + full_1.shape[0] + full_12.shape[0])
print(full_0.shape, full_1.shape, full_12.shape)
print(full_0['count'].sum(), full_1['count'].sum(), full_12['count'].sum())

6281
(6100, 21) (167, 21) (14, 21)
30804 428 14


In [30]:
def table(colnames):
    
    combined_df = pd.concat([full_0[colnames].value_counts(normalize = True), 
                             full_1[colnames].value_counts(normalize = True),
                             full_12[colnames].value_counts(normalize = True)
                            ],
                            axis=1).fillna(0)

    combined_df.columns = ['cluster1', 'cluster2', 'cluster12']

    return combined_df

table('景點數')

Unnamed: 0,cluster1,cluster2,cluster12
2,0.449836,0.113772,0.142857
0,0.371967,0.640719,0.357143
1,0.178197,0.245509,0.5


In [22]:
def chi_compare(c0, c1):
    dict_0 = {}
    dict_1 = {}
    
    for i in range(c1.shape[1] - 1):
        dict_0[c0.columns[i]] = c0.iloc[:, i].value_counts()

    for i in range(full_1.shape[1] - 1): 
        dict_1[c1.columns[i]] = c1.iloc[:, i].value_counts()
        
    pvalue_lst = [] 
    for i in range(full_1.shape[1] - 1):
        combined_df = pd.concat([dict_0[c0.columns[i]], dict_1[c1.columns[i]]], axis=1).fillna(0)
        combined_df.columns = ['cluster1', 'cluster2']
        # print(combined_df)

        # 將 DataFrame 轉換為列聯表
        contingency_table = combined_df.values
        chi2, p, dof, expected = chi2_contingency(contingency_table)
        
        if p > 0.05:
            print(f"{c1.columns[i]} p值: {p} 不可分群")
            # continue
        else:
            print(f"{c1.columns[i]} p值: {p} 可分群")
            
chi_compare(full_0, full_1)

光線名稱 p值: 0.011077062051727354 可分群
道路類別-第1當事者-名稱 p值: 0.0 可分群
速限-第1當事者 p值: 0.0 可分群
道路型態大類別名稱 p值: 4.000681196272407e-20 可分群
事故位置大類別名稱 p值: 3.451895007975873e-28 可分群
路面狀況-路面鋪裝名稱 p值: 1.0 不可分群
路面狀況-路面缺陷名稱 p值: 1.0 不可分群
道路障礙-障礙物名稱 p值: 1.6350386914364776e-08 可分群
道路障礙-視距品質名稱 p值: 1.0 不可分群
號誌-號誌種類名稱 p值: 3.0958831600694374e-06 可分群
車道劃分設施-分向設施大類別名稱 p值: 6.829118550875341e-91 可分群
車道劃分設施-分道設施-快車道或一般車道間名稱 p值: 4.276895093863431e-106 可分群
車道劃分設施-分道設施-快慢車道間名稱 p值: 0.000126247620814125 可分群
車道劃分設施-分道設施-路面邊線名稱 p值: 4.904939835835238e-29 可分群
事故類型及型態大類別名稱 p值: 8.09856997580718e-09 可分群
經度 p值: 0.2802971882460983 不可分群
緯度 p值: 0.3893669416369531 不可分群
死亡 p值: 2.9800179510088593e-41 可分群
受傷 p值: 1.2414727306044093e-14 可分群
景點數 p值: 2.2134058366155242e-17 可分群
