This code is to optimize TDA.ipynb file, the main change is using three main function as the filter function:
1. Eccentricity
2. PCA
3. KDE

In [None]:
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

import ast
import pandas as pd
from functools import partial

import pickle
from sklearn.cluster import KMeans
from tdam.cover import CubicalCover
from tdam.clustering import FailSafeClustering
from tdam.core_old import MapperAlgorithm

In [None]:
import geopandas as gpd
from shapely import wkt

TM2 = 3826

# 原始以 0.001 grid 計算出的區域事故及對應索引, 依照 hex_grid 計算出來的GI
grid_gi_df = pd.read_csv('../ComputedData/Grid/grid_giV2.csv')
grid_gi_df['accident_indices'] = grid_gi_df['accident_indices'].apply(ast.literal_eval)
grid_gi_df['geometry'] = grid_gi_df['geometry'].apply(wkt.loads)
grid_gi = gpd.GeoDataFrame(grid_gi_df, geometry='geometry').set_crs(TM2, allow_override=True)
grid_gi['geometry'] = grid_gi.geometry.centroid

# 熱點屬於哪一個城市
taiwan = gpd.read_file('../Data/OFiles_9e222fea-bafb-4436-9b17-10921abc6ef2/TOWN_MOI_1140318.shp')
taiwan = taiwan[(~taiwan['TOWNNAME'].isin(['旗津區', '頭城鎮', '蘭嶼鄉', '綠島鄉', '琉球鄉'])) & 
                (~taiwan['COUNTYNAME'].isin(['金門縣', '連江縣', '澎湖縣']))].to_crs(TM2)
taiwan_cnty = taiwan[['COUNTYNAME','geometry']].dissolve(by='COUNTYNAME')
taiwan_cnty['geometry'] = taiwan_cnty.buffer(0)

county_join = gpd.sjoin(grid_gi[['geometry']], taiwan_cnty, how='left', predicate='within')
grid_gi['COUNTYNAME'] = county_join['COUNTYNAME']

grid_filter = grid_gi[grid_gi['accident_indices'].str.len() > 0]
grid_filter.reset_index(inplace=True)

In [None]:
# from utils import read_data
# combined_data = read_data()

# grid_exploded = grid_filter.explode("accident_indices")
# mapping_df = grid_exploded[["accident_indices", "hotspot", "COUNTYNAME"]]

# combined_data = combined_data.merge(
#     mapping_df,
#     left_index=True,
#     right_on="accident_indices",
#     how="left"
# )

# combined_data = combined_data.drop(columns=["accident_indices"])
# combined_data = combined_data[~combined_data['hotspot'].isna()]
# combined_data = combined_data[~combined_data['COUNTYNAME'].isna()]

# combined_data.to_csv('../ComputedData/ForModel/combined_data_with_hotspot.csv', index=False)

In [None]:
# This is same as all_features but adding hotspot only
all_features_df = pd.read_csv("../ComputedData/ForModel/all_features.csv")

cols1 = all_features_df.columns[all_features_df.columns.str.contains('事故位置大類別名稱')]
all_features_df.drop(columns=cols1, inplace=True)

# cols2 = all_features_df.columns[all_features_df.columns.str.contains('號誌動作')]
# cols3 = all_features_df.columns[all_features_df.columns.str.contains('hotspot')]
# all_features_df.drop(columns=cols2, inplace=True)
# all_features_df.drop(columns=cols3, inplace=True)

forspeed = pd.read_csv("../ComputedData/ForModel/all_featuresV2.csv")
speed = forspeed['original_speed']
# all_features_df.drop(columns=['original_speed'], inplace=True)

# Mapper
Get filtered_data by running FilterforMapper.py

In [None]:
filter_full = pd.read_csv("../ComputedData/ForModel/filtered_data.csv")
filter_full.drop(columns=['pc4', 'pc5'], inplace=True)

overlaps = [3]
intervals = [11]
detailed_results = []
silhouette_for_intervals = []

for overlap in overlaps:
    for interval in intervals:
        print(f"Processing overlap {overlap}, interval {interval}")
        mapper_algo = MapperAlgorithm(
            cover=CubicalCover(
                n_intervals=interval,
                overlap_frac=overlap / 10
            ),
            clustering=FailSafeClustering(
                KMeans(
                    n_clusters=2,
                    random_state=42
                )
            ),
            n_jobs=-1
        )

        mapper_info = mapper_algo.fit_transform(all_features_df.to_numpy(), filter_full)

        silhouette_for_intervals.append(mapper_info[1])
        result = {
            "overlap": overlap,
            "interval": interval,
            "silhouette": mapper_info[1],
            "mapper_info": mapper_info
        }
        detailed_results.append(result)

        with open(f"../ComputedData/ForMatrixV2/o{overlap}i{interval}_test.pkl", 'wb') as file:
            pickle.dump(result, file)

detailed_results_df = pd.DataFrame(detailed_results)

In [None]:
def get_max_categories(row):

    cols = all_features_df.columns[all_features_df.columns.str.contains(col)]

    max_val = row[cols].max()
    max_cols = row[cols][row[cols] == max_val].index
    # 取底線後面的類別名稱，用逗號串起來
    return ','.join(col.split('_')[-1] for col in max_cols)

col = '道路類別-第1當事者-名稱'
all_features_df['Most common in node'] = all_features_df.apply(get_max_categories, axis=1)
all_features_df['Most common in node'].value_counts()

all_features_df['county'] = grid_filter['COUNTYNAME']
all_features_df['hotspot'] = grid_filter['hotspot']
all_features_df['county_city'] = all_features_df['county'].apply(lambda x: 'City' if '市' in str(x) else 'County')
all_features_df['hotspot'] = all_features_df['hotspot'].apply(lambda x: 'Hotspot' if 'Hotspot' in str(x) else 'Not Hotspot')

all_features_df['facility'] = all_features_df[['youbike_100m_count_mean']].apply(lambda row: '1' if (row > 0).any() else '0', axis=1)
all_features_df['hotspot_facility'] = all_features_df['hotspot'] + '_' + all_features_df['facility']
all_features_df['facility'] = all_features_df['facility'].astype(int) 
all_features_df['original_speed'] = speed

all_features_df['bn_feature'] = all_features_df.apply(
    lambda row: 1 if (
        (
        (
            (row['道路型態大類別名稱_單路部分'] > 0) or
            (row['道路型態大類別名稱_其他'] > 0) or
            (row['道路型態大類別名稱_圓環廣場'] > 0) or
            (row['道路型態大類別名稱_平交道'] > 0) or
            (row['道路型態大類別名稱_交岔路'] > 0)
            ) and
        (
            (row['號誌-號誌種類名稱_行車管制號誌(附設行人專用號誌)'] > 0) or
            (row['號誌-號誌種類名稱_行車管制號誌'] > 0) or
            (row['號誌-號誌種類名稱_閃光號誌'] > 0) or
            (row['號誌-號誌種類名稱_無號誌'] > 0)
            ) and
        (
            (row['道路類別-第1當事者-名稱_市區道路'] > 0)
            ) and
        (
            (row['original_speed'] < 60)
            ) and
        (
            (row['facility'] > 0)
            )
        )
    ) else 0,
    axis=1
)

all_features_df['lane'] = all_features_df.apply(
    lambda row: 1 if (
        (
        (
            (row['車道劃分設施-分向設施大類別名稱_無'] > 0)
            ) and
        (
            (row['車道劃分設施-分道設施-快車道或一般車道間名稱_未繪設車道線'] > 0)
            ) and
        (
            (row['車道劃分設施-分道設施-快慢車道間名稱_未繪設快慢車道分隔線'] > 0)
            ) and
        (
            (row['車道劃分設施-分道設施-路面邊線名稱_無'] > 0)
            ) and
        (
            (row['事故類型及型態大類別名稱_人與車'] > 0)
            )
        )
    ) else 0,
    axis=1
)


In [None]:
# 用來檢查條件機率效果
test = all_features_df[['hotspot', '號誌-號誌種類名稱_行車管制號誌']].copy()
test['號誌-號誌種類名稱_行車管制號誌'] = test['號誌-號誌種類名稱_行車管制號誌'].apply(lambda x: 1 if x > 0.5 else 0)
test[['hotspot', '號誌-號誌種類名稱_行車管制號誌']].value_counts()

In [None]:
from config import category_value_map
cat_en = category_value_map.get(col, col)
cat_en['村里道路,市區道路'] = 'Village road, Urban road'

In [None]:
from TrafficTDApythonUtils.plotsv2 import MapperPlotterSpring
from utils_tda import avg_label, most_common_encoded_label, cond_prob_mixed, ratio_in_data

oi = 'o3i10'
choose = 'Most common in node'
detailed_results_df = pickle.load(open(f"../ComputedData/ForMatrixV2/{oi}.pkl", "rb"))

In [None]:
mapper_plotter = MapperPlotterSpring(
    detailed_results_df['mapper_info'],
    all_features_df,
    seed=47, iterations=100, dim=2,
    range_lst=[-0.5, 0.5, 0.5, -0.5],
    cmap="Reds",
    encoded_label=most_common_encoded_label
)
mapper_plotter.create_mapper_plot(choose, avg=False, size_threshold=50, plot_type='spring')
full_info, outliers = mapper_plotter.extract_data()
mapper_plotter.map_colors(threshold=0, en={})
mapper_plotter.plot(set_label=True, size=500, anchor=(0,0),
                    # save_path=f"../ComputedData/ForMatrixV2/Plots/{oi}.png"
                    )

### 這是在獲取最佳拓樸圖後要跑的結果

In [None]:
books = { 
    # 'Most common in node': {
    #     'label': most_common_encoded_label,
    #     'avg': False,
    #     'color': 'jet',
    #     'en': cat_en
    # },
    # ('hotspot', 'lane'): {
    #     'label': cond_prob_mixed,
    #     'avg': True,
    #     'params': {'a_col': 'hotspot', 'a_is': 'Hotspot',
    #                'b_col': 'lane', 'b_rule': '>0'},
    #     'color': 'Blues',
    #     'en': {}
    # },
    # ('hotspot', '號誌-號誌種類名稱_行車管制號誌'): {
    #     'label': cond_prob_mixed,
    #     'avg': True,
    #     'params': {'a_col': 'hotspot', 'a_is': 'Hotspot',
    #                'b_col': '號誌-號誌種類名稱_行車管制號誌', 'b_rule': '>0'},
    #     'color': 'Blues',
    #     'en': {}
    # },
    ('hotspot', 'facility'): {
        'label': cond_prob_mixed,
        'avg': True,
        'params': {'a_col': 'hotspot', 'a_is': 'Hotspot',
                   'b_col': 'facility', 'b_rule': '>0'},
        'color': 'Blues',
        'en': {}
    },
    # 'hotspot': {
    #     'label': ratio_in_data,
    #     'avg': True,
    #     'params': {'col': 'hotspot', 'values': 'Hotspot'},
    #     'color': 'Greens',
    #     'en': {}
    #     },
    # 'bn_feature': {
    #     'label': ratio_in_data,
    #     'avg': True,
    #     'params': {'col': 'bn_feature', 'values': 1},
    #     'color': 'Reds',
    #     'en': {}
    # },
    # ('hotspot', 'bn_feature'): {
    #     # 'label': ratio_in_data,
    #     'label': cond_prob_mixed,
    #     'avg': True,
    #     'params': {'a_col': 'hotspot', 'a_is': 'Hotspot',
    #                'b_col': 'bn_feature', 'b_rule': '>0'},
    #     'color': 'Reds',
    #     'en': {}
    # },
    # 'county_city': {
    #     'label': ratio_in_data,
    #     'avg': True,
    #     'params': {'col': 'county_city', 'values': 'City'},
    #     'color': 'Greens',
    #     'en': {}
    # },
    # 'original_speed': {
    #     'label': avg_label,
    #     'avg': True,
    #     'color': 'Greens',
    #     'en': {}
    # },
}

for name, book in books.items():

    func = book['label']
    if 'params' in book:
        func = partial(func, **book['params'])

    mapper_plotter = MapperPlotterSpring(
        detailed_results_df['mapper_info'],
        all_features_df,
        seed=47, iterations=130, dim=2,
        range_lst=[-0.5, 0.5, 0.5, -0.5],
        cmap=book['color'],
        encoded_label=func
    )
    mapper_plotter.create_mapper_plot(choose=name, avg=book['avg'], size_threshold=50, plot_type='spring')
    full_info, outliers = mapper_plotter.extract_data()
    mapper_plotter.map_colors(threshold=0)
    mapper_plotter.plot(set_label=True, size=500, anchor=(0,0),
                        # save_path=f"../ComputedData/ForMatrixTest/47_{name}.png"
                        )