This code is to optimize TDA.ipynb file, the main change is using three main function as the filter function:
1. Eccentricity
2. PCA
3. KDE

In [1]:
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

import ast
import pandas as pd
from functools import partial

import pickle
from sklearn.cluster import KMeans
from tdam.cover import CubicalCover
from tdam.clustering import FailSafeClustering
from tdam.core_old import MapperAlgorithm

In [2]:
from utils import read_data, read_taiwan_specific

taiwan, grid_filter = read_taiwan_specific(read_grid=True)

In [3]:
combined_data = read_data()

grid_exploded = grid_filter.explode("accident_indices")
mapping_df = grid_exploded[["accident_indices", "hotspot", "COUNTYNAME"]]

combined_data = combined_data.merge(
    mapping_df,
    left_index=True,
    right_on="accident_indices",
    how="left"
)

combined_data = combined_data.drop(columns=["accident_indices"])
combined_data = combined_data[~combined_data['hotspot'].isna()]
combined_data = combined_data[~combined_data['COUNTYNAME'].isna()]

# combined_data.to_csv('../ComputedDataV2/ForModel/combined_data_with_hotspot.csv', index=False)

In [4]:
# This is same as all_features but adding hotspot only
all_features_df = pd.read_csv("../ComputedDataV2/ForModel/all_featuresV1.csv")

cols = all_features_df.columns[all_features_df.columns.str.contains('事故位置大類別名稱')]
cols2 = all_features_df.columns[all_features_df.columns.str.contains('號誌動作')]
cols3 = all_features_df.columns[all_features_df.columns.str.contains('hotspot')]
all_features_df.drop(columns=cols, inplace=True)
all_features_df.drop(columns=cols2, inplace=True)
all_features_df.drop(columns=cols3, inplace=True)

speed = all_features_df['original_speed']
all_features_df.drop(columns=['original_speed'], inplace=True)

# Mapper
Get filtered_data by running FilterforMapper.py

In [None]:
filter_full = pd.read_csv("../ComputedDataV2/ForModel/filtered_dataV1.csv")
filter_full.drop(columns=['pc4', 'pc5'], inplace=True)

overlaps = [4]
intervals = [9]
detailed_results = []
silhouette_for_intervals = []

for overlap in overlaps:
    for interval in intervals:
        print(f"Processing overlap {overlap}, interval {interval}")
        mapper_algo = MapperAlgorithm(
            cover=CubicalCover(
                n_intervals=interval,
                overlap_frac=overlap / 10
            ),
            clustering=FailSafeClustering(
                KMeans(
                    n_clusters=2,
                    random_state=42
                )
            ),
            n_jobs=-1
        )

        mapper_info = mapper_algo.fit_transform(all_features_df.to_numpy(), filter_full)

        silhouette_for_intervals.append(mapper_info[1])
        result = {
            "overlap": overlap,
            "interval": interval,
            "silhouette": mapper_info[1],
            "mapper_info": mapper_info
        }
        detailed_results.append(result)

        with open(f"../ComputedDataV2/ForMatrix/o{overlap}i{interval}.pkl", 'wb') as file:
            pickle.dump(result, file)

detailed_results_df = pd.DataFrame(detailed_results)

In [5]:
def get_max_categories(row):

    cols = all_features_df.columns[all_features_df.columns.str.contains(col)]

    max_val = row[cols].max()
    max_cols = row[cols][row[cols] == max_val].index
    # 取底線後面的類別名稱，用逗號串起來
    return ','.join(col.split('_')[-1] for col in max_cols)

col = '道路類別-第1當事者-名稱'
all_features_df['Most common in node'] = all_features_df.apply(get_max_categories, axis=1)
all_features_df['Most common in node'].value_counts()

all_features_df['county'] = grid_filter['COUNTYNAME']
all_features_df['hotspot'] = grid_filter['hotspot']
all_features_df['county_city'] = all_features_df['county'].apply(lambda x: 'City' if '市' in str(x) else 'County')
all_features_df['hotspot'] = all_features_df['hotspot'].apply(lambda x: 'Hotspot' if 'Hotspot' in str(x) else 'Not Hotspot')

all_features_df['facility'] = all_features_df[['youbike_100m_count_mean']].apply(lambda row: '1' if (row > 0).any() else '0', axis=1)
all_features_df['hotspot_facility'] = all_features_df['hotspot'] + '_' + all_features_df['facility']
all_features_df['facility'] = all_features_df['facility'].astype(int) 
all_features_df['original_speed'] = speed

all_features_df['bn_feature'] = all_features_df.apply(
    lambda row: 1 if (
        (
        (
            (row['道路型態大類別名稱_單路部分'] > 0) or
            (row['道路型態大類別名稱_其他'] > 0) or
            (row['道路型態大類別名稱_圓環廣場'] > 0) or
            (row['道路型態大類別名稱_平交道'] > 0) or
            (row['道路型態大類別名稱_交岔路'] > 0)
            ) and
        (
            (row['號誌-號誌種類名稱_行車管制號誌(附設行人專用號誌)'] > 0) or
            (row['號誌-號誌種類名稱_行車管制號誌'] > 0) or
            (row['號誌-號誌種類名稱_閃光號誌'] > 0) or
            (row['號誌-號誌種類名稱_無號誌'] > 0)
            ) and
        (
            (row['道路類別-第1當事者-名稱_市區道路'] > 0)
            ) and
        (
            (row['original_speed'] < 60)
            ) and
        (
            (row['facility'] > 0)
            )
        )
    ) else 0,
    axis=1
)

all_features_df['lane'] = all_features_df.apply(
    lambda row: 1 if (
        (
        # (
        #     (row['車道劃分設施-分向設施大類別名稱_無'] > 0)
        #     ) and
        # (
        #     (row['車道劃分設施-分道設施-快車道或一般車道間名稱_未繪設車道線'] > 0)
        #     ) and
        # (
        #     (row['車道劃分設施-分道設施-快慢車道間名稱_未繪設快慢車道分隔線'] > 0)
        #     ) and
        # (
        #     (row['車道劃分設施-分道設施-路面邊線名稱_無'] > 0)
        #     ) and
        (
            (row['事故類型及型態大類別名稱_人與車'] > 0)
            )
        )
    ) else 0,
    axis=1
)


In [None]:
# 用來檢查條件機率效果
test = all_features_df[['hotspot', '號誌-號誌種類名稱_行車管制號誌']].copy()
test['號誌-號誌種類名稱_行車管制號誌'] = test['號誌-號誌種類名稱_行車管制號誌'].apply(lambda x: 1 if x > 0.5 else 0)
test[['hotspot', '號誌-號誌種類名稱_行車管制號誌']].value_counts()

In [None]:
from config import category_value_map
cat_en = category_value_map.get(col, col)
cat_en['村里道路,市區道路'] = 'Village road, Urban road'

In [None]:
from TrafficTDApythonUtils.plotsv2 import MapperPlotterSpring
from utils_tda import avg_label, most_common_encoded_label, cond_prob_mixed, ratio_in_data

oi = 'o4i9'
choose = 'Most common in node'
detailed_results_df = pickle.load(open(f"../ComputedDataV2/ForMatrix/{oi}.pkl", "rb"))

In [None]:
books = { 
    'hotspot': {
        'label': ratio_in_data,
        'avg': True,
        'params': {'col': 'hotspot', 'values': 'Hotspot'},
        'color': 'Greens',
        'en': {}
        },
}

c = 2

for i in range(50, 100):
    for name, book in books.items():

        func = book['label']
        if 'params' in book:
            func = partial(func, **book['params'])

        mapper_plotter = MapperPlotterSpring(
            detailed_results_df['mapper_info'],
            all_features_df,
            seed=i, iterations=50, dim=2,
            range_lst=[-c, c, c, -c],
            cmap=book['color'],
            encoded_label=func
        )
        mapper_plotter.create_mapper_plot(choose=name, avg=book['avg'], size_threshold=50, plot_type='spring')
        full_info, outliers = mapper_plotter.extract_data()
        mapper_plotter.map_colors(threshold=0)
        mapper_plotter.plot(set_label=True, size=500, anchor=(0,0),
                            save_path=f"../ComputedDataV2/plot/{i}.png"
                            )

### 這是在獲取最佳拓樸圖後要跑的結果

In [None]:
books = { 
    ('hotspot', 'lane'): {
        'label': cond_prob_mixed,
        'avg': True,
        'params': {'a_col': 'hotspot', 'a_is': 'Hotspot',
                   'b_col': 'lane', 'b_rule': '>0'},
        'color': 'Blues',
        'en': {}
    },
    ('hotspot', '號誌-號誌種類名稱_行車管制號誌'): {
        'label': cond_prob_mixed,
        'avg': True,
        'params': {'a_col': 'hotspot', 'a_is': 'Hotspot',
                   'b_col': '號誌-號誌種類名稱_行車管制號誌', 'b_rule': '>0'},
        'color': 'Blues',
        'en': {}
    },
    ('hotspot', 'facility'): {
        'label': cond_prob_mixed,
        'avg': True,
        'params': {'a_col': 'hotspot', 'a_is': 'Hotspot',
                   'b_col': 'facility', 'b_rule': '>0'},
        'color': 'Blues',
        'en': {}
    },
    'hotspot': {
        'label': ratio_in_data,
        'avg': True,
        'params': {'col': 'hotspot', 'values': 'Hotspot'},
        'color': 'Greens',
        'en': {}
        },
    'bn_feature': {
        'label': ratio_in_data,
        'avg': True,
        'params': {'col': 'bn_feature', 'values': 1},
        'color': 'Reds',
        'en': {}
    },
    ('hotspot', 'bn_feature'): {
        # 'label': ratio_in_data,
        'label': cond_prob_mixed,
        'avg': True,
        'params': {'a_col': 'hotspot', 'a_is': 'Hotspot',
                   'b_col': 'bn_feature', 'b_rule': '>0'},
        'color': 'Reds',
        'en': {}
    },
    'county_city': {
        'label': ratio_in_data,
        'avg': True,
        'params': {'col': 'county_city', 'values': 'City'},
        'color': 'Greens',
        'en': {}
    },
    'original_speed': {
        'label': avg_label,
        'avg': True,
        'color': 'Greens',
        'en': {}
    },
}

for name, book in books.items():

    try:
        c = 0.5
        func = book['label']
        if 'params' in book:
            func = partial(func, **book['params'])

        mapper_plotter = MapperPlotterSpring(
            detailed_results_df['mapper_info'],
            all_features_df,
            seed=17, iterations=100, dim=2,
            range_lst=[-c, c, c, -c],
            cmap=book['color'],
            encoded_label=func
        )
        mapper_plotter.create_mapper_plot(choose=name, avg=book['avg'], size_threshold=50, plot_type='spring')
        full_info, outliers = mapper_plotter.extract_data()
        mapper_plotter.map_colors(threshold=0)
        mapper_plotter.plot(set_label=True, size=500, anchor=(0,0),
                            save_path=f"../ComputedDataV2/plot/17_{name}.png"
                            )
    except Exception as e:
        print(f"Error processing {name}: {e}")

## Correlation Coefficient

In [None]:
books = { 
    ('hotspot', 'lane'): {
        'label': cond_prob_mixed,
        'avg': True,
        'params': {'a_col': 'hotspot', 'a_is': 'Hotspot',
                   'b_col': 'lane', 'b_rule': '>0'},
        'color': 'Blues',
        'en': {}
    },
    ('hotspot', 'facility'): {
        'label': cond_prob_mixed,
        'avg': True,
        'params': {'a_col': 'hotspot', 'a_is': 'Hotspot',
                   'b_col': 'facility', 'b_rule': '>0'},
        'color': 'Blues',
        'en': {}
    },
}

data_for_association = {}

for name, book in books.items():

    func = book['label']
    if 'params' in book:
        func = partial(func, **book['params'])

    mapper_plotter = MapperPlotterSpring(
        detailed_results_df['mapper_info'],
        all_features_df,
        seed=47, iterations=130, dim=2,
        range_lst=[-0.5, 0.5, 0.5, -0.5],
        cmap=book['color'],
        encoded_label=func
    )
    mapper_plotter.create_mapper_plot(choose=name, avg=book['avg'], size_threshold=50, plot_type='spring')
    full_info, outliers = mapper_plotter.extract_data()
    mapper_plotter.map_colors(threshold=0)
    mapper_plotter.plot(set_label=True, size=500, anchor=(0,0),
                        # save_path=f"../ComputedData/ForMatrixTest/47_{name}.png"
                        )
    
    data_for_association[name] = full_info

In [None]:
x = data_for_association[('hotspot', 'lane')]['color']
y = data_for_association[('hotspot', 'facility')]['color']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress

slope, intercept, r_value, p_value, std_err = linregress(x, y)
line = slope * x + intercept

plt.figure(figsize=(8, 6))
sns.scatterplot(x=x, y=y, color='#37bd6f', label='Data points')
plt.plot(x, line, color='#3744bd', label=f'Regression (r={r_value:.2f})')

plt.xlabel("hotspot")
plt.ylabel("combined feature")
plt.legend()
plt.grid(True)
plt.show()