This code is to optimize TDA.ipynb file, the main change is using three main function as the filter function:
1. eccentricity
2. PCA
3. KDE

In [None]:
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

In [None]:
import ast
import pandas as pd

import pickle
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from tdam.cover import CubicalCover
from tdam.clustering import FailSafeClustering
from tdam.core_old import MapperAlgorithm

from TrafficTDApythonUtils.utils_v3 import *
from TrafficTDApythonUtils.plots import *

In [None]:
import geopandas as gpd
from shapely import wkt
from utils import read_data

combined_data = read_data()
TM2 = 3826
taiwan = gpd.read_file('../Data/OFiles_9e222fea-bafb-4436-9b17-10921abc6ef2/TOWN_MOI_1140318.shp')
taiwan = taiwan[(~taiwan['TOWNNAME'].isin(['旗津區', '頭城鎮', '蘭嶼鄉', '綠島鄉', '琉球鄉'])) & 
                (~taiwan['COUNTYNAME'].isin(['金門縣', '連江縣', '澎湖縣']))].to_crs(TM2)
taiwan_cnty = taiwan[['COUNTYNAME','geometry']].dissolve(by='COUNTYNAME')
taiwan_cnty['geometry'] = taiwan_cnty.buffer(0)

# 原始以 0.001 grid 計算出的區域事故及對應索引, 依照 hex_grid 計算出來的GI
grid_gi_df = pd.read_csv('../ComputedData/Grid/grid_gi.csv')
grid_gi_df['accident_indices'] = grid_gi_df['accident_indices'].apply(ast.literal_eval)
grid_gi_df['geometry'] = grid_gi_df['geometry'].apply(wkt.loads)
grid_gi  = gpd.GeoDataFrame(grid_gi_df, geometry='geometry').set_crs(TM2, allow_override=True)
grid_gi['geometry'] = grid_gi.geometry.centroid

county_join = gpd.sjoin(grid_gi[['geometry']], taiwan_cnty, how='left', predicate='within')
grid_gi['COUNTYNAME'] = county_join['COUNTYNAME']

grid_filter = grid_gi[grid_gi['accident_indices'].str.len() > 0]
grid_filter.reset_index(inplace=True)

In [None]:
all_features_df = pd.read_csv("../ComputedData/ForModel/all_features_filtered.csv")

## 1. PCA

In [None]:
pc = 3
filter_pca = PCA(pc).fit_transform(all_features_df)

pca = PCA(pc).fit(all_features_df)
ratios = pca.explained_variance_ratio_
print(ratios)
print(ratios.sum()) 

## 2. KDE

In [None]:
from sklearn.neighbors import KernelDensity

X = filter_pca # 3 pc
kde = KernelDensity(kernel='gaussian').fit(X)

In [None]:
log_density = kde.score_samples(X)

density = np.exp(log_density)
# rank-normalize
rank = (np.argsort(np.argsort(density)).astype(float) / (len(density)-1))
filter_kde = rank.reshape(-1, 1) 

## 3. Centrality

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize

def linf_centrality_exact(df, block_size = 2000):
    """
    回傳 shape=(n,1) 的 L∞ centrality（每點到最遠點的距離）。
    - metric: "cosine" 或 "euclidean"
    - block_size: 控制記憶體 (block_size * n distances)
    """
    X = df.to_numpy(dtype=float)
    n = X.shape[0]
    # 對每一列作 L2 正規化才能用 cosine 距離
    X = normalize(X, norm="l2", axis=1)

    # 準備結果陣列，初始為無窮小
    max_d = np.full(n, -np.inf, dtype=float)
    order = np.arange(n) # 保留原順序
    
    # 分塊計算 pairwise 距離以控制記憶體
    for start in range(0, n, block_size):
        idx = order[start:start+block_size]
        D_blk = pairwise_distances(X[idx], X, metric='cosine')  # (b, n)
        # 自身距離設為 -inf，避免影響 max
        D_blk[np.arange(D_blk.shape[0]), idx] = -np.inf
        # 針對每個 i（在 idx 中），更新它的全域最遠距離
        max_d[idx] = np.maximum(max_d[idx], D_blk.max(axis=1))

    return max_d.reshape(-1, 1)

In [None]:
filter_centrality = linf_centrality_exact(all_features_df)

In [None]:
filter_full = np.concatenate([filter_centrality, filter_kde, filter_pca], axis=1)
filter_full.shape

# Mapper

In [None]:
overlaps = [2]
intervals = [10]
detailed_results = []
silhouette_for_intervals = []

for overlap in overlaps:
    for interval in intervals:
        print(f"Processing overlap {overlap}, interval {interval}")
        mapper_algo = MapperAlgorithm(
            cover=CubicalCover(
                n_intervals=interval,
                overlap_frac=overlap / 10
            ),
            clustering=FailSafeClustering(
                KMeans(
                    n_clusters=2,
                    random_state=42
                )
            ),
            n_jobs=14
        )

        mapper_info = mapper_algo.fit_transform(all_features_df.to_numpy(), filter_full)

        silhouette_for_intervals.append(mapper_info[1])
        result = {
            "overlap": overlap,
            "interval": interval,
            "silhouette": mapper_info[1],
            "mapper_info": mapper_info
        }
        detailed_results.append(result)

        with open(f"../ComputedData/ForMatrixV2/o{overlap}i{interval}.pkl", 'wb') as file:
            pickle.dump(result, file)

detailed_results_df = pd.DataFrame(detailed_results)

In [None]:
all_features_df.columns

In [None]:
grid_filter

In [None]:
all_features_df['county'] = grid_filter['COUNTYNAME']

In [None]:
# all_features_df['hotspot'] = grid_filter['hotspot']
# all_features_df['hotspot'] = all_features_df['hotspot'].apply(lambda x: 'Hotspot' if 'Hotspot' in str(x) else 'Not Hotspot')
# all_features_df['youbike'] = all_features_df['youbike_100m_count_mean'].apply(lambda x: 'Facility' if x>0 else 'No Facility')
# all_features_df['hotspot_youbike'] = all_features_df['hotspot'] + '_' + all_features_df['youbike']

overlaps = [3]
intervals = [9]
seeds = [i for i in range(41, 300)]
# o3i9s63
for seed in [53]:
    for overlap in overlaps:
        for interval in intervals:
            
            detailed_results_df = pickle.load(open(f"../ComputedData/ForMatrixV2/o{overlap}i{interval}.pkl", "rb"))
            # choose = 'hotspot_youbike'
            choose = 'county'
            mapper_plotter = MapperPlotter(detailed_results_df['mapper_info'], 
                                        all_features_df, seed=seed, iterations=50, dim=2,
                                            range_lst=[-0.5, 0.5, 0.5, -0.5])

            def avg_label(data):
                return sum(data) / len(data) if len(data) > 0 else 0
            def most_common_encoded_label(data):
                most_common_item = Counter(data).most_common(1)[0][0]
                return most_common_item

            mapper_plot = mapper_plotter.create_mapper_plot(choose, most_common_encoded_label, avg=False)
            full_info = mapper_plotter.extract_data()
            mapper_plotter.map_colors(choose, size=50, threshold=0)
            mapper_plotter.plot(choose, avg=False, set_label=True, size=500, anchor=1,
                                # save_path=f"../ComputedData/ForMatrixV2/Plots/o{overlap}i{interval}s{seed}.png"
                                )