ComputedDataV3: Include facilities (youbike, parkinglot, mrt) <br/>
ComputedDataV3: Remove facilities

In [None]:
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

In [None]:
import joblib
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode Ms']

from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, average_precision_score
import torch
from utils_nn import BinaryMLP, predict_nn
from utils_model import model_preprocess, get_importance

from utils import read_data
from config import cause_mapping
version = "V1"
computeddata = 'ComputedDataV3'

In [None]:
combined_data = read_data()

def map_cause(cause):
    for category, causes in cause_mapping.items():
        if cause in causes:
            return category
    return "Unknown"

combined_data["cause_group"] = combined_data["肇因研判子類別名稱-主要"].apply(map_cause)

本資料夾主要完成三個模型：
- 模型1: (一維)人車路
- 模型2: (二維)人車沒有路 (加法、乘法交互) (一維有路但沒交互&一維沒有路且沒交互)
- 模型3: (三維)人車路 (加法、乘法交互)

| 需要去掉市區道路(道路類別-第1當事者-名稱)、縣市(county)這種欄位，專注在道路設計（路）、肇因（人）、車種（車）


In [None]:
all_features_df = pd.read_csv(f"../ComputedDataV2/ForModel/all_featuresV2.csv")
all_features_df = all_features_df[all_features_df.columns[~all_features_df.columns.str.contains('道路類別-第1當事者-名稱|county|路面狀況|道路障礙|車輛撞擊部位大類別名稱|original-speed')]]
# all_features_df = all_features_df[all_features_df.columns[~all_features_df.columns.str.contains('道路類別-第1當事者-名稱|county|youbike|mrt|parkinglot')]] # for ComputedDataV4
print(all_features_df.shape)

road_features = [
    '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱',
    '車道劃分設施-分道設施-路面邊線名稱',
    '車道劃分設施-分向設施大類別名稱',
    '事故類型及型態大類別名稱',
    '道路型態大類別名稱',
    '號誌-號誌種類名稱',
    '速限-第1當事者',
    'youbike_100m_count',
    'mrt_100m_count',
    'parkinglot_100m_count',
]
car_features = [
    '當事者區分-類別-大類別名稱-車種',
]
person_features = [
    'cause-group',
]

# This is to check if all features are in the dataframe columns
for feature in road_features + car_features + person_features:
    match_found = any(feature in col for col in all_features_df.columns)
    print(f"Checking feature: {feature}, Found: {match_found}")
    
    assert match_found, f"Feature {feature} not found in dataframe columns"

In [None]:
defined_keywords = road_features + car_features + person_features
matched_cols = set()
for col in all_features_df.columns:
    for kw in defined_keywords:
        if kw in col:
            matched_cols.add(col)
            break

all_cols = set(all_features_df.columns)
extra_cols = all_cols - matched_cols

print(f"Dataframe 總欄位數: {len(all_cols)}")
print(f"已被關鍵字涵蓋的欄位數: {len(matched_cols)}")
print(f"額外欄位數: {len(extra_cols)}")
print("-" * 30)

if len(extra_cols) > 0:
    print("額外欄位:")
    for col in sorted(list(extra_cols)):
        print(col)
else:
    print("所有欄位都已在列表")

In [None]:
# all_features_df.to_csv(f"../ComputedDataV4/ForModel/all_features.csv", index=False)

In [None]:
dct_model = {
    'model_1': {
        'grid_filter': None, # None is set to not include county data
        'dim': '1way',
        'base_road': road_features,
        'base_vehicle': car_features,
        'base_person': person_features,
        'interaction_type': None,
        'model': [],
    },
    'model_2_multiply_cp': {
        'grid_filter': None,
        'dim': '2way',
        'base_road': None,
        'base_vehicle': car_features,
        'base_person': person_features,
        'interaction_type': 'multiply',
        'model': [],
    },
    'model_2_add_cp': {
        'grid_filter': None,
        'dim': '2way',
        'base_road': None,
        'base_vehicle': car_features,
        'base_person': person_features,
        'interaction_type': 'add',
        'model': [],
    },
    'model_3_multiply': {
        'grid_filter': None,
        'dim': 'mixed',
        'base_road': road_features,
        'base_vehicle': car_features,
        'base_person': person_features,
        'interaction_type': 'multiply',
        'model': [],
    },
    'model_3_add': {
        'grid_filter': None,
        'dim': 'mixed',
        'base_road': road_features,
        'base_vehicle': car_features,
        'base_person': person_features,
        'interaction_type': 'add',
        'model': [],
    },
}

In [None]:
def get_extracted_features(lr, rf, X_resampled_test):
    feature_order = lr.feature_names_in_
    X_sorted = X_resampled_test.reindex(columns=feature_order, fill_value=0)

    importance_lr_dict, _ = get_importance(lr, X_sorted)
    df_lr_ranked = pd.DataFrame.from_dict(importance_lr_dict, orient='index', columns=['score', 'exp_score'])
    df_lr_ranked['abs_score'] = df_lr_ranked['score'].abs()
    importance_rf_dict, _ = get_importance(rf, X_sorted)
    df_rf_ranked = pd.DataFrame.from_dict(importance_rf_dict, orient='index', columns=['score', 'exp_score'])
    df_rf_ranked['abs_score'] = df_rf_ranked['score'].abs()

    def get_dimension(feature_name):
        x_count = feature_name.count(' x ')
        if x_count == 0: return 1
        if x_count == 1: return 2 
        if x_count == 2: return 3
        return 0

    df_lr_ranked['dimension'] = df_lr_ranked.index.map(get_dimension)
    df_rf_ranked['dimension'] = df_rf_ranked.index.map(get_dimension)

    lr_dim1 = df_lr_ranked[df_lr_ranked['dimension'] == 1].index.tolist()
    lr_dim2 = df_lr_ranked[df_lr_ranked['dimension'] == 2].sort_values('abs_score', ascending=False).head(20).index.tolist()
    lr_dim3 = df_lr_ranked[df_lr_ranked['dimension'] == 3].sort_values('abs_score', ascending=False).head(20).index.tolist()
    rf_dim1 = df_rf_ranked[df_rf_ranked['dimension'] == 1].index.tolist()
    rf_dim2 = df_rf_ranked[df_rf_ranked['dimension'] == 2].sort_values('abs_score', ascending=False).head(20).index.tolist()
    rf_dim3 = df_rf_ranked[df_rf_ranked['dimension'] == 3].sort_values('abs_score', ascending=False).head(20).index.tolist()

    final_dim1 = list(set(lr_dim1 + rf_dim1))
    final_dim2 = list(set(lr_dim2 + rf_dim2))
    final_dim3 = list(set(lr_dim3 + rf_dim3))
    final_feature_list = final_dim1 + final_dim2 + final_dim3

    print(f"Original feature: {len(final_dim1)}")
    print(f"Two-way feature: {len(final_dim2)} (LR and RF union)")
    print(f"Three-way feature: {len(final_dim3)} (LR and RF union)")
    print(f"Final model feature total: {len(final_feature_list)}")

    print("top 3 way")
    print(df_lr_ranked[df_lr_ranked['dimension'] == 3].sort_values('abs_score', ascending=False).head(5).index.tolist())

    return final_feature_list


In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

file_suffix_map = {
    'model_1': 'model_1',
    'model_2_add_cp': 'model_2_add_cp',
    'model_2_multiply_cp': 'model_2_multiply_cp',
    'model_3_add': 'model_3_add_extracted',
    'model_3_multiply': 'model_3_multiply_extracted',
}

for model_name, config in dct_model.items():
    print(f"\n=== Processing: {model_name} ===")
    
    X_train, X_test, y_train, y_test, X_resampled_test, y_resampled_test, le = model_preprocess(
        all_features_df, 
        grid_filter=config['grid_filter'],
        dim=config['dim'],
        base_road=config['base_road'],
        base_vehicle=config['base_vehicle'],
        base_person=config['base_person'],
        interaction_type=config['interaction_type'],
    )

    suffix = file_suffix_map.get(model_name, model_name) 
    base_path = f'../{computeddata}/ModelPerformance'

    if model_name in ['model_3_add', 'model_3_multiply']:
        lr = joblib.load(f'../{computeddata}/ModelPerformance/lr_modelV1_{model_name}.pkl')
        rf = joblib.load(f'../{computeddata}/ModelPerformance/rf_modelV1_{model_name}.pkl')
        final_feature_list = get_extracted_features(lr, rf, X_resampled_test)    
        X_train_refined = X_train[final_feature_list]
    # X_test_refined = X_resampled_test[final_feature_list]
    try:

        lr_path = f'{base_path}/lr_modelV1_{suffix}.pkl'
        lr_model = joblib.load(lr_path)
        print(f"  -> Loaded LR: {lr_path}")

        rf_path = f'{base_path}/rf_modelV1_{suffix}.pkl'
        rf_model = joblib.load(rf_path)
        print(f"  -> Loaded RF: {rf_path}")

        if model_name in ['model_3_add', 'model_3_multiply']:
            input_dim = X_train_refined.shape[1]
            print(f"  -> Using refined input dim: {input_dim}")
        else:
            input_dim = X_train.shape[1]
            print(f"  -> Using input dim: {input_dim}")
        nn_model = BinaryMLP(in_dim=input_dim).to(device)
        nn_path = f'{base_path}/nn_modelV1_{suffix}.pt'
        nn_model.load_state_dict(torch.load(nn_path, map_location=device))
        print(f"  -> Loaded NN: {nn_path} (dim={input_dim})")

        config['model'] = [lr_model, rf_model, nn_model]

    except FileNotFoundError as e:
        print(f" [ERROR] File not found: {e}")
        continue 

In [None]:
## Re fit for only important features
def print_results(proba_test, classes, y_resampled_test):
    """
    proba_test: 預測的概率
    classes: 類別名稱
    y_resampled_test: 重抽樣後的測試標籤
    """
    le = LabelEncoder()

    y_pred = np.argmax(proba_test, axis=1)

    print("Confusion Matrix")
    conf_matrix = confusion_matrix(y_resampled_test, y_pred, labels=range(len(classes)))
    print(conf_matrix)

    print("Classification Report")
    print(classification_report(
        y_resampled_test, y_pred, target_names=classes, digits=3
    ))

    if proba_test.shape[1] == 2:
        # 二元分類
        roc_auc = roc_auc_score(y_resampled_test, proba_test[:, 1])
        print(f'ROC AUC: {roc_auc:.3f}')
        y_test_bin = label_binarize(y_resampled_test, classes=range(len(classes)))
        pr_auc_macro  = average_precision_score(y_test_bin, proba_test[:, 1], average='macro')
        pr_auc_weight = average_precision_score(y_test_bin, proba_test[:, 1], average='weighted')
        print(f'PR  AUC macro: {pr_auc_macro:.3f}')
        print(f'PR  AUC wighted: {pr_auc_weight:.3f}')

    return conf_matrix

all_performance = {}
all_importance = {}
for model_name, config in dct_model.items():

    print(f"training model: {model_name}")
    X_train, X_test, y_train, y_test, X_resampled_test, y_resampled_test, le = model_preprocess(
        all_features_df, 
        grid_filter=config['grid_filter'],
        dim=config['dim'],
        base_road=config['base_road'],
        base_vehicle=config['base_vehicle'],
        base_person=config['base_person'],
        interaction_type=config['interaction_type'],
    )
    # don't need two reorder because they are using the same columns in training
    feature_order_lr = config['model'][0].feature_names_in_

    if model_name == 'model_3_add' or model_name == 'model_3_multiply':
        final_feature_list = get_extracted_features(config['model'][0], config['model'][1], X_resampled_test)
        X_resampled_test = X_resampled_test[final_feature_list]

    X_sorted = X_resampled_test.reindex(columns=feature_order_lr, fill_value=0)
    X_resampled_reordered = X_sorted.reindex(columns=feature_order_lr, fill_value=0)

    proba_test_lr = config['model'][0].predict_proba(X_resampled_reordered)
    proba_test_rf = config['model'][1].predict_proba(X_resampled_reordered)
    proba_test_nn = predict_nn(config['model'][2], X_resampled_reordered)
    
    # print('model performance for lr')
    # print_results(proba_test_lr, le.classes_, y_resampled_test)
    # print('model performance for rf')
    # print_results(proba_test_rf, le.classes_, y_resampled_test)

    all_performance[model_name] = {
        'lr': print_results(proba_test_lr, le.classes_, y_resampled_test),
        'rf': print_results(proba_test_rf, le.classes_, y_resampled_test),
        'nn': print_results(proba_test_nn, le.classes_, y_resampled_test),
    }

    importance_lr, importance_grouped_lr = get_importance(config['model'][0], X_resampled_reordered)
    importance_rf, importance_grouped_rf = get_importance(config['model'][1], X_resampled_reordered)

    all_importance[model_name] = {
        'lr': [importance_lr, importance_grouped_lr],
        'rf': [importance_rf, importance_grouped_rf],
    }

In [None]:
X_train, X_test, y_train, y_test, X_resampled_test, y_resampled_test, le = model_preprocess(
    all_features_df, 
    grid_filter=dct_model['model_3_multiply']['grid_filter'],
    dim=dct_model['model_3_multiply']['dim'],
    base_road=dct_model['model_3_multiply']['base_road'],
    base_vehicle=dct_model['model_3_multiply']['base_vehicle'],
    base_person=dct_model['model_3_multiply']['base_person'],
    interaction_type=dct_model['model_3_multiply']['interaction_type'],
)

In [None]:
import geopandas as gpd
from shapely.geometry import box
from shapely import wkt
import ast
TM2 = 3826

def read_taiwan_specific(read_grid=False):
    taiwan = gpd.read_file('../Data/OFiles_9e222fea-bafb-4436-9b17-10921abc6ef2/TOWN_MOI_1140318.shp')
    taiwan = taiwan[(~taiwan['TOWNNAME'].isin(['蘭嶼鄉', '綠島鄉', '琉球鄉'])) & 
                    (~taiwan['COUNTYNAME'].isin(['金門縣', '連江縣', '澎湖縣']))].to_crs(TM2)

    minx, miny, maxx, maxy = taiwan.total_bounds
    clip_box = box(minx, 2400000, 380000, maxy)
    clipper = gpd.GeoDataFrame(geometry=[clip_box], crs=taiwan.crs)
    taiwan = gpd.clip(taiwan, clipper)

    if read_grid:
        taiwan_cnty = taiwan[['COUNTYNAME','geometry']].dissolve(by='COUNTYNAME')
        taiwan_cnty['geometry'] = taiwan_cnty.buffer(0)

        # 原始以 0.001 grid 計算出的區域事故及對應索引, 依照 hex_grid 計算出來的GI
        grid_gi_df = pd.read_csv('../ComputedDataV2/Grid/grid_giV1.csv')
        grid_gi_df['accident_indices'] = grid_gi_df['accident_indices'].apply(ast.literal_eval)
        grid_gi_df['geometry'] = grid_gi_df['geometry'].apply(wkt.loads)
        grid_gi  = gpd.GeoDataFrame(grid_gi_df, geometry='geometry').set_crs(TM2, allow_override=True)
        grid_gi['geometry'] = grid_gi.geometry#.centroid

        county_join = gpd.sjoin(grid_gi[['geometry']], taiwan_cnty, how='left', predicate='within')
        grid_gi['COUNTYNAME'] = county_join['COUNTYNAME']
        # 這些都是離島資料，因為在taiwan被篩選掉了，所以會因為對應不到所以回傳空值
        grid_filter = grid_gi[grid_gi['accident_indices'].str.len() > 0]
        grid_filter.reset_index(inplace=True)

    else:
        grid_filter = None

    return taiwan, grid_filter

taiwan, grid_filter = read_taiwan_specific(read_grid=True)

## Use proba

In [None]:
import geopandas as gpd
full_interaction_df = pd.concat([X_train, X_test], ignore_index=True)

choose_model = dct_model['model_3_multiply']['model'][1]
fdata = full_interaction_df.reindex(columns=choose_model.feature_names_in_, fill_value=0)
proba_test_rf = choose_model.predict_proba(fdata)

# join on index
all_features_gdf = full_interaction_df.join(
    grid_filter[['geometry', 'COUNTYNAME', 'hotspot']], 
    how='left'
)

all_features_gdf = gpd.GeoDataFrame(
    all_features_gdf, 
    geometry='geometry', 
    crs=grid_filter.crs 
)

all_features_gdf['risk_prob'] = proba_test_rf[:, 1]
all_features_gdf['hotspot_binary_true'] = all_features_gdf['hotspot'].apply(lambda x: 0 if x == 'Not Significant' else 1)
all_features_gdf['risk_binary'] = all_features_gdf['risk_prob'].apply(lambda x: 0 if x < 0.5 else 1)

print(classification_report(all_features_gdf['hotspot_binary_true'], all_features_gdf['risk_binary']))

print(all_features_gdf[['risk_binary', 'hotspot_binary_true']].value_counts(normalize=False))

In [None]:
import contextily as ctx
from mpl_toolkits.axes_grid1 import make_axes_locatable

def plot_hotspot_map(gdf_plot, target_feature, binary=False):
    gdf_plot = gdf_plot.to_crs(epsg=3857)

    fig, ax = plt.subplots(figsize=(15, 15), dpi=150)

    if binary:
        gdf_plot.plot(
            ax=ax,
            color='red', 
            alpha=0.5, 
            edgecolor='none', 
            legend=False
        )
    else:
        gdf_plot.plot(
            column=target_feature,
            ax=ax,
            cmap='YlOrRd',
            cax=make_axes_locatable(ax).append_axes("bottom", size="3%", pad=0.1),
            alpha=0.5,
            legend=True,
            edgecolor='none',
            legend_kwds={'label': "Risk", 'orientation': "horizontal", 'shrink': 0.6}
        )


    ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)

    ax.set_axis_off()

    # output_img = "risk_map_static.png"
    # plt.savefig(output_img, bbox_inches='tight', pad_inches=0.1)
    plt.show()

In [None]:
# target_feature = '車道劃分設施-分道設施-路面邊線名稱_無 x 當事者區分-類別-大類別名稱-車種_小客車(含客、貨兩用) x cause-group_Decision'
target_feature = 'risk_prob'
# target_feature = 'hotspot_binary_true'

gdf_plot = all_features_gdf[
    (all_features_gdf[target_feature] > 0) & 
    (all_features_gdf['COUNTYNAME'] == '臺中市')
].copy()

In [None]:
plot_hotspot_map(gdf_plot, target_feature, binary=False)