In [None]:
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

要先建立輸入到模型的資料
- 若是要分類是否是熱點，應該要以一個區域的grid為單位
- 所以建立得grid亦包含該地區的所有特徵資料，以比例顯示

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd

from utils import get_grid, calculate_gi

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode Ms']

from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, average_precision_score, accuracy_score, f1_score, recall_score, precision_score

In [None]:
dataA1 = pd.read_csv('../ComputedData/Accident/DataA1_with_MYP.csv')
dataA2 = pd.read_csv('../ComputedData/Accident/DataA2_with_MYP.csv')

filtered_A2 = dataA2[dataA2['當事者順位'] == 1]
filtered_A1 = dataA1[dataA1['當事者順位'] == 1]

filtered_A1['source'] = 'A1'
filtered_A2['source'] = 'A2'
filtered_A1['num_accidents'] = 1 
filtered_A2['num_accidents'] = 1
combined_data = pd.concat([filtered_A1, filtered_A2], ignore_index=True)

# hex_grid = get_grid(combined_data, hex_size=0.01, threshold=-1)

## obtain hotspot's county

In [None]:
from shapely import wkt
import geopandas as gpd
import ast

taiwan = gpd.read_file('../Data/OFiles_9e222fea-bafb-4436-9b17-10921abc6ef2/TOWN_MOI_1140318.shp')
taiwan = taiwan[(~taiwan['TOWNNAME'].isin(['旗津區', '頭城鎮', '蘭嶼鄉', '綠島鄉', '琉球鄉'])) & 
                (~taiwan['COUNTYNAME'].isin(['金門縣', '連江縣', '澎湖縣']))]

TM2 = 3826
hex_grid_raw = pd.read_csv('../ComputedData/Grid/hex_grid.csv')
hex_grid_raw['geometry'] = hex_grid_raw['geometry'].apply(wkt.loads)
hex_grid = gpd.GeoDataFrame(hex_grid_raw, geometry='geometry').set_crs(TM2, allow_override=True)

grid_gi_df = pd.read_csv('../ComputedData/Grid/grid_gi.csv')
grid_gi_df['accident_indices'] = grid_gi_df['accident_indices'].apply(ast.literal_eval)
grid_gi_df['geometry'] = grid_gi_df['geometry'].apply(wkt.loads)
grid_gi  = gpd.GeoDataFrame(grid_gi_df, geometry='geometry').set_crs(TM2, allow_override=True)

taiwan_tm2 = taiwan.to_crs(TM2)

taiwan_cnty = taiwan_tm2[['COUNTYNAME','geometry']].dissolve(by='COUNTYNAME')
taiwan_cnty['geometry'] = taiwan_cnty.buffer(0)
taiwan_cnty = taiwan_cnty.reset_index()

pts = hex_grid.copy()
pts['geometry'] = pts.geometry.centroid

county_join = gpd.sjoin(
    pts[['geometry']], taiwan_cnty, how='left', predicate='within'
)[['COUNTYNAME']]

print('NaN ratio:', county_join['COUNTYNAME'].isna().mean())
grid_gi['COUNTYNAME'] = county_join['COUNTYNAME']
county_join.head()

hex_grid目前包含所有事故索引，所以要回推
- 回推方式從combined_data獲取，並且計算他們的事故特徵平均

In [None]:
# 會造成重複的資料，因為grid可能覆蓋多個縣市
# grid = calculate_gi(6, hex_grid, adjacency='knn')
# grid = gpd.sjoin(hex_grid, taiwan[['COUNTYNAME', 'geometry']], how='left', predicate='intersects')
# grid.to_csv('../ComputedData/Grid/grid.csv', index=False)

## Features concat

In [None]:
from config import select_group
from utils_model import extract_features

all_features_list = []

grid_filter = grid_gi[grid_gi['accident_indices'].str.len() > 0]
for rows in range(grid_filter.shape[0]):
    features = extract_features(grid_filter, combined_data, select_group, rows)
    all_features_list.append(features)

all_features_df = pd.concat(all_features_list, ignore_index=True)
all_features_df.fillna(0, inplace=True)

all_features_df[['mrt_100m_count_mean', 'youbike_100m_count_mean', 'parkinglot_100m_count_mean', '速限-第1當事者_mean']] =\
      all_features_df[['mrt_100m_count_mean', 'youbike_100m_count_mean', 'parkinglot_100m_count_mean', '速限-第1當事者_mean']].\
        apply(lambda x: (x - x.min()) / (x.max() - x.min()))

# all_features_df.to_csv("../ComputedData/ForModel/all_features.csv", index=False)

In [None]:
grid_filter = grid_gi[grid_gi['accident_indices'].str.len() > 0].reset_index(drop=True)
all_features_df = pd.read_csv("../ComputedData/ForModel/all_features.csv")

# Model Preprocess

In [None]:
from config import for_poly

# with county town
# 原始資料index並非從1開始所以需reset
new_grid = pd.concat([grid_filter[['hotspot', 'COUNTYNAME']], all_features_df], axis=1)
county_dummies = pd.get_dummies(new_grid['COUNTYNAME'], prefix='county')
new_grid_encoded = pd.concat([new_grid.drop(['COUNTYNAME'], axis=1), county_dummies], axis=1)

# binary hotspot
new_grid_encoded['hotspot'] = new_grid_encoded['hotspot'].apply(lambda x: 'Hotspot' if 'Hotspot' in str(x) else 'Not Hotspot')

le = LabelEncoder()
y = le.fit_transform(new_grid_encoded['hotspot'])
X = new_grid_encoded.drop(columns=['hotspot'])

###
from itertools import combinations
groups = {base: [c for c in X.columns if c.startswith(base)] for base in for_poly}
# 只做不同基底之間的配對
base_pairs = list(combinations(for_poly, 2))

new_cols = {}
for a, b in base_pairs:
    cols_a, cols_b = groups[a], groups[b]
    for ca in cols_a:
        va = X[ca].values
        for cb in cols_b:
            vb = X[cb].values
            prod = va * vb
            # 若這個交互列完全為0就跳過（節省維度）
            if not np.any(prod):
                continue
            name = f"{ca} x {cb}"
            new_cols[name] = prod

if new_cols:
    X_inter = pd.DataFrame(new_cols, index=X.index)
    X = pd.concat([X, X_inter], axis=1)
###

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
y_train = pd.Series(y_train, index=X_train.index)
y_test  = pd.Series(y_test,  index=X_test.index)

# with undersampling
cls_counts = y_test.value_counts()
min_count = cls_counts.min()
rus_test = RandomUnderSampler(
    sampling_strategy={int(c): int(min_count) for c in cls_counts.index},
    random_state=42
)
X_resampled_test, y_resampled_test = rus_test.fit_resample(X_test, y_test)

print("before US")
print(pd.Series(y_test).map(dict(enumerate(le.classes_))).value_counts())
print("after US")
print(pd.Series(y_resampled_test).map(dict(enumerate(le.classes_))).value_counts())

# LR and RF

In [None]:
lr = LogisticRegression(
        penalty='elasticnet', solver='saga', l1_ratio=0.5,
        class_weight='balanced', max_iter=1000, 
        random_state=42, 
        multi_class='multinomial',
        n_jobs=-1
    )
rf = RandomForestClassifier(
        n_estimators=300, max_depth=None, min_samples_leaf=1,
        class_weight='balanced', n_jobs=-1, random_state=42,
    )

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, clf in [('Logistic', lr), ('RandomForest', rf)]:
    scores = cross_val_score(clf, X_train, y_train, cv=cv, n_jobs=-1,
                            # scoring='roc_auc_ovr_weighted',
                            scoring='roc_auc'
                             )
    print(f'{name} CV ROC AUC: {scores.mean():.3f} ± {scores.std():.3f}')

lr.fit(X_train, y_train)
rf.fit(X_train, y_train)

proba_test_lr = lr.predict_proba(X_resampled_test)
proba_test_rf = rf.predict_proba(X_resampled_test)
y_pred_lr = np.argmax(proba_test_lr, axis=1)
y_pred_rf = np.argmax(proba_test_rf, axis=1)

In [None]:
import joblib

joblib.dump(lr, '../ComputedData/ModelPerformance/lr_model.pkl')
joblib.dump(rf, '../ComputedData/ModelPerformance/rf_model.pkl')

In [None]:
y_pred = y_pred_lr
proba_test = proba_test_lr

print("Confusion Matrix")
print(confusion_matrix(y_resampled_test, y_pred, labels=range(len(le.classes_))))

print("Classification Report")
print(classification_report(
    y_resampled_test, y_pred, target_names=le.classes_, digits=3
))

if proba_test.shape[1] == 2:
    # 二元分類
    roc_auc = roc_auc_score(y_resampled_test, proba_test[:, 1])
    print(f'ROC AUC: {roc_auc:.3f}')
    y_test_bin = label_binarize(y_resampled_test, classes=range(len(le.classes_)))
    pr_auc_macro  = average_precision_score(y_test_bin, proba_test[:, 1], average='macro')
    pr_auc_weight = average_precision_score(y_test_bin, proba_test[:, 1], average='weighted')
    print(f'PR  AUC macro: {pr_auc_macro:.3f}')
    print(f'PR  AUC wighted: {pr_auc_weight:.3f}')
else:
    # 多類分類
    roc_auc = roc_auc_score(y_resampled_test, proba_test, average='weighted', multi_class='ovr')
    print(f'ROC AUC: {roc_auc:.3f}')
    # 多類PR AUC需要 binarize 後用 one-vs-rest，再做 macro/weighted 平均
    y_test_bin = label_binarize(y_resampled_test, classes=range(len(le.classes_)))  # shape [n, n_classes]
    pr_auc_macro  = average_precision_score(y_test_bin, proba_test, average='macro')
    pr_auc_weight = average_precision_score(y_test_bin, proba_test, average='weighted')
    print(f'PR  AUC macro: {pr_auc_macro:.3f}')
    print(f'PR  AUC wighted: {pr_auc_weight:.3f}')

設施平均：該地區的事故點附近平均會有幾個設施

### RandomForest Feature Importance & LinearRegression coefficient
- group的寫法可能還要再修

In [None]:
from utils_model import get_importance

get_importance(rf, X_train, 'county')
get_importance(lr, X_train, 'county')

解釋範例：
exp(1.5741) = 4.826
在宜蘭，parkinglot 的效應比全台平均強 4.826 倍

In [None]:
get_importance(lr, X_train, 'youbike')

## Neural Network

In [None]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
INPUT_DIM = X_resampled_test.shape[1]
NUM_CLASSES = int(len(set(y)))  # 類別 0/1

class BinaryMLP(nn.Module):
    def __init__(self, in_dim=INPUT_DIM, num_classes=NUM_CLASSES, drop=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 1024),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(64, num_classes)  # logits
        )
    def forward(self, x):
        return self.net(x)

In [None]:
X_train_nn, X_val_nn, y_train_nn, y_val_nn = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)

def to_tensors(X_df, y_arr):
    return (torch.from_numpy(np.asarray(X_df, dtype=np.float32)),
            torch.from_numpy(np.asarray(y_arr, dtype=np.int64)))

X_train_t, y_train_t = to_tensors(X_train, y_train)
X_val_t, y_val_t = to_tensors(X_val_nn, y_val_nn)
X_test_t, y_test_t = to_tensors(X_resampled_test, y_resampled_test)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=256, shuffle=True, drop_last=False)
val_loader = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=512, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test_t, y_test_t), batch_size=512, shuffle=False)

model = BinaryMLP().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

best_val = -np.inf
patience = 5
wait = 0
epochs = 20

def eval_loop(loader):
    model.eval()
    all_logits = []
    all_y = []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            logits = model(xb)
            all_logits.append(logits.cpu())
            all_y.append(yb)
    logits_all = torch.cat(all_logits)
    y_all = torch.cat(all_y)
    probs = torch.softmax(logits_all, dim=1).numpy()
    preds = probs.argmax(axis=1)
    acc = accuracy_score(y_all, preds)
    f1  = f1_score(y_all, preds, average='binary' if probs.shape[1]==2 else 'weighted')
    recall = recall_score(y_all, preds, average='binary' if probs.shape[1]==2 else 'weighted')
    if probs.shape[1] == 2:
        auc = roc_auc_score(y_all, probs[:,1])
    else:
        auc = roc_auc_score(y_all, probs, multi_class='ovr', average='weighted')

    conf = confusion_matrix(y_all, preds, labels=range(len(le.classes_)))
    report = classification_report(y_all, preds, target_names=le.classes_, digits=3)

    return {'acc': acc, 'f1': f1, 'recall': recall, 'auc': auc, 'conf': conf, 'report': report, 'pred_y': preds}

In [None]:
for epoch in range(1, epochs+1):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)

    train_loss = total_loss / len(train_loader.dataset)
    val_metrics = eval_loop(val_loader)
    print(f'Epoch {epoch:02d}/{epochs} | loss {train_loss:.4f} | '
          f'val_acc {val_metrics["acc"]:.3f} | val_f1 {val_metrics["f1"]:.3f} | val_auc {val_metrics["auc"]:.3f}')

    score_for_early = val_metrics["auc"]  # 你也可用 f1
    if score_for_early > best_val:
        best_val = score_for_early
        wait = 0
        # torch.save(model.state_dict(), 'best_model.pt')
    else:
        wait += 1
        if wait >= patience:
            print('Early stopping.')
            break

In [None]:
test_metrics = eval_loop(test_loader)
print(test_metrics['report'])

torch.save(model.state_dict(), '../ComputedData/ModelPerformance/nn_model.pth')

## Permutation

In [None]:
from utils_model import build_groups_from_prefix, build_groups_with_interactions, build_pair_interaction_groups, PI_ML, PI_NN

groups = build_groups_with_interactions(X_test.columns)

print('lr')
base_lr, perm_lr = PI_ML(lr, X_test, y_test, groups=groups, n_repeats=10)
print('rf') 
base_rf, perm_rf = PI_ML(rf, X_test, y_test, groups=groups, n_repeats=10)
print('nn')
base_nn, perm_nn = PI_NN(model, X_test, y_test, groups=groups, n_repeats=10)

In [None]:
combined = pd.concat([
    perm_lr.assign(model='LR'),
    # perm_rf.assign(model='RF'),
    # perm_nn.assign(model='NN')
], ignore_index=True)

order = (combined.groupby('group')['importance'].mean().sort_values(ascending=True).index.tolist())
ypos = np.arange(len(order))

triples = [
    (perm_lr, 'lr'), 
    (perm_rf, 'rf'), 
    (perm_nn, 'nn')
    ]

plt.figure(figsize=(12, 8))

for perm_df_i, name in triples:
    d = (perm_df_i.set_index('group').reindex(order)) # 用統一群組順序對齊
    plt.errorbar(
        d['importance'],
        (ypos),
        xerr=d['std'],
        fmt='o',
        linewidth=2,
        capsize=5,
        label=name
    )

plt.yticks(ypos, order)
plt.axvline(0.0, linestyle='--', linewidth=1)
plt.xlabel('Permutation importance')
plt.ylabel('Group')
plt.title('Permutation importance (LR / RF / NN)')
plt.legend()
plt.tight_layout()
plt.show()


# Hitrate
le的轉換是1為not hotspot

In [None]:
county_cols = [col for col in X_resampled_test.columns if col.startswith('county_')]


df_hitrate = X_resampled_test.copy()
df_hitrate['y_true'] = y_resampled_test
df_hitrate['y_pred'] = y_pred_lr

hitrate = {}
for col in county_cols:

    mask = df_hitrate[df_hitrate[col] != False]
    tn, fp, fn, tp = confusion_matrix(
        mask['y_true'], mask['y_pred'], labels=[1, 0] # 這裡0是Hotspot
    ).ravel()

    # calculate precision, recall, accuracy, f1-score
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    hitrate[col] = {
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'f1': f1
    }

hitrate_df = pd.DataFrame.from_dict(hitrate, orient='index', columns=['precision', 'recall', 'accuracy', 'f1']).sort_values('f1', ascending=False)
hitrate_df['county'] = hitrate_df.index
hitrate_df['county'] = hitrate_df['county'].str.replace('county_', '')

In [None]:
from utils_model import hitrate_data

hitrate_lr = hitrate_data(X_resampled_test, y_resampled_test, y_pred_lr)
hitrate_rf = hitrate_data(X_resampled_test, y_resampled_test, y_pred_rf)
hitrate_nn = hitrate_data(X_resampled_test, y_resampled_test, test_metrics['pred_y'])

In [None]:
results = {
    'LR': hitrate_lr.copy(),
    'RF': hitrate_rf.copy(),
    'NN': hitrate_nn.copy(),
}
order = (results['NN'].sort_values('f1', ascending=False)['county']).tolist()

metrics = ['precision', 'recall', 'accuracy', 'f1']
plt.figure(figsize=(16, 10))

for i, met in enumerate(metrics, 1):
    ax = plt.subplot(2, 2, i)
    pos = np.arange(len(order))
    width = 0.25
    for j, (name, df) in enumerate(results.items()):
        d = df.set_index('county').reindex(order)
        ax.bar(pos + (j-1)*width, d[met].values, width=width, label=name)
    ax.set_title(met)
    ax.set_xticks(pos)
    ax.set_xticklabels(order, rotation=45, ha='right')
    if i == 2:
        ax.legend(loc='upper right')
    ax.set_ylim(0, 1)

plt.tight_layout()
plt.show()


In [None]:
def metrics_bin(y_true, y_pred):
    return {
        'precision': precision_score(y_true, y_pred, pos_label=0),
        'recall':    recall_score(y_true, y_pred, pos_label=0),
        'f1':        f1_score(y_true, y_pred, pos_label=0),
        'accuracy':  accuracy_score(y_true, y_pred),
    }

m_lr = metrics_bin(y_resampled_test, y_pred_lr)
m_rf = metrics_bin(y_resampled_test, y_pred_rf)
m_nn = metrics_bin(y_resampled_test, test_metrics['pred_y'])

df = pd.DataFrame([m_lr, m_rf, m_nn], index=['LR','RF','NN'])
metrics = ['precision', 'recall', 'f1', 'accuracy']
df = df[metrics]

plt.figure(figsize=(9, 5))
x = np.arange(len(metrics))
width = 0.25

for i, model in enumerate(df.index):
    plt.bar(x + (i-1)*width, df.loc[model].values, width=width, label=model)

plt.xticks(x, metrics)
plt.ylim(0, 1)
plt.ylabel('Score')
plt.title('Overall Metrics on Resampled Test')
plt.legend(loc='upper right')

for i, model in enumerate(df.index):
    vals = df.loc[model].values
    for xi, v in zip(x + (i-1)*width, vals):
        plt.text(xi, v + 0.01, f'{v:.2f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()