要先建立輸入到模型的資料
- 若是要分類是否是熱點，應該要以一個區域的grid為單位
- 所以建立得grid亦包含該地區的所有特徵資料，以比例顯示

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd

from utils import get_grid, calculate_gi

from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, average_precision_score

In [None]:
dataA1 = pd.read_csv('../ComputedData/Accident/DataA1_with_MYP.csv')
dataA2 = pd.read_csv('../ComputedData/Accident/DataA2_with_MYP.csv')
taiwan = gpd.read_file('../Data/OFiles_9e222fea-bafb-4436-9b17-10921abc6ef2/TOWN_MOI_1140318.shp')
taiwan = taiwan[(~taiwan['TOWNNAME'].isin(['旗津區', '頭城鎮', '蘭嶼鄉', '綠島鄉', '琉球鄉'])) & 
                (~taiwan['COUNTYNAME'].isin(['金門縣', '連江縣', '澎湖縣']))]

filtered_A2 = dataA2[dataA2['當事者順位'] == 1]
filtered_A1 = dataA1[dataA1['當事者順位'] == 1]

filtered_A1['source'] = 'A1'
filtered_A2['source'] = 'A2'
filtered_A1['num_accidents'] = 1 
filtered_A2['num_accidents'] = 1
combined_data = pd.concat([filtered_A1, filtered_A2], ignore_index=True)

hex_grid = get_grid(combined_data, hex_size=0.01, threshold=-1)
taiwan = taiwan.to_crs(hex_grid.crs)

hex_grid = hex_grid[hex_grid.intersects(taiwan.unary_union)]

hex_grid目前包含所有事故索引，所以要回推
- 回推方式從combined_data獲取，並且計算他們的事故特徵平均

In [None]:
grid = calculate_gi(6, hex_grid, adjacency='knn')
grid = gpd.sjoin(hex_grid, taiwan[['COUNTYNAME', 'TOWNNAME', 'geometry']], how='left', predicate='intersects')
grid

In [None]:
select_group = [
    # 氣候暫不討論
    # '天候名稱', '光線名稱',
    # 道路問題
    '路面狀況-路面鋪裝名稱', '路面狀況-路面狀態名稱', '路面狀況-路面缺陷名稱',
    '道路障礙-障礙物名稱', '道路障礙-視距品質名稱', '道路障礙-視距名稱',
    # 號誌
    '號誌-號誌種類名稱', '號誌-號誌動作名稱',
    # 車道劃分
    '車道劃分設施-分向設施大類別名稱', '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',
    # 大類別
    '事故類型及型態大類別名稱', '車輛撞擊部位大類別名稱-最初', '車輛撞擊部位大類別名稱-其他',
    '事故位置大類別名稱', '肇因研判大類別名稱-主要', '當事者區分-類別-大類別名稱-車種', '當事者行動狀態大類別名稱',
    '肇因研判大類別名稱-個別', '道路型態大類別名稱',
    # 子類別
    # '事故類型及型態子類別名稱', '車道劃分設施-分向設施子類別名稱',
    # '當事者區分-類別-子類別名稱-車種', '肇因研判子類別名稱-主要',
    # '車輛撞擊部位子類別名稱-最初', '當事者行動狀態子類別名稱',
    # '肇因研判子類別名稱-個別', '車輛撞擊部位子類別名稱-其他',
    # '事故位置子類別名稱', '道路型態子類別名稱',
    # 其他
    '當事者屬-性-別名稱', '當事者事故發生時年齡', '速限-第1當事者', '道路類別-第1當事者-名稱',
    '保護裝備名稱', '行動電話或電腦或其他相類功能裝置名稱', '肇事逃逸類別名稱-是否肇逃',
    # 設施
    'mrt_100m_count', 'youbike_100m_count', 'parkinglot_100m_count',
    # A1 or A2
    # 'source', 
    ]

In [None]:
def extract_features(
        grid, combined_data, select_group, rows
        ):

    indices = grid['accident_indices'].iloc[rows]
    sample = combined_data.iloc[indices]
    sample = sample[select_group]

    cat_cols = sample.select_dtypes(include='object').columns
    num_cols = sample.select_dtypes(include='number').columns

    # for categorical features
    cat_features = []
    for col in cat_cols:
        vc = sample[col].value_counts(normalize=True)
        vc.index = [f"{col}_{v}" for v in vc.index]
        cat_features.append(vc)
    cat_features = pd.concat(cat_features)

    # for numerical features
    num_features = sample[num_cols].mean()
    num_features.index = [f"{col}_mean" for col in num_features.index]

    all_features = pd.concat([cat_features, num_features])
    all_features_df = all_features.to_frame().T

    return all_features_df

In [None]:
all_features_list = []
for rows in range(grid.shape[0]):
    features = extract_features(grid, combined_data, select_group, rows)
    all_features_list.append(features)

all_features_df = pd.concat(all_features_list, ignore_index=True)
all_features_df.fillna(0, inplace=True)
all_features_df

In [None]:
# with county town
new_grid = pd.concat([grid.reset_index(drop=True)[['hotspot', 'COUNTYNAME']], all_features_df], axis=1)
county_dummies = pd.get_dummies(new_grid['COUNTYNAME'], prefix='county')
new_grid_encoded = pd.concat([new_grid.drop(['COUNTYNAME'], axis=1), county_dummies], axis=1)

# without county town
# new_grid = pd.concat([grid.reset_index(drop=True)[['hotspot']], all_features_df], axis=1)
# # new_grid['hotspot'] = new_grid['hotspot'].apply(lambda x: 'Hotspot' if 'Hotspot' in str(x) else 'Not Hotspot')
# new_grid_encoded = new_grid

le = LabelEncoder()
y = le.fit_transform(new_grid_encoded['hotspot'])
X = new_grid_encoded.drop(columns=['hotspot'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
y_train = pd.Series(y_train, index=X_train.index)
y_test  = pd.Series(y_test,  index=X_test.index)

cls_counts = y_test.value_counts()
min_count = cls_counts.min()
rus_test = RandomUnderSampler(
    sampling_strategy={int(c): int(min_count) for c in cls_counts.index},
    random_state=42
)
X_resampled_test, y_resampled_test = rus_test.fit_resample(X_test, y_test)

# without undersampling
# X_resampled_test = X_test.copy()
# y_resampled_test = y_test.copy()

print("before US")
print(pd.Series(y_test).map(dict(enumerate(le.classes_))).value_counts())
print("after US")
print(pd.Series(y_resampled_test).map(dict(enumerate(le.classes_))).value_counts())

In [None]:
logit_clf = Pipeline(steps=[
    ('model', LogisticRegression(
        class_weight='balanced', max_iter=1000, 
        random_state=42, multi_class='multinomial'))
])

rf_clf = Pipeline(steps=[
    ('model', RandomForestClassifier(
        n_estimators=300, max_depth=None, min_samples_leaf=1,
        class_weight='balanced', n_jobs=-1, random_state=42,
    ))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for name, clf in [('Logistic', logit_clf), ('RandomForest', rf_clf)]:
    scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='roc_auc_ovr_weighted', n_jobs=-1)
    print(f'{name} CV ROC AUC: {scores.mean():.3f} ± {scores.std():.3f}')

best_clf = rf_clf
best_clf.fit(X_train, y_train)

proba_test_bal = best_clf.predict_proba(X_resampled_test)
y_pred_bal = np.argmax(proba_test_bal, axis=1)

In [None]:
from sklearn.preprocessing import label_binarize

print("\nConfusion Matrix")
print(confusion_matrix(y_resampled_test, y_pred_bal, labels=range(len(le.classes_))))

print("\nClassification Report")
print(classification_report(
    y_resampled_test, y_pred_bal, target_names=le.classes_, digits=3
))

roc_auc = roc_auc_score(
    y_resampled_test, proba_test_bal,
    multi_class='ovr', average='weighted'
)
print(f'ROC AUC: {roc_auc:.3f}')

# 多類PR AUC需要 binarize 後用 one-vs-rest，再做 macro/weighted 平均
y_test_bin = label_binarize(y_resampled_test, classes=range(len(le.classes_)))  # shape [n, n_classes]
pr_auc_macro  = average_precision_score(y_test_bin, proba_test_bal, average='macro')
pr_auc_weight = average_precision_score(y_test_bin, proba_test_bal, average='weighted')
print(f'PR  AUC macro: {pr_auc_macro:.3f}')
print(f'PR  AUC wighted: {pr_auc_weight:.3f}')

In [None]:
rf_model = best_clf.named_steps['model']

importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

feature_names = X_train.columns
for i in indices:
    print(f"{feature_names[i]}: {importances[i]:.4f}")