Model.ipynb 分支出來分析縣市異質

In [None]:
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode Ms']

In [None]:
dataA1 = pd.read_csv('../ComputedData/Accident/DataA1_with_MYP.csv')
dataA2 = pd.read_csv('../ComputedData/Accident/DataA2_with_MYP.csv')

filtered_A2 = dataA2[dataA2['當事者順位'] == 1]
filtered_A1 = dataA1[dataA1['當事者順位'] == 1]

filtered_A1['source'] = 'A1'
filtered_A2['source'] = 'A2'
filtered_A1['num_accidents'] = 1 
filtered_A2['num_accidents'] = 1
combined_data = pd.concat([filtered_A1, filtered_A2], ignore_index=True)

In [None]:
from shapely import wkt
import geopandas as gpd
import ast

taiwan = gpd.read_file('../Data/OFiles_9e222fea-bafb-4436-9b17-10921abc6ef2/TOWN_MOI_1140318.shp')
taiwan = taiwan[(~taiwan['TOWNNAME'].isin(['旗津區', '頭城鎮', '蘭嶼鄉', '綠島鄉', '琉球鄉'])) & 
                (~taiwan['COUNTYNAME'].isin(['金門縣', '連江縣', '澎湖縣']))]

TM2 = 3826
hex_grid_raw = pd.read_csv('../ComputedData/Grid/hex_grid.csv')
hex_grid_raw['geometry'] = hex_grid_raw['geometry'].apply(wkt.loads)
hex_grid = gpd.GeoDataFrame(hex_grid_raw, geometry='geometry').set_crs(TM2, allow_override=True)

grid_gi_df = pd.read_csv('../ComputedData/Grid/grid_gi.csv')
grid_gi_df['accident_indices'] = grid_gi_df['accident_indices'].apply(ast.literal_eval)
grid_gi_df['geometry'] = grid_gi_df['geometry'].apply(wkt.loads)
grid_gi  = gpd.GeoDataFrame(grid_gi_df, geometry='geometry').set_crs(TM2, allow_override=True)

taiwan_tm2 = taiwan.to_crs(TM2)

taiwan_cnty = taiwan_tm2[['COUNTYNAME','geometry']].dissolve(by='COUNTYNAME')
taiwan_cnty['geometry'] = taiwan_cnty.buffer(0)
taiwan_cnty = taiwan_cnty.reset_index()

pts = hex_grid.copy()
pts['geometry'] = pts.geometry.centroid

county_join = gpd.sjoin(
    pts[['geometry']], taiwan_cnty, how='left', predicate='within'
)[['COUNTYNAME']]

print('NaN ratio:', county_join['COUNTYNAME'].isna().mean())
grid_gi['COUNTYNAME'] = county_join['COUNTYNAME']
county_join.head()

In [None]:
def process_speed(input_data):

    bins_speed = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, float('inf')]
    labels_speed = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100', '101-110', '110+']

    input_data['速限-第1當事者'] = pd.cut(input_data['速限-第1當事者'], bins=bins_speed, labels=labels_speed, right=False).astype(str)

    return input_data

In [None]:
# 無事故的資料沒有參考性，所以要去掉
grid_filter = grid_gi[grid_gi['accident_indices'].str.len() > 0].reset_index(drop=True)
all_features_df = pd.read_csv("../ComputedData/ForModel/all_features.csv")

grid_filter['hotspot'] = grid_filter['hotspot'].apply(lambda x: 'Hotspot' if 'Hotspot' in str(x) else 'Not Hotspot')
grid_filter = grid_filter[~grid_filter['COUNTYNAME'].isna()]

In [None]:
countylst = list(grid_filter['COUNTYNAME'].unique())

county_df = {}
for county in countylst:

    filter_hotspot = grid_filter[grid_filter['hotspot'] == 'Hotspot']
    filter_not_hotspot = grid_filter[grid_filter['hotspot'] == 'Not Hotspot']

    idx_hot = filter_hotspot.loc[filter_hotspot['COUNTYNAME'] == county, 'accident_indices']
    idx_nothot = filter_not_hotspot.loc[filter_not_hotspot['COUNTYNAME'] == county, 'accident_indices']

    # 攤平
    hot_flat = [i for sublist in idx_hot for i in sublist]
    nothot_flat = [i for sublist in idx_nothot for i in sublist]

    county_df[county] = pd.concat([
        combined_data.loc[hot_flat].assign(hotspot="Hotspot"),
        combined_data.loc[nothot_flat].assign(hotspot="Not Hotspot")
    ])
    county_df[county].loc[hot_flat]['hotspot'] = 'Hotspot'
    county_df[county].loc[nothot_flat]['hotspot'] = 'Not Hotspot'


# 檢查重複
all_indices = [i for sublist in grid_filter['accident_indices'] for i in sublist]
print("總索引數量:", len(all_indices))
print("去重後數量:", len(set(all_indices)))

In [None]:
from config import select_group

select_group = [
    # 氣候暫不討論
    # '天候名稱', '光線名稱',

    # 道路問題
    '路面狀況-路面鋪裝名稱', '路面狀況-路面狀態名稱', '路面狀況-路面缺陷名稱',
    '道路障礙-障礙物名稱', '道路障礙-視距品質名稱', '道路障礙-視距名稱',

    # 號誌
    '號誌-號誌種類名稱', '號誌-號誌動作名稱',

    # 車道劃分
    '車道劃分設施-分道設施-快車道或一般車道間名稱',
    '車道劃分設施-分道設施-快慢車道間名稱', '車道劃分設施-分道設施-路面邊線名稱',

    # 大類別
    # '肇因研判大類別名稱-主要', '肇因研判大類別名稱-個別', # 聚焦道路類型
    # '當事者區分-類別-大類別名稱-車種', # 聚焦道路類型
    # '當事者行動狀態大類別名稱', # 聚焦道路類型
    '車輛撞擊部位大類別名稱-最初', #'車輛撞擊部位大類別名稱-其他',
    '事故類型及型態大類別名稱', '車道劃分設施-分向設施大類別名稱',
    '事故位置大類別名稱', '道路型態大類別名稱',
    
    # 子類別
    # '肇因研判子類別名稱-主要', '肇因研判子類別名稱-個別', # 聚焦道路類型
    # '當事者區分-類別-子類別名稱-車種', # 聚焦道路類型
    # '當事者行動狀態子類別名稱', # 聚焦道路類型
    # '車輛撞擊部位子類別名稱-最初', '車輛撞擊部位子類別名稱-其他', # 道路類型很大程度影響撞擊部位，所以不考慮
    # '事故類型及型態子類別名稱', '車道劃分設施-分向設施子類別名稱', 
    # '事故位置子類別名稱', '道路型態子類別名稱',

    # 其他
    # '當事者屬-性-別名稱', '當事者事故發生時年齡', 
    '速限-第1當事者', '道路類別-第1當事者-名稱',
    # '保護裝備名稱', '行動電話或電腦或其他相類功能裝置名稱', '肇事逃逸類別名稱-是否肇逃',

    # 設施
    'youbike_100m_count', 'mrt_100m_count', 'parkinglot_100m_count',

    # A1 or A2
    # 'source',
    ]
select_group.append('hotspot')
select_group.append('COUNTY')

In [None]:
# concat all county_df
final_df = pd.concat(
    [df.assign(COUNTY=county) for county, df in county_df.items()],
    ignore_index=True
)

final_df['youbike_100m_count'] = final_df['youbike_100m_count'].apply(lambda x: '1' if x > 0 else '0')
final_df['mrt_100m_count'] = final_df['mrt_100m_count'].apply(lambda x: '1' if x > 0 else '0')
final_df['parkinglot_100m_count'] = final_df['parkinglot_100m_count'].apply(lambda x: '1' if x > 0 else '0')
final_df = process_speed(final_df)
final_df = final_df[select_group]

In [None]:
hotspot_df = final_df['hotspot']
final_df.drop(columns=['hotspot'], inplace=True)
dummy_df = pd.get_dummies(final_df, drop_first=True)

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, average_precision_score, accuracy_score, f1_score, recall_score, precision_score

In [None]:
le = LabelEncoder()
y = le.fit_transform(hotspot_df)
X = dummy_df

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
y_train = pd.Series(y_train, index=X_train.index)
y_test  = pd.Series(y_test,  index=X_test.index)

# with undersampling
cls_counts = y_test.value_counts()
min_count = cls_counts.min()
rus_test = RandomUnderSampler(
    sampling_strategy={int(c): int(min_count) for c in cls_counts.index},
    random_state=42
)
X_resampled_test, y_resampled_test = rus_test.fit_resample(X_test, y_test)

# without undersampling
# X_resampled_test = X_test.copy()
# y_resampled_test = y_test.copy()

print("before US")
print(pd.Series(y_test).map(dict(enumerate(le.classes_))).value_counts())
print("after US")
print(pd.Series(y_resampled_test).map(dict(enumerate(le.classes_))).value_counts())

In [None]:
lr = LogisticRegression(
        penalty='elasticnet', solver='saga', l1_ratio=0.5,
        class_weight='balanced', max_iter=1000, 
        random_state=42, 
        multi_class='multinomial'
    )
rf = RandomForestClassifier(
        n_estimators=300, max_depth=None, min_samples_leaf=1,
        class_weight='balanced', n_jobs=-1, random_state=42,
    )

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, clf in [('Logistic', lr), ('RandomForest', rf)]:
    scores = cross_val_score(clf, X_train, y_train, cv=cv, n_jobs=-1,
                            # scoring='roc_auc_ovr_weighted',
                            scoring='roc_auc'
                             )
    print(f'{name} CV ROC AUC: {scores.mean():.3f} ± {scores.std():.3f}')

lr.fit(X_train, y_train)
rf.fit(X_train, y_train)

proba_test_lr = lr.predict_proba(X_resampled_test)
proba_test_rf = rf.predict_proba(X_resampled_test)
y_pred_lr = np.argmax(proba_test_lr, axis=1)
y_pred_rf = np.argmax(proba_test_rf, axis=1)

In [None]:
y_pred = y_pred_rf
proba_test = proba_test_rf

print("Confusion Matrix")
print(confusion_matrix(y_resampled_test, y_pred, labels=range(len(le.classes_))))

print("Classification Report")
print(classification_report(
    y_resampled_test, y_pred, target_names=le.classes_, digits=3
))

if proba_test.shape[1] == 2:
    # 二元分類
    roc_auc = roc_auc_score(y_resampled_test, proba_test[:, 1])
    print(f'ROC AUC: {roc_auc:.3f}')
    y_test_bin = label_binarize(y_resampled_test, classes=range(len(le.classes_)))
    pr_auc_macro  = average_precision_score(y_test_bin, proba_test[:, 1], average='macro')
    pr_auc_weight = average_precision_score(y_test_bin, proba_test[:, 1], average='weighted')
    print(f'PR  AUC macro: {pr_auc_macro:.3f}')
    print(f'PR  AUC wighted: {pr_auc_weight:.3f}')
else:
    # 多類分類
    roc_auc = roc_auc_score(y_resampled_test, proba_test, average='weighted', multi_class='ovr')
    print(f'ROC AUC: {roc_auc:.3f}')
    # 多類PR AUC需要 binarize 後用 one-vs-rest，再做 macro/weighted 平均
    y_test_bin = label_binarize(y_resampled_test, classes=range(len(le.classes_)))  # shape [n, n_classes]
    pr_auc_macro  = average_precision_score(y_test_bin, proba_test, average='macro')
    pr_auc_weight = average_precision_score(y_test_bin, proba_test, average='weighted')
    print(f'PR  AUC macro: {pr_auc_macro:.3f}')
    print(f'PR  AUC wighted: {pr_auc_weight:.3f}')

Odd ratio 需要嘗試