In [None]:
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

要先建立輸入到模型的資料
- 若是要分類是否是熱點，應該要以一個區域的grid為單位
- 所以建立得grid亦包含該地區的所有特徵資料，以比例顯示

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode Ms']
# plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']

from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score
from utils_model import eval_loop, to_tensors
from utils import read_data

## obtain hotspot's county
這段使用將grid依照geometry找出個別在哪個城市

In [None]:
import ast
from shapely import wkt

combined_data = read_data()
TM2 = 3826
taiwan = gpd.read_file('../Data/OFiles_9e222fea-bafb-4436-9b17-10921abc6ef2/TOWN_MOI_1140318.shp')
taiwan = taiwan[(~taiwan['TOWNNAME'].isin(['旗津區', '頭城鎮', '蘭嶼鄉', '綠島鄉', '琉球鄉'])) & 
                (~taiwan['COUNTYNAME'].isin(['金門縣', '連江縣', '澎湖縣']))].to_crs(TM2)
taiwan_cnty = taiwan[['COUNTYNAME','geometry']].dissolve(by='COUNTYNAME')
taiwan_cnty['geometry'] = taiwan_cnty.buffer(0)

# 原始以 0.001 grid 計算出的區域事故及對應索引, 依照 hex_grid 計算出來的GI
grid_gi_df = pd.read_csv('../ComputedData/Grid/grid_gi.csv')
grid_gi_df['accident_indices'] = grid_gi_df['accident_indices'].apply(ast.literal_eval)
grid_gi_df['geometry'] = grid_gi_df['geometry'].apply(wkt.loads)

grid_gi  = gpd.GeoDataFrame(grid_gi_df, geometry='geometry').set_crs(TM2, allow_override=True)
grid_gi['geometry'] = grid_gi.geometry.centroid

county_join = gpd.sjoin(grid_gi[['geometry']], taiwan_cnty, how='left', predicate='within')
grid_gi['COUNTYNAME'] = county_join['COUNTYNAME']

In [None]:
# 這些都是離島資料，因為在taiwan被篩選掉了，所以會因為對應不到所以回傳空值
print('NaN ratio:', county_join['COUNTYNAME'].isna().mean())

# find all_features at DataPreprocess
grid_filter = grid_gi[grid_gi['accident_indices'].str.len() > 0].reset_index()

# all_featuresV2 為將離群替換為中位數
all_features_df = pd.read_csv("../ComputedData/ForModel/all_featuresV2.csv")

# 移除高共線
cols = all_features_df.columns[all_features_df.columns.str.contains('事故位置大類別名稱')]
cols2 = all_features_df.columns[all_features_df.columns.str.contains('號誌動作')]
all_features_df.drop(columns=cols, inplace=True)
all_features_df.drop(columns=cols2, inplace=True)

# Model Preprocess

In [None]:
new_grid = pd.concat([grid_filter[['COUNTYNAME']], all_features_df], axis=1)
county_dummies = pd.get_dummies(new_grid['COUNTYNAME'], prefix='county')
new_grid_encoded = pd.concat([new_grid.drop(['COUNTYNAME'], axis=1), county_dummies], axis=1)

# binary hotspot
new_grid_encoded['hotspot'] = new_grid_encoded['hotspot'].apply(lambda x: 'Hotspot' if 'Hotspot' in str(x) else 'Not Hotspot')
le = LabelEncoder()
# y = le.fit_transform(new_grid_encoded['hotspot'])
y = new_grid_encoded['hotspot'].map({'Not Hotspot': 0, 'Hotspot': 1}).values
X = new_grid_encoded.drop(columns=['hotspot'])

# interaction
from utils_model import get_interaction
X = get_interaction(X)

X.drop(columns='original_speed', inplace=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
y_train = pd.Series(y_train, index=X_train.index)
y_test  = pd.Series(y_test,  index=X_test.index)

# undersampling
cls_counts = y_test.value_counts()
min_count = cls_counts.min()
rus_test = RandomUnderSampler(
    sampling_strategy={int(c): int(min_count) for c in cls_counts.index},
    random_state=42
)
X_resampled_test, y_resampled_test = rus_test.fit_resample(X_test, y_test)

# LR and RF

In [None]:
import joblib

# load model
lr = joblib.load('../ComputedData/ModelPerformance/lr_modelV4.pkl')

proba_test_lr = lr.predict_proba(X_resampled_test)

In [None]:
from utils_model import print_results

print_results(proba_test_lr, ['Not Hotspot', 'Hotspot'], y_resampled_test)

In [None]:
def get_importance(model, df, specific_col=None):
    # The importance doesn't consider interaction terms
    if model.__class__.__name__ == 'LogisticRegression':
        importances = model.coef_[0]
    else:
        importances = model.feature_importances_

    feature_names = df.columns

    if specific_col:
        sel_idx = [i for i, name in enumerate(feature_names) if specific_col in name]
        indices = np.argsort(importances[sel_idx])[::-1]
        indices = [sel_idx[i] for i in indices] # 對應回原始 index
    else:
        indices = np.argsort(importances)[::-1]

    importance_ungrouped = {}
    for i in indices:
        importance_ungrouped[feature_names[i]] = importances[i]

    return importance_ungrouped

importance_lr = get_importance(lr, X_train)

In [None]:
df_ts_youbike = pd.DataFrame()
for i in importance_lr:
    if ('youbike' in i) and 'county' not in i:
        # print(i, np.exp(importance_lr[i]))
        df_ts_youbike = pd.concat([df_ts_youbike, pd.DataFrame({'feature': [i], 
        'importance': [importance_lr[i]], 'exp_importance': [np.exp(importance_lr[i])]})], axis=0)

df_ts_youbike['feature'] = df_ts_youbike['feature'].str.replace(' x youbike_100m_count_mean', '')
df_ts_youbike['feature'] = df_ts_youbike['feature'].str.replace('youbike_100m_count_mean x ', '')
df_ts_youbike.sort_values('exp_importance', ascending=True, inplace=True)

In [None]:
df_ts_youbike

In [None]:
from config import col_translation

df_ts_youbike['feature_en'] = df_ts_youbike['feature'].map(col_translation)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm
import matplotlib.colors as mcolors

df_sorted = df_ts_youbike.sort_values('exp_importance', ascending=True)

plt.figure(figsize=(10, 8))

norm = mcolors.Normalize(vmin=df_sorted['exp_importance'].min(),
                         vmax=df_sorted['exp_importance'].max())
cmap = cm.get_cmap("Greens")
colors = [cmap(norm(val)) for val in df_sorted['exp_importance']]

bars = plt.barh(df_sorted['feature'], df_sorted['exp_importance'], color=colors)

for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.02, bar.get_y() + bar.get_height()/2,
             f"{width:.3f}", va='center', fontsize=9)
plt.xlabel('Odds Ratio (Exp(Importance))', fontsize=12)
plt.title('LR Youbike Interaction Feature Importance', fontsize=14, weight='bold')
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.tight_layout()
plt.show()
