這裡個概念是要先區分出該區域的不同情況

- 非尖峰→尖峰：非熱點→熱點
- 尖峰→非尖峰：非熱點→熱點
- 非尖峰→尖峰：熱點→非熱點
- 尖峰→非尖峰：熱點→非熱點

接著以這個情況建立四種label來帶入模型

In [None]:
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from utils import get_grid, read_data, read_taiwan_specific
from utils_macro import LocalMoranAnalysis, GetisOrdGiAnalysis

combined_data = read_data()
taiwan, grid_filter = read_taiwan_specific()

In [None]:
peak = pd.read_csv("../ComputedDataV7/ForModel/final_data_peak.csv")
offpeak = pd.read_csv("../ComputedDataV7/ForModel/final_data_offpeak.csv")
final_full = pd.read_csv("../ComputedDataV7/ForModel/final_data_full.csv")

In [None]:
def is_hot(val):
    if pd.isna(val) or val == 'Not Significant' or 'Coldspot' in str(val):
        return 0
    return 1

peak_state = peak['hotspot'].apply(is_hot)
off_state = offpeak['hotspot'].apply(is_hot)

conditions = [
    (off_state == 0) & (peak_state == 0),
    (off_state == 0) & (peak_state == 1),
    (off_state == 1) & (peak_state == 0),
    (off_state == 1) & (peak_state == 1)
]
choices = ['Stable_Safe', 'Emergent', 'Dissipated', 'Persistent']

final_full['transition_label'] = np.select(conditions, choices, default='Unknown')

print(final_full['transition_label'].value_counts())

final_full.drop(columns=['hotspot'], inplace=True)
final_full['速限-第1當事者_mean'] = (final_full['速限-第1當事者_mean'] - final_full['速限-第1當事者_mean'].min()) / (final_full['速限-第1當事者_mean'].max() - final_full['速限-第1當事者_mean'].min())

In [None]:
from sklearn.preprocessing import MinMaxScaler

rescale_cols = [
    'num_mrt', 'lag_num_mrt', 'num_parking', 'lag_num_parking',
    'num_youbike', 'lag_num_youbike', 'num_speed_diff', 'lag_num_speed_diff',
    'num_bus_stop', 'lag_num_bus_stop'
]

scaler = MinMaxScaler()
final_full[rescale_cols] = scaler.fit_transform(final_full[rescale_cols])

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

X = final_full.drop(columns=['transition_label', 'hotspot', 'accident_indices', 'geometry'], errors='ignore')
X.drop(columns=['車道劃分設施-分道設施-路面邊線名稱_無'], inplace=True)
y = final_full['transition_label']

remove_lst = [
    '路面狀況-路面鋪裝名稱', '路面狀況-路面缺陷名稱',
    '道路障礙-障礙物名稱', '道路障礙-視距品質名稱', '道路障礙-視距名稱']

cols_to_drop = [c for c in X.columns if c.startswith(tuple(remove_lst))]
X = X.drop(columns=cols_to_drop)

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

print(f"採樣前分佈: {Counter(y)}")
print(f"採樣後分佈: {Counter(y_resampled)}")

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

## LR

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'newton-cg'], 
    'max_iter': [1000, 1500, 2000]
}

grid_search = GridSearchCV(
    LogisticRegression(multi_class='multinomial'), 
    param_grid, 
    cv=5, 
    scoring='f1_macro',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print(f"Best params: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

In [None]:
y_pred = best_model.predict(X_test)
plt.figure(figsize=(10, 8))
plt.rcParams['font.sans-serif'] = ['PingFang TC']

ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred,
    display_labels=best_model.classes_,
    cmap='Greens',
    normalize='true'
)

plt.xticks(rotation=45)
plt.show()

## RF

In [None]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1
)

rf_grid.fit(X_train, y_train)

print(f"Best params: {rf_grid.best_params_}")
best_rf = rf_grid.best_estimator_

In [None]:
y_pred = best_rf.predict(X_test)

plt.figure(figsize=(10, 8))

ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred,
    display_labels=best_rf.classes_,
    cmap='Purples',
    normalize='true'
)
plt.show()

print(classification_report(y_test, y_pred))

In [None]:
import shap

X_subset = X_test.iloc[:100, :]

explainer = shap.TreeExplainer(best_rf)
shap_values = explainer.shap_values(X_subset, approximate=True)

for target_label in y_test.unique():
    class_idx = list(best_rf.classes_).index(target_label)

    plt.figure(figsize=(12, 8))
    plt.title(f"Feature Contributions for Class {target_label}")

    if isinstance(shap_values, list):
        target_shap_values = shap_values[class_idx]
    else:
        target_shap_values = shap_values[:, :, class_idx]

    shap.summary_plot(target_shap_values, X_subset)

In [None]:
for col in select_group:
    top_share = combined_data[col].value_counts(normalize=True).iloc[0]
    if top_share > 0.95:
        print(f"建議移除 {col}，其最大類別佔比達 {top_share:.2%}")

In [None]:
X_subset = X_test.iloc[:100, :]

target_label = 'Dissipated'
class_idx = list(best_rf.classes_).index(target_label)

if isinstance(shap_values, list):
    target_shap = shap_values[class_idx]
else:
    target_shap = shap_values[:, :, class_idx]

plt.figure(figsize=(10, 6))
shap.dependence_plot(
    "num_youbike", 
    target_shap,
    X_subset,
    interaction_index="num_bus_stop"
)
plt.show()

## MLP

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(final_full['transition_label'])

X_resampled, y_resampled = rus.fit_resample(X, y_encoded)

In [None]:
from sklearn.neural_network import MLPClassifier

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 25), (100, 50)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate_init': [0.001, 0.01],
    'max_iter': [2000]
}
mlp_grid = GridSearchCV(
    MLPClassifier(random_state=42, early_stopping=True),
    param_grid,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1
)

mlp_grid.fit(X_resampled, y_resampled)

print(f"Best params: {mlp_grid.best_params_}")
best_mlp = mlp_grid.best_estimator_

In [None]:
y_test_le = le.transform(y_test)
y_pred = best_mlp.predict(X_test)

plt.figure(figsize=(10, 8))
ConfusionMatrixDisplay.from_predictions(
    y_test_le, y_pred,
    display_labels=le.classes_,
    cmap='RdPu',
    normalize='true'
)
plt.show()

In [None]:
# import shap

# background = shap.kmeans(X_train, 10)
# explainer = shap.KernelExplainer(best_mlp.predict_proba, background)

# random_samples = X_test.sample(1000, random_state=42)
# shap_values = explainer.shap_values(random_samples)

# X_test_subset = X_test.iloc[:1000, :] 

# plt.figure(figsize=(12, 8))
# shap.dependence_plot(
#     "num_youbike", 
#     shap_values[:, :, 2],
#     X_test_subset, 
#     interaction_index="num_bus_stop" 
# )