In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

hex_gi = pd.read_csv('../ComputedDataV5/ForModel/base_hex_grid_spd_final.csv')
hex_gi = hex_gi[hex_gi['accident_count'] > 0]

In [None]:
feature = 'count_crossing'
hex_grid_with_features_filter = hex_gi
np.log1p(hex_grid_with_features_filter[feature]).hist(bins=50, log=True)
# hex_grid_with_features_filter[feature].hist(bins=50, log=True)
plt.title(f'{feature}')
plt.show()

In [None]:
from config import column_translation
hex_gi = hex_gi[column_translation]

In [None]:
from new_model import model_preprocess

X_train_bal, X_test_original, y_train_bal, y_test_original, X_test_rus, y_test_rus, feat_names = model_preprocess(
    hex_gi, dim='1way'
)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_bal, y_train_bal)
y_pred_rf = rf_model.predict(X_test_original)

print(classification_report(y_test_original, y_pred_rf))

# y_pred_rf_rus = rf_model.predict(X_test_rus)
# print(classification_report(y_test_rus, y_pred_rf_rus))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

import platform
system = platform.system()
if system == 'Darwin':
    plt.rcParams['font.family'] = ['Arial Unicode MS']
elif system == 'Windows':
    plt.rcParams['font.family'] = ['Microsoft JhengHei']

def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['預測: 非熱點', '預測: 熱點'],
                yticklabels=['實際: 非熱點', '實際: 熱點'],
                annot_kws={"size": 14})
    plt.title('Random Forest 混淆矩陣', fontsize=15)
    plt.ylabel('真實情況', fontsize=12)
    plt.xlabel('模型預測', fontsize=12)
    plt.tight_layout()
    plt.show()

def plot_feature_importance(model, feature_names, top_n=20):

    importances = model.feature_importances_
    feature_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    })

    feature_df['Feature_TW'] = feature_df['Feature'].map(column_translation)
    feature_df['Feature_TW'] = feature_df['Feature_TW'].fillna(feature_df['Feature'])
    feature_df = feature_df.sort_values(by='Importance', ascending=False).head(top_n)

    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature_TW', data=feature_df, palette='Reds_r')
    
    plt.title(f'Random Forest - Top {top_n} 關鍵風險因子', fontsize=16, fontweight='bold')
    plt.xlabel('特徵重要性 (Feature Importance)', fontsize=12)
    plt.ylabel('環境特徵', fontsize=12)
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

plot_confusion_matrix(y_test_original, y_pred_rf)
plot_feature_importance(rf_model, feat_names, top_n=20)