In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

hex_gi = pd.read_csv('../ComputedDataV5/ForModel/base_hex_grid_spd_final.csv')
hex_gi = hex_gi[hex_gi['accident_count'] > 0]

In [None]:
feature = 'count_crossing'
hex_grid_with_features_filter = hex_gi
np.log1p(hex_grid_with_features_filter[feature]).hist(bins=50, log=True)
# hex_grid_with_features_filter[feature].hist(bins=50, log=True)
plt.title(f'{feature}')
plt.show()

In [None]:
column_translation = {
    # --- 1. 來自 gis_osm_roads_free_1.shp (roads) ---
    'road_len_motorway': '國道(高速公路)長度(roads)',
    'road_len_motorway_link': '國道匝道長度(roads)',
    'road_len_trunk': '快速道路長度(roads)',
    'road_len_trunk_link': '快速道路匝道長度(roads)',
    'road_len_primary': '省道(主要幹道)長度(roads)',
    'road_len_primary_link': '省道匝道長度(roads)',
    'road_len_secondary': '縣道(次要幹道)長度(roads)',
    'road_len_secondary_link': '縣道匝道長度(roads)',
    'road_len_tertiary': '鄉道(一般道路)長度(roads)',
    'road_len_tertiary_link': '鄉道匝道長度(roads)',
    'road_len_unclassified': '無分級道路長度(roads)',
    'road_len_residential': '住宅區街道(巷弄)長度(roads)',
    'road_len_living_street': '人車共用道(生活街道)長度(roads)',
    'road_len_service': '服務道路長度(roads)',
    'road_len_pedestrian': '行人徒步區長度(roads)',
    'road_len_track': '產業道路長度(roads)',
    'road_len_busway': '公車專用道長度(roads)',
    'road_len_cycleway': '自行車道長度(roads)',
    'road_len_footway': '人行道長度(roads)',
    'road_len_path': '小徑長度(roads)',
    'road_len_steps': '階梯長度(roads)',
    'road_len_bridleway': '馬道長度(roads)',
    'road_len_unknown': '未知類型道路長度(roads)',
    
    # 產業道路細分 (roads)
    'road_len_track_grade1': '產業道路_硬鋪面(roads)',
    'road_len_track_grade2': '產業道路_混合鋪面(roads)',
    'road_len_track_grade3': '產業道路_軟混合(roads)',
    'road_len_track_grade4': '產業道路_植被壓實(roads)',
    'road_len_track_grade5': '產業道路_鬆軟泥土(roads)',

    # --- 2. 來自 gis_osm_traffic_free_1.shp (traffic) ---
    'count_traffic_signals': '交通號誌(紅綠燈)數量(traffic)',
    'count_stop': '停車標誌數量(traffic)',
    'count_crossing': '行人穿越道(斑馬線)數量(traffic)',
    'count_speed_camera': '測速照相機數量(traffic)',
    'count_parking': '路邊停車點數量(traffic)',
    'count_motorway_junction': '交流道數量(traffic)',

    # --- 3. 來自 gis_osm_transport_free_1.shp (transport) ---
    'count_bus_stop': '公車站牌數量(transport)',
    'count_train_station': '火車站數量(transport)',

    # --- 4. 來自 gis_osm_pois_free_1.shp (pois) ---
    # 'count_alcohol': '飲酒場所數量(pois)',
    # 'count_convenience': '便利商店數量(pois)',
    # 'count_school': '學校數量(pois)',

    # --- 5. 來自 外部 CSV 資料 (local data) ---
    'count_mrt': '捷運站出口數量(mrt)',
    'count_youbike': 'YouBike站點數量(youbike)',
    'count_parking_official': '公有路外停車場數量(parkinglot)',

    'count_intersection': '交叉口數量(roads)',
    'count_spd_points': '速差點數量(roads)',

    # --- 6. 模型結果 ---
    'gi_category': 'gi_category'
}
hex_gi = hex_gi[column_translation]

In [None]:
from new_model import model_preprocess

X_train_bal, X_test_original, y_train_bal, y_test_original, X_test_rus, y_test_rus, feat_names = model_preprocess(
    hex_gi, dim='1way'
)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_bal, y_train_bal)
y_pred_rf = rf_model.predict(X_test_original)

print(classification_report(y_test_original, y_pred_rf))

# y_pred_rf_rus = rf_model.predict(X_test_rus)
# print(classification_report(y_test_rus, y_pred_rf_rus))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

import platform
system = platform.system()
if system == 'Darwin':
    plt.rcParams['font.family'] = ['Arial Unicode MS']
elif system == 'Windows':
    plt.rcParams['font.family'] = ['Microsoft JhengHei']

def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['預測: 非熱點', '預測: 熱點'],
                yticklabels=['實際: 非熱點', '實際: 熱點'],
                annot_kws={"size": 14})
    plt.title('Random Forest 混淆矩陣', fontsize=15)
    plt.ylabel('真實情況', fontsize=12)
    plt.xlabel('模型預測', fontsize=12)
    plt.tight_layout()
    plt.show()

def plot_feature_importance(model, feature_names, top_n=20):

    importances = model.feature_importances_
    feature_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    })

    feature_df['Feature_TW'] = feature_df['Feature'].map(column_translation)
    feature_df['Feature_TW'] = feature_df['Feature_TW'].fillna(feature_df['Feature'])
    feature_df = feature_df.sort_values(by='Importance', ascending=False).head(top_n)

    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature_TW', data=feature_df, palette='Reds_r')
    
    plt.title(f'Random Forest - Top {top_n} 關鍵風險因子', fontsize=16, fontweight='bold')
    plt.xlabel('特徵重要性 (Feature Importance)', fontsize=12)
    plt.ylabel('環境特徵', fontsize=12)
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

plot_confusion_matrix(y_test_original, y_pred_rf)
plot_feature_importance(rf_model, feat_names, top_n=20)