ComputedDataV2/ModelPerformance/nn_modelV2.pth: 這裡是增加肇因後的模型表現<br/>
ComputedDataV2/ModelPerformance/nn_modelV3.pth: 這裡是增加三次交互的模型表現<br/>
ComputedDataV2/ModelPerformance/nn_modelV4.pth: 這裡只考慮二次交互並指定人車路的表現<br/>
ComputedDataV2/ModelPerformance/nn_modelV5.pth: 做refit，考慮所有二三維但各取20，目前存到v6篩選的版本<br/>
ComputedDataV2/ModelPerformance/nn_modelV6.pth: Refit V3內容<br/>

In [1]:
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
analyze_path = os.path.join(parent_dir, "utils")

os.chdir(analyze_path)

In [2]:
import joblib
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode Ms']

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from utils_model import eval_loop, to_tensors, model_preprocess, print_results, get_importance

from utils import read_data, read_taiwan_specific
from config import cause_mapping, countycity_dct
from config_new import for_poly, group_translation

version = "V5"
computeddata = 'ComputedDataV2'

In [None]:
combined_data = read_data()
taiwan, grid_filter = read_taiwan_specific(read_grid=True)

def map_cause(cause):
    for category, causes in cause_mapping.items():
        if cause in causes:
            return category
    return "Unknown"

combined_data["cause_group"] = combined_data["肇因研判子類別名稱-主要"].apply(map_cause)
combined_data['cause_group'].value_counts()

all_features_df = pd.read_csv(f"../{computeddata}/ForModel/all_featuresV2.csv")
X_train, X_test, y_train, y_test, X_resampled_test, y_resampled_test, le =\
     model_preprocess(grid_filter, all_features_df, for_poly=for_poly, dim='mixed')

## Model

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid_lr = {
    'penalty': ['elasticnet'],
    'solver': ['saga'],
    'l1_ratio': [0.1, 0.5, 0.9],
    'C': [0.01, 0.1, 1, 10]
}

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [1, 2, 5]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
lr = LogisticRegression(
        penalty='elasticnet', solver='saga', l1_ratio=0.5,
        class_weight='balanced', max_iter=1000, 
        random_state=42, 
        multi_class='multinomial',
        n_jobs=-1
    )
rf = RandomForestClassifier(
        n_estimators=300, max_depth=None, min_samples_leaf=1,
        class_weight='balanced', n_jobs=-1, random_state=42,
    )

# for name, clf in [('Logistic', lr), ('RandomForest', rf)]:
#     scores = cross_val_score(clf, X_train, y_train, cv=cv, n_jobs=-1, scoring='roc_auc')
#     print(f'{name} CV ROC AUC: {scores.mean():.3f} ± {scores.std():.3f}')

lr.fit(X_train, y_train)
rf.fit(X_train, y_train)

In [None]:
proba_test_lr = lr.predict_proba(X_resampled_test)
proba_test_rf = rf.predict_proba(X_resampled_test)

print_results(proba_test_lr, le.classes_, y_resampled_test)
print_results(proba_test_rf, le.classes_, y_resampled_test)

In [None]:
importance_rf, importance_grouped_rf = get_importance(rf, X_train)
importance_lr, importance_grouped_lr = get_importance(lr, X_train)

In [None]:
df = pd.DataFrame.from_dict(importance_lr, orient='index', columns=['importance', 'odds_ratio'])
df_sorted = df.sort_values(by='importance', ascending=False)
df_sorted

## Re fit for only important features

In [None]:
# 這裡寫死是因為V3/V6是完整的三維特徵版本
lr = joblib.load(f'../{computeddata}/ModelPerformance/lr_modelV6.pkl')
rf = joblib.load(f'../{computeddata}/ModelPerformance/rf_modelV6.pkl')

feature_order = lr.feature_names_in_
X_sorted = X_resampled_test.reindex(columns=feature_order, fill_value=0)

importance_lr_dict, _ = get_importance(lr, X_sorted)
df_lr_ranked = pd.DataFrame.from_dict(importance_lr_dict, orient='index', columns=['score', 'exp_score'])
df_lr_ranked['abs_score'] = df_lr_ranked['score'].abs()
importance_rf_dict, _ = get_importance(rf, X_sorted)
df_rf_ranked = pd.DataFrame.from_dict(importance_rf_dict, orient='index', columns=['score', 'exp_score'])
df_rf_ranked['abs_score'] = df_rf_ranked['score'].abs()

def get_dimension(feature_name):
    x_count = feature_name.count(' x ')
    if x_count == 0: return 1
    if x_count == 1: return 2 
    if x_count == 2: return 3
    return 0

df_lr_ranked['dimension'] = df_lr_ranked.index.map(get_dimension)
df_rf_ranked['dimension'] = df_rf_ranked.index.map(get_dimension)

lr_dim1 = df_lr_ranked[df_lr_ranked['dimension'] == 1].index.tolist()
lr_dim2 = df_lr_ranked[df_lr_ranked['dimension'] == 2].sort_values('abs_score', ascending=False).head(20).index.tolist()
lr_dim3 = df_lr_ranked[df_lr_ranked['dimension'] == 3].sort_values('abs_score', ascending=False).head(20).index.tolist()
rf_dim1 = df_rf_ranked[df_rf_ranked['dimension'] == 1].index.tolist()
rf_dim2 = df_rf_ranked[df_rf_ranked['dimension'] == 2].sort_values('abs_score', ascending=False).head(20).index.tolist()
rf_dim3 = df_rf_ranked[df_rf_ranked['dimension'] == 3].sort_values('abs_score', ascending=False).head(20).index.tolist()

final_dim1 = list(set(lr_dim1 + rf_dim1))
final_dim2 = list(set(lr_dim2 + rf_dim2))
final_dim3 = list(set(lr_dim3 + rf_dim3))
final_feature_list = final_dim1 + final_dim2 + final_dim3

print(f"原始特徵保留: {len(final_dim1)} 個")
print(f"二維特徵保留: {len(final_dim2)} 個 (LR與RF聯集)")
print(f"三維特徵保留: {len(final_dim3)} 個 (LR與RF聯集)")
print(f"最終模型特徵總數: {len(final_feature_list)} 個")

print("top 3 way")
print(df_lr_ranked[df_lr_ranked['dimension'] == 3].sort_values('abs_score', ascending=False).head(5).index.tolist())

In [None]:
X_train_refined = X_train[final_feature_list]
X_test_refined = X_resampled_test[final_feature_list]

In [None]:
param_grid_lr = {
    # 'C': [0.1, 1, 10, 100], 
    'C': [100], 
    'l1_ratio': [0.9]
}

lr_refined = LogisticRegression(
    penalty='elasticnet', solver='saga', 
    class_weight='balanced', max_iter=1000, 
    random_state=42, n_jobs=-1
)

grid_lr = GridSearchCV(lr_refined, param_grid_lr, cv=5, scoring='roc_auc', n_jobs=-1)
grid_lr.fit(X_train_refined, y_train)

best_lr_model = grid_lr.best_estimator_
print(f"LR 最佳參數: {grid_lr.best_params_}")

In [None]:
param_grid_rf = {
    'n_estimators': [300],
    # 'max_depth': [10, 20],
    'max_depth': [20],
    'min_samples_leaf': [4]
}

rf_refined = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)

grid_rf = GridSearchCV(rf_refined, param_grid_rf, cv=5, scoring='roc_auc', n_jobs=-1)
grid_rf.fit(X_train_refined, y_train)

best_rf_model = grid_rf.best_estimator_
print(f"RF 最佳參數: {grid_rf.best_params_}")

In [None]:
X_train_refined

### Save

In [None]:
# joblib.dump(best_lr_model, f'../{computeddata}/ModelPerformance/lr_model{version}.pkl')
# joblib.dump(best_rf_model, f'../{computeddata}/ModelPerformance/rf_model{version}.pkl')

# load model
best_lr_model = joblib.load(f'../{computeddata}/ModelPerformance/lr_model{version}.pkl')
best_rf_model = joblib.load(f'../{computeddata}/ModelPerformance/rf_model{version}.pkl')

model_features = best_lr_model.feature_names_in_
X_test_refined = X_test_refined.reindex(columns=model_features, fill_value=0)

proba_test_lr = best_lr_model.predict_proba(X_test_refined)
proba_test_rf = best_rf_model.predict_proba(X_test_refined)
y_pred_lr = np.argmax(proba_test_lr, axis=1)
y_pred_rf = np.argmax(proba_test_rf, axis=1)

print_results(proba_test_lr, le.classes_, y_resampled_test)
print_results(proba_test_rf, le.classes_, y_resampled_test)

X_train_refined = X_train_refined.reindex(columns=model_features, fill_value=0)

importance_rf, importance_grouped_rf = get_importance(best_rf_model, X_train_refined)
importance_lr, importance_grouped_lr = get_importance(best_lr_model, X_train_refined)

## Neural Network

In [None]:
## Neural Network
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# INPUT_DIM = X_resampled_test.shape[1]
INPUT_DIM = X_test_refined.shape[1]
NUM_CLASSES = 2

class BinaryMLP(nn.Module):
    def __init__(self, in_dim=INPUT_DIM, num_classes=NUM_CLASSES, drop=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 1024),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(64, num_classes)  # logits
        )
    def forward(self, x):
        return self.net(x)

In [None]:
# refit version
X_train_nn, X_val_nn, y_train_nn, y_val_nn = train_test_split(
    X_train_refined, y_train, test_size=0.2, stratify=y_train, random_state=42
)   

X_train_t, y_train_t = to_tensors(X_train_nn, y_train_nn)
X_val_t, y_val_t = to_tensors(X_val_nn, y_val_nn)
X_test_t, y_test_t = to_tensors(X_test_refined, y_resampled_test)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=256, shuffle=True, drop_last=False)
val_loader = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=512, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test_t, y_test_t), batch_size=512, shuffle=False)

model = BinaryMLP().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

best_val = -np.inf
patience = 5
wait = 0
epochs = 20

In [None]:
for epoch in range(1, epochs+1):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)

    train_loss = total_loss / len(train_loader.dataset)
    val_metrics = eval_loop(model, val_loader, le)
    print(f'Epoch {epoch:02d}/{epochs} | loss {train_loss:.4f} | '
          f'val_acc {val_metrics["acc"]:.3f} | val_f1 {val_metrics["f1"]:.3f} | val_auc {val_metrics["auc"]:.3f}')

    score_for_early = val_metrics["auc"]
    if score_for_early > best_val:
        best_val = score_for_early
        wait = 0
        # torch.save(model.state_dict(), 'best_model.pt')
    else:
        wait += 1
        if wait >= patience:
            print('Early stopping.')
            break

In [None]:
# torch.save(model.state_dict(), f'../{computeddata}/ModelPerformance/nn_model{version}.pth')

model = BinaryMLP().to(device)
model.load_state_dict(torch.load(f'../{computeddata}/ModelPerformance/nn_model{version}.pth'))

test_metrics = eval_loop(model, test_loader, le)
print(test_metrics['report'])

In [None]:
X_train_refined = X_train[final_feature_list]
X_test_refined = X_resampled_test[final_feature_list]
X_test_r = X_test[final_feature_list]

from utils_model import build_groups_from_prefix, build_groups_with_interactions, build_pair_interaction_groups, PI_ML, PI_NN

groups = build_groups_with_interactions(X_test_refined.columns)

print('lr')
base_lr, perm_lr = PI_ML(best_lr_model, X_test_r, y_test, groups=groups, n_repeats=30)
print('rf') 
base_rf, perm_rf = PI_ML(best_rf_model, X_test_r, y_test, groups=groups, n_repeats=30)
print('nn')
base_nn, perm_nn = PI_NN(model, X_test_r, y_test, groups=groups, n_repeats=30)

In [None]:
from utils_model import build_groups_from_prefix, build_groups_with_interactions, build_pair_interaction_groups, PI_ML, PI_NN

groups = build_groups_with_interactions(X_test.columns)

print('lr')
base_lr, perm_lr = PI_ML(lr, X_test, y_test, groups=groups, n_repeats=10)
print('rf') 
base_rf, perm_rf = PI_ML(rf, X_test, y_test, groups=groups, n_repeats=10)
print('nn')
base_nn, perm_nn = PI_NN(model, X_test, y_test, groups=groups, n_repeats=10)

In [None]:
perm_lr.to_csv(f'../{computeddata}/Permutation/perm_lr{version}.csv')
perm_rf.to_csv(f'../{computeddata}/Permutation/perm_rf{version}.csv')
perm_nn.to_csv(f'../{computeddata}/Permutation/perm_nn{version}.csv')

perm_lr = pd.read_csv(f'../{computeddata}/Permutation/perm_lr{version}.csv')
perm_rf = pd.read_csv(f'../{computeddata}/Permutation/perm_rf{version}.csv')
perm_nn = pd.read_csv(f'../{computeddata}/Permutation/perm_nn{version}.csv')

In [None]:
combined = pd.concat([
    perm_lr.assign(model='LR'),
    # perm_rf.assign(model='RF'),
    # perm_nn.assign(model='NN')
], ignore_index=True)

combined["group"] = combined["group"].map(group_translation)

order = (combined.groupby('group')['importance'].mean().sort_values(ascending=True).index.tolist())
ypos = np.arange(len(order))

In [None]:
triples = [
    (perm_lr, 'lr'), 
    (perm_rf, 'rf'), 
    (perm_nn, 'nn')
    ]

plt.figure(figsize=(12, 8))

for perm_df_i, name in triples:
    perm_df_i["group"] = perm_df_i["group"].map(group_translation)
    d = (perm_df_i.set_index('group').reindex(order)) # 用統一群組順序對齊
    plt.errorbar(
        d['importance'],
        (ypos),
        xerr=d['std'],
        fmt='o',
        linewidth=2,
        capsize=5,
        label=name
    )

plt.yticks(ypos, order)
plt.axvline(0.0, linestyle='--', linewidth=1)
plt.xlabel('Permutation importance')
plt.ylabel('Group')
plt.title('Permutation importance (LR / RF / NN)')
plt.legend()
plt.tight_layout()
plt.show()

# Hitrate

In [None]:
from utils_model import metrics_bin, hitrate_data

hitrate_lr = hitrate_data(X_resampled_test, y_resampled_test, y_pred_lr)
hitrate_rf = hitrate_data(X_resampled_test, y_resampled_test, y_pred_rf)
hitrate_nn = hitrate_data(X_resampled_test, y_resampled_test, test_metrics['pred_y'])

hitrate_lr['county'] = hitrate_lr['county'].map(countycity_dct)
hitrate_rf['county'] = hitrate_rf['county'].map(countycity_dct)
hitrate_nn['county'] = hitrate_nn['county'].map(countycity_dct)

In [None]:
results = {
    'LR': hitrate_lr.copy(),
    'RF': hitrate_rf.copy(),
    'NN': hitrate_nn.copy(),
}
order = (results['LR'].sort_values('f1', ascending=False)['county']).tolist()

metrics = ['precision', 'recall', 'accuracy', 'f1']
plt.figure(figsize=(16, 10))

for i, met in enumerate(metrics, 1):
    ax = plt.subplot(2, 2, i)
    pos = np.arange(len(order))
    width = 0.25
    for j, (name, df) in enumerate(results.items()):
        d = df.set_index('county').reindex(order)
        ax.bar(pos + (j-1)*width, d[met].values, width=width, label=name)
    ax.set_title(met)
    ax.set_xticks(pos)
    ax.set_xticklabels(order, rotation=45, ha='right')
    if i == 2:
        ax.legend(loc='upper right')
    ax.set_ylim(0, 1)

plt.tight_layout()
plt.show()

In [None]:
m_lr = metrics_bin(y_resampled_test, y_pred_lr)
m_rf = metrics_bin(y_resampled_test, y_pred_rf)
m_nn = metrics_bin(y_resampled_test, test_metrics['pred_y'])

df = pd.DataFrame([m_lr, m_rf, m_nn], index=['LR','RF','NN'])
metrics = ['precision', 'recall', 'f1', 'accuracy']
df = df[metrics]

plt.figure(figsize=(9, 5))
x = np.arange(len(metrics))
width = 0.25

for i, model in enumerate(df.index):
    plt.bar(x + (i-1)*width, df.loc[model].values, width=width, label=model)

plt.xticks(x, metrics)
plt.ylim(0, 1)
plt.ylabel('Score')
plt.title('Overall Metrics on Resampled Test')
plt.legend(loc='upper right')

for i, model in enumerate(df.index):
    vals = df.loc[model].values
    for xi, v in zip(x + (i-1)*width, vals):
        plt.text(xi, v + 0.01, f'{v:.2f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

Start Analyze importance_lr based on te concept:
- human behaviour vs car
- car vs road design
- road design vs human behavour

### Car vs Road
This is only for model with 2 interaction

In [None]:
from utils_model_analyze import *

In [None]:
structured_cause = car_vs_road(importance_lr)
structured_cause = pd.DataFrame(structured_cause)
structured_cause = structured_cause[~structured_cause.index.str.contains('車輛撞擊')]

plot_interaction('車種', structured_cause, filter_val=10, top_k=30)

### Human vs Road

In [None]:
structured_cause = human_vs_road(importance_lr)
structured_cause = pd.DataFrame(structured_cause)
structured_cause = structured_cause[~structured_cause.index.str.contains('車種')]
structured_cause = structured_cause[~structured_cause.index.str.contains('車輛')]

plot_interaction('cause-group', structured_cause, filter_val=None)

### Human vs Car

In [None]:
structured_cause = human_vs_car(importance_lr)
structured_cause = pd.DataFrame(structured_cause)

plot_interaction('cause-group', structured_cause, filter_val=None)

## This is only for model with more than 2 interaction

In [None]:
import pandas as pd

In [None]:
important_groups = []
for k, v in importance_lr.items():
    if v[1] > 1:
        important_groups.append((k, v))

df = pd.DataFrame(important_groups, columns=['feature_name', 'scores'])
df['importance'] = df['scores'].apply(lambda x: x[0])
df['odds_ratio'] = df['scores'].apply(lambda x: x[1])

In [None]:
def parse_interaction(row):
    parts = row.split(' x ')
    road, vehicle, person = "無", "無", "無"
    
    for p in parts:
        p_clean = p.replace('_mean', '')
        
        if 'county' in p_clean:
            # county_臺北市 -> 臺北市
            road = p_clean.split('_')[-1]
            
        # 只改POI
        elif 'count' in p_clean:
            if 'parkinglot' in p_clean: road = '停車場密度'
            elif 'youbike' in p_clean: road = 'YouBike密度'
            elif 'mrt' in p_clean: road = '捷運密度'
            else: road = p_clean

        elif '速限' in p_clean:
            road = '速限'

        # 道路
        # elif any(x in p_clean for x in ['道路', '號誌', '路面', '障礙', '事故類型']):
        #     if '_' in p_clean:
        #         road = p_clean.split('_')[-1] # 取底線後的值
        #     else:
        #         road = p_clean

        elif any(x in p_clean for x in ['道路', '號誌', '路面', '障礙', '事故類型', '車道', '設施']):
            val = p_clean.split('_')[-1] if '_' in p_clean else p_clean
            
            # "有" 或 "無" 有包含多種分向設施
            if val in ['有', '無']:
                # 找出前綴 (例如 "路面邊線名稱")
                prefix = p_clean.split('_')[0]
                # 簡化前綴 (只取最後幾個字，例如 "路面邊線")
                simple_prefix = prefix
                if '名稱' in prefix: simple_prefix = prefix.replace('名稱', '').replace('大類別', '')
                if '-' in simple_prefix: simple_prefix = simple_prefix.split('-')[-1]
                
                road = f"{val}{simple_prefix}"
            else:
                road = val

        # 車
        elif any(x in p_clean for x in ['車種', '撞擊', '當事者']):
            if '_' in p_clean:
                vehicle = p_clean.split('_')[-1]
            else:
                vehicle = p_clean

        # 人
        elif 'cause' in p_clean:
             if '_' in p_clean:
                person = p_clean.split('_')[-1]
             else:
                person = p_clean
                
    dim = len(parts)
    return pd.Series([road, vehicle, person, dim])

df[['Road', 'Vehicle', 'Person', 'Dimension']] = df['feature_name'].apply(parse_interaction)
# df = df.replace("機車", "機車與自行車")
# df = df.replace("慢車", "機車與自行車")
# '機車', '慢車', 
df = df[~df['Vehicle'].isin(['汽車', '機車與自行車'])]
df = df[~df['Road'].str[2].isin(['市', '縣'])]
df = df[~((df['Vehicle'] == '人') & (df['Road'] == '車與車'))] # 移除異常
df = df[(df['Road'] != '車與車') & (df['Road'] != '人與車')] # 雖然子類別也是道路型態，但會誤導
# df = df.head(30)

global_min = 0  
global_max = df['importance'].max()

In [None]:
import plotly.express as px

df_3 = df[df['Dimension'] == 3].head(30)
df_plot = df_3.copy()

df_plot = df_plot.sort_values('importance', ascending=False)
fig_parcats = px.parallel_categories(
    df_plot, 
    dimensions=['Road', 'Vehicle', 'Person'],
    color="importance",
    color_continuous_scale=px.colors.sequential.Inferno,
    labels={'Road':'道路設計', 'Vehicle':'車種', 'Person':'肇因'},
    range_color=[global_min, global_max]
)

fig_parcats.update_layout(
    title="三維交互作用圖",
    height=800,
    width=1500
)

fig_parcats.show()

# 2 Way interaction

In [None]:
def parallel_categories_plot(df_plot, dimensions_dct):
    df_plot = df_plot.sort_values('importance', ascending=False)
    fig_parcats = px.parallel_categories(
        df_plot, 
        dimensions=dimensions_dct.keys(),
        color="importance",
        color_continuous_scale=px.colors.sequential.Inferno,
        labels=dimensions_dct,
        range_color=[global_min, global_max]
    )

    fig_parcats.update_layout(
        title="二維交互作用圖",
        height=800,
        width=1500
    )

    fig_parcats.show()

df_cause_road = df[(df['Vehicle'] == '無') & (df['Person'] != '無') & (df['Road'] != '無')].head(30).copy()
df_cause_road['Interaction'] = '人 vs 路'
df_car_road = df[(df['Vehicle'] != '無') & (df['Person'] == '無') & (df['Road'] != '無')].head(30).copy()
df_car_road['Interaction'] = '車 vs 路'
df_human_car = df[(df['Vehicle'] != '無') & (df['Person'] != '無') & (df['Road'] == '無')].head(30).copy()
df_human_car['Interaction'] = '人 vs 車'

df_combined = pd.concat([df_cause_road, df_car_road, df_human_car])
# df_combined = df_combined[df_combined['importance'] > 0.001]

df_combined = df_combined.sort_values('importance', ascending=False)

dims = ['Interaction', 'Road', 'Vehicle', 'Person']
labels_map = {'Interaction': '交互類型', 'Road':'道路設計', 'Vehicle':'車種', 'Person':'肇因'}

fig_combined = px.parallel_categories(
    df_combined, 
    dimensions=dims,
    color="importance",
    color_continuous_scale=px.colors.sequential.Inferno,
    labels=labels_map,
    range_color=[global_min, global_max]
)

fig_combined.update_layout(
    title="二維交互作用圖",
    height=900,
    width=1600
)

fig_combined.show()

In [None]:
print(combined_data['車輛撞擊部位大類別名稱-最初'].value_counts())

In [None]:
combined_data[(combined_data['車道劃分設施-分道設施-快車道或一般車道間名稱'] == '禁止變換車道線(無標記)') & 
              (combined_data['車輛撞擊部位大類別名稱-最初'] == '機車與自行車') &
              (combined_data['cause_group'] == 'Posture') ]

In [None]:
col_name = '車道劃分設施-分道設施-快車道或一般車道間名稱_禁止變換車道線(無標記) x 車輛撞擊部位大類別名稱-最初_機車與自行車 x cause-group_Posture'
a = X_train_refined[[col_name]]
b = pd.concat([a, y_train], axis=1)

b[col_name] = b[col_name].apply(lambda x: '有' if x > 0 else '無')
b.value_counts()