In [2]:
import pandas as pd
df = pd.read_csv('../../data/person_data_no_na.csv')

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


numeric_features = [
    "V_YEAR", 
    "P_AGE"
]

categorical_features = [
    "V_TYPE",
    "P_SEX",
    "P_PSN",
    "P_SAFE",
    "P_USER"
]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), categorical_features),
    ]
)

model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('rf', RandomForestClassifier(
        n_estimators=50,  # fewer trees
        max_depth=20,     # limit depth
        n_jobs=-1,         # use all CPU cores
    ))
])

X = df.drop("P_ISEV", axis=1)
y = df["P_ISEV"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      
    random_state=42,     
    shuffle=True         
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.42      0.48    314927
           1       0.65      0.77      0.70    442587
           2       0.25      0.01      0.01      5103

    accuracy                           0.62    762617
   macro avg       0.49      0.40      0.40    762617
weighted avg       0.61      0.62      0.61    762617



### Random undersampling of nonfatal injuries and no injuries

In [4]:
num_fatality = df[df["P_ISEV"] == 2].shape[0]
num_fatal_injury = df[df["P_ISEV"] == 2].shape[0]
df_undersampled_0 = df[df["P_ISEV"] == 0].sample(n=num_fatality * 2, random_state=42)
df_undersampled_1 = df[df["P_ISEV"] == 1].sample(n=num_fatality * 2, random_state=42)
df_balanced = pd.concat([df[df["P_ISEV"] == 2], df_undersampled_0, df_undersampled_1])

X = df_balanced.drop("P_ISEV", axis=1)
y = df_balanced["P_ISEV"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      
    random_state=42,     
    shuffle=True         
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.64      0.58     10358
           1       0.51      0.49      0.50     10147
           2       0.62      0.41      0.50      5119

    accuracy                           0.53     25624
   macro avg       0.55      0.51      0.53     25624
weighted avg       0.54      0.53      0.53     25624



### Try using sample weights combined with undersampling

In [8]:
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('rf', RandomForestClassifier(
        n_estimators=50,  # fewer trees
        max_depth=20,     # limit depth
        n_jobs=-1,         # use all CPU cores
        class_weight={0:1, 1:1, 2: 2}
    ))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.61      0.57     10358
           1       0.51      0.47      0.49     10147
           2       0.54      0.48      0.51      5119

    accuracy                           0.53     25624
   macro avg       0.53      0.52      0.52     25624
weighted avg       0.53      0.53      0.53     25624



## Feature importance

In [9]:
ohe = model.named_steps["preprocess"].named_transformers_["cat"]
ohe_features = ohe.get_feature_names_out(categorical_features)

feature_names = (
    numeric_features +
    list(ohe_features)
)

In [10]:
importances = model.named_steps["rf"].feature_importances_

feature_importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

print(feature_importance_df)

  feature  importance
1   P_AGE    0.334170
5  P_SAFE    0.240642
0  V_YEAR    0.203330
6  P_USER    0.067650
2  V_TYPE    0.060709
4   P_PSN    0.049731
3   P_SEX    0.043768
