In [15]:
import pandas as pd

In [16]:
df = pd.read_csv('../../data/collision_data_no_na.csv')

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

numeric_features = [
    "C_YEAR", 
    "C_VEHS",
    "C_MNTH_sin", "C_MNTH_cos",
    "C_WDAY_sin", "C_WDAY_cos",
    "C_HOUR_sin", "C_HOUR_cos"
]

categorical_features = [
    "C_CONF",
    "C_RCFG",
    "C_WTHR",
    "C_RSUR",
    "C_RALN",
    "C_TRAF"
]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), categorical_features),
    ]
)

model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('rf', RandomForestClassifier(
        n_estimators=50,  # fewer trees
        max_depth=20,     # limit depth
        n_jobs=-1,         # use all CPU cores
    ))
])

X = df.drop("C_SEV", axis=1)
y = df["C_SEV"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      
    random_state=42,     
    shuffle=True         
)

df[categorical_features].nunique()

C_CONF    19
C_RCFG    11
C_WTHR     8
C_RSUR    10
C_RALN     7
C_TRAF    18
dtype: int64

In [18]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    380011
           1       0.94      0.02      0.03      6085

    accuracy                           0.98    386096
   macro avg       0.96      0.51      0.51    386096
weighted avg       0.98      0.98      0.98    386096



### Random undersampling of nonfatal collisions

In [19]:
num_positive = df[df["C_SEV"] == 1].shape[0]
df_undersampled_neg = df[df["C_SEV"] == 0].sample(n=num_positive*2, random_state=42)
df_balanced = pd.concat([df[df["C_SEV"] == 1], df_undersampled_neg])

X = df_balanced.drop("C_SEV", axis=1)
y = df_balanced["C_SEV"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      
    random_state=42,     
    shuffle=True         
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.87      0.82     12348
           1       0.65      0.47      0.55      6045

    accuracy                           0.74     18393
   macro avg       0.71      0.67      0.68     18393
weighted avg       0.73      0.74      0.73     18393



### Try using sample weights combined with undersampling

In [20]:
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('rf', RandomForestClassifier(
        n_estimators=50,  # fewer trees
        max_depth=20,     # limit depth
        n_jobs=-1,         # use all CPU cores
        class_weight={0:1, 1:6}
    ))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.74      0.77     12348
           1       0.54      0.62      0.58      6045

    accuracy                           0.71     18393
   macro avg       0.67      0.68      0.68     18393
weighted avg       0.72      0.71      0.71     18393



## Feature importance

In [21]:
ohe = model.named_steps["preprocess"].named_transformers_["cat"]
ohe_features = ohe.get_feature_names_out(categorical_features)

feature_names = (
    numeric_features +
    list(ohe_features)
)

In [22]:
importances = model.named_steps["rf"].feature_importances_

feature_importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

print(feature_importance_df)

       feature  importance
8       C_CONF    0.167499
0       C_YEAR    0.120577
7   C_HOUR_cos    0.087842
6   C_HOUR_sin    0.082241
13      C_TRAF    0.075278
3   C_MNTH_cos    0.065599
2   C_MNTH_sin    0.062343
4   C_WDAY_sin    0.057788
9       C_RCFG    0.053680
12      C_RALN    0.050354
1       C_VEHS    0.046231
11      C_RSUR    0.046022
10      C_WTHR    0.045296
5   C_WDAY_cos    0.039250


### Permutation importance

In [23]:
from sklearn.inspection import permutation_importance

r = permutation_importance(
    model, X, y,
    n_repeats=10,
    random_state=42
)

perm_importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": r.importances_mean # type: ignore
}).sort_values("importance", ascending=False)

print(perm_importance_df)

       feature  importance
2   C_MNTH_sin    0.133850
7   C_HOUR_cos    0.099369
13      C_TRAF    0.094707
1       C_VEHS    0.077506
3   C_MNTH_cos    0.067147
6   C_HOUR_sin    0.063209
0       C_YEAR    0.062485
12      C_RALN    0.057181
8       C_CONF    0.053401
10      C_WTHR    0.047000
9       C_RCFG    0.046329
5   C_WDAY_cos    0.045395
11      C_RSUR    0.040081
4   C_WDAY_sin    0.027813
