Classifies collisions as fatal (1) or nonfatal (2)

In [21]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [22]:
df = pd.read_csv('../../data/collision_data.csv')
df.head()

Unnamed: 0,C_YEAR,C_MNTH,C_WDAY,C_HOUR,C_SEV,C_VEHS,C_CONF,C_RCFG,C_WTHR,C_RSUR,C_RALN,C_TRAF,C_CASE
0,2005,1.0,1.0,11.0,2,1.0,4.0,2.0,4.0,5.0,3.0,,915642
1,2005,1.0,1.0,15.0,2,2.0,2.0,3.0,1.0,3.0,1.0,18.0,915794
2,2005,1.0,1.0,13.0,2,2.0,35.0,2.0,1.0,1.0,1.0,3.0,915805
3,2005,1.0,1.0,13.0,2,1.0,4.0,,4.0,4.0,3.0,,915877
4,2005,1.0,1.0,20.0,2,1.0,2.0,,1.0,3.0,3.0,,915919


In [23]:
df.columns

Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_SEV', 'C_VEHS', 'C_CONF',
       'C_RCFG', 'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'C_CASE'],
      dtype='object')

In [24]:
# transform severity into binary classification: 1 (fatal), 0 (non-fatal)
df["C_SEV"] = df["C_SEV"].map({1: 1, 2: 0})

# cyclical encoding for month/day/hour
df["C_MNTH_sin"] = np.sin(2*np.pi * df["C_MNTH"] / 12)
df["C_MNTH_cos"] = np.cos(2*np.pi * df["C_MNTH"] / 12)

df["C_WDAY_sin"] = np.sin(2*np.pi * df["C_WDAY"] / 7)
df["C_WDAY_cos"] = np.cos(2*np.pi * df["C_WDAY"] / 7)

df["C_HOUR_sin"] = np.sin(2*np.pi * df["C_HOUR"] / 24)
df["C_HOUR_cos"] = np.cos(2*np.pi * df["C_HOUR"] / 24)

# drop original month/day/hour columns
df = df.drop(columns=["C_MNTH", "C_WDAY", "C_HOUR"])

df = df.drop(columns=["C_CASE"])

In [25]:
numeric_features = [
    "C_YEAR", 
    "C_VEHS",
    "C_MNTH_sin", "C_MNTH_cos",
    "C_WDAY_sin", "C_WDAY_cos",
    "C_HOUR_sin", "C_HOUR_cos"
]

categorical_features = [
    "C_CONF",
    "C_RCFG",
    "C_WTHR",
    "C_RSUR",
    "C_RALN",
    "C_TRAF"
]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

In [26]:
X = df.drop("C_SEV", axis=1)
y = df["C_SEV"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      
    random_state=42,     
    shuffle=True         
)

### Try default XGBoost classifier

In [27]:
model = XGBClassifier()
model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [28]:
y_pred = model.predict(X_test)

In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    384006
           1       0.53      0.00      0.00      6142

    accuracy                           0.98    390148
   macro avg       0.76      0.50      0.50    390148
weighted avg       0.98      0.98      0.98    390148



### Use scale_pos_weight to tell the model to give more weight to the minority class

In [30]:
neg = sum(y_train == 0)
pos = sum(y_train == 1)
scale_pos_weight = neg / pos

model = XGBClassifier(
    max_depth=6,
    n_estimators=500,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,  # balance fatal class
    eval_metric="auc",
    use_label_encoder=False,
    random_state=42
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [31]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.74      0.84    384006
           1       0.04      0.68      0.07      6142

    accuracy                           0.73    390148
   macro avg       0.52      0.71      0.46    390148
weighted avg       0.98      0.73      0.83    390148



### Combine with random undersampling of non-fatal collisions (1:4 ratio)

In [32]:
num_positive = df[df["C_SEV"] == 1].shape[0]
df_undersampled_neg = df[df["C_SEV"] == 0].sample(n=num_positive*2, random_state=42)
df_balanced = pd.concat([df[df["C_SEV"] == 1], df_undersampled_neg])

In [33]:
X = df_balanced.drop("C_SEV", axis=1)
y = df_balanced["C_SEV"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      
    random_state=42,     
    shuffle=True         
)

In [34]:
neg = sum(y_train == 0)
pos = sum(y_train == 1)
scale_pos_weight = neg / pos

model = XGBClassifier(
    max_depth=6,
    n_estimators=500,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,  # balance fatal class
    eval_metric="auc",
    use_label_encoder=False,
    random_state=42
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [35]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.71      0.77     12476
           1       0.55      0.71      0.62      6164

    accuracy                           0.71     18640
   macro avg       0.69      0.71      0.69     18640
weighted avg       0.74      0.71      0.72     18640



## Feature importance

In [36]:
from sklearn.inspection import permutation_importance
feature_names = (
    numeric_features +
    categorical_features
)

r = permutation_importance(
    model, X, y,
    n_repeats=10,
    random_state=42
)

perm_importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": r.importances_mean # type: ignore
}).sort_values("importance", ascending=False)

print(perm_importance_df)

       feature  importance
2   C_MNTH_sin    0.108522
7   C_HOUR_cos    0.037152
1       C_VEHS    0.036572
13      C_TRAF    0.034182
5   C_WDAY_cos    0.027919
6   C_HOUR_sin    0.022933
12      C_RALN    0.022020
0       C_YEAR    0.019994
3   C_MNTH_cos    0.014940
9       C_RCFG    0.013064
4   C_WDAY_sin    0.011861
8       C_CONF    0.011808
10      C_WTHR    0.010178
11      C_RSUR    0.004988
