In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../../data/collision_data_no_na.csv')

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

numeric_features = [
    "C_YEAR", 
    "C_VEHS",
    "C_MNTH_sin", "C_MNTH_cos",
    "C_WDAY_sin", "C_WDAY_cos",
    "C_HOUR_sin", "C_HOUR_cos"
]

categorical_features = [
    "C_CONF",
    "C_RCFG",
    "C_WTHR",
    "C_RSUR",
    "C_RALN",
    "C_TRAF"
]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), categorical_features),
    ]
)

model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('rf', RandomForestClassifier(
        n_estimators=50,  # fewer trees
        max_depth=20,     # limit depth
        n_jobs=-1,         # use all CPU cores
    ))
])

X = df.drop("C_SEV", axis=1)
y = df["C_SEV"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      
    random_state=42,     
    shuffle=True         
)

df[categorical_features].nunique()

C_CONF    19
C_RCFG    11
C_WTHR     8
C_RSUR    10
C_RALN     7
C_TRAF    18
dtype: int64

In [4]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    380011
           1       0.93      0.02      0.03      6085

    accuracy                           0.98    386096
   macro avg       0.96      0.51      0.51    386096
weighted avg       0.98      0.98      0.98    386096



### Random undersampling of nonfatal collisions

In [5]:
num_positive = df[df["C_SEV"] == 1].shape[0]
df_undersampled_neg = df[df["C_SEV"] == 0].sample(n=num_positive*2, random_state=42)
df_balanced = pd.concat([df[df["C_SEV"] == 1], df_undersampled_neg])

X = df_balanced.drop("C_SEV", axis=1)
y = df_balanced["C_SEV"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      
    random_state=42,     
    shuffle=True         
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.88      0.82     12348
           1       0.65      0.47      0.55      6045

    accuracy                           0.74     18393
   macro avg       0.71      0.67      0.68     18393
weighted avg       0.73      0.74      0.73     18393



### Try using sample weights combined with undersampling

In [10]:
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('rf', RandomForestClassifier(
        n_estimators=50,  # fewer trees
        max_depth=20,     # limit depth
        n_jobs=-1,         # use all CPU cores
        class_weight={0:1, 1:6}
    ))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.75      0.77     12348
           1       0.55      0.63      0.59      6045

    accuracy                           0.71     18393
   macro avg       0.68      0.69      0.68     18393
weighted avg       0.72      0.71      0.71     18393

