In [1]:
import pandas as pd
df = pd.read_csv('../../data/collision_data_no_na.csv')

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

numeric_features = [
    "C_YEAR", 
    "C_VEHS",
    "C_MNTH_sin", "C_MNTH_cos",
    "C_WDAY_sin", "C_WDAY_cos",
    "C_HOUR_sin", "C_HOUR_cos"
]

categorical_features = [
    "C_CONF",
    "C_RCFG",
    "C_WTHR",
    "C_RSUR",
    "C_RALN",
    "C_TRAF"
]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), numeric_features),  # scales to [0,1]
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("nb", MultinomialNB())
])

X = df.drop("C_SEV", axis=1)
y = df["C_SEV"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    380011
           1       0.06      0.00      0.00      6085

    accuracy                           0.98    386096
   macro avg       0.52      0.50      0.50    386096
weighted avg       0.97      0.98      0.98    386096



### Random undersampling of nonfatal collisions

In [9]:
num_positive = df[df["C_SEV"] == 1].shape[0]
df_undersampled_neg = df[df["C_SEV"] == 0].sample(n=num_positive*2, random_state=42)
df_balanced = pd.concat([df[df["C_SEV"] == 1], df_undersampled_neg])

In [10]:
X = df_balanced.drop("C_SEV", axis=1)
y = df_balanced["C_SEV"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      
    random_state=42,     
    shuffle=True         
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.78      0.78     12348
           1       0.56      0.57      0.56      6045

    accuracy                           0.71     18393
   macro avg       0.67      0.67      0.67     18393
weighted avg       0.71      0.71      0.71     18393



### Try using sample weights combined with undersampling

In [12]:
from sklearn.utils.class_weight import compute_sample_weight

weights = compute_sample_weight(class_weight="balanced", y=y_train)

model.fit(X_train, y_train, nb__sample_weight=weights)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.67      0.73     12348
           1       0.51      0.70      0.59      6045

    accuracy                           0.68     18393
   macro avg       0.66      0.68      0.66     18393
weighted avg       0.72      0.68      0.69     18393

