In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LogisticRegression, PoissonRegressor
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv("../data/fleet_incidents_synthetic.csv")
df.head()

Unnamed: 0,vehicle_id,age_years,annual_mileage,usage_type,region,incident_rate_theoretical,n_incidents,total_repair_cost
0,1,1,15882,delivery,suburban,0.301764,0,0.0
1,2,11,6863,sales,suburban,0.433726,0,0.0
2,3,9,39502,sales,urban,0.499004,0,0.0
3,4,6,42565,delivery,urban,0.49513,1,539.82855
4,5,6,47493,delivery,urban,0.504986,0,0.0


In [2]:
df["any_incident"] = (df["n_incidents"] > 0).astype(int)

In [3]:
X = df[["age_years", "annual_mileage", "usage_type", "region"]]
y_logit = df["any_incident"]
y_pois = df["n_incidents"]

cat_features = ["usage_type", "region"]
num_features = ["age_years", "annual_mileage"]

In [4]:
logit_model = Pipeline(steps=[
    ("preprocess", ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(drop="first"), cat_features),
            ("num", "passthrough", num_features)
        ]
    )),
    ("model", LogisticRegression(max_iter=500))
])

X_train, X_test, y_train, y_test = train_test_split(X, y_logit, test_size=0.2, random_state=42)

logit_model.fit(X_train, y_train)

y_pred = logit_model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      1.00      0.76       618
           1       0.71      0.01      0.03       382

    accuracy                           0.62      1000
   macro avg       0.67      0.50      0.40      1000
weighted avg       0.66      0.62      0.48      1000

[[616   2]
 [377   5]]
