In [1]:
import pandas as pd
import warnings
import os
import sys
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

sys.path.insert(0, root_dir)

from config import *

# Show all columns
pd.set_option('display.max_columns', None)

In [2]:
balanced_df = pd.read_csv(BALANCED_DATA_PATH)
x_selected_df = pd.read_csv(X_SELECTED_TREE_PATH)

In [3]:
print(f"Balanced data shape: {balanced_df.shape}")
print(f"X selected data shape: {x_selected_df.shape}")

Balanced data shape: (31245, 152)
X selected data shape: (31245, 21)


In [4]:
# Split the data into train and test
X = x_selected_df.drop('accident_severity', axis=1)
y = x_selected_df['accident_severity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
model = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=len(y.unique()),  
    eval_metric='mlogloss',  
    random_state=42
)

In [6]:
model.fit(X_train, y_train)

In [7]:
y_pred = model.predict(X_test)

In [8]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("F1 Score: {:.2f}%".format(f1_score(y_test, y_pred, average='weighted') * 100))

Accuracy: 73.36%
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.75      0.76      2064
           1       0.64      0.61      0.63      2122
           2       0.78      0.84      0.81      2063

    accuracy                           0.73      6249
   macro avg       0.73      0.73      0.73      6249
weighted avg       0.73      0.73      0.73      6249

Confusion Matrix:
[[1548  415  101]
 [ 444 1304  374]
 [  25  306 1732]]
F1 Score: 73.19%


In [9]:
model_tune = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=len(y.unique()),
    eval_metric='mlogloss',
    random_state=42,
)

In [10]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'n_estimators': [100, 200, 300],
}

In [11]:
random_search = RandomizedSearchCV(
    model_tune,
    param_grid,
    scoring='f1_weighted',
    n_jobs=-1,
    cv=10,
    verbose=1
)

In [12]:
random_search.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [13]:
print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)

Best parameters found:  {'subsample': 0.5, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 7, 'learning_rate': 0.2, 'colsample_bytree': 0.5}
Best score found:  0.7378764010943475


In [14]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(f'accuracy: {accuracy_score(y_test, y_pred)}')
print(f'f1_score: {f1_score(y_test, y_pred, average="weighted")}')

              precision    recall  f1-score   support

           0       0.76      0.75      0.76      2064
           1       0.64      0.62      0.63      2122
           2       0.79      0.84      0.81      2063

    accuracy                           0.73      6249
   macro avg       0.73      0.73      0.73      6249
weighted avg       0.73      0.73      0.73      6249

accuracy: 0.7327572411585854
f1_score: 0.7312481207616981


In [15]:
# Export the model
import pickle

with open(os.path.join(MODELS_PATH, 'xgb_model_1.pkl'), 'wb') as f:
    pickle.dump(best_model, f)