# Naive Bayes

In [9]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

from pathlib import Path
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import CategoricalNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, label_binarize

In [21]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
# Import dataset
dataset = Path('../dataset')
df = pd.read_csv(dataset/'accidents_clean_train.csv')
df.head()

Unnamed: 0,Area_accident_occured,Types_of_Junction,Light_conditions,Number_of_vehicles_involved,Number_of_casualties,Cause_of_accident,Day_of_week,Sex_of_driver,Age_band_of_driver,Accident_severity
0,Residential areas,No junction,Daylight,2,2,Moving Backward,Monday,Male,18-30,Slight Injury
1,Office areas,No junction,Daylight,2,2,Overtaking,Monday,Male,31-50,Slight Injury
2,Recreational areas,No junction,Daylight,2,2,Changing lane to the left,Monday,Male,18-30,Serious Injury
3,Office areas,Y Shape,Darkness - lights lit,2,2,Changing lane to the right,Sunday,Male,18-30,Slight Injury
4,Industrial areas,Y Shape,Darkness - lights lit,2,2,Overtaking,Sunday,Male,18-30,Slight Injury


In [11]:
df.shape

(8210, 10)

## Prepare data

In [12]:
# Create binary 
# Add mapped column where Accident Severity is binary (Slight Injury vs Serious/Fatal)
df['Accident_slight'] = df['Accident_severity'].map(
    {'Slight Injury': 0}
    ).fillna(1).astype(int)
df['Accident_serious'] = df['Accident_severity'].map(
    {'Serious Injury': 0}
    ).fillna(1).astype(int)
df['Accident_severity_mapped'] = df['Accident_severity'].map({
    'Serious Injury': 0, 
    'Slight Injury': 1, 
    'Fatal injury': 2
    })

In [13]:
# Features
X = df.drop(['Accident_severity', 'Accident_severity_mapped', 'Accident_slight', 'Accident_serious'], axis=1)
# Convert to dummy vars
X = pd.get_dummies(X)
X.head()

Unnamed: 0,Number_of_vehicles_involved,Number_of_casualties,Area_accident_occured_ Market areas,Area_accident_occured_ Recreational areas,Area_accident_occured_ Church areas,Area_accident_occured_ Hospital areas,Area_accident_occured_ Industrial areas,Area_accident_occured_ Outside rural areas,Area_accident_occured_Office areas,Area_accident_occured_Other,...,Day_of_week_Tuesday,Day_of_week_Wednesday,Sex_of_driver_Female,Sex_of_driver_Male,Sex_of_driver_Unknown,Age_band_of_driver_18-30,Age_band_of_driver_31-50,Age_band_of_driver_Over 51,Age_band_of_driver_Under 18,Age_band_of_driver_Unknown
0,2,2,False,False,False,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
1,2,2,False,False,False,False,False,False,True,False,...,False,False,False,True,False,False,True,False,False,False
2,2,2,False,True,False,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
3,2,2,False,False,False,False,False,False,True,False,...,False,False,False,True,False,True,False,False,False,False
4,2,2,False,False,False,False,True,False,False,False,...,False,False,False,True,False,True,False,False,False,False


In [14]:
# Create target variables
y = df['Accident_severity_mapped']
y_slight = df['Accident_slight']
y_serious = df['Accident_serious']

## Split Data

### Accident_severity: Slight Injury, Serious Injury, Fatal

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42)

# Train model
nb_model = CategoricalNB()
nb_model.fit(X_train, y_train)

In [23]:
# Predict and view results
y_pred = nb_model.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy:  0.8583028826634186
Confusion Matrix:
 [[   3  325    2]
 [   0 2110    1]
 [   0   21    1]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.01      0.02       330
           1       0.86      1.00      0.92      2111
           2       0.25      0.05      0.08        22

    accuracy                           0.86      2463
   macro avg       0.70      0.35      0.34      2463
weighted avg       0.87      0.86      0.80      2463



### Accident_slight: Slight Injury (0), Serious/Fatal (1)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y_slight, 
                                                    test_size=0.3, 
                                                    random_state=42)

nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)


print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy:  0.8595209094600081
Confusion Matrix:
 [[2110    1]
 [ 345    7]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      1.00      0.92      2111
           1       0.88      0.02      0.04       352

    accuracy                           0.86      2463
   macro avg       0.87      0.51      0.48      2463
weighted avg       0.86      0.86      0.80      2463



#### Accident_serious: Slight/Fatal: 0, Serious: 1

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y_serious, 
                                                    test_size=0.3, 
                                                    random_state=42)

nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)


print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy:  0.8676410881039383
Confusion Matrix:
 [[   4  326]
 [   0 2133]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.01      0.02       330
           1       0.87      1.00      0.93      2133

    accuracy                           0.87      2463
   macro avg       0.93      0.51      0.48      2463
weighted avg       0.89      0.87      0.81      2463



## Resample target variable

In [26]:
from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(random_state=42)

### Resample Accident_severity

In [27]:
X_resampled, y_resampled = oversampler.fit_resample(X, y)

print(y_resampled.value_counts())

nb_resampled = CategoricalNB()
nb_resampled.fit(X_resampled, y_resampled)

y_pred = nb_resampled.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accident_severity_mapped
1    7082
0    7082
2    7082
Name: count, dtype: int64
Accuracy:  0.5075111652456354
Confusion Matrix:
 [[ 126  128   76]
 [ 585 1124  424]
 [   0    0    0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.18      0.38      0.24       330
           1       0.90      0.53      0.66      2133
           2       0.00      0.00      0.00         0

    accuracy                           0.51      2463
   macro avg       0.36      0.30      0.30      2463
weighted avg       0.80      0.51      0.61      2463



### Resample y_slight

In [87]:
X_resampled, y_resampled = oversampler.fit_resample(X, y_slight)

# Check distribution
print(y_resampled.value_counts())

Accident_slight
0    7082
1    7082
Name: count, dtype: int64


In [28]:
nb_resampled = CategoricalNB()
nb_resampled.fit(X_resampled, y_resampled)

y_pred = nb_resampled.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy:  0.5075111652456354
Confusion Matrix:
 [[ 126  128   76]
 [ 585 1124  424]
 [   0    0    0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.18      0.38      0.24       330
           1       0.90      0.53      0.66      2133
           2       0.00      0.00      0.00         0

    accuracy                           0.51      2463
   macro avg       0.36      0.30      0.30      2463
weighted avg       0.80      0.51      0.61      2463



### Resample y_serious

In [29]:
X_resampled, y_resampled = oversampler.fit_resample(X, y_serious)

nb_model = CategoricalNB()
nb_model.fit(X_resampled, y_resampled)

In [31]:
y_pred1 = nb_model.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred1))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred1))
print("Classification Report:\n", classification_report(y_test, y_pred1))

Accuracy:  0.6126674786845311
Confusion Matrix:
 [[ 182  148]
 [ 806 1327]]
Classification Report:
               precision    recall  f1-score   support

           0       0.18      0.55      0.28       330
           1       0.90      0.62      0.74      2133

    accuracy                           0.61      2463
   macro avg       0.54      0.59      0.51      2463
weighted avg       0.80      0.61      0.67      2463



## AUC-ROC Score

In [22]:
y_probs = nb_model.predict_proba(X_test)

In [23]:
# Calculate AUC-ROC score
auc_score = roc_auc_score(y_test, y_probs, multi_class='ovr')
# auc_score1 = roc_auc_score(y_test, y_probs, multi_class='ovo')
print(f"AUC-ROC Score: {auc_score:.4f}")
print(f"AUC-ROC1 Score: {auc_score1:.4f}")

ValueError: y should be a 1d array, got an array of shape (2463, 2) instead.

In [24]:
y_probs
y_test

3139    0
3409    0
1374    0
1545    0
8164    0
       ..
422     0
1220    0
3203    0
3826    0
7670    0
Name: Accident_severity_mapped, Length: 2463, dtype: int32

In [25]:
y_test.unique()

array([0, 1])

In [26]:
# Binarize the output
n_classes = y_probs.shape[1]
y_test_binarized = label_binarize(y_test, classes=np.arange(n_classes))

In [27]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot all ROC curves
plt.figure(figsize=(10, 8))
colors = ['aqua', 'darkorange', 'cornflowerblue']
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'ROC curve of class {i} (area = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc="lower right")
plt.show()

IndexError: index 1 is out of bounds for axis 1 with size 1

In [98]:
print("Class distribution in y_test:")
print(pd.Series(y_test).value_counts())


Class distribution in y_test:
Accident_severity
Slight Injury     2111
Serious Injury     330
Fatal injury        22
Name: count, dtype: int64
