# Dataset Preparation

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [None]:
# Import training dataset
dataset = Path('../dataset')
df = pd.read_csv(dataset/'/content/dataset/accidents_clean_train.csv')
df.head()

Unnamed: 0,Area_accident_occured,Types_of_Junction,Light_conditions,Number_of_vehicles_involved,Number_of_casualties,Cause_of_accident,Day_of_week,Sex_of_driver,Age_band_of_driver,Accident_severity
0,Residential areas,No junction,Daylight,2,2,Moving Backward,Monday,Male,18-30,Slight Injury
1,Office areas,No junction,Daylight,2,2,Overtaking,Monday,Male,31-50,Slight Injury
2,Recreational areas,No junction,Daylight,2,2,Changing lane to the left,Monday,Male,18-30,Serious Injury
3,Office areas,Y Shape,Darkness - lights lit,2,2,Changing lane to the right,Sunday,Male,18-30,Slight Injury
4,Industrial areas,Y Shape,Darkness - lights lit,2,2,Overtaking,Sunday,Male,18-30,Slight Injury


In [None]:
# Get feature columns
columns = df.columns.tolist()
print(columns)

features = ['Area_accident_occured', 'Types_of_Junction', 'Light_conditions', 'Number_of_vehicles_involved', 'Number_of_casualties', 'Cause_of_accident', 'Day_of_week', 'Sex_of_driver', 'Age_band_of_driver']

target = 'Accident_severity'

['Area_accident_occured', 'Types_of_Junction', 'Light_conditions', 'Number_of_vehicles_involved', 'Number_of_casualties', 'Cause_of_accident', 'Day_of_week', 'Sex_of_driver', 'Age_band_of_driver', 'Accident_severity']


## Convert features to categorical

In [None]:
# Convert to categorical
X = df[features]
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0,Number_of_vehicles_involved,Number_of_casualties,Area_accident_occured_ Recreational areas,Area_accident_occured_ Church areas,Area_accident_occured_ Hospital areas,Area_accident_occured_ Industrial areas,Area_accident_occured_ Outside rural areas,Area_accident_occured_Office areas,Area_accident_occured_Other,Area_accident_occured_Recreational areas,...,Day_of_week_Sunday,Day_of_week_Thursday,Day_of_week_Tuesday,Day_of_week_Wednesday,Sex_of_driver_Male,Sex_of_driver_Unknown,Age_band_of_driver_31-50,Age_band_of_driver_Over 51,Age_band_of_driver_Under 18,Age_band_of_driver_Unknown
0,2,2,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,2,2,False,False,False,False,False,True,False,False,...,False,False,False,False,True,False,True,False,False,False
2,2,2,True,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,2,2,False,False,False,False,False,True,False,False,...,True,False,False,False,True,False,False,False,False,False
4,2,2,False,False,False,True,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False


## Create mapped columns of target variable

In [None]:
# Create mapped columns of target
df['Accident_slight'] = df[target].map(
    {'Slight Injury': 0}
    ).fillna(1).astype(int)
df['Accident_serious'] = df[target].map(
    {'Serious Injury': 0}
    ).fillna(1).astype(int)
df['Accident_severity_mapped'] = df[target].map({
    'Serious Injury': 0,
    'Slight Injury': 1,
    'Fatal injury': 2
    })

df.head()

Unnamed: 0,Area_accident_occured,Types_of_Junction,Light_conditions,Number_of_vehicles_involved,Number_of_casualties,Cause_of_accident,Day_of_week,Sex_of_driver,Age_band_of_driver,Accident_severity,Accident_slight,Accident_serious,Accident_severity_mapped
0,Residential areas,No junction,Daylight,2,2,Moving Backward,Monday,Male,18-30,Slight Injury,0,1,1
1,Office areas,No junction,Daylight,2,2,Overtaking,Monday,Male,31-50,Slight Injury,0,1,1
2,Recreational areas,No junction,Daylight,2,2,Changing lane to the left,Monday,Male,18-30,Serious Injury,1,0,0
3,Office areas,Y Shape,Darkness - lights lit,2,2,Changing lane to the right,Sunday,Male,18-30,Slight Injury,0,1,1
4,Industrial areas,Y Shape,Darkness - lights lit,2,2,Overtaking,Sunday,Male,18-30,Slight Injury,0,1,1


In [None]:
df.shape

(8210, 13)

## Create y_test variables for ML training

In [None]:
y = df[target]
y_mapped = df['Accident_severity_mapped']
y_slight = df['Accident_slight']
y_serious = df['Accident_serious']

## Resampling

In [None]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
oversample = RandomOverSampler(random_state=42)

X_resampled, y_resampled = oversample.fit_resample(X, y_mapped)

In [None]:
# Function to print evaluation metrics
def print_evaluation_metrics(test, pred):
    print("Accuracy: ", accuracy_score(test, pred))
    print("Confusion Matrix:\n", confusion_matrix(test, pred))
    print("Classification Report:\n", classification_report(test, pred))

def print_cross_val_scores(scores):
    print("Cross-validation scores: ", scores)
    print("Average score: ", scores.mean())

## Slight Injury (1) vs. Serious Injury (0) vs. Fatal Injury (2)

In [None]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_mapped, test_size=0.3, random_state=42)

#Train model
log_reg_mapped = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
log_reg_mapped.fit(X_train, y_train)

#Create prediction
y_pred_mapped = log_reg_mapped.predict(X_test)

print("Logistic Regression (Slight Injury vs. Serious Injury vs. Fatal Injury):")
print_evaluation_metrics(y_test, y_pred_mapped)
score_mapped = cross_val_score(log_reg_mapped, X_test, y_test, cv=10)
print_cross_val_scores(score_mapped)

Logistic Regression (Slight Injury vs. Serious Injury vs. Fatal Injury):
Accuracy:  0.8574908647990256
Confusion Matrix:
 [[   1  329    0]
 [   0 2111    0]
 [   0   22    0]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.00      0.01       330
           1       0.86      1.00      0.92      2111
           2       0.00      0.00      0.00        22

    accuracy                           0.86      2463
   macro avg       0.62      0.33      0.31      2463
weighted avg       0.87      0.86      0.79      2463

Cross-validation scores:  [0.85425101 0.8582996  0.85425101 0.85365854 0.85365854 0.8495935
 0.85365854 0.86178862 0.85772358 0.85772358]
Average score:  0.8554606497481979


##Slight Injury (0) vs. Serious Injury/Fatal Injury (1)

In [None]:
# Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_slight, test_size=0.3, random_state=42)


# Train model
log_reg_slight = LogisticRegression(max_iter=1000)
log_reg_slight.fit(X_train, y_train)

# Create prediction
y_pred_slight = log_reg_slight.predict(X_test)

print("Logistic Regression (Slight Injury vs. Serious/Fatal Injury):")
print_evaluation_metrics(y_test, y_pred_slight)
score_slight = cross_val_score(log_reg_slight, X_test, y_test, cv=10)
print_cross_val_scores(score_slight)

Logistic Regression (Slight Injury vs. Serious/Fatal Injury):
Accuracy:  0.8574908647990256
Confusion Matrix:
 [[2111    0]
 [ 351    1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      1.00      0.92      2111
           1       1.00      0.00      0.01       352

    accuracy                           0.86      2463
   macro avg       0.93      0.50      0.46      2463
weighted avg       0.88      0.86      0.79      2463

Cross-validation scores:  [0.85425101 0.8582996  0.8582996  0.85365854 0.85365854 0.85365854
 0.85772358 0.86178862 0.85772358 0.85772358]
Average score:  0.8566785161778743


##Slight/Fatal Injury (1) vs. Serious Injury (0)

In [None]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_serious, test_size=0.3, random_state=42)

#Train model
log_reg_serious = LogisticRegression(max_iter=1000)
log_reg_serious.fit(X_train, y_train)

#Create prediction
y_pred_serious = log_reg_serious.predict(X_test)
print("Logistic Regression (Slight/Fatal Injury vs. Serious Injury):")
print_evaluation_metrics(y_test, y_pred_serious)

score_serious = cross_val_score(log_reg_serious, X_test, y_test, cv=10)
print_cross_val_scores(score_serious)

Logistic Regression (Slight/Fatal Injury vs. Serious Injury):
Accuracy:  0.8660170523751523
Confusion Matrix:
 [[   0  330]
 [   0 2133]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       330
           1       0.87      1.00      0.93      2133

    accuracy                           0.87      2463
   macro avg       0.43      0.50      0.46      2463
weighted avg       0.75      0.87      0.80      2463

Cross-validation scores:  [0.86234818 0.86639676 0.86639676 0.86178862 0.86178862 0.85772358
 0.86585366 0.86585366 0.86585366 0.86585366]
Average score:  0.8639857147559329


# Resample Target Data

## Slight Injury (1) vs. Serious Injury (0) vs. Fatal Injury (2)

In [None]:
# Apply RandomOverSampler
oversample = RandomOverSampler(random_state=42)

X_resampled, y_resampled = oversample.fit_resample(X_scaled, y_mapped)

# Check distribution
print(y_resampled.value_counts())

Accident_severity_mapped
1    7082
0    7082
2    7082
Name: count, dtype: int64


In [None]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

#Train model
log_reg_mapped = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
log_reg_mapped.fit(X_train, y_train)

#Create prediction
y_pred_mapped = log_reg_mapped.predict(X_test)


print("Logistic Regression with Oversampling (Slight Injury vs. Serious Injury vs. Fatal Injury):")
print_evaluation_metrics(y_test, y_pred_mapped)
score_mapped = cross_val_score(log_reg_mapped, X_test, y_test, cv=10)
print_cross_val_scores(score_mapped)

Logistic Regression with Oversampling (Slight Injury vs. Serious Injury vs. Fatal Injury):
Accuracy:  0.5522434891747725
Confusion Matrix:
 [[ 851  797  519]
 [ 631 1047  438]
 [ 291  178 1622]]
Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.39      0.43      2167
           1       0.52      0.49      0.51      2116
           2       0.63      0.78      0.69      2091

    accuracy                           0.55      6374
   macro avg       0.54      0.55      0.54      6374
weighted avg       0.54      0.55      0.54      6374

Cross-validation scores:  [0.53605016 0.56583072 0.52351097 0.55329154 0.54631083 0.54160126
 0.55259027 0.53689168 0.52590267 0.50078493]
Average score:  0.5382765018232998


##Slight Injury (0) vs. Serious Injury/Fatal Injury (1)

In [None]:
X_resampled, y_resampled = oversample.fit_resample(X_scaled, y_slight)

#Check distribution
print(y_resampled.value_counts())


Accident_slight
0    7082
1    7082
Name: count, dtype: int64


In [None]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

#Train model
log_reg_slight = LogisticRegression(max_iter=1000)
log_reg_slight.fit(X_train, y_train)

#Create predictions
y_pred_slight = log_reg_slight.predict(X_test)


print("Logistic Regression with Oversampling (Slight Injury vs. Serious/Fatal Injury):")
print_evaluation_metrics(y_test, y_pred_slight)
score_slight = cross_val_score(log_reg_slight, X_test, y_test, cv=10)
print_cross_val_scores(score_slight)


Logistic Regression with Oversampling (Slight Injury vs. Serious/Fatal Injury):
Accuracy:  0.5851764705882353
Confusion Matrix:
 [[1216  915]
 [ 848 1271]]
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.57      0.58      2131
           1       0.58      0.60      0.59      2119

    accuracy                           0.59      4250
   macro avg       0.59      0.59      0.59      4250
weighted avg       0.59      0.59      0.59      4250

Cross-validation scores:  [0.57411765 0.58588235 0.53882353 0.58117647 0.60235294 0.51529412
 0.63764706 0.6        0.54588235 0.59529412]
Average score:  0.5776470588235295


##Slight/Fatal Injury (1) vs. Serious Injury (0)

In [None]:
X_resampled, y_resampled = oversample.fit_resample(X_scaled, y_serious)

#Check distribution
print(y_resampled.value_counts())


Accident_serious
1    7164
0    7164
Name: count, dtype: int64


In [None]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

#Train model
log_reg_serious = LogisticRegression(max_iter=1000)
log_reg_serious.fit(X_train, y_train)

#Create prediction
y_pred_serious = log_reg_serious.predict(X_test)


print("Logistic Regression with Oversampling (Slight/Fatal Injury vs. Serious Injury):")
print_evaluation_metrics(y_test, y_pred_serious)
score_serious = cross_val_score(log_reg_serious, X_test, y_test, cv=10)
print_cross_val_scores(score_serious)


Logistic Regression with Oversampling (Slight/Fatal Injury vs. Serious Injury):
Accuracy:  0.5787392416841126
Confusion Matrix:
 [[1266  899]
 [ 912 1222]]
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.58      0.58      2165
           1       0.58      0.57      0.57      2134

    accuracy                           0.58      4299
   macro avg       0.58      0.58      0.58      4299
weighted avg       0.58      0.58      0.58      4299

Cross-validation scores:  [0.55116279 0.6        0.59767442 0.59069767 0.55581395 0.55348837
 0.5744186  0.59069767 0.58372093 0.56410256]
Average score:  0.5761776982707215


### **Logistic Regression Summary**
**Target: Slight Injury (1) vs. Serious Injury (0) vs. Fatal Injury (2)**

**Without Oversampling:**

Accuracy: 86%


Precision Serious Injury: 86%


Recall: 100%


**With Oversampling:**


Accuracy: 55%

Precision Serious Injury: 52%

Recall: 49%



---


**Target: Slight Injury (0) vs. Serious Injury/Fatal Injury (1)**

**Without Oversampling:**

Accuracy: 86%

Precision Serious Injury: 100%

Recall: 0%

**With Oversampling:**

Accuracy: 59%

Precision Serious Injury: 58%

Recall: 60%


---


**Target: Slight/Fatal Injury (1) vs. Serious Injury (0)**

**Without Oversampling:**

Accuracy: 87%

Precision Serious Injury: 87%

Recall: 100%

**With Oversampling:**

Accuracy: 58%

Precision Serious Injury: 58%

Recall: 57%