# Dataset Preparation

In [60]:
import warnings
warnings.filterwarnings('ignore')

In [61]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [62]:
# Import training dataset
dataset = Path('../dataset')
df = pd.read_csv(dataset/'/content/dataset/accidents_clean_train.csv')
df.head()

Unnamed: 0,Area_accident_occured,Types_of_Junction,Light_conditions,Number_of_vehicles_involved,Number_of_casualties,Cause_of_accident,Day_of_week,Sex_of_driver,Age_band_of_driver,Accident_severity
0,Residential areas,No junction,Daylight,2,2,Moving Backward,Monday,Male,18-30,Slight Injury
1,Office areas,No junction,Daylight,2,2,Overtaking,Monday,Male,31-50,Slight Injury
2,Recreational areas,No junction,Daylight,2,2,Changing lane to the left,Monday,Male,18-30,Serious Injury
3,Office areas,Y Shape,Darkness - lights lit,2,2,Changing lane to the right,Sunday,Male,18-30,Slight Injury
4,Industrial areas,Y Shape,Darkness - lights lit,2,2,Overtaking,Sunday,Male,18-30,Slight Injury


In [63]:
# Get feature columns
columns = df.columns.tolist()
print(columns)

features = ['Area_accident_occured', 'Types_of_Junction', 'Light_conditions', 'Number_of_vehicles_involved', 'Number_of_casualties', 'Cause_of_accident', 'Day_of_week', 'Sex_of_driver', 'Age_band_of_driver']

target = 'Accident_severity'

['Area_accident_occured', 'Types_of_Junction', 'Light_conditions', 'Number_of_vehicles_involved', 'Number_of_casualties', 'Cause_of_accident', 'Day_of_week', 'Sex_of_driver', 'Age_band_of_driver', 'Accident_severity']


## Convert features to categorical

In [64]:
# Convert to categorical
X = df[features]
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0,Number_of_vehicles_involved,Number_of_casualties,Area_accident_occured_ Recreational areas,Area_accident_occured_ Church areas,Area_accident_occured_ Hospital areas,Area_accident_occured_ Industrial areas,Area_accident_occured_ Outside rural areas,Area_accident_occured_Office areas,Area_accident_occured_Other,Area_accident_occured_Recreational areas,...,Day_of_week_Sunday,Day_of_week_Thursday,Day_of_week_Tuesday,Day_of_week_Wednesday,Sex_of_driver_Male,Sex_of_driver_Unknown,Age_band_of_driver_31-50,Age_band_of_driver_Over 51,Age_band_of_driver_Under 18,Age_band_of_driver_Unknown
0,2,2,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,2,2,False,False,False,False,False,True,False,False,...,False,False,False,False,True,False,True,False,False,False
2,2,2,True,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,2,2,False,False,False,False,False,True,False,False,...,True,False,False,False,True,False,False,False,False,False
4,2,2,False,False,False,True,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False


## Create mapped columns of target variable

In [65]:
# Create mapped columns of target
df['Accident_slight'] = df[target].map(
    {'Slight Injury': 0}
    ).fillna(1).astype(int)
df['Accident_serious'] = df[target].map(
    {'Serious Injury': 0}
    ).fillna(1).astype(int)
df['Accident_severity_mapped'] = df[target].map({
    'Serious Injury': 0,
    'Slight Injury': 1,
    'Fatal injury': 2
    })

df.head()

Unnamed: 0,Area_accident_occured,Types_of_Junction,Light_conditions,Number_of_vehicles_involved,Number_of_casualties,Cause_of_accident,Day_of_week,Sex_of_driver,Age_band_of_driver,Accident_severity,Accident_slight,Accident_serious,Accident_severity_mapped
0,Residential areas,No junction,Daylight,2,2,Moving Backward,Monday,Male,18-30,Slight Injury,0,1,1
1,Office areas,No junction,Daylight,2,2,Overtaking,Monday,Male,31-50,Slight Injury,0,1,1
2,Recreational areas,No junction,Daylight,2,2,Changing lane to the left,Monday,Male,18-30,Serious Injury,1,0,0
3,Office areas,Y Shape,Darkness - lights lit,2,2,Changing lane to the right,Sunday,Male,18-30,Slight Injury,0,1,1
4,Industrial areas,Y Shape,Darkness - lights lit,2,2,Overtaking,Sunday,Male,18-30,Slight Injury,0,1,1


In [66]:
df.shape

(8210, 13)

## Create y_test variables for ML training

In [67]:
y = df[target]
y_mapped = df['Accident_severity_mapped']
y_slight = df['Accident_slight']
y_serious = df['Accident_serious']

## Resampling

In [68]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [69]:
oversample = RandomOverSampler(random_state=42)

X_resampled, y_resampled = oversample.fit_resample(X, y_mapped)

In [70]:
# Function to print evaluation metrics
def print_evaluation_metrics(test, pred):
    print("Accuracy: ", accuracy_score(test, pred))
    print("Confusion Matrix:\n", confusion_matrix(test, pred))
    print("Classification Report:\n", classification_report(test, pred))

def print_cross_val_scores(scores):
    print("Cross-validation scores: ", scores)
    print("Average score: ", scores.mean())

# **Linear Regression**

## Slight Injury (1) vs. Serious Injury (0) vs. Fatal Injury (2)

In [71]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_mapped, test_size=0.3, random_state=42)

#Train model
log_reg_mapped = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
log_reg_mapped.fit(X_train, y_train)

#Create prediction
y_pred_mapped = log_reg_mapped.predict(X_test)

print("Logistic Regression (Slight Injury vs. Serious Injury vs. Fatal Injury):")
print_evaluation_metrics(y_test, y_pred_mapped)
score_mapped = cross_val_score(log_reg_mapped, X_test, y_test, cv=5)
print_cross_val_scores(score_mapped)

Logistic Regression (Slight Injury vs. Serious Injury vs. Fatal Injury):
Accuracy:  0.8574908647990256
Confusion Matrix:
 [[   1  329    0]
 [   0 2111    0]
 [   0   22    0]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.00      0.01       330
           1       0.86      1.00      0.92      2111
           2       0.00      0.00      0.00        22

    accuracy                           0.86      2463
   macro avg       0.62      0.33      0.31      2463
weighted avg       0.87      0.86      0.79      2463

Cross-validation scores:  [0.85598377 0.85192698 0.85395538 0.85772358 0.85772358]
Average score:  0.8554626560464387


##Slight Injury (0) vs. Serious Injury/Fatal Injury (1)

In [72]:
# Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_slight, test_size=0.3, random_state=42)


# Train model
log_reg_slight = LogisticRegression(max_iter=1000)
log_reg_slight.fit(X_train, y_train)

# Create prediction
y_pred_slight = log_reg_slight.predict(X_test)

print("Logistic Regression (Slight Injury vs. Serious/Fatal Injury):")
print_evaluation_metrics(y_test, y_pred_slight)
score_slight = cross_val_score(log_reg_slight, X_test, y_test, cv=5)
print_cross_val_scores(score_slight)

Logistic Regression (Slight Injury vs. Serious/Fatal Injury):
Accuracy:  0.8574908647990256
Confusion Matrix:
 [[2111    0]
 [ 351    1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      1.00      0.92      2111
           1       1.00      0.00      0.01       352

    accuracy                           0.86      2463
   macro avg       0.93      0.50      0.46      2463
weighted avg       0.88      0.86      0.79      2463

Cross-validation scores:  [0.85598377 0.85598377 0.85395538 0.8597561  0.85772358]
Average score:  0.8566805191378485


##Slight/Fatal Injury (1) vs. Serious Injury (0)

In [73]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_serious, test_size=0.3, random_state=42)

#Train model
log_reg_serious = LogisticRegression(max_iter=1000)
log_reg_serious.fit(X_train, y_train)

#Create prediction
y_pred_serious = log_reg_serious.predict(X_test)
print("Logistic Regression (Slight/Fatal Injury vs. Serious Injury):")
print_evaluation_metrics(y_test, y_pred_serious)

score_serious = cross_val_score(log_reg_serious, X_test, y_test, cv=5)
print_cross_val_scores(score_serious)

Logistic Regression (Slight/Fatal Injury vs. Serious Injury):
Accuracy:  0.8660170523751523
Confusion Matrix:
 [[   0  330]
 [   0 2133]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       330
           1       0.87      1.00      0.93      2133

    accuracy                           0.87      2463
   macro avg       0.43      0.50      0.46      2463
weighted avg       0.75      0.87      0.80      2463

Cross-validation scores:  [0.86409736 0.86409736 0.86409736 0.86788618 0.86585366]
Average score:  0.8652063853295735


# Resample Target Data

## Slight Injury (1) vs. Serious Injury (0) vs. Fatal Injury (2)

In [74]:
# Apply RandomOverSampler
oversample = RandomOverSampler(random_state=42)

X_resampled, y_resampled = oversample.fit_resample(X_scaled, y_mapped)

# Check distribution
print(y_resampled.value_counts())

Accident_severity_mapped
1    7082
0    7082
2    7082
Name: count, dtype: int64


In [75]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

#Train model
log_reg_mapped = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
log_reg_mapped.fit(X_train, y_train)

#Create prediction
y_pred_mapped = log_reg_mapped.predict(X_test)


print("Logistic Regression with Oversampling (Slight Injury vs. Serious Injury vs. Fatal Injury):")
print_evaluation_metrics(y_test, y_pred_mapped)
score_mapped = cross_val_score(log_reg_mapped, X_test, y_test, cv=5)
print_cross_val_scores(score_mapped)

Logistic Regression with Oversampling (Slight Injury vs. Serious Injury vs. Fatal Injury):
Accuracy:  0.5522434891747725
Confusion Matrix:
 [[ 851  797  519]
 [ 631 1047  438]
 [ 291  178 1622]]
Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.39      0.43      2167
           1       0.52      0.49      0.51      2116
           2       0.63      0.78      0.69      2091

    accuracy                           0.55      6374
   macro avg       0.54      0.55      0.54      6374
weighted avg       0.54      0.55      0.54      6374

Cross-validation scores:  [0.54588235 0.52862745 0.54823529 0.53882353 0.51020408]
Average score:  0.5343545418167268


##Slight Injury (0) vs. Serious Injury/Fatal Injury (1)

In [76]:
X_resampled, y_resampled = oversample.fit_resample(X_scaled, y_slight)

#Check distribution
print(y_resampled.value_counts())


Accident_slight
0    7082
1    7082
Name: count, dtype: int64


In [77]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

#Train model
log_reg_slight = LogisticRegression(max_iter=1000)
log_reg_slight.fit(X_train, y_train)

#Create predictions
y_pred_slight = log_reg_slight.predict(X_test)


print("Logistic Regression with Oversampling (Slight Injury vs. Serious/Fatal Injury):")
print_evaluation_metrics(y_test, y_pred_slight)
score_slight = cross_val_score(log_reg_slight, X_test, y_test, cv=5)
print_cross_val_scores(score_slight)


Logistic Regression with Oversampling (Slight Injury vs. Serious/Fatal Injury):
Accuracy:  0.5851764705882353
Confusion Matrix:
 [[1216  915]
 [ 848 1271]]
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.57      0.58      2131
           1       0.58      0.60      0.59      2119

    accuracy                           0.59      4250
   macro avg       0.59      0.59      0.59      4250
weighted avg       0.59      0.59      0.59      4250

Cross-validation scores:  [0.57411765 0.55647059 0.55882353 0.61176471 0.55764706]
Average score:  0.571764705882353


##Slight/Fatal Injury (1) vs. Serious Injury (0)

In [78]:
X_resampled, y_resampled = oversample.fit_resample(X_scaled, y_serious)

#Check distribution
print(y_resampled.value_counts())


Accident_serious
1    7164
0    7164
Name: count, dtype: int64


In [79]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

#Train model
log_reg_serious = LogisticRegression(max_iter=1000)
log_reg_serious.fit(X_train, y_train)

#Create prediction
y_pred_serious = log_reg_serious.predict(X_test)


print("Logistic Regression with Oversampling (Slight/Fatal Injury vs. Serious Injury):")
print_evaluation_metrics(y_test, y_pred_serious)
score_serious = cross_val_score(log_reg_serious, X_test, y_test, cv=5)
print_cross_val_scores(score_serious)


Logistic Regression with Oversampling (Slight/Fatal Injury vs. Serious Injury):
Accuracy:  0.5787392416841126
Confusion Matrix:
 [[1266  899]
 [ 912 1222]]
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.58      0.58      2165
           1       0.58      0.57      0.57      2134

    accuracy                           0.58      4299
   macro avg       0.58      0.58      0.58      4299
weighted avg       0.58      0.58      0.58      4299

Cross-validation scores:  [0.57674419 0.58255814 0.54069767 0.57906977 0.57275902]
Average score:  0.5703657579121206


### **Logistic Regression Summary**
**Target: Slight Injury (1) vs. Serious Injury (0) vs. Fatal Injury (2)**

**Without Oversampling:**

Accuracy: 85.74%


Precision Serious Injury: 86%


Recall: 100%


**With Oversampling:**


Accuracy: 57.87%

Precision Serious Injury: 58%

Recall: 57%



---


**Target: Slight Injury (0) vs. Serious Injury/Fatal Injury (1)**

**Without Oversampling:**

Accuracy: 85.75%

Precision Serious Injury: 100%

Recall: 0%

**With Oversampling:**

Accuracy: 57.87%

Precision Serious Injury: 58%

Recall: 57%


---


**Target: Slight/Fatal Injury (1) vs. Serious Injury (0)**

**Without Oversampling:**

Accuracy: 86.60%

Precision Serious Injury: 87%

Recall: 100%

**With Oversampling:**

Accuracy: 57.87%

Precision Serious Injury: 58%

Recall: 57%

# **KNN Model**

In [80]:
param_grid = {'n_neighbors': range(1, 50)}

## Without Oversampling

## Slight Injury (1) vs. Serious Injury (0) vs. Fatal Injury (2)

In [81]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_mapped, test_size=0.3, random_state=42)

#Train model with best number of neighbors
knn_mapped = KNeighborsClassifier()
grid_search_mapped = GridSearchCV(knn_mapped, param_grid, cv=5, scoring='accuracy')
grid_search_mapped.fit(X_train, y_train)
best_knn_mapped = grid_search_mapped.best_estimator_

#Create prediction
y_pred_mapped = best_knn_mapped.predict(X_test)

print("Best number of neighbors:", grid_search_mapped.best_params_['n_neighbors'])
print("KNN (Slight Injury vs. Serious Injury vs. Fatal Injury):")
print_evaluation_metrics(y_test, y_pred_mapped)
score_mapped = cross_val_score(knn_mapped, X_test, y_test, cv=5)
print_cross_val_scores(score_mapped)


Best number of neighbors: 21
KNN (Slight Injury vs. Serious Injury vs. Fatal Injury):
Accuracy:  0.857084855866829
Confusion Matrix:
 [[   0  330    0]
 [   0 2111    0]
 [   0   22    0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       330
           1       0.86      1.00      0.92      2111
           2       0.00      0.00      0.00        22

    accuracy                           0.86      2463
   macro avg       0.29      0.33      0.31      2463
weighted avg       0.73      0.86      0.79      2463

Cross-validation scores:  [0.84381339 0.86004057 0.81947262 0.83739837 0.8495935 ]
Average score:  0.8420636883853625


##Slight Injury (0) vs. Serious Injury/Fatal Injury (1)

In [82]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_slight, test_size=0.3, random_state=42)

#Train model with best number of neighbors
knn_slight = KNeighborsClassifier()
grid_search_slight = GridSearchCV(knn_slight, param_grid, cv=5, scoring='accuracy')
grid_search_slight.fit(X_train, y_train)
best_knn_slight = grid_search_slight.best_estimator_

#Create prediction
y_pred_slight = best_knn_slight.predict(X_test)

print("Best number of neighbors for Slight Injury vs. Serious/Fatal Injury:", grid_search_slight.best_params_['n_neighbors'])
print("KNN (Slight Injury vs. Serious/Fatal Injury):")
print_evaluation_metrics(y_test, y_pred_slight)
score_slight = cross_val_score(best_knn_slight, X_test, y_test, cv=5)
print_cross_val_scores(score_slight)

Best number of neighbors for Slight Injury vs. Serious/Fatal Injury: 20
KNN (Slight Injury vs. Serious/Fatal Injury):
Accuracy:  0.857084855866829
Confusion Matrix:
 [[2111    0]
 [ 352    0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      1.00      0.92      2111
           1       0.00      0.00      0.00       352

    accuracy                           0.86      2463
   macro avg       0.43      0.50      0.46      2463
weighted avg       0.73      0.86      0.79      2463

Cross-validation scores:  [0.85801217 0.85598377 0.85598377 0.85772358 0.85772358]
Average score:  0.8570853740991771


##Slight/Fatal Injury (1) vs. Serious Injury (0)

In [83]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_serious, test_size=0.3, random_state=42)

#Train model with best number of neighbors
knn_serious = KNeighborsClassifier()
grid_search_serious = GridSearchCV(knn_serious, param_grid, cv=5, scoring='accuracy')
grid_search_serious.fit(X_train, y_train)
best_knn_serious = grid_search_serious.best_estimator_

#Create prediction
y_pred_serious = best_knn_serious.predict(X_test)

print("Best number of neighbors for Slight/Fatal Injury vs. Serious Injury:", grid_search_serious.best_params_['n_neighbors'])
print("KNN (Slight/Fatal Injury vs. Serious Injury):")
print_evaluation_metrics(y_test, y_pred_serious)
score_serious = cross_val_score(best_knn_serious, X_test, y_test, cv=5)
print_cross_val_scores(score_serious)

Best number of neighbors for Slight/Fatal Injury vs. Serious Injury: 21
KNN (Slight/Fatal Injury vs. Serious Injury):
Accuracy:  0.8660170523751523
Confusion Matrix:
 [[   0  330]
 [   0 2133]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       330
           1       0.87      1.00      0.93      2133

    accuracy                           0.87      2463
   macro avg       0.43      0.50      0.46      2463
weighted avg       0.75      0.87      0.80      2463

Cross-validation scores:  [0.86612576 0.86612576 0.86409736 0.86585366 0.86585366]
Average score:  0.8656112402909019


# Resample Target Data

## Slight Injury (1) vs. Serious Injury (0) vs. Fatal Injury (2)

In [84]:
# Apply RandomOverSampler
oversample = RandomOverSampler(random_state=42)

X_resampled, y_resampled = oversample.fit_resample(X_scaled, y_mapped)

# Check distribution
print(y_resampled.value_counts())

Accident_severity_mapped
1    7082
0    7082
2    7082
Name: count, dtype: int64


In [85]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

#Train model with best number of neighbors
knn_mapped = KNeighborsClassifier()
grid_search_mapped = GridSearchCV(knn_mapped, param_grid, cv=5, scoring='accuracy')
grid_search_mapped.fit(X_train, y_train)
best_knn_mapped = grid_search_mapped.best_estimator_

#Create prediction
y_pred_mapped = best_knn_mapped.predict(X_test)


print("Best number of neighbors for Slight Injury vs. Serious Injury vs. Fatal Injury (oversampled):", grid_search_mapped.best_params_['n_neighbors'])
print("KNN with Oversampling (Slight Injury vs. Serious Injury vs. Fatal Injury):")
print_evaluation_metrics(y_test, y_pred_mapped)
score_mapped = cross_val_score(best_knn_mapped, X_test, y_test, cv=5)
print_cross_val_scores(score_mapped)

Best number of neighbors for Slight Injury vs. Serious Injury vs. Fatal Injury (oversampled): 1
KNN with Oversampling (Slight Injury vs. Serious Injury vs. Fatal Injury):
Accuracy:  0.9287731408848446
Confusion Matrix:
 [[2090   70    7]
 [ 356 1739   21]
 [   0    0 2091]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.96      0.91      2167
           1       0.96      0.82      0.89      2116
           2       0.99      1.00      0.99      2091

    accuracy                           0.93      6374
   macro avg       0.93      0.93      0.93      6374
weighted avg       0.93      0.93      0.93      6374

Cross-validation scores:  [0.84       0.84784314 0.84       0.83215686 0.83516484]
Average score:  0.839032967032967


##Slight Injury (0) vs. Serious Injury/Fatal Injury (1)

In [86]:
X_resampled, y_resampled = oversample.fit_resample(X_scaled, y_slight)

#Check distribution
print(y_resampled.value_counts())

Accident_slight
0    7082
1    7082
Name: count, dtype: int64


In [87]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

#Train model with best number of neighbors
knn_slight = KNeighborsClassifier()
grid_search_slight = GridSearchCV(knn_slight, param_grid, cv=5, scoring='accuracy')
grid_search_slight.fit(X_train, y_train)
best_knn_slight = grid_search_slight.best_estimator_

#Create prediction
y_pred_slight = best_knn_slight.predict(X_test)

print("Best number of neighbors for Slight Injury vs. Serious/Fatal Injury (oversampled):", grid_search_slight.best_params_['n_neighbors'])
print("KNN with Oversampling (Slight Injury vs. Serious/Fatal Injury):")
print_evaluation_metrics(y_test, y_pred_slight)
score_slight = cross_val_score(best_knn_slight, X_test, y_test, cv=5)
print_cross_val_scores(score_slight)

Best number of neighbors for Slight Injury vs. Serious/Fatal Injury (oversampled): 1
KNN with Oversampling (Slight Injury vs. Serious/Fatal Injury):
Accuracy:  0.8849411764705882
Confusion Matrix:
 [[1719  412]
 [  77 2042]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.81      0.88      2131
           1       0.83      0.96      0.89      2119

    accuracy                           0.88      4250
   macro avg       0.89      0.89      0.88      4250
weighted avg       0.89      0.88      0.88      4250

Cross-validation scores:  [0.78117647 0.75294118 0.76705882 0.76       0.77882353]
Average score:  0.768


##Slight/Fatal Injury (1) vs. Serious Injury (0)

In [88]:
X_resampled, y_resampled = oversample.fit_resample(X_scaled, y_serious)

#Check distribution
print(y_resampled.value_counts())


Accident_serious
1    7164
0    7164
Name: count, dtype: int64


In [89]:
#Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)


#Train model with best number of neighbors
knn_serious = KNeighborsClassifier()
grid_search_serious = GridSearchCV(knn_serious, param_grid, cv=5, scoring='accuracy')
grid_search_serious.fit(X_train, y_train)
best_knn_serious = grid_search_serious.best_estimator_

#Create predictiom
y_pred_serious = best_knn_serious.predict(X_test)

print("Best number of neighbors for Slight/Fatal Injury vs. Serious Injury (oversampled):", grid_search_serious.best_params_['n_neighbors'])
print("KNN with Oversampling (Slight/Fatal Injury vs. Serious Injury):")
print_evaluation_metrics(y_test, y_pred_serious)
score_serious = cross_val_score(best_knn_serious, X_test, y_test, cv=5)
print_cross_val_scores(score_serious)

Best number of neighbors for Slight/Fatal Injury vs. Serious Injury (oversampled): 1
KNN with Oversampling (Slight/Fatal Injury vs. Serious Injury):
Accuracy:  0.8946266573621773
Confusion Matrix:
 [[2067   98]
 [ 355 1779]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.95      0.90      2165
           1       0.95      0.83      0.89      2134

    accuracy                           0.89      4299
   macro avg       0.90      0.89      0.89      4299
weighted avg       0.90      0.89      0.89      4299

Cross-validation scores:  [0.77674419 0.76046512 0.76627907 0.78604651 0.78230501]
Average score:  0.7743679779083303


### **KNN Model Summary**
**Target: Slight Injury (1) vs. Serious Injury (0) vs. Fatal Injury (2)**

**Without Oversampling:**

Accuracy: 85.71%


Precision Serious Injury: 86%


Recall: 100%


**With Oversampling:**


Accuracy: 92.88%

Precision Serious Injury: 96%

Recall: 82%



---


**Target: Slight Injury (0) vs. Serious Injury/Fatal Injury (1)**

**Without Oversampling:**

Accuracy: 85.71%

Precision Serious Injury: 0%

Recall: 0%

**With Oversampling:**

Accuracy: 88.49%

Precision Serious Injury: 83%

Recall: 96%


---


**Target: Slight/Fatal Injury (1) vs. Serious Injury (0)**

**Without Oversampling:**

Accuracy: 86.60%

Precision Serious Injury: 87%

Recall: 100%

**With Oversampling:**

Accuracy: 89.46%

Precision Serious Injury: 95%

Recall: 83%