# Random Forest

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

from pathlib import Path
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
dataset = Path('../dataset')
df = pd.read_csv(dataset/'accidents_clean_train.csv')
df.head()

Unnamed: 0,Area_accident_occured,Types_of_Junction,Light_conditions,Number_of_vehicles_involved,Number_of_casualties,Cause_of_accident,Day_of_week,Sex_of_driver,Age_band_of_driver,Accident_severity
0,Residential areas,No junction,Daylight,2,2,Moving Backward,Monday,Male,18-30,Slight Injury
1,Office areas,No junction,Daylight,2,2,Overtaking,Monday,Male,31-50,Slight Injury
2,Recreational areas,No junction,Daylight,2,2,Changing lane to the left,Monday,Male,18-30,Serious Injury
3,Office areas,Y Shape,Darkness - lights lit,2,2,Changing lane to the right,Sunday,Male,18-30,Slight Injury
4,Industrial areas,Y Shape,Darkness - lights lit,2,2,Overtaking,Sunday,Male,18-30,Slight Injury


In [4]:
df.shape

(8210, 10)

In [5]:
# Create binary 
# Add mapped column where Accident Severity is binary (Slight Injury vs Serious/Fatal)
df['Accident_slight'] = df['Accident_severity'].map(
    {'Slight Injury': 0}
    ).fillna(1).astype(int)
df['Accident_serious'] = df['Accident_severity'].map(
    {'Serious Injury': 0}
    ).fillna(1).astype(int)
df['Accident_severity_mapped'] = df['Accident_severity'].map({
    'Serious Injury': 0, 
    'Slight Injury': 1, 
    'Fatal injury': 2
    })

In [6]:
# Features
X = df.drop(['Accident_severity', 'Accident_severity_mapped', 'Accident_slight', 'Accident_serious'], axis=1)
# Convert to dummy vars
X = pd.get_dummies(X)
X.head()

Unnamed: 0,Number_of_vehicles_involved,Number_of_casualties,Area_accident_occured_ Market areas,Area_accident_occured_ Recreational areas,Area_accident_occured_ Church areas,Area_accident_occured_ Hospital areas,Area_accident_occured_ Industrial areas,Area_accident_occured_ Outside rural areas,Area_accident_occured_Office areas,Area_accident_occured_Other,...,Day_of_week_Tuesday,Day_of_week_Wednesday,Sex_of_driver_Female,Sex_of_driver_Male,Sex_of_driver_Unknown,Age_band_of_driver_18-30,Age_band_of_driver_31-50,Age_band_of_driver_Over 51,Age_band_of_driver_Under 18,Age_band_of_driver_Unknown
0,2,2,False,False,False,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
1,2,2,False,False,False,False,False,False,True,False,...,False,False,False,True,False,False,True,False,False,False
2,2,2,False,True,False,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
3,2,2,False,False,False,False,False,False,True,False,...,False,False,False,True,False,True,False,False,False,False
4,2,2,False,False,False,False,True,False,False,False,...,False,False,False,True,False,True,False,False,False,False


In [7]:
# Create target variables
y = df['Accident_severity_mapped']
y_slight = df['Accident_slight']
y_serious = df['Accident_serious']

## Split Data

### Accident_severity: Slight injury, Serious injury, Fatal injury

In [8]:
# Split model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=42)

# Initialize model
rf_model = RandomForestClassifier(n_estimators=100, random_state=84)

# Train
rf_model.fit(X_train, y_train)

In [9]:
y_pred = rf_model.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy:  0.8522127486804709
Confusion Matrix:
 [[  27  301    2]
 [  37 2071    3]
 [   1   20    1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.08      0.14       330
           1       0.87      0.98      0.92      2111
           2       0.17      0.05      0.07        22

    accuracy                           0.85      2463
   macro avg       0.48      0.37      0.38      2463
weighted avg       0.80      0.85      0.81      2463



### Accident_slight: slight Injury (0), Serious/Fatal(1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y_slight,
                                                    test_size=0.3, 
                                                    random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=84)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy:  0.8534307754770605
Confusion Matrix:
 [[2068   43]
 [ 318   34]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.98      0.92      2111
           1       0.44      0.10      0.16       352

    accuracy                           0.85      2463
   macro avg       0.65      0.54      0.54      2463
weighted avg       0.81      0.85      0.81      2463



### Accident_serious: slight/fatal (0), Serious (1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y_serious,
                                                    test_size=0.3, 
                                                    random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=84)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy:  0.8603329273244011
Confusion Matrix:
 [[  27  303]
 [  41 2092]]
Classification Report:
               precision    recall  f1-score   support

           0       0.40      0.08      0.14       330
           1       0.87      0.98      0.92      2133

    accuracy                           0.86      2463
   macro avg       0.64      0.53      0.53      2463
weighted avg       0.81      0.86      0.82      2463



### Analysis of Results

Overall accuracy shows model can accurately predict accident severity in 85.3% of cases. Unfortunately, based on the classification report, this model is more accurate at predicting slight injuries compared to serious or fatal injuries.  

There is an imbalance in the dataset where there are more "slight injuries" compared to the other two categories. 