# Random Forest

In [8]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

from pathlib import Path
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [9]:
dataset = Path('../dataset')
df = pd.read_csv(dataset/'accidents_clean_train.csv')
df.head()

Unnamed: 0,Area_accident_occured,Types_of_Junction,Light_conditions,Number_of_vehicles_involved,Number_of_casualties,Cause_of_accident,Day_of_week,Sex_of_driver,Age_band_of_driver,Accident_severity
0,Residential areas,No junction,Daylight,2,2,Moving Backward,Monday,Male,18-30,Slight Injury
1,Office areas,No junction,Daylight,2,2,Overtaking,Monday,Male,31-50,Slight Injury
2,Recreational areas,No junction,Daylight,2,2,Changing lane to the left,Monday,Male,18-30,Serious Injury
3,Office areas,Y Shape,Darkness - lights lit,2,2,Changing lane to the right,Sunday,Male,18-30,Slight Injury
4,Industrial areas,Y Shape,Darkness - lights lit,2,2,Overtaking,Sunday,Male,18-30,Slight Injury


In [10]:
df.shape

(8210, 10)

In [11]:
# Features and target
X = df.drop('Accident_severity', axis=1)
y = df['Accident_severity']

# Convert to dummy var
X = pd.get_dummies(X)

In [12]:
# Split model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=42)

# Initialize model
rf_model = RandomForestClassifier(n_estimators=100, random_state=84)

# Train
rf_model.fit(X_train, y_train)

In [15]:
y_pred = rf_model.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy:  0.8526187576126675
Confusion Matrix:
 [[   2    1   19]
 [   2   27  301]
 [   3   37 2071]]
Classification Report:
                 precision    recall  f1-score   support

  Fatal injury       0.29      0.09      0.14        22
Serious Injury       0.42      0.08      0.14       330
 Slight Injury       0.87      0.98      0.92      2111

      accuracy                           0.85      2463
     macro avg       0.52      0.38      0.40      2463
  weighted avg       0.80      0.85      0.81      2463



### Analysis of Results

Overall accuracy shows model can accurately predict accident severity in 85.3% of cases. Unfortunately, based on the classification report, this model is more accurate at predicting slight injuries compared to serious or fatal injuries.  

There is an imbalance in the dataset where there are more "slight injuries" compared to the other two categories. 