# Random Forest

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [5]:
# Import training dataset 
dataset = Path('../dataset')
df = pd.read_csv(dataset/'accidents_clean_train.csv')
df.head()

Unnamed: 0,Area_accident_occured,Types_of_Junction,Light_conditions,Number_of_vehicles_involved,Number_of_casualties,Cause_of_accident,Day_of_week,Sex_of_driver,Age_band_of_driver,Accident_severity
0,Residential areas,No junction,Daylight,2,2,Moving Backward,Monday,Male,18-30,Slight Injury
1,Office areas,No junction,Daylight,2,2,Overtaking,Monday,Male,31-50,Slight Injury
2,Recreational areas,No junction,Daylight,2,2,Changing lane to the left,Monday,Male,18-30,Serious Injury
3,Office areas,Y Shape,Darkness - lights lit,2,2,Changing lane to the right,Sunday,Male,18-30,Slight Injury
4,Industrial areas,Y Shape,Darkness - lights lit,2,2,Overtaking,Sunday,Male,18-30,Slight Injury


In [6]:
df.shape

(8210, 10)

In [7]:
# Get feature columns
columns = df.columns.tolist()
print(columns)

features = ['Area_accident_occured', 'Types_of_Junction', 'Light_conditions', 'Number_of_vehicles_involved', 'Number_of_casualties', 'Cause_of_accident', 'Day_of_week', 'Sex_of_driver', 'Age_band_of_driver']

target = 'Accident_severity'

['Area_accident_occured', 'Types_of_Junction', 'Light_conditions', 'Number_of_vehicles_involved', 'Number_of_casualties', 'Cause_of_accident', 'Day_of_week', 'Sex_of_driver', 'Age_band_of_driver', 'Accident_severity']


## Prepare dataset

In [8]:
# Convert to categorical
X = df[features]
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0,Number_of_vehicles_involved,Number_of_casualties,Area_accident_occured_ Recreational areas,Area_accident_occured_ Church areas,Area_accident_occured_ Hospital areas,Area_accident_occured_ Industrial areas,Area_accident_occured_ Outside rural areas,Area_accident_occured_Office areas,Area_accident_occured_Other,Area_accident_occured_Recreational areas,...,Day_of_week_Sunday,Day_of_week_Thursday,Day_of_week_Tuesday,Day_of_week_Wednesday,Sex_of_driver_Male,Sex_of_driver_Unknown,Age_band_of_driver_31-50,Age_band_of_driver_Over 51,Age_band_of_driver_Under 18,Age_band_of_driver_Unknown
0,2,2,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,2,2,False,False,False,False,False,True,False,False,...,False,False,False,False,True,False,True,False,False,False
2,2,2,True,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,2,2,False,False,False,False,False,True,False,False,...,True,False,False,False,True,False,False,False,False,False
4,2,2,False,False,False,True,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False


In [9]:
# Create mapped columns of target
df['Accident_slight'] = df[target].map(
    {'Slight Injury': 0}
    ).fillna(1).astype(int)
df['Accident_serious'] = df[target].map(
    {'Serious Injury': 0}
    ).fillna(1).astype(int)
df['Accident_severity_mapped'] = df[target].map({
    'Serious Injury': 0, 
    'Slight Injury': 1, 
    'Fatal injury': 2
    })

df.head()

Unnamed: 0,Area_accident_occured,Types_of_Junction,Light_conditions,Number_of_vehicles_involved,Number_of_casualties,Cause_of_accident,Day_of_week,Sex_of_driver,Age_band_of_driver,Accident_severity,Accident_slight,Accident_serious,Accident_severity_mapped
0,Residential areas,No junction,Daylight,2,2,Moving Backward,Monday,Male,18-30,Slight Injury,0,1,1
1,Office areas,No junction,Daylight,2,2,Overtaking,Monday,Male,31-50,Slight Injury,0,1,1
2,Recreational areas,No junction,Daylight,2,2,Changing lane to the left,Monday,Male,18-30,Serious Injury,1,0,0
3,Office areas,Y Shape,Darkness - lights lit,2,2,Changing lane to the right,Sunday,Male,18-30,Slight Injury,0,1,1
4,Industrial areas,Y Shape,Darkness - lights lit,2,2,Overtaking,Sunday,Male,18-30,Slight Injury,0,1,1


In [10]:
df.shape

(8210, 13)

In [11]:
y = df[target]
y_mapped = df['Accident_severity_mapped']
y_slight = df['Accident_slight']
y_serious = df['Accident_serious']

In [12]:
%run ./naive_bayes.ipynb

['Area_accident_occured', 'Types_of_Junction', 'Light_conditions', 'Number_of_vehicles_involved', 'Number_of_casualties', 'Cause_of_accident', 'Day_of_week', 'Sex_of_driver', 'Age_band_of_driver', 'Accident_severity']
Accuracy:  0.857896873731222
Confusion Matrix:
 [[   2  327    1]
 [   0 2110    1]
 [   0   21    1]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.01      0.01       330
           1       0.86      1.00      0.92      2111
           2       0.33      0.05      0.08        22

    accuracy                           0.86      2463
   macro avg       0.73      0.35      0.34      2463
weighted avg       0.87      0.86      0.79      2463

Cross-validation scores:  [0.85801217 0.85801217 0.85395538 0.8597561  0.85772358]
Average score:  0.8574918781642177
Accuracy:  0.8591149005278116
Confusion Matrix:
 [[2111    0]
 [ 347    5]]
Classification Report:
               precision    recall  f1-score   support

 

### Accident Severity: Slight Injury (1) vs. Serious Injury (0) vs. Fatal Injury (2)

In [14]:
# Split model
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.3, 
                                                    random_state=42)

# Initialize model
rf_model = RandomForestClassifier(n_estimators=100, random_state=84)

# Train
rf_model.fit(X_train, y_train)
# Predict
y_pred = rf_model.predict(X_test)
nb_report(y_test, y_pred)

Accuracy:  0.8501827040194885
Confusion Matrix:
 [[  32  297    1]
 [  46 2062    3]
 [   1   21    0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.10      0.16       330
           1       0.87      0.98      0.92      2111
           2       0.00      0.00      0.00        22

    accuracy                           0.85      2463
   macro avg       0.42      0.36      0.36      2463
weighted avg       0.80      0.85      0.81      2463



In [34]:
# Split model / Unmapped
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=42)

# Initialize model
rf_model = RandomForestClassifier(n_estimators=100, random_state=84)

# Train
rf_model.fit(X_train, y_train)
# Predict
y_pred = rf_model.predict(X_test)
nb_report(y_test, y_pred)

Accuracy:  0.8505887129516849
Confusion Matrix:
 [[   1    1   20]
 [   1   32  297]
 [   3   46 2062]]
Classification Report:
                 precision    recall  f1-score   support

  Fatal injury       0.20      0.05      0.07        22
Serious Injury       0.41      0.10      0.16       330
 Slight Injury       0.87      0.98      0.92      2111

      accuracy                           0.85      2463
     macro avg       0.49      0.37      0.38      2463
  weighted avg       0.80      0.85      0.81      2463



### Accident Severity: Slight Injury (0) vs Serious Injury/Fatal Injury (1)

In [16]:
# Split model
X_train, X_test, y_train, y_test = train_test_split(X, y_slight, test_size=0.3, 
                                                    random_state=42)

# Initialize model
rf_model_slight = RandomForestClassifier(n_estimators=100, random_state=84)

# Train
rf_model_slight.fit(X_train, y_train)
# Predict
y_pred = rf_model_slight.predict(X_test)
nb_report(y_test, y_pred)

Accuracy:  0.8518067397482745
Confusion Matrix:
 [[2058   53]
 [ 312   40]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.97      0.92      2111
           1       0.43      0.11      0.18       352

    accuracy                           0.85      2463
   macro avg       0.65      0.54      0.55      2463
weighted avg       0.81      0.85      0.81      2463



### Accident Severity: Slight/Fatal Injury (1) vs Serious Injury (0)

In [17]:
# Split model
X_train, X_test, y_train, y_test = train_test_split(X, y_serious, test_size=0.3, 
                                                    random_state=42)

# Initialize model
rf_model_serious = RandomForestClassifier(n_estimators=100, random_state=84)

# Train
rf_model_serious.fit(X_train, y_train)
# Predict
y_pred = rf_model_serious.predict(X_test)
nb_report(y_test, y_pred)

Accuracy:  0.8587088915956151
Confusion Matrix:
 [[  28  302]
 [  46 2087]]
Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.08      0.14       330
           1       0.87      0.98      0.92      2133

    accuracy                           0.86      2463
   macro avg       0.63      0.53      0.53      2463
weighted avg       0.81      0.86      0.82      2463



## Resample Target Data

### Accident Severity: Slight Injury (1) vs. Serious Injury (0) vs. Fatal Injury (2)

In [24]:
oversample = RandomOverSampler(random_state=42)

X_resampled, y_resampled = oversample.fit_resample(X, y_mapped)
# Check distribution
print(y_resampled.value_counts())

# Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.3,
                                                    random_state=42)
# Train model
rf_model_rs = RandomForestClassifier()
rf_model_rs.fit(X_train, y_train)

# Create prediction
y_pred_rs = rf_model_rs.predict(X_test)
nb_report(y_test, y_pred_rs)

score_serious = cross_val_score(rf_model_rs, X_test, y_test, cv=5)
cross_scores(score_serious)

Accident_severity_mapped
1    7082
0    7082
2    7082
Name: count, dtype: int64
Accuracy:  0.9606212739253216
Confusion Matrix:
 [[2121   39    7]
 [ 197 1911    8]
 [   0    0 2091]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95      2167
           1       0.98      0.90      0.94      2116
           2       0.99      1.00      1.00      2091

    accuracy                           0.96      6374
   macro avg       0.96      0.96      0.96      6374
weighted avg       0.96      0.96      0.96      6374

Cross-validation scores:  [0.8854902  0.8745098  0.87294118 0.86745098 0.87912088]
Average score:  0.8759026071967249


### Accident Severity: Slight Injury (0) vs Serious Injury/Fatal Injury (1)

In [25]:
X_resampled, y_resampled = oversample.fit_resample(X, y_slight)
# Check distribution
print(y_resampled.value_counts())

# Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.3,
                                                    random_state=42)
# Train model
rf_model_rs = RandomForestClassifier()
rf_model_rs.fit(X_train, y_train)

# Create prediction
y_pred_rs = rf_model_rs.predict(X_test)
nb_report(y_test, y_pred_rs)

score_serious = cross_val_score(rf_model_rs, X_test, y_test, cv=5)
cross_scores(score_serious)

Accident_slight
0    7082
1    7082
Name: count, dtype: int64
Accuracy:  0.9301176470588235
Confusion Matrix:
 [[1877  254]
 [  43 2076]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.88      0.93      2131
           1       0.89      0.98      0.93      2119

    accuracy                           0.93      4250
   macro avg       0.93      0.93      0.93      4250
weighted avg       0.93      0.93      0.93      4250

Cross-validation scores:  [0.84823529 0.82588235 0.81294118 0.81882353 0.81176471]
Average score:  0.823529411764706


### Accident Severity: Slight/Fatal Injury (1) vs Serious Injury (0)

In [27]:
X_resampled, y_resampled = oversample.fit_resample(X, y_serious)
# Check distribution
print(y_resampled.value_counts())

# Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.3,
                                                    random_state=42)
# Train model
rf_model_rs = RandomForestClassifier()
rf_model_rs.fit(X_train, y_train)

# Create prediction
y_pred_rs = rf_model_rs.predict(X_test)
nb_report(y_test, y_pred_rs)

score_serious = cross_val_score(rf_model_rs, X_test, y_test, cv=5)
cross_scores(score_serious)

Accident_serious
1    7164
0    7164
Name: count, dtype: int64
Accuracy:  0.9388229820888578
Confusion Matrix:
 [[2128   37]
 [ 226 1908]]
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.98      0.94      2165
           1       0.98      0.89      0.94      2134

    accuracy                           0.94      4299
   macro avg       0.94      0.94      0.94      4299
weighted avg       0.94      0.94      0.94      4299

Cross-validation scores:  [0.81511628 0.83604651 0.8372093  0.8255814  0.81722934]
Average score:  0.8262365649619623


In [28]:
smote = SMOTE(random_state=42)

X_resampled, y_resampled = smote.fit_resample(X, y_mapped)
# Check distribution
print(y_resampled.value_counts())

# Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.3,
                                                    random_state=42)
# Train model
rf_model_rs = RandomForestClassifier()
rf_model_rs.fit(X_train, y_train)

# Create prediction
y_pred_rs = rf_model_rs.predict(X_test)
nb_report(y_test, y_pred_rs)

score_serious = cross_val_score(rf_model_rs, X_test, y_test, cv=5)
cross_scores(score_serious)

Accident_severity_mapped
1    7082
0    7082
2    7082
Name: count, dtype: int64
Accuracy:  0.8821775964857232
Confusion Matrix:
 [[1854  235   78]
 [ 388 1700   28]
 [  12   10 2069]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.86      0.84      2167
           1       0.87      0.80      0.84      2116
           2       0.95      0.99      0.97      2091

    accuracy                           0.88      6374
   macro avg       0.88      0.88      0.88      6374
weighted avg       0.88      0.88      0.88      6374

Cross-validation scores:  [0.84       0.83137255 0.83215686 0.81803922 0.8422292 ]
Average score:  0.8327595653646073


In [30]:
rus = RandomUnderSampler(random_state=42)

X_resampled, y_resampled = rus.fit_resample(X, y_mapped)
# Check distribution
print(y_resampled.value_counts())

# Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.3,
                                                    random_state=42)
# Train model
rf_model_rs = RandomForestClassifier()
rf_model_rs.fit(X_train, y_train)

# Create prediction
y_pred_rs = rf_model_rs.predict(X_test)
nb_report(y_test, y_pred_rs)

score_serious = cross_val_score(rf_model_rs, X_test, y_test, cv=5)
cross_scores(score_serious)

Accident_severity_mapped
0    82
1    82
2    82
Name: count, dtype: int64
Accuracy:  0.5
Confusion Matrix:
 [[ 9 10  3]
 [ 7 14  3]
 [10  4 14]]
Classification Report:
               precision    recall  f1-score   support

           0       0.35      0.41      0.38        22
           1       0.50      0.58      0.54        24
           2       0.70      0.50      0.58        28

    accuracy                           0.50        74
   macro avg       0.52      0.50      0.50        74
weighted avg       0.53      0.50      0.51        74

Cross-validation scores:  [0.53333333 0.4        0.46666667 0.53333333 0.28571429]
Average score:  0.44380952380952376


In [33]:
X_resampled, y_resampled = oversample.fit_resample(X, y)
# Check distribution
print(y_resampled.value_counts())

# Split testing data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.3,
                                                    random_state=42)
# Train model
rf_model_rs = RandomForestClassifier()
rf_model_rs.fit(X_train, y_train)

# Create prediction
y_pred_rs = rf_model_rs.predict(X_test)
nb_report(y_test, y_pred_rs)

score_serious = cross_val_score(rf_model_rs, X_test, y_test, cv=5)
cross_scores(score_serious)

Accident_severity
Slight Injury     7082
Serious Injury    7082
Fatal injury      7082
Name: count, dtype: int64
Accuracy:  0.961719485409476
Confusion Matrix:
 [[2165    0    0]
 [   8 2061   24]
 [   8  204 1904]]
Classification Report:
                 precision    recall  f1-score   support

  Fatal injury       0.99      1.00      1.00      2165
Serious Injury       0.91      0.98      0.95      2093
 Slight Injury       0.99      0.90      0.94      2116

      accuracy                           0.96      6374
     macro avg       0.96      0.96      0.96      6374
  weighted avg       0.96      0.96      0.96      6374

Cross-validation scores:  [0.87921569 0.86588235 0.8854902  0.88705882 0.8822606 ]
Average score:  0.8799815310739681


## Analysis of Results

Target: Slight Injury (1) vs Serious Injury (0) vs. Fatal Injury (2)

Accuracy: 85%

Precision: 41%

Recall: 10%


Target: Slight Injury(0) vs Serious Injury/Fatal Injury (1)

Accuracy: 85%

Precision: 43%

Recall: 11%


Target: Slight/Fatal Injury (1) vs Serious Injury(0)

Accuracy: 86%

Precision: 38%

Recall: 8%

