# Accident Fatal Prediction Model

In [1]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## Preprocessing

First preprocess accident data, then vehicle data. After cleaning both data then perform merge operation.

In [2]:
accident_features = [
 'Accident_Index',
 'Road_Type',
 'Time',
 'Speed_limit',
 'Junction_Detail',
 'Light_Conditions',
 'Weather_Conditions',
 'Road_Surface_Conditions',
 'Urban_or_Rural_Area',
 'Accident_Severity'
]
vehicle_features = [
    'Accident_Index',
    'Vehicle_Type',
    'Vehicle_Manoeuvre',
    'Engine_Capacity_(CC)',
    'Journey_Purpose_of_Driver',
    'Sex_of_Driver',
    'Age_Band_of_Driver',
    'Age_of_Vehicle'
]
label_column = 'Accident_Severity'

In [3]:
def read_files(filepath, cols):
    all_files = glob.glob(filepath)
    li = []
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0, usecols=cols)
        li.append(df)
    return pd.concat(li, axis=0, ignore_index=True)

accidents_df = read_files('data/accident/*.csv', accident_features)
vehicles_df = read_files('data/vehicle/*.csv', vehicle_features)

accident_features.remove('Time')
accident_features.append('Daytime')

  if (await self.run_code(code, result,  async_=asy)):


In [4]:
accidents_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2287427 entries, 0 to 2287426
Data columns (total 10 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Accident_Index           object 
 1   Accident_Severity        int64  
 2   Time                     object 
 3   Road_Type                int64  
 4   Speed_limit              float64
 5   Junction_Detail          int64  
 6   Light_Conditions         int64  
 7   Weather_Conditions       int64  
 8   Road_Surface_Conditions  int64  
 9   Urban_or_Rural_Area      int64  
dtypes: float64(1), int64(7), object(2)
memory usage: 174.5+ MB


In [5]:
vehicles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4196486 entries, 0 to 4196485
Data columns (total 8 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   Accident_Index             object
 1   Vehicle_Type               int64 
 2   Vehicle_Manoeuvre          int64 
 3   Journey_Purpose_of_Driver  int64 
 4   Sex_of_Driver              int64 
 5   Age_Band_of_Driver         int64 
 6   Engine_Capacity_(CC)       int64 
 7   Age_of_Vehicle             int64 
dtypes: int64(7), object(1)
memory usage: 256.1+ MB


### Processing Accident Data

In [6]:
# Transform task to binary classification problem (Fatal or Non-Fatal)
def convert_label(label):
    if label == 1:
        return 1
    else:
        return 0
accidents_df[label_column] = accidents_df[label_column].apply(convert_label)

In [7]:
accidents_df['Hour'] = pd.to_numeric(accidents_df['Time'].str[0:2])
accidents_df = accidents_df.dropna(subset=['Hour'])
accidents_df['Hour'] = accidents_df['Hour'].astype('int')

In [8]:
def convert_hour(hour):
    if hour >= 5 and hour < 10:
        return 1
    elif hour >= 10 and hour < 15:
        return 2
    elif hour >= 15 and hour < 19:
        return 3
    elif hour >= 19 and hour < 23:
        return 4
    else:
        return 5
accidents_df['Daytime'] = accidents_df['Hour'].apply(convert_hour)
accidents_df[['Daytime', 'Time', 'Hour']].head()

Unnamed: 0,Daytime,Time,Hour
0,3,17:42,17
1,3,17:36,17
2,5,00:15,0
3,2,10:35,10
4,4,21:13,21


In [9]:
accidents_df = accidents_df[accident_features]

In [10]:
accidents_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2287195 entries, 0 to 2287426
Data columns (total 10 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Accident_Index           object 
 1   Road_Type                int64  
 2   Speed_limit              float64
 3   Junction_Detail          int64  
 4   Light_Conditions         int64  
 5   Weather_Conditions       int64  
 6   Road_Surface_Conditions  int64  
 7   Urban_or_Rural_Area      int64  
 8   Accident_Severity        int64  
 9   Daytime                  int64  
dtypes: float64(1), int64(8), object(1)
memory usage: 191.9+ MB


In [11]:
accidents_df = accidents_df.dropna()

In [12]:
len(accidents_df)

2287158

In [13]:
accidents_df['Accident_Severity'].value_counts(normalize=True)

0    0.987017
1    0.012983
Name: Accident_Severity, dtype: float64

In [14]:
accidents_df.isna().sum()

Accident_Index             0
Road_Type                  0
Speed_limit                0
Junction_Detail            0
Light_Conditions           0
Weather_Conditions         0
Road_Surface_Conditions    0
Urban_or_Rural_Area        0
Accident_Severity          0
Daytime                    0
dtype: int64

In [16]:
accidents_df = accidents_df[
    (accidents_df['Road_Type'] != -1) &
    (accidents_df['Speed_limit'] <= 70) &
    (accidents_df['Speed_limit'] >= 30) &
    (accidents_df['Junction_Detail'] != -1) &
    (accidents_df['Light_Conditions'] != -1) &
    (accidents_df['Weather_Conditions'] != -1) &
    (accidents_df['Road_Surface_Conditions'] != -1) &
    (accidents_df['Urban_or_Rural_Area'] != -1)
]

In [17]:
accidents_df['Speed_limit'].unique()

array([30., 40., 50., 60., 70.])

In [18]:
label_enc = LabelEncoder()
accidents_df['Speed_limit'] = label_enc.fit_transform(accidents_df['Speed_limit'])
accidents_df['Speed_limit']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  accidents_df['Speed_limit'] = label_enc.fit_transform(accidents_df['Speed_limit'])


0          0
1          0
2          0
3          0
4          0
          ..
2287422    3
2287423    3
2287424    3
2287425    0
2287426    3
Name: Speed_limit, Length: 2217747, dtype: int64

In [19]:
accidents_df['Accident_Severity'].value_counts(normalize=True)

0    0.986761
1    0.013239
Name: Accident_Severity, dtype: float64

### Processing Vehicle Data

In [20]:
vehicles_df[vehicles_df == -1].count()

Accident_Index                     0
Vehicle_Type                    1750
Vehicle_Manoeuvre              29693
Journey_Purpose_of_Driver      45050
Sex_of_Driver                    100
Age_Band_of_Driver            476400
Engine_Capacity_(CC)         1079663
Age_of_Vehicle               1207692
dtype: int64

In [21]:
vehicles_df.isna().sum()

Accident_Index               0
Vehicle_Type                 0
Vehicle_Manoeuvre            0
Journey_Purpose_of_Driver    0
Sex_of_Driver                0
Age_Band_of_Driver           0
Engine_Capacity_(CC)         0
Age_of_Vehicle               0
dtype: int64

In [22]:
len(vehicles_df)

4196486

In [23]:
def convert_age_of_vehicle(age):
    if age <= 5: return age
    elif 5 < age <= 10: return 6
    else: return 7
vehicles_df['Age_of_Vehicle'] = vehicles_df['Age_of_Vehicle'].apply(convert_age_of_vehicle)

In [24]:
vehicles_df = vehicles_df.dropna()
vehicles_df = vehicles_df[
    (vehicles_df['Vehicle_Type'] != -1) &
    (vehicles_df['Vehicle_Manoeuvre'] != -1) &
    (vehicles_df['Journey_Purpose_of_Driver'] != -1) &
    (vehicles_df['Sex_of_Driver'] != -1) &
    (vehicles_df['Age_Band_of_Driver'] != -1) &
    (vehicles_df['Age_of_Vehicle'] != -1) &
    (vehicles_df['Engine_Capacity_(CC)'] <= 10000)
]

In [25]:
vehicles_df

Unnamed: 0,Accident_Index,Vehicle_Type,Vehicle_Manoeuvre,Journey_Purpose_of_Driver,Sex_of_Driver,Age_Band_of_Driver,Engine_Capacity_(CC),Age_of_Vehicle
1,200501BS00002,11,4,1,1,7,8268,3
2,200501BS00003,11,17,1,1,6,8300,5
3,200501BS00003,9,2,15,1,9,1762,6
4,200501BS00004,9,18,15,2,8,1769,4
5,200501BS00005,3,18,15,1,8,85,6
...,...,...,...,...,...,...,...,...
4196481,2019984107019,19,18,1,1,4,2198,4
4196482,2019984107219,9,18,6,1,6,1997,7
4196483,2019984107219,9,18,6,1,9,2967,5
4196484,2019984107419,9,7,5,1,11,1597,6


In [26]:
df = pd.merge(accidents_df, vehicles_df, on='Accident_Index')

In [27]:
df

Unnamed: 0,Accident_Index,Road_Type,Speed_limit,Junction_Detail,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Urban_or_Rural_Area,Accident_Severity,Daytime,Vehicle_Type,Vehicle_Manoeuvre,Journey_Purpose_of_Driver,Sex_of_Driver,Age_Band_of_Driver,Engine_Capacity_(CC),Age_of_Vehicle
0,200501BS00002,3,0,6,4,1,1,1,0,3,11,4,1,1,7,8268,3
1,200501BS00003,6,0,0,4,1,1,1,0,5,11,17,1,1,6,8300,5
2,200501BS00003,6,0,0,4,1,1,1,0,5,9,2,15,1,9,1762,6
3,200501BS00004,6,0,0,1,1,1,1,0,2,9,18,15,2,8,1769,4
4,200501BS00005,6,0,0,7,1,2,1,0,4,3,18,15,1,8,85,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2569511,2019984107019,6,3,0,1,1,2,2,0,1,19,18,1,1,4,2198,4
2569512,2019984107219,6,3,3,1,1,1,2,0,3,9,18,6,1,6,1997,7
2569513,2019984107219,6,3,3,1,1,1,2,0,3,9,18,6,1,9,2967,5
2569514,2019984107419,6,0,3,1,1,1,2,0,2,9,7,5,1,11,1597,6


In [28]:
df.isna().sum()

Accident_Index               0
Road_Type                    0
Speed_limit                  0
Junction_Detail              0
Light_Conditions             0
Weather_Conditions           0
Road_Surface_Conditions      0
Urban_or_Rural_Area          0
Accident_Severity            0
Daytime                      0
Vehicle_Type                 0
Vehicle_Manoeuvre            0
Journey_Purpose_of_Driver    0
Sex_of_Driver                0
Age_Band_of_Driver           0
Engine_Capacity_(CC)         0
Age_of_Vehicle               0
dtype: int64

## Generate Training and Testing Data

In [29]:
y = df['Accident_Severity']

In [30]:
X = df.drop(columns=['Accident_Index', 'Accident_Severity'])

In [31]:
X

Unnamed: 0,Road_Type,Speed_limit,Junction_Detail,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Urban_or_Rural_Area,Daytime,Vehicle_Type,Vehicle_Manoeuvre,Journey_Purpose_of_Driver,Sex_of_Driver,Age_Band_of_Driver,Engine_Capacity_(CC),Age_of_Vehicle
0,3,0,6,4,1,1,1,3,11,4,1,1,7,8268,3
1,6,0,0,4,1,1,1,5,11,17,1,1,6,8300,5
2,6,0,0,4,1,1,1,5,9,2,15,1,9,1762,6
3,6,0,0,1,1,1,1,2,9,18,15,2,8,1769,4
4,6,0,0,7,1,2,1,4,3,18,15,1,8,85,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2569511,6,3,0,1,1,2,2,1,19,18,1,1,4,2198,4
2569512,6,3,3,1,1,1,2,3,9,18,6,1,6,1997,7
2569513,6,3,3,1,1,1,2,3,9,18,6,1,9,2967,5
2569514,6,0,3,1,1,1,2,2,9,7,5,1,11,1597,6


In [32]:
y

0          0
1          0
2          0
3          0
4          0
          ..
2569511    0
2569512    0
2569513    0
2569514    0
2569515    0
Name: Accident_Severity, Length: 2569516, dtype: int64

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [34]:
y_train.value_counts(normalize=True)

0    0.986336
1    0.013664
Name: Accident_Severity, dtype: float64

In [35]:
y_test.value_counts(normalize=True)

0    0.98596
1    0.01404
Name: Accident_Severity, dtype: float64

## Model

In the fatal accident prediction task, this work will focus on following three points.
1. Pre-processing and feature selection stretagies
2. Imbalance labeled data distribution
3. Model selection

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, average_precision_score, classification_report

In [37]:
n_classes = 2

### Baseline

Using naive random forest classifier as baseline solution. The reason of choosing decision tree model is because decision tree performs significant effects on discrete and low-dimensional features, in this task the chosen features are 14 features.

In order to solve data imbalance problem, there are some techniques can help to address the problem.

1. Downsampling
2. Upsampling
3. Generating artificial data

In [39]:
# First apply naive random forest as baseline classifier, without class_weight
rf_clf = RandomForestClassifier(max_depth=10, criterion='entropy', n_jobs=-1, random_state=1, verbose=1)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Non-Fatal', 'Fatal']))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   53.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.7min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.7s finished
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

   Non-Fatal       0.99      1.00      0.99    506689
       Fatal       0.00      0.00      0.00      7215

    accuracy                           0.99    513904
   macro avg       0.49      0.50      0.50    513904
weighted avg       0.97      0.99      0.98    513904



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
# First apply naive random forest as baseline classifier, with automatically adjusted class_weight
# class_weight is used to reduce the weights of major class, so as to increase the weight of minor class
rf_clf = RandomForestClassifier(max_depth=10, criterion='entropy', n_jobs=-1, random_state=1, class_weight='balanced', verbose=1)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Non-Fatal', 'Fatal']))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.0min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.9s finished


              precision    recall  f1-score   support

   Non-Fatal       0.99      0.74      0.85    506689
       Fatal       0.04      0.72      0.07      7215

    accuracy                           0.74    513904
   macro avg       0.52      0.73      0.46    513904
weighted avg       0.98      0.74      0.84    513904



### Improvements

Hence, the improvements are proposed to address the following problems:

1. Sampling strategies
2. Feature selection

#### 1. Sampling Strategies

##### SMOTE oversampling

In [41]:
# First applying method is SMOTE oversampling methods
import imblearn

X_resampled, y_resampled = imblearn.over_sampling.SMOTE().fit_resample(X_train, y_train)

In [42]:
y_train.value_counts(normalize=True)

0    0.986336
1    0.013664
Name: Accident_Severity, dtype: float64

In [43]:
y_resampled.value_counts(normalize=True)

1    0.5
0    0.5
Name: Accident_Severity, dtype: float64

In [44]:
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(
    X_resampled, y_resampled, random_state=123
)

In [45]:
# Random forest with SMOTE oversampling
rf_clf = RandomForestClassifier(criterion='entropy', n_jobs=-1, random_state=1, verbose=1)
rf_clf.fit(X_train_resampled, y_train_resampled)
y_pred = rf_clf.predict(X_test_resampled)
print(classification_report(y_test_resampled, y_pred, target_names=['Non-Fatal', 'Fatal']))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.2min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    8.4s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   22.4s finished


              precision    recall  f1-score   support

   Non-Fatal       0.99      0.96      0.97    506743
       Fatal       0.96      0.99      0.97    507020

    accuracy                           0.97   1013763
   macro avg       0.97      0.97      0.97   1013763
weighted avg       0.97      0.97      0.97   1013763



In [46]:
for i, j in zip(rf_clf.feature_importances_, X_train.columns):
    print(i, j)

0.05586023637149454 Road_Type
0.07592088409556128 Speed_limit
0.07499031744328842 Junction_Detail
0.05444402522735311 Light_Conditions
0.025696461322694288 Weather_Conditions
0.035740406244786485 Road_Surface_Conditions
0.0169353768195043 Urban_or_Rural_Area
0.04275946679856979 Daytime
0.03265139300534174 Vehicle_Type
0.10133041798777899 Vehicle_Manoeuvre
0.053110204845652244 Journey_Purpose_of_Driver
0.06635991102446694 Sex_of_Driver
0.07426495622634757 Age_Band_of_Driver
0.22615913446129993 Engine_Capacity_(CC)
0.06377680812586056 Age_of_Vehicle


##### Undersampling

Random Under Sampling

Randomly selected negative samples to equal amount of positive samples.

In [113]:
X_undsampled, y_undsampled = imblearn.under_sampling.RandomUnderSampler(random_state=1).fit_resample(X_train, y_train)

In [114]:
y_undsampled.value_counts(normalize=True)

1    0.5
0    0.5
Name: Accident_Severity, dtype: float64

In [115]:
y_undsampled.value_counts()

1    28087
0    28087
Name: Accident_Severity, dtype: int64

In [116]:
X_train_undsampled, X_test_undsampled, y_train_undsampled, y_test_undsampled = train_test_split(
    X_undsampled, y_undsampled, random_state=123
)

In [117]:
# Random forest with random under sampler
rf_clf = RandomForestClassifier(criterion='entropy', n_jobs=-1, random_state=1, verbose=1)
rf_clf.fit(X_train_undsampled, y_train_undsampled)
y_pred = rf_clf.predict(X_test_undsampled)
print(classification_report(y_test_undsampled, y_pred, target_names=['Non-Fatal', 'Fatal']))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.9s


              precision    recall  f1-score   support

   Non-Fatal       0.73      0.73      0.73      6986
       Fatal       0.73      0.73      0.73      7058

    accuracy                           0.73     14044
   macro avg       0.73      0.73      0.73     14044
weighted avg       0.73      0.73      0.73     14044



[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


In [118]:
for i, j in zip(rf_clf.feature_importances_, X_train.columns):
    print(i, j)

0.03153088368305859 Road_Type
0.07300793016005154 Speed_limit
0.060915909033160624 Junction_Detail
0.0338626793758107 Light_Conditions
0.03183912645832451 Weather_Conditions
0.031120909533673115 Road_Surface_Conditions
0.03406293678096145 Urban_or_Rural_Area
0.06810994977459972 Daytime
0.04126666792291967 Vehicle_Type
0.10333125829086039 Vehicle_Manoeuvre
0.06395013891014457 Journey_Purpose_of_Driver
0.020482111792990264 Sex_of_Driver
0.10528806833669153 Age_Band_of_Driver
0.2110764739883848 Engine_Capacity_(CC)
0.09015495595836846 Age_of_Vehicle


#### 2. Feature Selection

Instead of building feature engineering pipelines by statistical information, there is an alternative choice to filter irrelavant features before actual learning.

Proposed method first uses classifier with L1-regularization penalty to punish the irrelavant features contributing to prediction. It is because the L1 norm can decrease the coefficient of irrelavant features during training, then eliminate the zero-coefficient features.

In [122]:
# Borrow the datasets generated by SMOTE oversampling methods
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

clf = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", C=0.01, dual=False))),
  ('classification', RandomForestClassifier(criterion='entropy', n_jobs=-1, random_state=1, verbose=1))
])
clf.fit(X_train_resampled, y_train_resampled)
y_pred = rf_clf.predict(X_test_resampled)
print(classification_report(y_test_resampled, y_pred, target_names=['Non-Fatal', 'Fatal']))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.4min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    4.5s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   11.7s finished


              precision    recall  f1-score   support

   Non-Fatal       0.76      0.73      0.74    506743
       Fatal       0.74      0.77      0.75    507020

    accuracy                           0.75   1013763
   macro avg       0.75      0.75      0.75   1013763
weighted avg       0.75      0.75      0.75   1013763

