In [1]:
import numpy as np 
import pandas as pd 
import os
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import category_encoders as ce

In [2]:
df=pd.read_csv('accident_data.csv')

In [3]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, shuffle=False)

In [4]:
df_train=df_train.drop(columns=['Accident_Index','Time'])
df_test=df_test.drop(columns=['Accident_Index','Time', 'Accident_Severity'])

# Imputing categorical and numeric columns

In [5]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [6]:
df_train_new = DataFrameImputer().fit_transform(df_train)
df_test_new = DataFrameImputer().fit_transform(df_test)

In [7]:
display(df_train_new.isnull().sum().sum())
display(df_test_new.isnull().sum().sum())

0

0

In [8]:
display(df_train.head())
display(df_train_new.head())

Unnamed: 0,1st_Road_Class,1st_Road_Number,2nd_Road_Class,2nd_Road_Number,Accident_Severity,Carriageway_Hazards,Date,Day_of_Week,Did_Police_Officer_Attend_Scene_of_Accident,Junction_Control,...,Pedestrian_Crossing-Physical_Facilities,Police_Force,Road_Surface_Conditions,Road_Type,Special_Conditions_at_Site,Speed_limit,Urban_or_Rural_Area,Weather_Conditions,Year,InScotland
0,A,3218.0,,0.0,Serious,,04/01/2005,Tuesday,1.0,Data missing or out of range,...,1.0,Metropolitan Police,Wet or damp,Single carriageway,,30,Urban,Raining no high winds,2005,No
1,B,450.0,C,0.0,Slight,,05/01/2005,Wednesday,1.0,Auto traffic signal,...,5.0,Metropolitan Police,Dry,Dual carriageway,,30,Urban,Fine no high winds,2005,No
2,C,0.0,,0.0,Slight,,06/01/2005,Thursday,1.0,Data missing or out of range,...,0.0,Metropolitan Police,Dry,Single carriageway,,30,Urban,Fine no high winds,2005,No
3,A,3220.0,,0.0,Slight,,07/01/2005,Friday,1.0,Data missing or out of range,...,0.0,Metropolitan Police,Dry,Single carriageway,,30,Urban,Fine no high winds,2005,No
4,,0.0,,0.0,Slight,,10/01/2005,Monday,1.0,Data missing or out of range,...,0.0,Metropolitan Police,Wet or damp,Single carriageway,,30,Urban,Fine no high winds,2005,No


Unnamed: 0,1st_Road_Class,1st_Road_Number,2nd_Road_Class,2nd_Road_Number,Accident_Severity,Carriageway_Hazards,Date,Day_of_Week,Did_Police_Officer_Attend_Scene_of_Accident,Junction_Control,...,Pedestrian_Crossing-Physical_Facilities,Police_Force,Road_Surface_Conditions,Road_Type,Special_Conditions_at_Site,Speed_limit,Urban_or_Rural_Area,Weather_Conditions,Year,InScotland
0,A,3218.0,Unclassified,0.0,Serious,,04/01/2005,Tuesday,1.0,Data missing or out of range,...,1.0,Metropolitan Police,Wet or damp,Single carriageway,,30,Urban,Raining no high winds,2005,No
1,B,450.0,C,0.0,Slight,,05/01/2005,Wednesday,1.0,Auto traffic signal,...,5.0,Metropolitan Police,Dry,Dual carriageway,,30,Urban,Fine no high winds,2005,No
2,C,0.0,Unclassified,0.0,Slight,,06/01/2005,Thursday,1.0,Data missing or out of range,...,0.0,Metropolitan Police,Dry,Single carriageway,,30,Urban,Fine no high winds,2005,No
3,A,3220.0,Unclassified,0.0,Slight,,07/01/2005,Friday,1.0,Data missing or out of range,...,0.0,Metropolitan Police,Dry,Single carriageway,,30,Urban,Fine no high winds,2005,No
4,A,0.0,Unclassified,0.0,Slight,,10/01/2005,Monday,1.0,Data missing or out of range,...,0.0,Metropolitan Police,Wet or damp,Single carriageway,,30,Urban,Fine no high winds,2005,No


# Separating categorical and numerical data

## For df_train_new

In [9]:
# example of a ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
# define data
df_train_new_cat = df_train_new[[ '1st_Road_Class', '2nd_Road_Class',
       'Carriageway_Hazards', 'Date', 'Day_of_Week',
       'Junction_Control', 'Junction_Detail', 'Light_Conditions',
       'Local_Authority_(District)', 'Local_Authority_(Highway)',
       'LSOA_of_Accident_Location', 'Police_Force', 'Road_Surface_Conditions',
       'Road_Type', 'Special_Conditions_at_Site',
       'Urban_or_Rural_Area', 'Weather_Conditions', 'InScotland']]

df_train_new_num= df_train_new[['Did_Police_Officer_Attend_Scene_of_Accident','1st_Road_Number',
                   '2nd_Road_Number','Location_Easting_OSGR','Location_Northing_OSGR',
                    'Pedestrian_Crossing-Human_Control','Pedestrian_Crossing-Physical_Facilities'
                               ]]
     

## For df_test_new

In [11]:
# example of a ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
# define data
df_test_new_cat = df_test_new[[ '1st_Road_Class', '2nd_Road_Class',
       'Carriageway_Hazards', 'Date', 'Day_of_Week',
       'Junction_Control', 'Junction_Detail', 'Light_Conditions',
       'Local_Authority_(District)', 'Local_Authority_(Highway)',
       'LSOA_of_Accident_Location', 'Police_Force', 'Road_Surface_Conditions',
       'Road_Type', 'Special_Conditions_at_Site',
       'Urban_or_Rural_Area', 'Weather_Conditions', 'InScotland']]

df_test_new_num= df_test_new[['Did_Police_Officer_Attend_Scene_of_Accident','1st_Road_Number',
                   '2nd_Road_Number','Location_Easting_OSGR','Location_Northing_OSGR',
                    'Pedestrian_Crossing-Human_Control','Pedestrian_Crossing-Physical_Facilities'
                              ]]
     

# Encoding categorical columns

## For df_train_new_cat

In [17]:
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
df_train_cat_encoded = encoder.fit_transform(df_train_new_cat)

In [31]:
df_train_cat_encoded=pd.DataFrame(df_train_cat_encoded)
df_train_cat_encoded.columns = [ '1st_Road_Class', '2nd_Road_Class',
       'Carriageway_Hazards', 'Date', 'Day_of_Week',
       'Junction_Control', 'Junction_Detail', 'Light_Conditions',
       'Local_Authority_(District)', 'Local_Authority_(Highway)',
       'LSOA_of_Accident_Location', 'Police_Force', 'Road_Surface_Conditions',
       'Road_Type', 'Special_Conditions_at_Site',
       'Urban_or_Rural_Area', 'Weather_Conditions', 'InScotland']
df_train_cat_encoded

Unnamed: 0,1st_Road_Class,2nd_Road_Class,Carriageway_Hazards,Date,Day_of_Week,Junction_Control,Junction_Detail,Light_Conditions,Local_Authority_(District),Local_Authority_(Highway),LSOA_of_Accident_Location,Police_Force,Road_Surface_Conditions,Road_Type,Special_Conditions_at_Site,Urban_or_Rural_Area,Weather_Conditions,InScotland
0,0.0,5.0,1.0,180.0,5.0,3.0,4.0,4.0,182.0,92.0,2826.0,29.0,4.0,3.0,5.0,2.0,5.0,0.0
1,2.0,3.0,1.0,240.0,6.0,2.0,0.0,1.0,182.0,92.0,2886.0,29.0,0.0,0.0,5.0,2.0,1.0,0.0
2,3.0,5.0,1.0,300.0,4.0,3.0,4.0,1.0,182.0,92.0,2834.0,29.0,0.0,3.0,5.0,2.0,1.0,0.0
3,0.0,5.0,1.0,360.0,0.0,3.0,4.0,4.0,182.0,92.0,2817.0,29.0,0.0,3.0,5.0,2.0,1.0,0.0
4,0.0,5.0,1.0,540.0,1.0,3.0,4.0,0.0,182.0,92.0,2840.0,29.0,4.0,3.0,5.0,2.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786426,0.0,0.0,1.0,714.0,4.0,4.0,8.0,1.0,142.0,69.0,8179.0,35.0,4.0,0.0,9.0,2.0,1.0,0.0
786427,0.0,0.0,1.0,1765.0,4.0,4.0,7.0,4.0,225.0,119.0,8238.0,35.0,0.0,2.0,5.0,2.0,1.0,0.0
786428,0.0,5.0,1.0,1014.0,5.0,3.0,4.0,4.0,142.0,69.0,8102.0,35.0,0.0,0.0,5.0,2.0,1.0,0.0
786429,2.0,5.0,1.0,1014.0,5.0,3.0,4.0,4.0,344.0,179.0,8635.0,35.0,0.0,3.0,5.0,2.0,1.0,0.0


## For df_test_new_cat

In [32]:
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
df_test_cat_encoded = encoder.fit_transform(df_test_new_cat)

In [33]:
df_test_cat_encoded=pd.DataFrame(df_test_cat_encoded)

df_test_cat_encoded.columns = [ '1st_Road_Class', '2nd_Road_Class',
       'Carriageway_Hazards', 'Date', 'Day_of_Week',
       'Junction_Control', 'Junction_Detail', 'Light_Conditions',
       'Local_Authority_(District)', 'Local_Authority_(Highway)',
       'LSOA_of_Accident_Location', 'Police_Force', 'Road_Surface_Conditions',
       'Road_Type', 'Special_Conditions_at_Site',
       'Urban_or_Rural_Area', 'Weather_Conditions', 'InScotland']
df_test_cat_encoded

Unnamed: 0,1st_Road_Class,2nd_Road_Class,Carriageway_Hazards,Date,Day_of_Week,Junction_Control,Junction_Detail,Light_Conditions,Local_Authority_(District),Local_Authority_(Highway),LSOA_of_Accident_Location,Police_Force,Road_Surface_Conditions,Road_Type,Special_Conditions_at_Site,Urban_or_Rural_Area,Weather_Conditions,InScotland
0,2.0,5.0,1.0,404.0,5.0,4.0,0.0,1.0,319.0,168.0,7439.0,35.0,4.0,0.0,5.0,1.0,5.0,0.0
1,2.0,5.0,1.0,20.0,3.0,3.0,3.0,4.0,218.0,119.0,7245.0,35.0,4.0,3.0,5.0,1.0,0.0,0.0
2,3.0,5.0,1.0,140.0,0.0,3.0,3.0,4.0,218.0,119.0,7245.0,35.0,0.0,3.0,5.0,1.0,1.0,0.0
3,2.0,5.0,1.0,140.0,0.0,4.0,8.0,4.0,241.0,132.0,25110.0,35.0,0.0,3.0,5.0,1.0,1.0,0.0
4,0.0,0.0,1.0,404.0,5.0,4.0,6.0,1.0,338.0,179.0,7479.0,35.0,0.0,3.0,5.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262139,0.0,5.0,1.0,411.0,4.0,3.0,3.0,4.0,163.0,84.0,10021.0,34.0,0.0,3.0,5.0,0.0,1.0,1.0
262140,0.0,5.0,1.0,483.0,3.0,3.0,3.0,3.0,163.0,84.0,10021.0,34.0,2.0,3.0,5.0,0.0,1.0,1.0
262141,0.0,5.0,1.0,531.0,5.0,4.0,8.0,4.0,163.0,84.0,10021.0,34.0,2.0,3.0,5.0,0.0,1.0,1.0
262142,0.0,5.0,1.0,531.0,5.0,4.0,8.0,3.0,163.0,84.0,10021.0,34.0,4.0,3.0,5.0,0.0,1.0,1.0


# Join categorical and numeric columns back together

## Reset Index

In [34]:
df_train_new_num.reset_index(inplace=True, drop=True)
df_train_cat_encoded.reset_index(inplace=True, drop=True)

df_test_new_num.reset_index(inplace=True, drop=True)
df_test_cat_encoded.reset_index(inplace=True, drop=True)

## Join

In [35]:
dftrain_final=df_train_new_num.join(df_train_cat_encoded)
accident_severity=df_train['Accident_Severity']
dftrain_final=dftrain_final.join(accident_severity)

dftrain_final

Unnamed: 0,Did_Police_Officer_Attend_Scene_of_Accident,1st_Road_Number,2nd_Road_Number,Location_Easting_OSGR,Location_Northing_OSGR,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,1st_Road_Class,2nd_Road_Class,Carriageway_Hazards,...,Local_Authority_(Highway),LSOA_of_Accident_Location,Police_Force,Road_Surface_Conditions,Road_Type,Special_Conditions_at_Site,Urban_or_Rural_Area,Weather_Conditions,InScotland,Accident_Severity
0,1.0,3218.0,0.0,525680.0,178240.0,0.0,1.0,0.0,5.0,1.0,...,92.0,2826.0,29.0,4.0,3.0,5.0,2.0,5.0,0.0,Serious
1,1.0,450.0,0.0,524170.0,181650.0,0.0,5.0,2.0,3.0,1.0,...,92.0,2886.0,29.0,0.0,0.0,5.0,2.0,1.0,0.0,Slight
2,1.0,0.0,0.0,524520.0,182240.0,0.0,0.0,3.0,5.0,1.0,...,92.0,2834.0,29.0,0.0,3.0,5.0,2.0,1.0,0.0,Slight
3,1.0,3220.0,0.0,526900.0,177530.0,0.0,0.0,0.0,5.0,1.0,...,92.0,2817.0,29.0,0.0,3.0,5.0,2.0,1.0,0.0,Slight
4,1.0,0.0,0.0,528060.0,179040.0,0.0,0.0,0.0,5.0,1.0,...,92.0,2840.0,29.0,4.0,3.0,5.0,2.0,1.0,0.0,Slight
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786426,2.0,184.0,1.0,423290.0,561490.0,0.0,0.0,0.0,0.0,1.0,...,69.0,8179.0,35.0,4.0,0.0,9.0,2.0,1.0,0.0,Slight
786427,2.0,188.0,1058.0,427050.0,566560.0,0.0,0.0,0.0,0.0,1.0,...,119.0,8238.0,35.0,0.0,2.0,5.0,2.0,1.0,0.0,Slight
786428,2.0,1.0,0.0,419430.0,563780.0,0.0,0.0,0.0,5.0,1.0,...,69.0,8102.0,35.0,0.0,0.0,5.0,2.0,1.0,0.0,Slight
786429,1.0,1260.0,0.0,433920.0,548800.0,0.0,0.0,2.0,5.0,1.0,...,179.0,8635.0,35.0,0.0,3.0,5.0,2.0,1.0,0.0,Slight


In [36]:
dftest_final=df_test_new_num.join(df_test_cat_encoded)
dftest_final

Unnamed: 0,Did_Police_Officer_Attend_Scene_of_Accident,1st_Road_Number,2nd_Road_Number,Location_Easting_OSGR,Location_Northing_OSGR,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,1st_Road_Class,2nd_Road_Class,Carriageway_Hazards,...,Local_Authority_(District),Local_Authority_(Highway),LSOA_of_Accident_Location,Police_Force,Road_Surface_Conditions,Road_Type,Special_Conditions_at_Site,Urban_or_Rural_Area,Weather_Conditions,InScotland
0,1.0,1516.0,336.0,432710.0,563070.0,0.0,0.0,2.0,5.0,1.0,...,319.0,168.0,7439.0,35.0,4.0,0.0,5.0,1.0,5.0,0.0
1,1.0,1600.0,0.0,424390.0,563600.0,0.0,1.0,2.0,5.0,1.0,...,218.0,119.0,7245.0,35.0,4.0,3.0,5.0,1.0,0.0,0.0
2,1.0,144.0,0.0,425010.0,564110.0,0.0,0.0,3.0,5.0,1.0,...,218.0,119.0,7245.0,35.0,0.0,3.0,5.0,1.0,1.0,0.0
3,2.0,1328.0,9511.0,431070.0,581390.0,0.0,0.0,2.0,5.0,1.0,...,241.0,132.0,25110.0,35.0,0.0,3.0,5.0,1.0,1.0,0.0
4,2.0,1018.0,1018.0,440100.0,557260.0,0.0,0.0,0.0,0.0,1.0,...,338.0,179.0,7479.0,35.0,0.0,3.0,5.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262139,1.0,95.0,0.0,311830.0,832450.0,0.0,0.0,0.0,5.0,1.0,...,163.0,84.0,10021.0,34.0,0.0,3.0,5.0,0.0,1.0,1.0
262140,1.0,95.0,0.0,290880.0,817170.0,0.0,0.0,0.0,5.0,1.0,...,163.0,84.0,10021.0,34.0,2.0,3.0,5.0,0.0,1.0,1.0
262141,1.0,96.0,0.0,288730.0,856520.0,0.0,0.0,0.0,5.0,1.0,...,163.0,84.0,10021.0,34.0,2.0,3.0,5.0,0.0,1.0,1.0
262142,1.0,9.0,0.0,289940.0,815260.0,0.0,0.0,0.0,5.0,1.0,...,163.0,84.0,10021.0,34.0,4.0,3.0,5.0,0.0,1.0,1.0


# Save preprocessed data for modeling

In [37]:
dftrain_final.to_csv('train_preprocessed.csv', index=False)
dftest_final.to_csv('test_preprocessed.csv', index=False)