# Data Preprocessing

## Load dataset

In [1]:
import numpy as np
import pandas as pd

In [258]:
df = pd.read_csv('RTA Dataset.csv')

In [259]:
X = df.iloc[:, :-1]
y = df.iloc[:, [-1]]

In [282]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=61)

## Preprocessing

In [300]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer

cat_col_with_order = ['Day_of_week', 'Age_band_of_driver', 'Driving_experience', 
                  'Service_year_of_vehicle', 'Age_band_of_casualty', 
                  'Casualty_severity', 'Defect_of_vehicle']

num_cols = X_train.select_dtypes(exclude='object').columns

cat_col_without_order = list(set(X_train.columns) - set(cat_col_with_order) -  set(num_cols))

map_dicts = {
    'Day_of_week': {'Monday': 1, "Tuesday": 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 0},
    'Age_band_of_driver': {'Under 18': 0, '18-30': 1, '31-50': 2, 'Over 51': 3},
    'Driving_experience': {'No Licence': 0, 'Below 1yr': 1, '1-2yr': 2, '2-5yr': 3, '5-10yr': 4, 'Above 10yr': 5},
    'Service_year_of_vehicle': {'Below 1yr': 0, '1-2yr': 1, '2-5yr': 2, '5-10yr': 3, 'Above 10yr': 4},
    'Age_band_of_casualty': {'5': 0, 'Under 18': 1, '18-30': 2, '31-50': 3, 'Over 51': 4},
    'Casualty_severity': {'1': 1, '2': 2, '3': 3},
    'Defect_of_vehicle': {'No defect': 0, '5': 5, '7': 7}
}

def preprocessing(df):
    df = df.replace('Unknown', np.nan)
    df = df.replace('na', np.nan)
    df = df.drop(['Work_of_casuality'], axis=1)    
    df.Time = pd.to_datetime(df.Time).dt.hour
    
    for col in cat_col_with_order:
        df[col] = df[col].map(map_dicts[col])
    
    cat_cols = df.select_dtypes(include='object').columns    
    num_cols = df.select_dtypes(exclude='object').columns    
    num_vals = df.select_dtypes(exclude='object').to_numpy()
    
    label_lst = []
    for col in cat_cols:
        le = LabelEncoder()
        nan_label = df[col].nunique()        
        labels = le.fit_transform(df[col])
        labels = np.where(labels==nan_label, np.nan, labels)
        label_lst.append(labels)
    cat_labels = np.array(label_lst).transpose()
    label_encoded = np.concatenate((cat_labels, num_vals), axis=1)
    
    knn_imputer = KNNImputer(n_neighbors=5, 
                             weights="uniform", 
                             metric='nan_euclidean')
    imputed_data = knn_imputer.fit_transform(label_encoded)
    df_clean = pd.DataFrame(imputed_data)
    
    df_clean.columns = np.concatenate((cat_cols, num_cols))  
    
    return df_clean

In [301]:
preprocessing(X_train)

Unnamed: 0,Sex_of_driver,Educational_level,Vehicle_driver_relation,Type_of_vehicle,Owner_of_vehicle,Area_accident_occured,Lanes_or_Medians,Road_allignment,Types_of_Junction,Road_surface_type,...,Time,Day_of_week,Age_band_of_driver,Driving_experience,Service_year_of_vehicle,Defect_of_vehicle,Number_of_vehicles_involved,Number_of_casualties,Age_band_of_casualty,Casualty_severity
0,1.0,4.0,0.0,0.0,3.0,11.0,4.0,5.0,6.0,0.0,...,12.0,1.0,1.0,5.0,2.8,0.0,3.0,1.0,3.0,3.0
1,1.0,4.0,0.0,7.4,3.0,6.0,2.0,7.0,1.0,0.0,...,17.0,5.0,2.0,5.0,1.4,0.0,2.0,1.0,4.0,3.0
2,1.0,4.0,0.0,15.0,3.0,9.0,4.0,1.0,6.0,0.0,...,22.0,6.0,1.0,4.0,2.8,0.0,2.0,2.0,1.0,3.0
3,1.0,1.0,0.0,0.0,0.0,2.0,5.0,3.0,6.0,0.0,...,23.0,0.0,1.0,4.0,1.0,0.0,3.0,4.0,2.0,3.0
4,1.0,4.0,2.0,0.0,3.0,8.0,5.0,5.0,1.0,0.0,...,17.0,2.0,0.0,4.0,1.6,0.0,2.0,2.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8616,1.0,4.0,0.0,8.0,3.0,3.0,2.0,5.0,1.0,0.0,...,17.0,5.0,1.0,4.0,4.0,0.0,2.0,1.0,2.0,3.0
8617,1.0,2.8,0.0,5.0,3.0,7.0,2.0,5.0,1.0,0.0,...,17.0,1.0,3.0,2.2,2.8,0.0,2.0,1.0,2.2,2.8
8618,1.0,4.0,0.0,0.0,3.0,7.0,4.0,5.0,6.0,0.0,...,17.0,0.0,2.0,4.0,2.8,0.0,2.0,1.0,2.0,3.0
8619,1.0,4.0,0.0,9.0,3.0,7.0,2.0,5.0,6.0,2.0,...,9.0,4.0,1.0,5.0,2.6,0.0,2.0,1.0,3.0,3.0


In [263]:
X_train = X_train.replace('Unknown', np.nan)
X_train = X_train.replace('na', np.nan)
X_train = X_train.drop(['Work_of_casuality'], axis=1)
X_train.Time = pd.to_datetime(X_train.Time).dt.hour

In [303]:
cat_col_with_order = ['Day_of_week', 'Age_band_of_driver', 'Driving_experience', 
                  'Service_year_of_vehicle', 'Age_band_of_casualty', 
                  'Casualty_severity', 'Defect_of_vehicle']


map_dicts = {
    'Day_of_week': {'Monday': 1, "Tuesday": 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 0},
    'Age_band_of_driver': {'Under 18': 0, '18-30': 1, '31-50': 2, 'Over 51': 3},
    'Driving_experience': {'No Licence': 0, 'Below 1yr': 1, '1-2yr': 2, '2-5yr': 3, '5-10yr': 4, 'Above 10yr': 5},
    'Service_year_of_vehicle': {'Below 1yr': 0, '1-2yr': 1, '2-5yr': 2, '5-10yr': 3, 'Above 10yr': 4},
    'Age_band_of_casualty': {'5': 0, 'Under 18': 1, '18-30': 2, '31-50': 3, 'Over 51': 4},
    'Casualty_severity': {'1': 1, '2': 2, '3': 3},
    'Defect_of_vehicle': {'No defect': 0, '5': 5, '7': 7}
}

In [306]:
list(X_train.columns)

['Time',
 'Day_of_week',
 'Age_band_of_driver',
 'Sex_of_driver',
 'Educational_level',
 'Vehicle_driver_relation',
 'Driving_experience',
 'Type_of_vehicle',
 'Owner_of_vehicle',
 'Service_year_of_vehicle',
 'Defect_of_vehicle',
 'Area_accident_occured',
 'Lanes_or_Medians',
 'Road_allignment',
 'Types_of_Junction',
 'Road_surface_type',
 'Road_surface_conditions',
 'Light_conditions',
 'Weather_conditions',
 'Type_of_collision',
 'Number_of_vehicles_involved',
 'Number_of_casualties',
 'Vehicle_movement',
 'Casualty_class',
 'Sex_of_casualty',
 'Age_band_of_casualty',
 'Casualty_severity',
 'Work_of_casuality',
 'Fitness_of_casuality',
 'Pedestrian_movement',
 'Cause_of_accident']

In [265]:
for col in col_with_order:
    X_train[col] = X_train[col].map(map_dicts[col])

In [266]:
X_train

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Number_of_vehicles_involved,Number_of_casualties,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident
10069,12,1,1.0,Male,Junior high school,Employee,5.0,Automobile,Owner,,...,3,1,Going straight,,,,,Normal,Not a Pedestrian,Changing lane to the left
2894,17,5,2.0,Male,Junior high school,Employee,5.0,,Owner,,...,2,1,Going straight,Driver or rider,Female,4.0,3.0,,Not a Pedestrian,No priority to vehicle
5268,22,6,1.0,Male,Junior high school,Employee,4.0,Taxi,Owner,,...,2,2,Going straight,Pedestrian,Female,1.0,3.0,,Not a Pedestrian,No distancing
3160,23,0,1.0,Male,Elementary school,Employee,4.0,Automobile,Governmental,1.0,...,3,4,Reversing,Driver or rider,Male,2.0,3.0,Normal,Not a Pedestrian,Changing lane to the left
11707,17,2,0.0,Male,Junior high school,Owner,4.0,Automobile,Owner,,...,2,2,Moving Backward,,,,,Normal,Not a Pedestrian,No distancing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6654,17,5,1.0,Male,Junior high school,Employee,4.0,Pick up upto 10Q,Owner,4.0,...,2,1,Going straight,Driver or rider,Male,2.0,3.0,Normal,Not a Pedestrian,Driving carelessly
10256,17,1,3.0,Male,,Employee,,Lorry (41?100Q),Owner,,...,2,1,Going straight,,,,,Normal,Not a Pedestrian,Overtaking
12194,17,0,2.0,Male,Junior high school,Employee,4.0,Automobile,,,...,2,1,U-Turn,Driver or rider,Male,2.0,3.0,Normal,Not a Pedestrian,No priority to vehicle
3602,9,4,1.0,Male,Junior high school,Employee,5.0,Public (12 seats),Owner,,...,2,1,Going straight,Pedestrian,Female,3.0,3.0,Normal,Crossing from offside - masked by parked or s...,Moving Backward


In [267]:
from sklearn.preprocessing import LabelEncoder

In [268]:
cat_cols = X_train.select_dtypes(include='object').columns
num_vals = X_train.select_dtypes(exclude='object').to_numpy()

In [269]:
cat_cols

Index(['Sex_of_driver', 'Educational_level', 'Vehicle_driver_relation',
       'Type_of_vehicle', 'Owner_of_vehicle', 'Area_accident_occured',
       'Lanes_or_Medians', 'Road_allignment', 'Types_of_Junction',
       'Road_surface_type', 'Road_surface_conditions', 'Light_conditions',
       'Weather_conditions', 'Type_of_collision', 'Vehicle_movement',
       'Casualty_class', 'Sex_of_casualty', 'Fitness_of_casuality',
       'Pedestrian_movement', 'Cause_of_accident'],
      dtype='object')

In [270]:
num_vals.shape

(8621, 10)

In [271]:
label_lst = []
for col in cat_cols:
    nan_label = X_train[col].nunique()
    le = LabelEncoder()
    labels = le.fit_transform(X_train[col])
    labels = np.where(labels==nan_label, np.nan, labels)
    label_lst.append(labels)

In [272]:
cat_labels = np.array(label_lst).transpose()

In [273]:
cat_labels.shape

(8621, 20)

In [274]:
num_vals.shape

(8621, 10)

In [245]:
X_train_label_encoded = np.concatenate((cat_labels, num_vals), axis=1)

In [246]:
pd.DataFrame(X_train_label_encoded)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,1.0,4.0,0.0,0.0,3.0,11.0,4.0,5.0,6.0,0.0,...,12.0,1.0,1.0,5.0,,0.0,3.0,1.0,,
1,1.0,4.0,0.0,,3.0,6.0,2.0,7.0,1.0,0.0,...,17.0,5.0,2.0,5.0,,,2.0,1.0,4.0,3.0
2,1.0,4.0,0.0,15.0,3.0,9.0,4.0,1.0,6.0,0.0,...,22.0,6.0,1.0,4.0,,0.0,2.0,2.0,1.0,3.0
3,1.0,1.0,0.0,0.0,0.0,2.0,5.0,3.0,6.0,0.0,...,23.0,0.0,1.0,4.0,1.0,0.0,3.0,4.0,2.0,3.0
4,1.0,4.0,2.0,0.0,3.0,8.0,5.0,5.0,1.0,0.0,...,17.0,2.0,0.0,4.0,,,2.0,2.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8616,1.0,4.0,0.0,8.0,3.0,3.0,2.0,5.0,1.0,0.0,...,17.0,5.0,1.0,4.0,4.0,0.0,2.0,1.0,2.0,3.0
8617,1.0,,0.0,5.0,3.0,7.0,2.0,5.0,1.0,0.0,...,17.0,1.0,3.0,,,,2.0,1.0,,
8618,1.0,4.0,0.0,0.0,,7.0,4.0,5.0,6.0,0.0,...,17.0,0.0,2.0,4.0,,,2.0,1.0,2.0,3.0
8619,1.0,4.0,0.0,9.0,3.0,7.0,2.0,5.0,6.0,2.0,...,9.0,4.0,1.0,5.0,,,2.0,1.0,3.0,3.0


In [247]:
from sklearn.impute import KNNImputer
knn_imputer = KNNImputer(n_neighbors=5, 
                         weights="uniform",  
                         metric='nan_euclidean')
knn_imputer.fit(X_train_label_encoded)
knn_results = knn_imputer.transform(X_train_label_encoded)

In [248]:
X_train_clean = pd.DataFrame(knn_results)

In [249]:
X_train_clean

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,1.0,4.0,0.0,0.0,3.0,11.0,4.0,5.0,6.0,0.0,...,12.0,1.0,1.0,5.0,2.8,0.0,3.0,1.0,3.0,3.0
1,1.0,4.0,0.0,7.4,3.0,6.0,2.0,7.0,1.0,0.0,...,17.0,5.0,2.0,5.0,1.4,0.0,2.0,1.0,4.0,3.0
2,1.0,4.0,0.0,15.0,3.0,9.0,4.0,1.0,6.0,0.0,...,22.0,6.0,1.0,4.0,2.8,0.0,2.0,2.0,1.0,3.0
3,1.0,1.0,0.0,0.0,0.0,2.0,5.0,3.0,6.0,0.0,...,23.0,0.0,1.0,4.0,1.0,0.0,3.0,4.0,2.0,3.0
4,1.0,4.0,2.0,0.0,3.0,8.0,5.0,5.0,1.0,0.0,...,17.0,2.0,0.0,4.0,1.6,0.0,2.0,2.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8616,1.0,4.0,0.0,8.0,3.0,3.0,2.0,5.0,1.0,0.0,...,17.0,5.0,1.0,4.0,4.0,0.0,2.0,1.0,2.0,3.0
8617,1.0,2.8,0.0,5.0,3.0,7.0,2.0,5.0,1.0,0.0,...,17.0,1.0,3.0,2.2,2.8,0.0,2.0,1.0,2.2,2.8
8618,1.0,4.0,0.0,0.0,3.0,7.0,4.0,5.0,6.0,0.0,...,17.0,0.0,2.0,4.0,2.8,0.0,2.0,1.0,2.0,3.0
8619,1.0,4.0,0.0,9.0,3.0,7.0,2.0,5.0,6.0,2.0,...,9.0,4.0,1.0,5.0,2.6,0.0,2.0,1.0,3.0,3.0


In [275]:
cat_cols = X_train.select_dtypes(include='object').columns
num_cols = X_train.select_dtypes(exclude='object').columns
cols = np.concatenate((cat_cols, num_cols))

In [278]:
X_train_clean.columns = cols

In [279]:
X_train_clean

Unnamed: 0,Sex_of_driver,Educational_level,Vehicle_driver_relation,Type_of_vehicle,Owner_of_vehicle,Area_accident_occured,Lanes_or_Medians,Road_allignment,Types_of_Junction,Road_surface_type,...,Time,Day_of_week,Age_band_of_driver,Driving_experience,Service_year_of_vehicle,Defect_of_vehicle,Number_of_vehicles_involved,Number_of_casualties,Age_band_of_casualty,Casualty_severity
0,1.0,4.0,0.0,0.0,3.0,11.0,4.0,5.0,6.0,0.0,...,12.0,1.0,1.0,5.0,2.8,0.0,3.0,1.0,3.0,3.0
1,1.0,4.0,0.0,7.4,3.0,6.0,2.0,7.0,1.0,0.0,...,17.0,5.0,2.0,5.0,1.4,0.0,2.0,1.0,4.0,3.0
2,1.0,4.0,0.0,15.0,3.0,9.0,4.0,1.0,6.0,0.0,...,22.0,6.0,1.0,4.0,2.8,0.0,2.0,2.0,1.0,3.0
3,1.0,1.0,0.0,0.0,0.0,2.0,5.0,3.0,6.0,0.0,...,23.0,0.0,1.0,4.0,1.0,0.0,3.0,4.0,2.0,3.0
4,1.0,4.0,2.0,0.0,3.0,8.0,5.0,5.0,1.0,0.0,...,17.0,2.0,0.0,4.0,1.6,0.0,2.0,2.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8616,1.0,4.0,0.0,8.0,3.0,3.0,2.0,5.0,1.0,0.0,...,17.0,5.0,1.0,4.0,4.0,0.0,2.0,1.0,2.0,3.0
8617,1.0,2.8,0.0,5.0,3.0,7.0,2.0,5.0,1.0,0.0,...,17.0,1.0,3.0,2.2,2.8,0.0,2.0,1.0,2.2,2.8
8618,1.0,4.0,0.0,0.0,3.0,7.0,4.0,5.0,6.0,0.0,...,17.0,0.0,2.0,4.0,2.8,0.0,2.0,1.0,2.0,3.0
8619,1.0,4.0,0.0,9.0,3.0,7.0,2.0,5.0,6.0,2.0,...,9.0,4.0,1.0,5.0,2.6,0.0,2.0,1.0,3.0,3.0


In [325]:
cat_col_without_order = list(set(X_train.columns) - set(cat_col_with_order) -  set(num_cols))

In [327]:
cat_col_without_order

['Type_of_collision',
 'Pedestrian_movement',
 'Weather_conditions',
 'Casualty_class',
 'Sex_of_casualty',
 'Road_surface_conditions',
 'Sex_of_driver',
 'Vehicle_movement',
 'Work_of_casuality',
 'Types_of_Junction',
 'Type_of_vehicle',
 'Vehicle_driver_relation',
 'Light_conditions',
 'Educational_level',
 'Road_allignment',
 'Cause_of_accident',
 'Fitness_of_casuality',
 'Road_surface_type',
 'Owner_of_vehicle',
 'Lanes_or_Medians',
 'Area_accident_occured']