# Data Preprocessing

## Load dataset

In [506]:
import numpy as np
import pandas as pd

In [507]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('RTA Dataset.csv')
X = df.iloc[:, :-1]
y = df.iloc[:, [-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=61)

## Encode and Impute

In [508]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.impute import KNNImputer
from fancyimpute import KNN

encoder = OrdinalEncoder()

cat_col_with_order = ['Day_of_week', 'Age_band_of_driver', 'Driving_experience', 
                  'Service_year_of_vehicle', 'Age_band_of_casualty', 
                  'Casualty_severity', 'Defect_of_vehicle']

num_cols = X_train.select_dtypes(exclude='object').columns

cat_col_without_order = list(set(X_train.columns) - set(cat_col_with_order) -  set(num_cols))

map_dicts = {
    'Day_of_week': {'Monday': 1, "Tuesday": 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 0},
    'Age_band_of_driver': {'Under 18': 0, '18-30': 1, '31-50': 2, 'Over 51': 3},
    'Driving_experience': {'No Licence': 0, 'Below 1yr': 1, '1-2yr': 2, '2-5yr': 3, '5-10yr': 4, 'Above 10yr': 5},
    'Service_year_of_vehicle': {'Below 1yr': 0, '1-2yr': 1, '2-5yr': 2, '5-10yr': 3, 'Above 10yr': 4},
    'Age_band_of_casualty': {'5': 0, 'Under 18': 1, '18-30': 2, '31-50': 3, 'Over 51': 4},
    'Casualty_severity': {'1': 1, '2': 2, '3': 3},
    'Defect_of_vehicle': {'No defect': 0, '5': 5, '7': 7}
}


def preprocess(df):
    df = df.replace('Unknown', np.nan)
    df = df.replace('na', np.nan)
    df = df.drop(['Work_of_casuality'], axis=1)    
    df.Time = pd.to_datetime(df.Time).dt.hour

    for col in cat_col_with_order:
         df[col] = df[col].map(map_dicts[col])
    
#     cat_cols = df.select_dtypes(include='object').columns    
#     num_cols = df.select_dtypes(exclude='object').columns    
#     num_vals = df.select_dtypes(exclude='object').to_numpy()
    return df

def encode(df):
    '''function to encode non-null data and replace it in the original data'''    
    
    df_enc = df.copy()
    cat_cols = df_enc.select_dtypes(include='object').columns    
    
    for col in cat_cols:
        #retains only non-null values
        nonulls = np.array(df_enc[col].dropna())
        #reshapes the data for encoding
        impute_reshape = nonulls.reshape(-1,1)
        #encode date
        impute_ordinal = encoder.fit_transform(impute_reshape)
        #Assign back encoded values to non-null values
        df_enc.loc[df_enc[col].notnull(), col] = np.squeeze(impute_ordinal)
    return df_enc

def impute(df):
    return pd.DataFrame(np.round(imputer.fit_transform(df)), columns = df.columns)


In [509]:
X_train = preprocess(X_train)

In [510]:
X_train = encode(X_train)

In [511]:
X_train = impute(X_train)

Imputing row 1/8621 with 5 missing, elapsed time: 15.656
Imputing row 101/8621 with 3 missing, elapsed time: 15.672
Imputing row 201/8621 with 6 missing, elapsed time: 15.690
Imputing row 301/8621 with 5 missing, elapsed time: 15.708
Imputing row 401/8621 with 1 missing, elapsed time: 15.725
Imputing row 501/8621 with 3 missing, elapsed time: 15.743
Imputing row 601/8621 with 6 missing, elapsed time: 15.759
Imputing row 701/8621 with 9 missing, elapsed time: 15.776
Imputing row 801/8621 with 0 missing, elapsed time: 15.796
Imputing row 901/8621 with 6 missing, elapsed time: 15.813
Imputing row 1001/8621 with 2 missing, elapsed time: 15.831
Imputing row 1101/8621 with 3 missing, elapsed time: 15.852
Imputing row 1201/8621 with 6 missing, elapsed time: 15.871
Imputing row 1301/8621 with 5 missing, elapsed time: 15.889
Imputing row 1401/8621 with 3 missing, elapsed time: 15.908
Imputing row 1501/8621 with 3 missing, elapsed time: 15.926
Imputing row 1601/8621 with 0 missing, elapsed time:

In [512]:
X_train

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Number_of_vehicles_involved,Number_of_casualties,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident
0,12.0,1.0,1.0,1.0,4.0,0.0,5.0,0.0,3.0,3.0,...,3.0,1.0,2.0,0.0,1.0,3.0,3.0,2.0,5.0,0.0
1,17.0,5.0,2.0,1.0,4.0,0.0,5.0,7.0,3.0,1.0,...,2.0,1.0,2.0,0.0,0.0,4.0,3.0,2.0,5.0,12.0
2,22.0,6.0,1.0,1.0,4.0,0.0,4.0,15.0,3.0,3.0,...,2.0,2.0,2.0,2.0,0.0,1.0,3.0,2.0,5.0,10.0
3,23.0,0.0,1.0,1.0,1.0,0.0,4.0,0.0,0.0,1.0,...,3.0,4.0,7.0,0.0,1.0,2.0,3.0,2.0,5.0,0.0
4,17.0,2.0,0.0,1.0,4.0,2.0,4.0,0.0,3.0,1.0,...,2.0,2.0,3.0,2.0,0.0,2.0,3.0,2.0,5.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8616,17.0,5.0,1.0,1.0,4.0,0.0,4.0,8.0,3.0,4.0,...,2.0,1.0,2.0,0.0,1.0,2.0,3.0,2.0,5.0,3.0
8617,17.0,1.0,3.0,1.0,3.0,0.0,2.0,5.0,3.0,3.0,...,2.0,1.0,2.0,0.0,1.0,2.0,3.0,2.0,5.0,16.0
8618,17.0,0.0,2.0,1.0,4.0,0.0,4.0,0.0,3.0,3.0,...,2.0,1.0,10.0,0.0,1.0,2.0,3.0,2.0,5.0,12.0
8619,9.0,4.0,1.0,1.0,4.0,0.0,5.0,9.0,3.0,3.0,...,2.0,1.0,2.0,2.0,0.0,3.0,3.0,2.0,2.0,9.0
