In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## Data of May, 2015

In [2]:
path = "./data/road_traffic_collisions/road-traffic-collisions-2012-5.xls"

In [3]:
data_2015_5 = pd.read_excel(path)
data_2015_5.head()

Unnamed: 0,Reference,Date,Year,Month,Day,Day.1,Time,Severity,Road_Class,Main_rd_no,...,Vehicle_2_Location,Vehicle_ 2_Skidding,Vehicle_3_Type,V3_Manouvres,V3_From_Direction,V3_To_Direction,V3_Location,V3_Skidding,Easting,Northing
0,2412,20120103,2012,1,3,Tuesday,1140,Slight,A,1198,...,Approaching jnctn or waiting/parked,Skidded,,,,,,,525250.3192,269592.7491
1,2612,20120103,2012,1,3,Tuesday,1244,Slight,A,505,...,"Not at,or within 20m of, junction",Did not skid,,,,,,,546715.1485,246747.0387
2,2712,20120103,2012,1,3,Tuesday,1317,Serious,A,1,...,,,,,,,,,518665.261,264848.0873
3,2912,20121221,2012,12,21,Friday,15,Serious,U,0,...,Mid jnctn - on rbout or main road,Did not skid,,,,,,,519514.8238,299836.222
4,3112,20120103,2012,1,3,Tuesday,1745,Slight,U,0,...,,,,,,,,,542037.4532,296531.1881


Number of samples and features of raw data

In [4]:
data_2015_5.shape

(2319, 54)

Number of duplicated samples

In [5]:
data_2015_5.index.duplicated().sum()

0

Number of missing values of each feature

In [6]:
data_2015_5.isna().sum()

Reference                      0
Date                           0
Year                           0
Month                          0
Day                            0
Day.1                          0
Time                           0
Severity                       0
Road_Class                     0
Main_rd_no                     0
Junction Detail                0
Light                          0
Weather                        0
Surface                        0
Speed_limit                    0
Cycle                          0
TWMV                           0
Child                          0
Child_Pedestrian               0
Ped                            0
Skid                           0
HGV                            0
LGV                            0
PSV                            0
School Journey                 0
Unnamed: 25                    0
Pedestrian-1_Sex            2088
Pedestrian_1_Age               0
Pedestrian_1_Severity       2088
Pedestria_1_Location        2092
Pedestrian

Some properties of numeric data

In [7]:
data_2015_5._get_numeric_data().describe()

Unnamed: 0,Reference,Date,Year,Month,Day,Time,Main_rd_no,Speed_limit,Unnamed: 25,Pedestrian_1_Age,Number_Vehicles,Easting,Northing
count,2319.0,2319.0,2319.0,2319.0,2319.0,2319.0,2319.0,2319.0,2319.0,2319.0,2319.0,2319.0,2319.0
mean,317518.511427,20120700.0,2012.0,6.875377,15.782665,1356.081501,465.569642,43.691246,0.102199,3.079776,1.862872,533668.064991,278766.313129
std,176438.845667,345.1519,0.0,3.453561,8.763021,489.316355,522.974118,16.037635,0.31416,11.60807,0.711787,14449.117584,19654.708555
min,112.0,20120100.0,2012.0,1.0,1.0,0.0,0.0,20.0,0.0,0.0,1.0,502159.3082,238872.9626
25%,167012.0,20120420.0,2012.0,4.0,8.0,940.0,11.0,30.0,0.0,0.0,1.0,519374.21665,259722.40445
50%,326412.0,20120720.0,2012.0,7.0,16.0,1425.0,221.0,30.0,0.0,0.0,2.0,533382.5384,274887.0037
75%,472862.0,20121010.0,2012.0,10.0,23.0,1725.0,1090.0,60.0,0.0,0.0,2.0,545963.35555,298878.24125
max,904612.0,20121230.0,2012.0,12.0,31.0,2355.0,6118.0,70.0,3.0,93.0,9.0,570433.1303,316690.8274


In [8]:
cate_features = data_2015_5.select_dtypes(include='object').columns.to_list()

Some features of categorical features

In [9]:
pd.set_option('display.max_colwidth', 200)
def missing_ratio(df):
    return (df.isna().mean() * 100).round(1)
def num_values(df):
    return df.nunique()
def value_ratios(c):
    return dict((c.value_counts(normalize=True) * 100).round(1))
infor = data_2015_5[cate_features].agg([missing_ratio, num_values, value_ratios])
infor.T

Unnamed: 0,missing_ratio,num_values,value_ratios
Day.1,0.0,7,"{'Friday': 17.6, 'Monday': 15.9, 'Wednesday': 15.5, 'Thursday': 14.7, 'Tuesday': 14.6, 'Saturday': 12.2, 'Sunday': 9.5}"
Severity,0.0,3,"{'Slight': 85.4, 'Serious': 13.3, 'Fatal': 1.3}"
Road_Class,0.0,6,"{'A': 45.3, 'C': 20.1, 'U': 19.0, 'B': 14.4, 'M': 0.7, 'A(M)': 0.5}"
Junction Detail,0.0,8,"{'Not at junction': 43.0, ''T'/staggered junctn': 27.3, 'Roundabout': 13.8, 'Private drive/entry': 7.1, 'Cross roads': 5.2, 'Slip road': 1.7, 'Mini-roundabout': 1.1, 'Uncoded junction': 0.9}"
Light,0.0,2,"{'Day': 74.7, 'Dark': 25.3}"
Weather,0.0,8,"{'Fine (no wind)': 71.6, 'Rain (no wind)': 11.7, 'Unknown': 10.1, 'Uncoded': 3.2, 'Raining & windy': 1.2, 'Fine & windy': 1.0, 'Fog/mist': 0.8, 'Snow (no wind)': 0.3}"
Surface,0.0,6,"{'Dry': 67.7, 'Wet/damp': 29.5, 'Frost/Ice': 1.8, 'Unknown': 0.6, 'Snow': 0.2, 'Flood >3cm': 0.1}"
Cycle,0.0,2,"{'N': 78.7, 'Y': 21.3}"
TWMV,0.0,2,"{'N': 88.1, 'Y': 11.9}"
Child,0.0,2,"{'N': 90.0, 'Y': 10.0}"


Select features having large missing ratio and drop them
We also drop "Date" feature because there have already been year, month, day features

In [10]:
top_missing_features = infor.T[infor.T['missing_ratio'] >= 90].index.to_list()
top_missing_features

['Pedestrian-1_Sex',
 'Pedestrian_1_Severity',
 'Pedestria_1_Location',
 'Pedestrian-1_Movement',
 'Pedestrian_1_Mov_To',
 'Vehicle_3_Type',
 'V3_Manouvres',
 'V3_From_Direction',
 'V3_To_Direction',
 'V3_Location',
 'V3_Skidding']

In [11]:
# data= data_2015_5.drop(columns=top_missing_features,)
# data.drop(columns=["Date"], inplace=True)
data = data_2015_5.drop(columns=["Date"])

Using SimpleImputer to fill missing values of data

In [12]:
cate_features = data.select_dtypes(include='object').columns.to_list()
num_features = data.select_dtypes(exclude='object').columns.to_list()

In [13]:
data[num_features].head(2)

Unnamed: 0,Reference,Year,Month,Day,Time,Main_rd_no,Speed_limit,Unnamed: 25,Pedestrian_1_Age,Number_Vehicles,Easting,Northing
0,2412,2012,1,3,1140,1198,60,0,0,2,525250.3192,269592.7491
1,2612,2012,1,3,1244,505,50,0,0,2,546715.1485,246747.0387


In [14]:
data[cate_features].head(2)

Unnamed: 0,Day.1,Severity,Road_Class,Junction Detail,Light,Weather,Surface,Cycle,TWMV,Child,...,Vehicle_2_From_Direction,Vehicle_2_To_Direction,Vehicle_2_Location,Vehicle_ 2_Skidding,Vehicle_3_Type,V3_Manouvres,V3_From_Direction,V3_To_Direction,V3_Location,V3_Skidding
0,Tuesday,Slight,A,'T'/staggered junctn,Day,Raining & windy,Wet/damp,N,N,N,...,SE,N,Approaching jnctn or waiting/parked,Skidded,,,,,,
1,Tuesday,Slight,A,Not at junction,Day,Fine (no wind),Wet/damp,N,N,N,...,Parked,Parked,"Not at,or within 20m of, junction",Did not skid,,,,,,


In [15]:
num_imputer = [('num', SimpleImputer(missing_values= np.nan, strategy='mean'))]
cate_imputer =[('cate', SimpleImputer(missing_values= np.nan, strategy='most_frequent'))]

In [16]:
tsf = ColumnTransformer([('num', Pipeline(num_imputer), num_features), ('cate', Pipeline(cate_imputer), cate_features)])
data = pd.DataFrame(tsf.fit_transform(data), columns = [*num_features, *cate_features])
data.head()

Unnamed: 0,Reference,Year,Month,Day,Time,Main_rd_no,Speed_limit,Unnamed: 25,Pedestrian_1_Age,Number_Vehicles,...,Vehicle_2_From_Direction,Vehicle_2_To_Direction,Vehicle_2_Location,Vehicle_ 2_Skidding,Vehicle_3_Type,V3_Manouvres,V3_From_Direction,V3_To_Direction,V3_Location,V3_Skidding
0,2412,2012,1,3,1140,1198,60,0,0,2,...,SE,N,Approaching jnctn or waiting/parked,Skidded,Car,Slow or stopping,Parked,E,"Not at,or within 20m of, junction",Did not skid
1,2612,2012,1,3,1244,505,50,0,0,2,...,Parked,Parked,"Not at,or within 20m of, junction",Did not skid,Car,Slow or stopping,Parked,E,"Not at,or within 20m of, junction",Did not skid
2,2712,2012,1,3,1317,1,70,0,0,1,...,S,N,"Not at,or within 20m of, junction",Did not skid,Car,Slow or stopping,Parked,E,"Not at,or within 20m of, junction",Did not skid
3,2912,2012,12,21,15,0,30,0,0,2,...,S,N,Mid jnctn - on rbout or main road,Did not skid,Car,Slow or stopping,Parked,E,"Not at,or within 20m of, junction",Did not skid
4,3112,2012,1,3,1745,0,30,0,0,1,...,S,N,"Not at,or within 20m of, junction",Did not skid,Car,Slow or stopping,Parked,E,"Not at,or within 20m of, junction",Did not skid


<!-- Because ComlumTransformer return object, transfer to numeric features -->

Because ColumnsTransformer return object so we need to transfer some columns to numeric data

In [17]:
data[num_features] = data[num_features].apply(lambda x: pd.to_numeric(x, downcast='signed', errors='ignore'))

Check data again

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2319 entries, 0 to 2318
Data columns (total 53 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Reference                 2319 non-null   int32  
 1   Year                      2319 non-null   int16  
 2   Month                     2319 non-null   int8   
 3   Day                       2319 non-null   int8   
 4   Time                      2319 non-null   int16  
 5   Main_rd_no                2319 non-null   int16  
 6   Speed_limit               2319 non-null   int8   
 7   Unnamed: 25               2319 non-null   int8   
 8   Pedestrian_1_Age          2319 non-null   int8   
 9   Number_Vehicles           2319 non-null   int8   
 10  Easting                   2319 non-null   float64
 11  Northing                  2319 non-null   float64
 12  Day.1                     2319 non-null   object 
 13  Severity                  2319 non-null   object 
 14  Road_Cla

In [19]:
def preprocessing(df, infor):
    top_missing_features = infor.T[infor.T['missing_ratio'] >= 90].index.to_list()
    new_df= df.drop(columns=top_missing_features)
    new_df.drop(columns=["Date"], inplace=True)
    #
    cate_features = new_df.select_dtypes(include='object').columns.to_list()
    num_features = new_df.select_dtypes(exclude='object').columns.to_list()
    #
    num_imputer = [('num', SimpleImputer(missing_values= np.nan, strategy='mean'))]
    cate_imputer =[('cate', SimpleImputer(missing_values= np.nan, strategy='most_frequent'))]
    #
    tsf = ColumnTransformer([('num', Pipeline(num_imputer), num_features), ('cate', Pipeline(cate_imputer), cate_features)])
    new_df = pd.DataFrame(tsf.fit_transform(new_df), columns = [*num_features, *cate_features])
    #
    new_df[num_features] = new_df[num_features].apply(lambda x: pd.to_numeric(x, downcast='signed', errors='ignore'))
    return new_df