In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.metrics import roc_auc_score, roc_curve

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


pd.set_option("display.max_columns",200)

def print_value_counts(df):
    for i in df.columns:
        print(f'column: {i}')
        display(df[i].value_counts())
        print()

Steps:

1. Select relevant cols
2. Impute NA - strategies: categorical - mode and NA, numeric median (or sth else)
3. one-hot encode categorical
4. merge categorical and numeric 
5. Fit models

Contents:

- load data
- train test split
- preprocess categorical vars
- preprocess numeric vars
- run CV

## Load data

In [2]:
master_table = pd.read_pickle('data/master_table.pkl')

In [3]:
master_table.head()

Unnamed: 0,accident_id,lighting,localization,intersection_type,weather,collision_type,com,address,gps,lat,long,departament,time,year,month,hour,road_category,road_regime,no_lanes,reserved_lane,road_gradient,road_plan,road_condition,infrastructure,accident_situation,user_id,place_in_car,user_type,injury_type,sex,equipment_used,pedestrian_action,pedestrian_alone,year_of_birth,vechicle_number,y
0,200500000001,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,11,CD41B,Métropole,5051500,294400,590,2005-01-05 19:00:00,2005.0,1.0,19.0,Departmental Road,Bidirectional,2.0,,Dish,Straight part,normal,,On the road,1678507,1,Driver,Light injury,Male,yes,not specified or not applicable,,1976,A01,0
1,200500000001,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,11,CD41B,Métropole,5051500,294400,590,2005-01-05 19:00:00,2005.0,1.0,19.0,Departmental Road,Bidirectional,2.0,,Dish,Straight part,normal,,On the road,1678508,1,Driver,Hospitalized wounded,Female,yes,not specified or not applicable,,1968,B02,1
2,200500000001,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,11,CD41B,Métropole,5051500,294400,590,2005-01-05 19:00:00,2005.0,1.0,19.0,Departmental Road,Bidirectional,2.0,,Dish,Straight part,normal,,On the road,1678509,2,Passenger,Unscathed,Male,yes,not specified or not applicable,,1964,B02,0
3,200500000001,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,11,CD41B,Métropole,5051500,294400,590,2005-01-05 19:00:00,2005.0,1.0,19.0,Departmental Road,Bidirectional,2.0,,Dish,Straight part,normal,,On the road,1678510,4,Passenger,Unscathed,Male,yes,not specified or not applicable,,2004,B02,0
4,200500000001,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,11,CD41B,Métropole,5051500,294400,590,2005-01-05 19:00:00,2005.0,1.0,19.0,Departmental Road,Bidirectional,2.0,,Dish,Straight part,normal,,On the road,1678511,5,Passenger,Unscathed,Male,yes,not specified or not applicable,,1998,B02,0


## EDA with filtering approach and feature selection

In [4]:
master_table['year_of_birth'] = master_table['year_of_birth'].astype(float)

In [5]:
cols_to_model = [
#  'accident_id',
 'lighting',
 'localization',
 'intersection_type',
 'weather',
 'collision_type',
#  'com',
#  'address',
#  'gps',
#  'lat',
#  'long',
#  'departament',
#  'time',
 'year',
 'month',
 'hour',
 'road_category',
 'road_regime',
 'no_lanes',
 'reserved_lane',
 'road_gradient',
 'road_plan',
 'road_condition',
 'infrastructure',
 'accident_situation',
#  'user_id',
 'place_in_car',
 'user_type',
#  'injury_type',
 'sex',
 'equipment_used',
 'pedestrian_action',
 'pedestrian_alone',
 'year_of_birth',
#  'vechicle_number',
#  'y'
]


In [6]:
master_table[cols_to_model].dtypes

lighting               object
localization           object
intersection_type      object
weather                object
collision_type         object
year                  float64
month                 float64
hour                  float64
road_category          object
road_regime            object
no_lanes              float64
reserved_lane          object
road_gradient          object
road_plan              object
road_condition         object
infrastructure         object
accident_situation     object
place_in_car           object
user_type              object
sex                    object
equipment_used         object
pedestrian_action      object
pedestrian_alone       object
year_of_birth         float64
dtype: object

TODO:

- get dummies (with na)
- 0 variance
- m

### Categorical columns

In [7]:
df_cat = master_table[cols_to_model].select_dtypes('object')
df_cat.head()

Unnamed: 0,lighting,localization,intersection_type,weather,collision_type,road_category,road_regime,reserved_lane,road_gradient,road_plan,road_condition,infrastructure,accident_situation,place_in_car,user_type,sex,equipment_used,pedestrian_action,pedestrian_alone
0,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,Departmental Road,Bidirectional,,Dish,Straight part,normal,,On the road,1,Driver,Male,yes,not specified or not applicable,
1,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,Departmental Road,Bidirectional,,Dish,Straight part,normal,,On the road,1,Driver,Female,yes,not specified or not applicable,
2,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,Departmental Road,Bidirectional,,Dish,Straight part,normal,,On the road,2,Passenger,Male,yes,not specified or not applicable,
3,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,Departmental Road,Bidirectional,,Dish,Straight part,normal,,On the road,4,Passenger,Male,yes,not specified or not applicable,
4,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,Departmental Road,Bidirectional,,Dish,Straight part,normal,,On the road,5,Passenger,Male,yes,not specified or not applicable,


Replacing NA's - for now with 'NA'

In [8]:
si = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 'NA', add_indicator=False)
temp = si.fit_transform(df_cat)
df_cat2 = pd.DataFrame(temp, columns = df_cat.columns)
df_cat2.head()

Unnamed: 0,lighting,localization,intersection_type,weather,collision_type,road_category,road_regime,reserved_lane,road_gradient,road_plan,road_condition,infrastructure,accident_situation,place_in_car,user_type,sex,equipment_used,pedestrian_action,pedestrian_alone
0,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,Departmental Road,Bidirectional,,Dish,Straight part,normal,,On the road,1,Driver,Male,yes,not specified or not applicable,
1,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,Departmental Road,Bidirectional,,Dish,Straight part,normal,,On the road,1,Driver,Female,yes,not specified or not applicable,
2,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,Departmental Road,Bidirectional,,Dish,Straight part,normal,,On the road,2,Passenger,Male,yes,not specified or not applicable,
3,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,Departmental Road,Bidirectional,,Dish,Straight part,normal,,On the road,4,Passenger,Male,yes,not specified or not applicable,
4,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,Departmental Road,Bidirectional,,Dish,Straight part,normal,,On the road,5,Passenger,Male,yes,not specified or not applicable,


One hot encoding

In [9]:
df_cat3 = pd.get_dummies(df_cat2, drop_first = True)

In [10]:
df_cat3.columns = ['d_' + col for col in list(df_cat3.columns)]

In [11]:
df_cat_out = df_cat3.copy()

In [12]:
df_cat_out.head()

Unnamed: 0,d_lighting_Night with public lighting not lit,d_lighting_Night with public lighting on,d_lighting_Night without public lighting,d_lighting_Twilight or dawn,d_localization_Out of agglomeration,d_intersection_type_Intersection in T,d_intersection_type_Intersection in X,d_intersection_type_Intersection in Y,d_intersection_type_Intersection with more than 4 branches,d_intersection_type_Level crossing,d_intersection_type_NA,d_intersection_type_Other intersection,d_intersection_type_Out of intersection,d_intersection_type_Place,d_weather_Dazzling weather,d_weather_Fog - smoke,d_weather_Heavy rain,d_weather_Light rain,d_weather_NA,d_weather_Normal,d_weather_Other,d_weather_Snow - hail,d_weather_Strong wind - storm,d_collision_type_Other collision,d_collision_type_Three or more vehicles - multiple collisions,d_collision_type_Three vehicles and more - in chain,d_collision_type_Two vehicles - by the side,d_collision_type_Two vehicles - from the rear,d_collision_type_Two vehicles - frontal,d_collision_type_Without collision,d_road_category_Departmental Road,d_road_category_Highway,d_road_category_NA,d_road_category_National Road,d_road_category_Off public network,d_road_category_Parking lot open to public traffic,d_road_category_other,d_road_regime_NA,d_road_regime_One way,d_road_regime_Separated carriageways,d_road_regime_With variable assignment channels,d_reserved_lane_Cycle Bank,d_reserved_lane_NA,d_reserved_lane_Reserved channel,d_road_gradient_Hill bottom,d_road_gradient_Hilltop,d_road_gradient_NA,d_road_gradient_Slope,d_road_plan_Curved right,"d_road_plan_In ""S""",d_road_plan_NA,d_road_plan_Straight part,d_road_condition_fat - oil,d_road_condition_flooded,d_road_condition_icy,d_road_condition_mud,d_road_condition_normal,d_road_condition_other,d_road_condition_puddles,d_road_condition_snow,d_road_condition_wet,d_infrastructure_Carrefour arranged,d_infrastructure_Exchanger or connection brace,d_infrastructure_NA,d_infrastructure_Pedestrian area,d_infrastructure_Railway,d_infrastructure_Toll zone,d_infrastructure_Underground - tunnel,d_accident_situation_On bike path,d_accident_situation_On emergency stop band,d_accident_situation_On the road,d_accident_situation_On the sidewalk,d_accident_situation_On the verge,d_place_in_car_1,d_place_in_car_2,d_place_in_car_3,d_place_in_car_4,d_place_in_car_5,d_place_in_car_6,d_place_in_car_7,d_place_in_car_8,d_place_in_car_9,d_place_in_car_NA,d_user_type_Passenger,d_user_type_Pedestrian,d_user_type_Pedestrian in rollerblade or scooter,d_sex_Male,d_equipment_used_no,d_equipment_used_yes,d_pedestrian_action_Masked,d_pedestrian_action_Meaning bumping vehicle,d_pedestrian_action_NA,d_pedestrian_action_Opposite direction of the vehicle,d_pedestrian_action_Other,d_pedestrian_action_Playing - running,d_pedestrian_action_With animal,d_pedestrian_action_not specified or not applicable,d_pedestrian_alone_In a group,d_pedestrian_alone_NA,d_pedestrian_alone_Only
0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0
1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0
3,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0
4,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0


In [13]:
df_cat_out.shape

(1876005, 100)

### Numeric columns 

In [14]:
cols_num = [
 'year',
 'month',
 'hour',
 'year_of_birth'
]

In [15]:
df_num = master_table[cols_num]
df_num.head()

Unnamed: 0,year,month,hour,year_of_birth
0,2005.0,1.0,19.0,1976.0
1,2005.0,1.0,19.0,1968.0
2,2005.0,1.0,19.0,1964.0
3,2005.0,1.0,19.0,2004.0
4,2005.0,1.0,19.0,1998.0


In [17]:
df_num.isna().mean()

year             0.016829
month            0.016829
hour             0.016829
year_of_birth    0.001253
dtype: float64

Imputing median

In [18]:
si = SimpleImputer(missing_values=np.nan, strategy='median', add_indicator=False)
temp = si.fit_transform(df_num)
df_num2 = pd.DataFrame(temp, columns = df_num.columns)
df_num2.head()

Unnamed: 0,year,month,hour,year_of_birth
0,2005.0,1.0,19.0,1976.0
1,2005.0,1.0,19.0,1968.0
2,2005.0,1.0,19.0,1964.0
3,2005.0,1.0,19.0,2004.0
4,2005.0,1.0,19.0,1998.0


Basic feature engineering - age at the time of accident

In [19]:
df_num2['user_age'] = df_num2['year'] - df_num2['year_of_birth']
df_num3 = df_num2.drop(['year_of_birth'], axis =1 )

In [20]:
df_num_out = df_num3.copy()

In [21]:
df_num_out.head()

Unnamed: 0,year,month,hour,user_age
0,2005.0,1.0,19.0,29.0
1,2005.0,1.0,19.0,37.0
2,2005.0,1.0,19.0,41.0
3,2005.0,1.0,19.0,1.0
4,2005.0,1.0,19.0,7.0


### Merging categorical and numeric columns

In [22]:
df_out = df_cat_out.join(df_num_out)

In [23]:
df_out['y'] = master_table.y

In [24]:
df_out.head()

Unnamed: 0,d_lighting_Night with public lighting not lit,d_lighting_Night with public lighting on,d_lighting_Night without public lighting,d_lighting_Twilight or dawn,d_localization_Out of agglomeration,d_intersection_type_Intersection in T,d_intersection_type_Intersection in X,d_intersection_type_Intersection in Y,d_intersection_type_Intersection with more than 4 branches,d_intersection_type_Level crossing,d_intersection_type_NA,d_intersection_type_Other intersection,d_intersection_type_Out of intersection,d_intersection_type_Place,d_weather_Dazzling weather,d_weather_Fog - smoke,d_weather_Heavy rain,d_weather_Light rain,d_weather_NA,d_weather_Normal,d_weather_Other,d_weather_Snow - hail,d_weather_Strong wind - storm,d_collision_type_Other collision,d_collision_type_Three or more vehicles - multiple collisions,d_collision_type_Three vehicles and more - in chain,d_collision_type_Two vehicles - by the side,d_collision_type_Two vehicles - from the rear,d_collision_type_Two vehicles - frontal,d_collision_type_Without collision,d_road_category_Departmental Road,d_road_category_Highway,d_road_category_NA,d_road_category_National Road,d_road_category_Off public network,d_road_category_Parking lot open to public traffic,d_road_category_other,d_road_regime_NA,d_road_regime_One way,d_road_regime_Separated carriageways,d_road_regime_With variable assignment channels,d_reserved_lane_Cycle Bank,d_reserved_lane_NA,d_reserved_lane_Reserved channel,d_road_gradient_Hill bottom,d_road_gradient_Hilltop,d_road_gradient_NA,d_road_gradient_Slope,d_road_plan_Curved right,"d_road_plan_In ""S""",d_road_plan_NA,d_road_plan_Straight part,d_road_condition_fat - oil,d_road_condition_flooded,d_road_condition_icy,d_road_condition_mud,d_road_condition_normal,d_road_condition_other,d_road_condition_puddles,d_road_condition_snow,d_road_condition_wet,d_infrastructure_Carrefour arranged,d_infrastructure_Exchanger or connection brace,d_infrastructure_NA,d_infrastructure_Pedestrian area,d_infrastructure_Railway,d_infrastructure_Toll zone,d_infrastructure_Underground - tunnel,d_accident_situation_On bike path,d_accident_situation_On emergency stop band,d_accident_situation_On the road,d_accident_situation_On the sidewalk,d_accident_situation_On the verge,d_place_in_car_1,d_place_in_car_2,d_place_in_car_3,d_place_in_car_4,d_place_in_car_5,d_place_in_car_6,d_place_in_car_7,d_place_in_car_8,d_place_in_car_9,d_place_in_car_NA,d_user_type_Passenger,d_user_type_Pedestrian,d_user_type_Pedestrian in rollerblade or scooter,d_sex_Male,d_equipment_used_no,d_equipment_used_yes,d_pedestrian_action_Masked,d_pedestrian_action_Meaning bumping vehicle,d_pedestrian_action_NA,d_pedestrian_action_Opposite direction of the vehicle,d_pedestrian_action_Other,d_pedestrian_action_Playing - running,d_pedestrian_action_With animal,d_pedestrian_action_not specified or not applicable,d_pedestrian_alone_In a group,d_pedestrian_alone_NA,d_pedestrian_alone_Only,year,month,hour,user_age,y
0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,2005.0,1.0,19.0,29.0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,2005.0,1.0,19.0,37.0,1
2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,2005.0,1.0,19.0,41.0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,2005.0,1.0,19.0,1.0,0
4,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,2005.0,1.0,19.0,7.0,0


## Feature selection - removing near-zero-variance predictors

In [25]:
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression, RFE, VarianceThreshold

In [26]:
df_num_out.var()

year         11.797123
month        11.273194
hour         21.691832
user_age    330.088458
dtype: float64

Numerical variables are not near-zero-variance

In [27]:
variances = []

for col in df_out.columns:
    variances.append(df_out[col].var())

In [28]:
pd.DataFrame({'col': list(df_out.columns), 'var': variances}).sort_values(by = 'var').head(10)

Unnamed: 0,col,var
32,d_road_category_NA,1e-06
18,d_weather_NA,6.2e-05
10,d_intersection_type_NA,0.000135
95,d_pedestrian_action_With animal,0.000185
53,d_road_condition_flooded,0.000385
55,d_road_condition_mud,0.000437
81,d_place_in_car_9,0.000671
66,d_infrastructure_Toll zone,0.000828
78,d_place_in_car_6,0.000863
91,d_pedestrian_action_NA,0.000944


For categorical variables - some have very low variance. A better way to filter these out is probably remove variables with very low percentage.

In [29]:
df_out.select_dtypes('uint8').apply(lambda x: np.mean(x==1)).sort_values()

d_road_category_NA                                     0.000001
d_weather_NA                                           0.000062
d_intersection_type_NA                                 0.000135
d_pedestrian_action_With animal                        0.000185
d_road_condition_flooded                               0.000385
                                                         ...   
d_accident_situation_On the road                       0.881517
d_infrastructure_NA                                    0.887186
d_pedestrian_action_not specified or not applicable    0.918675
d_pedestrian_alone_NA                                  0.920466
d_reserved_lane_NA                                     0.939200
Length: 100, dtype: float64

In [30]:
perc_tre = 0.02

one_perc = df_out.select_dtypes('uint8').apply(lambda x: np.mean(x==1))
non_zero_var_cols = list(one_perc[one_perc > perc_tre][one_perc < 1 - perc_tre].index)

In [31]:
non_zero_var_cols = non_zero_var_cols + list(df_num_out.columns)

In [32]:
df_out2 = df_out[non_zero_var_cols + ['y']]

In [33]:
df_out2.head()

Unnamed: 0,d_lighting_Night with public lighting on,d_lighting_Night without public lighting,d_lighting_Twilight or dawn,d_localization_Out of agglomeration,d_intersection_type_Intersection in T,d_intersection_type_Intersection in X,d_intersection_type_Out of intersection,d_weather_Heavy rain,d_weather_Light rain,d_weather_Normal,d_collision_type_Other collision,d_collision_type_Three or more vehicles - multiple collisions,d_collision_type_Three vehicles and more - in chain,d_collision_type_Two vehicles - by the side,d_collision_type_Two vehicles - from the rear,d_collision_type_Two vehicles - frontal,d_collision_type_Without collision,d_road_category_Departmental Road,d_road_category_Highway,d_road_category_National Road,d_road_regime_NA,d_road_regime_One way,d_road_regime_Separated carriageways,d_reserved_lane_NA,d_reserved_lane_Reserved channel,d_road_gradient_NA,d_road_gradient_Slope,d_road_plan_Curved right,d_road_plan_NA,d_road_plan_Straight part,d_road_condition_normal,d_road_condition_wet,d_infrastructure_Carrefour arranged,d_infrastructure_NA,d_accident_situation_On the road,d_accident_situation_On the verge,d_place_in_car_1,d_place_in_car_2,d_place_in_car_3,d_place_in_car_4,d_place_in_car_NA,d_user_type_Passenger,d_user_type_Pedestrian,d_sex_Male,d_equipment_used_no,d_equipment_used_yes,d_pedestrian_action_not specified or not applicable,d_pedestrian_alone_NA,d_pedestrian_alone_Only,year,month,hour,user_age,y
0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,1,0,1,1,1,0,2005.0,1.0,19.0,29.0,0
1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,1,0,2005.0,1.0,19.0,37.0,1
2,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0,1,0,1,1,1,0,2005.0,1.0,19.0,41.0,0
3,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,0,1,0,1,0,1,1,1,0,2005.0,1.0,19.0,1.0,0
4,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,1,1,1,0,2005.0,1.0,19.0,7.0,0


## Feature selection - mutual information

In [34]:
minfos_cat = []

In [35]:
for col in df_out2.select_dtypes('uint8').columns:
#     print(col)
    minfos_cat.append(mutual_info_classif(df_out2[col].values.reshape(-1,1),
                                                        df_out2["y"].values, discrete_features = True)[0])

In [36]:
minfos2= [(col, val) for col, val in zip(df_out2.select_dtypes('uint8').columns, minfos_cat)]

In [37]:
pd.DataFrame(minfos2, columns = ['column', 'mutual_info']).sort_values(by='mutual_info', ascending=False).head()

Unnamed: 0,column,mutual_info
17,d_road_category_Departmental Road,0.018954
3,d_localization_Out of agglomeration,0.01848
35,d_accident_situation_On the verge,0.012684
34,d_accident_situation_On the road,0.007342
42,d_user_type_Pedestrian,0.006981


Don't run below - takes forever and brakes everything

In [39]:
# minfos_num = []
# for col in df_out2.select_dtypes('float64').columns:
#     print(col)
#     minfos_num.append(mutual_info_regression(df_out2[col].values.reshape(-1,1),
#                                                         df_out2["y"].values))

## Train test split

For each accident in the data, there can be multiple users engaged. This means that we should control for data leakage. People from one accident should be placed together in either train or test dataset. One way to achieve this and simultanously keep distribution of y is to sample accident_id, and from that obtain train test split.


In [38]:
master_table['accident_id']

0          200500000001
1          200500000001
2          200500000001
3          200500000001
4          200500000001
               ...     
1876000    201600059431
1876001    201600059431
1876002    201600059432
1876003    201600059432
1876004    201600059432
Name: accident_id, Length: 1876005, dtype: object

In [39]:
df_out2['accident_id'] = master_table['accident_id']#.groupby('y').accident_id.count()

Test what proportion of y we need in both train and test datasets

In [40]:
df_out2.y.value_counts(normalize = True)

0    0.763189
1    0.236811
Name: y, dtype: float64

Around 23% of y is equal to 1 (seriously injured or killed).
Randomly select 30% of accident_id and check the distribution of y obtained:

In [41]:
temp = pd.DataFrame({'test_id': df_out2.accident_id.sample(frac = 0.3), 'if_test': 1}).set_index('test_id')

temp2 = df_out2.set_index('accident_id').join(temp)
temp3 = temp2.reset_index().rename({'index': 'accident_id'}, axis = 1)
temp3.head()

Unnamed: 0,accident_id,d_lighting_Night with public lighting on,d_lighting_Night without public lighting,d_lighting_Twilight or dawn,d_localization_Out of agglomeration,d_intersection_type_Intersection in T,d_intersection_type_Intersection in X,d_intersection_type_Out of intersection,d_weather_Heavy rain,d_weather_Light rain,d_weather_Normal,d_collision_type_Other collision,d_collision_type_Three or more vehicles - multiple collisions,d_collision_type_Three vehicles and more - in chain,d_collision_type_Two vehicles - by the side,d_collision_type_Two vehicles - from the rear,d_collision_type_Two vehicles - frontal,d_collision_type_Without collision,d_road_category_Departmental Road,d_road_category_Highway,d_road_category_National Road,d_road_regime_NA,d_road_regime_One way,d_road_regime_Separated carriageways,d_reserved_lane_NA,d_reserved_lane_Reserved channel,d_road_gradient_NA,d_road_gradient_Slope,d_road_plan_Curved right,d_road_plan_NA,d_road_plan_Straight part,d_road_condition_normal,d_road_condition_wet,d_infrastructure_Carrefour arranged,d_infrastructure_NA,d_accident_situation_On the road,d_accident_situation_On the verge,d_place_in_car_1,d_place_in_car_2,d_place_in_car_3,d_place_in_car_4,d_place_in_car_NA,d_user_type_Passenger,d_user_type_Pedestrian,d_sex_Male,d_equipment_used_no,d_equipment_used_yes,d_pedestrian_action_not specified or not applicable,d_pedestrian_alone_NA,d_pedestrian_alone_Only,year,month,hour,user_age,y,if_test
0,200500000001,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,1,0,1,1,1,0,2005.0,1.0,19.0,29.0,0,1.0
1,200500000001,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,1,0,1,1,1,0,2005.0,1.0,19.0,29.0,0,1.0
2,200500000001,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,1,0,2005.0,1.0,19.0,37.0,1,1.0
3,200500000001,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,1,0,2005.0,1.0,19.0,37.0,1,1.0
4,200500000001,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0,1,0,1,1,1,0,2005.0,1.0,19.0,41.0,0,1.0


Finally, create training and test sets

In [42]:
test = temp3.query('if_test == 1').drop('if_test', axis =1)
train = temp3[temp3.if_test.isna()].drop('if_test', axis =1)

Checking if procedure was correct:

In [43]:
print(test.shape)
print(train.shape)

print(test.y.mean())
print(train.y.mean())

(1602813, 55)
(777375, 55)
0.21665222330989328
0.2553503778742563


In [44]:
x_train = train.drop(['y', 'accident_id'], axis = 1)
x_test = test.drop(['y', 'accident_id'], axis = 1)

y_train = train[['y']]
y_test = test[['y']]

In [45]:
x_train.head()

Unnamed: 0,d_lighting_Night with public lighting on,d_lighting_Night without public lighting,d_lighting_Twilight or dawn,d_localization_Out of agglomeration,d_intersection_type_Intersection in T,d_intersection_type_Intersection in X,d_intersection_type_Out of intersection,d_weather_Heavy rain,d_weather_Light rain,d_weather_Normal,d_collision_type_Other collision,d_collision_type_Three or more vehicles - multiple collisions,d_collision_type_Three vehicles and more - in chain,d_collision_type_Two vehicles - by the side,d_collision_type_Two vehicles - from the rear,d_collision_type_Two vehicles - frontal,d_collision_type_Without collision,d_road_category_Departmental Road,d_road_category_Highway,d_road_category_National Road,d_road_regime_NA,d_road_regime_One way,d_road_regime_Separated carriageways,d_reserved_lane_NA,d_reserved_lane_Reserved channel,d_road_gradient_NA,d_road_gradient_Slope,d_road_plan_Curved right,d_road_plan_NA,d_road_plan_Straight part,d_road_condition_normal,d_road_condition_wet,d_infrastructure_Carrefour arranged,d_infrastructure_NA,d_accident_situation_On the road,d_accident_situation_On the verge,d_place_in_car_1,d_place_in_car_2,d_place_in_car_3,d_place_in_car_4,d_place_in_car_NA,d_user_type_Passenger,d_user_type_Pedestrian,d_sex_Male,d_equipment_used_no,d_equipment_used_yes,d_pedestrian_action_not specified or not applicable,d_pedestrian_alone_NA,d_pedestrian_alone_Only,year,month,hour,user_age
14,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,1,1,1,0,2005.0,1.0,18.0,22.0
15,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,1,1,1,0,2005.0,1.0,18.0,49.0
16,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,1,0,1,1,1,0,2005.0,1.0,16.0,85.0
17,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0,0,0,1,1,1,0,2005.0,1.0,16.0,82.0
18,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,1,0,1,1,1,0,2005.0,1.0,16.0,41.0


In [48]:
# x_train, x_test, y_train, y_test = train_test_split(df_out2.drop('y', axis=1), 
#                                                     df_out2.y, 
#                                                     stratify=df_out2.y, 
#                                                     test_size=0.3, 
#                                                     random_state = 3)

### Modeling

In [46]:
cols_to_log = [
    # 'd_lighting_Night with public lighting on',
#  'd_lighting_Night without public lighting',
#  'd_lighting_Twilight or dawn',
#  'd_localization_Out of agglomeration',
#  'd_intersection_type_Intersection in T',
#  'd_intersection_type_Intersection in X',
#  'd_intersection_type_Out of intersection',
#  'd_weather_Heavy rain',
#  'd_weather_Light rain',
#  'd_weather_Normal',
#  'd_collision_type_Other collision',
#  'd_collision_type_Three or more vehicles - multiple collisions',
#  'd_collision_type_Three vehicles and more - in chain',
#  'd_collision_type_Two vehicles - by the side',
#  'd_collision_type_Two vehicles - from the rear',
#  'd_collision_type_Two vehicles - frontal',
#  'd_collision_type_Without collision',
 'd_road_category_Departmental Road',
#  'd_road_category_Highway',
#  'd_road_category_National Road',
#  'd_road_regime_NA',
#  'd_road_regime_One way',
#  'd_road_regime_Separated carriageways',
#  'd_reserved_lane_NA',
#  'd_reserved_lane_Reserved channel',
#  'd_road_gradient_NA',
#  'd_road_gradient_Slope',
#  'd_road_plan_Curved right',
#  'd_road_plan_NA',
#  'd_road_plan_Straight part',
#  'd_road_condition_normal',
#  'd_road_condition_wet',
#  'd_infrastructure_Carrefour arranged',
#  'd_infrastructure_NA',
#  'd_accident_situation_On the road',
#  'd_accident_situation_On the verge',
#  'd_place_in_car_1',
#  'd_place_in_car_2',
#  'd_place_in_car_3',
#  'd_place_in_car_4',
#  'd_place_in_car_NA',
#  'd_user_type_Passenger',
#  'd_user_type_Pedestrian',
#  'd_sex_Male',
#  'd_equipment_used_no',
#  'd_equipment_used_yes',
#  'd_pedestrian_action_not specified or not applicable',
#  'd_pedestrian_alone_NA',
#  'd_pedestrian_alone_Only',
#  'year',
#  'month',
#  'hour',
#  'user_age'
 ]

In [47]:
mod = sm.OLS(y_train["y"],x_train["d_road_category_Departmental Road"])
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.192
Model:,OLS,Adj. R-squared (uncentered):,0.192
Method:,Least Squares,F-statistic:,184300.0
Date:,"Sun, 17 May 2020",Prob (F-statistic):,0.0
Time:,02:33:15,Log-Likelihood:,-489760.0
No. Observations:,777375,AIC:,979500.0
Df Residuals:,777374,BIC:,979500.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
d_road_category_Departmental Road,0.3977,0.001,429.257,0.000,0.396,0.400

0,1,2,3
Omnibus:,76684.311,Durbin-Watson:,1.655
Prob(Omnibus):,0.0,Jarque-Bera (JB):,97813.876
Skew:,0.855,Prob(JB):,0.0
Kurtosis:,2.687,Cond. No.,1.0


R^2 value is not that good enough lets try with all variables.

In [49]:

cols_to_log = [
 'd_lighting_Night with public lighting on',
 'd_lighting_Night without public lighting',
 'd_lighting_Twilight or dawn',
 'd_localization_Out of agglomeration',
 'd_intersection_type_Intersection in T',
 'd_intersection_type_Intersection in X',
 'd_intersection_type_Out of intersection',
 'd_weather_Heavy rain',
 'd_weather_Light rain',
 'd_weather_Normal',
 'd_collision_type_Other collision',
 'd_collision_type_Three or more vehicles - multiple collisions',
 'd_collision_type_Three vehicles and more - in chain',
 'd_collision_type_Two vehicles - by the side',
 'd_collision_type_Two vehicles - from the rear',
 'd_collision_type_Two vehicles - frontal',
 'd_collision_type_Without collision',
 'd_road_category_Departmental Road',
 'd_road_category_Highway',
 'd_road_category_National Road',
 'd_road_regime_NA',
 'd_road_regime_One way',
 'd_road_regime_Separated carriageways',
 'd_reserved_lane_NA',
 'd_reserved_lane_Reserved channel',
 'd_road_gradient_NA',
 'd_road_gradient_Slope',
 'd_road_plan_Curved right',
 'd_road_plan_NA',
 'd_road_plan_Straight part',
 'd_road_condition_normal',
 'd_road_condition_wet',
 'd_infrastructure_Carrefour arranged',
 'd_infrastructure_NA',
 'd_accident_situation_On the road',
 'd_accident_situation_On the verge',
 'd_place_in_car_1',
 'd_place_in_car_2',
 'd_place_in_car_3',
 'd_place_in_car_4',
 'd_place_in_car_NA',
 'd_user_type_Passenger',
 'd_user_type_Pedestrian',
 'd_sex_Male',
 'd_equipment_used_no',
 'd_equipment_used_yes',
 'd_pedestrian_action_not specified or not applicable',
 'd_pedestrian_alone_NA',
 'd_pedestrian_alone_Only',
 'year',
 'month',
 'hour',
 'user_age'
 ]

mod = sm.OLS(y_train['y'], add_constant(x_train[cols_to_log]))
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.167
Model:,OLS,Adj. R-squared:,0.167
Method:,Least Squares,F-statistic:,2939.0
Date:,"Sun, 17 May 2020",Prob (F-statistic):,0.0
Time:,02:34:51,Log-Likelihood:,-386860.0
No. Observations:,777375,AIC:,773800.0
Df Residuals:,777321,BIC:,774400.0
Df Model:,53,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3108,0.302,1.031,0.303,-0.280,0.902
d_lighting_Night with public lighting on,0.0305,0.001,23.215,0.000,0.028,0.033
d_lighting_Night without public lighting,0.0712,0.002,37.556,0.000,0.068,0.075
d_lighting_Twilight or dawn,0.0198,0.002,10.050,0.000,0.016,0.024
d_localization_Out of agglomeration,0.1497,0.001,106.733,0.000,0.147,0.152
d_intersection_type_Intersection in T,-0.0079,0.002,-3.485,0.000,-0.012,-0.003
d_intersection_type_Intersection in X,-0.0136,0.002,-6.476,0.000,-0.018,-0.010
d_intersection_type_Out of intersection,0.0273,0.002,15.161,0.000,0.024,0.031
d_weather_Heavy rain,-0.0166,0.004,-4.467,0.000,-0.024,-0.009

0,1,2,3
Omnibus:,84055.179,Durbin-Watson:,1.893
Prob(Omnibus):,0.0,Jarque-Bera (JB):,112449.046
Skew:,0.923,Prob(JB):,0.0
Kurtosis:,2.751,Cond. No.,1630000.0


R^2 value slightly improved 

In [50]:
r2 = []

for col in cols_to_log:
    mod = sm.OLS(y_train['y'], add_constant(x_train[col]))
    res = mod.fit()
    r2.append((col, res.rsquared))

In [52]:
r2_df = pd.DataFrame(r2, columns = ['col', 'r2']).sort_values('r2', ascending = False)
r2_df.head(7)

Unnamed: 0,col,r2
3,d_localization_Out of agglomeration,0.051578
17,d_road_category_Departmental Road,0.047739
35,d_accident_situation_On the verge,0.043652
34,d_accident_situation_On the road,0.02435
1,d_lighting_Night without public lighting,0.021268
44,d_equipment_used_no,0.014952
6,d_intersection_type_Out of intersection,0.014138


d_road_category_Departmental Road is still in top 5 variables.

In [54]:
# r2 = []

# for col in cols_to_log:
#     mod = sm.OLS(y_train['y'], add_constant(np.power(x_train, 1/2)[col]))
#     res = mod.fit()
#     r2.append((col, res.rsquared))
# MissingDataError: exog contains inf or nans


MissingDataError: exog contains inf or nans

In [62]:
# import statsmodels.formula.api as smf
# mod = smf.glm(y_train,x_train[cols_to_log], family=sm.families.Binomial())
# res = mod.fit()
# res.summary()

PatsyError: model is missing required outcome variables

KNN is slooow

In [49]:
# model = KNeighborsClassifier()
# a = cross_validate(model, df_out, y_train, cv=3, scoring = 'roc_auc', verbose = 1)

In [46]:
model1 = LogisticRegression(n_jobs=-1, verbose=2)
a = cross_validate(model1, x_train, y_train, cv=2, scoring = 'roc_auc', verbose = 1, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   31.2s finished


In [47]:
fitted = model1.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  6.1min finished


In [48]:
roc_auc_score(y_train, fitted.predict_proba(x_train)[:,1])


0.7423474357220142

## Stepwise feature selection

In [49]:
from sklearn.svm import LinearSVC

In [50]:
model2 = LinearSVC()

In [51]:
model2.fit(x_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [52]:
from sklearn.feature_selection import RFECV, RFE

In [55]:
rfe = RFE(model2, n_features_to_select = 1)

In [None]:
rfe.fit(x_train, y_train)