In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import statsmodels.api as sm

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.metrics import roc_auc_score, roc_curve

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


def print_value_counts(df):
    for i in df.columns:
        print(f'column: {i}')
        print(df_cat2[i].value_counts())
        print()

Steps:

1. Select relevant cols
2. Impute NA - strategies: categorical - mode and NA, numeric median (or sth else)
3. one-hot encode categorical
4. merge categorical and numeric 
5. Fit models

In [2]:
master_table = pd.read_pickle('data/master_table.pkl')

In [3]:
master_table.head()

Unnamed: 0,accident_id,lighting,localization,intersection_type,weather,collision_type,com,address,gps,lat,...,place_in_car,user_type,injury_type,sex,equipment_used,pedestrian_action,pedestrian_alone,year_of_birth,vechicle_number,y
0,200500000001,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,11,CD41B,Métropole,5051500,...,1,Driver,Light injury,Male,yes,not specified or not applicable,,1976,A01,0
1,200500000001,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,11,CD41B,Métropole,5051500,...,1,Driver,Hospitalized wounded,Female,yes,not specified or not applicable,,1968,B02,1
2,200500000001,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,11,CD41B,Métropole,5051500,...,2,Passenger,Unscathed,Male,yes,not specified or not applicable,,1964,B02,0
3,200500000001,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,11,CD41B,Métropole,5051500,...,4,Passenger,Unscathed,Male,yes,not specified or not applicable,,2004,B02,0
4,200500000001,Night without public lighting,In built-up areas,Out of intersection,Normal,Two vehicles - by the side,11,CD41B,Métropole,5051500,...,5,Passenger,Unscathed,Male,yes,not specified or not applicable,,1998,B02,0


In [4]:
x_train, x_test, y_train, y_test = train_test_split(master_table, 
                                                    master_table.y, 
                                                    stratify=master_table.y, 
                                                    test_size=0.3, 
                                                    random_state = 3)

In [5]:
cols_to_model = [ 
 'lighting',
 'localization',
 'weather',
 'collision_type',
 'year',
 'month',
 'hour',
 'road_condition',
 'user_type',
 'sex',
]

cols_cat = [
 'lighting',
 'localization',
 'weather',
 'collision_type',
 'road_condition',
 'user_type',
 'sex',
]

### Categorical columns

In [6]:
df_cat = x_train[cols_cat]
df_cat.head()

Unnamed: 0,lighting,localization,weather,collision_type,road_condition,user_type,sex
1447467,Full day,In built-up areas,Normal,Other collision,normal,Driver,Male
1015436,Night with public lighting on,In built-up areas,Normal,Three or more vehicles - multiple collisions,normal,Passenger,Female
1016143,Night with public lighting on,In built-up areas,Normal,Two vehicles - by the side,normal,Driver,Male
1429402,Full day,In built-up areas,Normal,Two vehicles - by the side,normal,Driver,Male
1130602,Night with public lighting not lit,In built-up areas,Normal,Two vehicles - by the side,wet,Driver,Male


Replacing NA's - for now with 'NA'

In [7]:
si = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 'NA', add_indicator=False)
temp = si.fit_transform(df_cat)
df_cat2 = pd.DataFrame(temp, columns = df_cat.columns)
df_cat2.head()

Unnamed: 0,lighting,localization,weather,collision_type,road_condition,user_type,sex
0,Full day,In built-up areas,Normal,Other collision,normal,Driver,Male
1,Night with public lighting on,In built-up areas,Normal,Three or more vehicles - multiple collisions,normal,Passenger,Female
2,Night with public lighting on,In built-up areas,Normal,Two vehicles - by the side,normal,Driver,Male
3,Full day,In built-up areas,Normal,Two vehicles - by the side,normal,Driver,Male
4,Night with public lighting not lit,In built-up areas,Normal,Two vehicles - by the side,wet,Driver,Male


In [8]:
# print_value_counts(df_cat2)

One hot encoding

In [9]:
df_cat3 = pd.get_dummies(df_cat2, drop_first = True)

In [10]:
df_cat_out = df_cat3.copy()

In [11]:
df_cat_out.head()

Unnamed: 0,lighting_Night with public lighting not lit,lighting_Night with public lighting on,lighting_Night without public lighting,lighting_Twilight or dawn,localization_Out of agglomeration,weather_Dazzling weather,weather_Fog - smoke,weather_Heavy rain,weather_Light rain,weather_NA,...,road_condition_mud,road_condition_normal,road_condition_other,road_condition_puddles,road_condition_snow,road_condition_wet,user_type_Passenger,user_type_Pedestrian,user_type_Pedestrian in rollerblade or scooter,sex_Male
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


### Numeric columns 

In [12]:
cols_num = [
 'year',
 'month',
 'hour',
 'year_of_birth'
]

In [13]:
df_num = x_train[cols_num]
df_num.head()

Unnamed: 0,year,month,hour,year_of_birth
1447467,2013.0,4.0,0.0,1974
1015436,2010.0,7.0,23.0,1994
1016143,2010.0,10.0,11.0,1956
1429402,2013.0,7.0,13.0,1996
1130602,2011.0,12.0,17.0,1976


In [14]:
df_num.isna().mean()

year             0.016908
month            0.016908
hour             0.016908
year_of_birth    0.001256
dtype: float64

Imputing median

In [15]:
si = SimpleImputer(missing_values=np.nan, strategy='median', add_indicator=False)
temp = si.fit_transform(df_num)
df_num2 = pd.DataFrame(temp, columns = df_num.columns)
df_num2.head()

Unnamed: 0,year,month,hour,year_of_birth
0,2013.0,4.0,0.0,1974.0
1,2010.0,7.0,23.0,1994.0
2,2010.0,10.0,11.0,1956.0
3,2013.0,7.0,13.0,1996.0
4,2011.0,12.0,17.0,1976.0


Basic feature engineering - age at the time of accident

In [16]:
df_num2['user_age'] = df_num2['year'] - df_num2['year_of_birth']
df_num3 = df_num2.drop(['year_of_birth'], axis =1 )

In [17]:
df_num_out = df_num3.copy()

In [18]:
df_num_out

Unnamed: 0,year,month,hour,user_age
0,2013.0,4.0,0.0,39.0
1,2010.0,7.0,23.0,16.0
2,2010.0,10.0,11.0,54.0
3,2013.0,7.0,13.0,17.0
4,2011.0,12.0,17.0,35.0
...,...,...,...,...
1313198,2008.0,10.0,17.0,30.0
1313199,2010.0,8.0,10.0,69.0
1313200,2014.0,11.0,18.0,20.0
1313201,2009.0,2.0,16.0,44.0


### Merging categorical and numeric columns

In [19]:
df_out = df_cat_out.join(df_num_out)

In [20]:
df_out.head()

Unnamed: 0,lighting_Night with public lighting not lit,lighting_Night with public lighting on,lighting_Night without public lighting,lighting_Twilight or dawn,localization_Out of agglomeration,weather_Dazzling weather,weather_Fog - smoke,weather_Heavy rain,weather_Light rain,weather_NA,...,road_condition_snow,road_condition_wet,user_type_Passenger,user_type_Pedestrian,user_type_Pedestrian in rollerblade or scooter,sex_Male,year,month,hour,user_age
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,2013.0,4.0,0.0,39.0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,2010.0,7.0,23.0,16.0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,2010.0,10.0,11.0,54.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,2013.0,7.0,13.0,17.0
4,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,2011.0,12.0,17.0,35.0


KNN is slooow

In [23]:
# model = KNeighborsClassifier()
# a = cross_validate(model, df_out, y_train, cv=3, scoring = 'roc_auc', verbose = 1)

In [39]:
model1 = LogisticRegression(max_iter= 1000)
a = cross_validate(model1, df_out, y_train, cv=3, scoring = 'roc_auc', verbose = 1, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.4min finished


In [43]:
a

{'fit_time': array([39.67123461, 66.83779168, 37.71313334]),
 'score_time': array([0.33063102, 0.36289167, 0.24838495]),
 'test_score': array([0.70140825, 0.70067348, 0.70140021])}

In [44]:
fitted = model1.fit(df_out, y_train)

In [56]:
roc_auc_score(y_train, fitted.predict_proba(df_out)[:,1])


0.7024275913960263

In [57]:
a

{'fit_time': array([39.67123461, 66.83779168, 37.71313334]),
 'score_time': array([0.33063102, 0.36289167, 0.24838495]),
 'test_score': array([0.70140825, 0.70067348, 0.70140021])}

In [30]:
from sklearn.ensemble import GradientBoostingClassifier

In [31]:
model = GradientBoostingClassifier(verbose = 3)
a = cross_validate(model, df_out, y_train, cv=3, scoring = 'roc_auc', verbose = 1, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  8.6min finished


In [33]:
a

{'fit_time': array([322.55792332, 323.35422611, 188.26634598]),
 'score_time': array([1.84972596, 1.85406041, 1.31721139]),
 'test_score': array([0.72906842, 0.72829636, 0.72953959])}

In [34]:
model.fit(df_out, y_train)

      Iter       Train Loss   Remaining Time 
         1           1.0784            4.64m
         2           1.0655            4.64m
         3           1.0545            4.60m
         4           1.0460            4.55m
         5           1.0384            4.50m
         6           1.0324            4.47m
         7           1.0271            4.41m
         8           1.0227            4.36m
         9           1.0191            4.33m
        10           1.0159            4.31m
        11           1.0131            4.26m
        12           1.0107            4.21m
        13           1.0088            4.15m
        14           1.0066            4.11m
        15           1.0047            4.06m
        16           1.0032            4.01m
        17           1.0013            3.96m
        18           0.9999            3.90m
        19           0.9986            3.85m
        20           0.9976            3.80m
        21           0.9965            3.74m
        2

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=3,
                           warm_start=False)

In [58]:
roc_auc_score(y_train, model.predict_proba(df_out)[:,1])


0.7294279987897022

Things to manipulate with:
- NA imputation
- Model selected (and hyperparameters)
- Features present (feature elimination)


- Upsampling
- Binning of features