In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
places = pd.read_pickle('data/places_preprocessed.pkl')
users = pd.read_pickle('data/users_preprocessed.pkl')
accidents = pd.read_pickle('data/accidents_preprocessed.pkl')

In [3]:
master_table = accidents.set_index("accident_id").join(places.set_index("accident_id"))
master_table = master_table.join(users.set_index("accident_id"))

In [4]:
master_table = master_table.reset_index()

We are trying to model type of injury to a person. In each accident there could be multiple pepole - we need to handle that 

In [None]:
cols = list(master_table.columns)
cols

In [None]:
char_cols = [
    'lighting',
 'localization',
 'intersection_type',
 'weather',
 'collision_type',
 'road_category',
 'road_regime',
 'reserved_lane',
 'road_gradient',
 'road_plan',
 'road_condition',
 'infrastructure',
 'accident_situation',
 'user_type',
 'injury_type',
 'sex',
 'equipment_used',
 'pedestrian_action',
 'pedestrian_alone'
]

In [None]:
pieces = []
for col in char_cols:
    tmp_series = master_table[col].value_counts()
    tmp_series.name = col
    pieces.append(tmp_series)
df_value_counts = pd.concat(pieces, axis=1)

In [None]:
pieces

In [None]:
master_table.shape

In [None]:
master_table.injury_type.value_counts()

In [5]:
master_table['y'] = master_table.injury_type.map({"Unscathed": 0, 
                             "Light injury": 0,
                              "Hospitalized wounded": 1,
                              "Killed": 1
                             })

In [None]:
master_table.y.mean()

Using a stratified train test split. Dependence on particular accident doesn't matter much - only few observations.

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(master_table, master_table.y, stratify=master_table.y, test_size=0.3)

In [None]:
# x_train.drop('y', axis = 1, inplace = True)
# x_test.drop('y', axis = 1, inplace = True)

In [None]:
x_train.shape

In [None]:
y_train.shape

### First model

Logistic regression with 2 features - user type sex and lightning. Also removing nas

In [7]:
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, roc_curve

In [None]:
temp_x_train = x_train[['y', 'sex', 'user_type', 'lighting', 'year']].dropna()

In [11]:
def use_all_cols(df):
    all_columns = " + ".join([i for i in df.columns if i != 'y'])
    my_formula = "y ~ " + all_columns
    return my_formula

In [None]:
use_all_cols(temp_x_train)

In [None]:
mod = sm.GLM.from_formula(formula=use_all_cols(temp_x_train), data=temp_x_train, family=sm.families.Binomial())
res = mod.fit()
res.summary()

In [None]:
roc_auc_score(temp_x_train.y, res.predict())

Drop in AUC on test set not bad. However performance is rather questionable.

In [8]:
import matplotlib.pyplot as plt

In [None]:
fpr, tpr, _ = roc_curve(temp_x_train.y, res.predict())
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
# plt.show()

### Model 2

In [9]:
temp_x_train = x_train[['y',
                        'lighting',
 'localization',
 'weather',
 'collision_type',
 'year',
 'month',
 'hour',
 'road_condition',
 'user_type',
 'sex',
]].dropna()

In [12]:
mod = sm.GLM.from_formula(formula=use_all_cols(temp_x_train), data=temp_x_train, family=sm.families.Binomial())
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,1250468.0
Model:,GLM,Df Residuals:,1250433.0
Model Family:,Binomial,Df Model:,34.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-622670.0
Date:,"Wed, 08 Apr 2020",Deviance:,1245300.0
Time:,22:04:23,Pearson chi2:,1260000.0
No. Iterations:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.6231,1.290,2.033,0.042,0.094,5.152
lighting[T.Night with public lighting not lit],0.1052,0.025,4.216,0.000,0.056,0.154
lighting[T.Night with public lighting on],0.1033,0.007,15.024,0.000,0.090,0.117
lighting[T.Night without public lighting],0.3963,0.008,50.035,0.000,0.381,0.412
lighting[T.Twilight or dawn],0.1507,0.009,16.163,0.000,0.132,0.169
localization[T.Out of agglomeration],1.0605,0.005,203.765,0.000,1.050,1.071
weather[T.Dazzling weather],0.3143,0.024,13.370,0.000,0.268,0.360
weather[T.Fog - smoke],0.0855,0.026,3.246,0.001,0.034,0.137
weather[T.Heavy rain],-0.0691,0.019,-3.607,0.000,-0.107,-0.032


In [13]:
roc_auc_score(temp_x_train.y, res.predict())

0.7044569516018537