* ## **Import tools and primary data**

In [222]:
import pandas as pd
df1 = pd.read_csv("usagers.csv")
df2 = pd.read_csv("vehicules.csv")
df3 = pd.read_csv("caract.csv")
df4 = pd.read_csv("lieux.csv")

* ## **Merge primary data**

In [223]:
data = df1.merge(df2, on = ['id_vehicule', 'Num_Acc', 'num_veh'])
data = data.merge(df3, on = 'Num_Acc')
data = data.merge(df4, on = 'Num_Acc')
data.columns

Index(['Num_Acc', 'id_vehicule', 'num_veh', 'place', 'catu', 'grav', 'sexe',
       'an_nais', 'trajet', 'secu1', 'secu2', 'secu3', 'locp', 'actp', 'etatp',
       'senc', 'catv', 'obs', 'obsm', 'choc', 'manv', 'motor', 'occutc',
       'jour', 'mois', 'an', 'hrmn', 'lum', 'dep', 'com', 'agg', 'int', 'atm',
       'col', 'adr', 'lat', 'long', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv',
       'vosp', 'prof', 'pr', 'pr1', 'plan', 'lartpc', 'larrout', 'surf',
       'infra', 'situ', 'vma'],
      dtype='object')

* ## **Missing values handling**

+ ### Dropping variables

In [224]:
# count nan values
nan_values = data.isna().sum()
# percentage of nan values ay column
nan_values = nan_values.sort_values(ascending=False) *100 / data.shape[0]
# columns with nan values greater than 20%
nan_values = nan_values.loc[nan_values > 20]
nan_values

long       100.000000
lartpc      99.648811
larrout     99.375080
occutc      98.587726
v2          92.673921
dtype: float64

In [225]:
# Copy of original data
df = data.copy()
# Drop nan_values criteria columns
df = df.drop(nan_values.to_dict().keys(), axis=1)
# Drop other column not concerned by the process
df = df.drop('lat', axis=1)
df.columns, df.shape

(Index(['Num_Acc', 'id_vehicule', 'num_veh', 'place', 'catu', 'grav', 'sexe',
        'an_nais', 'trajet', 'secu1', 'secu2', 'secu3', 'locp', 'actp', 'etatp',
        'senc', 'catv', 'obs', 'obsm', 'choc', 'manv', 'motor', 'jour', 'mois',
        'an', 'hrmn', 'lum', 'dep', 'com', 'agg', 'int', 'atm', 'col', 'adr',
        'catr', 'voie', 'v1', 'circ', 'nbv', 'vosp', 'prof', 'pr', 'pr1',
        'plan', 'surf', 'infra', 'situ', 'vma'],
       dtype='object'),
 (132977, 48))

* ### Dropping rows

In [226]:
# Drop rows with any cells with nan values
df = df.dropna(axis=0, how='any')
df.shape

(104994, 48)

===============================================================================

In [227]:
import datetime
# Create age variable from birth year 'an_nais'
df['age'] = datetime.date.today().year - df.an_nais
print('Dimension de df', df.shape)

Dimension de df (104994, 49)


In [228]:
df.sexe = df.sexe.astype('category')
df.sexe.dtypes

CategoricalDtype(categories=[1, 2], ordered=False)

In [229]:
df.columns

Index(['Num_Acc', 'id_vehicule', 'num_veh', 'place', 'catu', 'grav', 'sexe',
       'an_nais', 'trajet', 'secu1', 'secu2', 'secu3', 'locp', 'actp', 'etatp',
       'senc', 'catv', 'obs', 'obsm', 'choc', 'manv', 'motor', 'jour', 'mois',
       'an', 'hrmn', 'lum', 'dep', 'com', 'agg', 'int', 'atm', 'col', 'adr',
       'catr', 'voie', 'v1', 'circ', 'nbv', 'vosp', 'prof', 'pr', 'pr1',
       'plan', 'surf', 'infra', 'situ', 'vma', 'age'],
      dtype='object')

In [230]:
df.corr()

Unnamed: 0,Num_Acc,place,catu,grav,an_nais,trajet,secu1,secu2,secu3,locp,...,vosp,prof,pr,pr1,plan,surf,infra,situ,vma,age
Num_Acc,1.0,-0.004888,-0.005226,-0.000257,-0.004941,0.012481,0.010437,0.000525,-0.000425,0.004247,...,0.007624,-0.001343,0.00026,-0.01042,0.009125,-0.005562,-0.000184,0.006882,-0.01715,0.004941
place,-0.004888,1.0,0.909641,0.237757,0.031261,0.024174,0.399107,-0.208439,-0.004369,0.761188,...,0.030087,-0.020399,-0.011012,-0.068868,-0.028915,0.007002,0.042378,0.027651,-0.117097,-0.031261
catu,-0.005226,0.909641,1.0,0.250882,0.071126,0.036285,0.341962,-0.201225,-0.009966,0.672248,...,0.021081,-0.011289,-0.004458,-0.055405,-0.012962,0.010067,0.039536,0.022501,-0.087877,-0.071126
grav,-0.000257,0.237757,0.250882,1.0,0.101981,-0.022056,0.191538,0.056955,0.024064,0.177265,...,-0.003853,0.01372,-0.002308,-0.011859,0.043955,0.02525,-0.004132,0.01725,-0.007472,-0.101981
an_nais,-0.004941,0.031261,0.071126,0.101981,1.0,-0.029607,0.031355,0.03245,-0.013323,-0.064767,...,0.003377,0.008557,-0.006153,-0.008713,0.014542,0.01951,-0.007075,0.012289,0.017262,-1.0
trajet,0.012481,0.024174,0.036285,-0.022056,-0.029607,1.0,-0.021218,0.019704,0.007041,0.024864,...,-0.009851,0.021819,0.025725,0.03652,0.036744,-0.006914,0.028851,0.029366,0.024485,0.029607
secu1,0.010437,0.399107,0.341962,0.191538,0.031355,-0.021218,1.0,-0.130033,0.005358,0.342206,...,0.088156,-0.024575,-0.023835,-0.115089,-0.021742,-0.012958,0.040031,0.019033,-0.15844,-0.031355
secu2,0.000525,-0.208439,-0.201225,0.056955,0.03245,0.019704,-0.130033,1.0,0.091061,-0.195746,...,-0.010277,-0.002378,-0.010182,-0.045886,-0.024246,-0.010648,0.016007,-0.075366,-0.019595,-0.03245
secu3,-0.000425,-0.004369,-0.009966,0.024064,-0.013323,0.007041,0.005358,0.091061,1.0,0.02539,...,-0.007989,0.026252,0.001789,0.043828,0.048378,0.009173,-0.000337,0.026634,0.021642,0.013323
locp,0.004247,0.761188,0.672248,0.177265,-0.064767,0.024864,0.342206,-0.195746,0.02539,1.0,...,0.027438,-0.012806,-0.01191,-0.049398,-0.024151,0.005282,0.048824,0.068449,-0.128594,0.064767


In [231]:
df.var().sort_values().head()

an      5.169928e-26
v1      2.015286e-02
agg     2.138907e-01
prof    2.859882e-01
plan    3.757334e-01
dtype: float64

In [232]:
df = df.drop('an', axis=1)

In [259]:
y = df['grav']

features = ['catu','sexe','trajet',
            'catv','an_nais','mois',
            'obs','obsm','choc','manv',
            'lum','agg','int','atm','col',
            'catr','circ','vosp','prof','plan',
            'surf','infra','situ']

In [281]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score

In [266]:
X_data = pd.get_dummies(df[features].astype(str))

In [269]:
# On commence par normaliser les données :

X_N_data = normalize(X_data.values)

In [283]:
# On divise la base en bases d'entraînements et de test :

X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_N_data, y)
# On construit le modèle :

model_rf = RandomForestClassifier(n_estimators=100)
# L'entrînement commence :

model_rf.fit(X_train_rf, y_train_rf)
# On a maintenant les prédictions pour la base de test

predictions_test = model_rf.predict(X_test_rf)
# On calcul de même les prédictions pour la base train

predictions_train = model_rf.predict(X_train_rf)
# Les résultats sont calculés de cette manière :

train_acc = accuracy_score(y_train_rf, predictions_train)
print(train_acc)

test_acc = accuracy_score(y_test_rf, predictions_test)
print(test_acc)

0.9986919804432027
0.6925978132500286


In [282]:
# On redécoupe la base en train/test

X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(X_N_data, y)


# On crée le modèle :

model_boosting = GradientBoostingClassifier(loss="deviance",
    learning_rate=0.2,
    max_depth=5,
    max_features="sqrt",
    subsample=0.95,
    n_estimators=200)

# L'entraînement débute :

model_boosting.fit(X_train_xgb, y_train_xgb)

# On calcul les prédictions
predictions_test_xgb = model_boosting.predict(X_test_xgb)
predictions_train_xgb = model_boosting.predict(X_train_xgb)

# On affiche les résultats :

train_acc = accuracy_score(y_train_xgb, predictions_train_xgb)
print(train_acc)

test_acc = accuracy_score(y_test_xgb, predictions_test_xgb)
print(test_acc)

0.7263572290304147
0.6864261495676026


In [254]:
df.columns

Index(['Num_Acc', 'id_vehicule', 'num_veh', 'place', 'catu', 'grav', 'sexe',
       'an_nais', 'trajet', 'secu1', 'secu2', 'secu3', 'locp', 'actp', 'etatp',
       'senc', 'catv', 'obs', 'obsm', 'choc', 'manv', 'motor', 'jour', 'mois',
       'hrmn', 'lum', 'dep', 'com', 'agg', 'int', 'atm', 'col', 'adr', 'catr',
       'voie', 'v1', 'circ', 'nbv', 'vosp', 'prof', 'pr', 'pr1', 'plan',
       'surf', 'infra', 'situ', 'vma', 'age'],
      dtype='object')

In [None]:
df['sexe'].value_counts(sort=False) *100 / df.shape[0]

In [233]:
df['grav'].value_counts(sort=False) *100 / df.shape[0]

1    42.647199
2     1.857249
3    12.144503
4    43.351049
Name: grav, dtype: float64

In [235]:
df['lum'].value_counts(sort=False) *100 / df.shape[0]

1    66.321885
2     5.947959
3     8.671924
4     0.967674
5    18.090558
Name: lum, dtype: float64

In [236]:
df['agg'].value_counts(sort=False) *100 / df.shape[0]

1    30.997009
2    69.002991
Name: agg, dtype: float64

In [237]:
df['int'].value_counts(sort=False) *100 / df.shape[0]

1    63.621731
2    13.322666
3    10.762520
4     2.270606
5     0.599082
6     3.387813
7     1.288645
8     0.079052
9     4.667886
Name: int, dtype: float64

In [238]:
df['atm'].value_counts(sort=False) *100 / df.shape[0]

1    79.529306
2    11.444463
3     2.286797
4     0.364783
5     0.440978
6     0.239061
7     1.470560
8     3.784026
9     0.440025
Name: atm, dtype: float64

In [240]:
df['col'].value_counts(sort=True) *100 / df.shape[0]

3    31.860868
6    26.156733
2    14.487495
1     8.251900
4     7.870926
5     5.796522
7     5.575557
Name: col, dtype: float64

In [242]:
df['catr'].value_counts(sort=False) *100 / df.shape[0]

1    12.580719
2     6.937539
3    29.851230
4    46.650285
5     0.107625
6     0.600987
7     2.526811
9     0.744804
Name: catr, dtype: float64

In [250]:
# -1 problem

df['circ'].value_counts(sort=False) *100 / df.shape[0]

 4.0     0.706707
 3.0    17.488618
 1.0    20.433549
-1.0     5.439358
 2.0    55.931768
Name: circ, dtype: float64

In [246]:
df['surf'].value_counts(sort=False) *100 / df.shape[0]

 4.0     0.027621
 8.0     0.140960
 3.0     0.178105
 1.0    80.311256
-1.0     0.002857
 7.0     0.238109
 2.0    18.652494
 5.0     0.121912
 9.0     0.312399
 6.0     0.014287
Name: surf, dtype: float64

In [247]:
df['infra'].value_counts(sort=False) *100 / df.shape[0]

 0.0    82.479951
 4.0     0.445740
 8.0     0.921957
 3.0     1.592472
 1.0     1.644856
-1.0     0.070480
 7.0     0.052384
 2.0     2.152504
 5.0     5.860335
 9.0     3.884031
 6.0     0.895289
Name: infra, dtype: float64

In [253]:
df['catv'].value_counts(sort=False) *100 / df.shape[0]

0      0.190487
1      4.005943
2      2.729680
3      0.985771
7     63.239804
10     7.055641
13     0.348591
14     0.656228
15     0.690516
16     0.030478
80     0.197154
17     0.561937
20     0.202869
21     0.120007
30     3.277330
31     1.663905
32     2.478237
33     6.724194
34     1.374364
35     0.009524
99     0.399070
36     0.069528
37     1.169591
38     0.279064
39     0.027621
40     0.177153
41     0.010477
42     0.036193
43     0.509553
50     0.658133
60     0.120959
Name: catv, dtype: float64

In [234]:
df.columns

Index(['Num_Acc', 'id_vehicule', 'num_veh', 'place', 'catu', 'grav', 'sexe',
       'an_nais', 'trajet', 'secu1', 'secu2', 'secu3', 'locp', 'actp', 'etatp',
       'senc', 'catv', 'obs', 'obsm', 'choc', 'manv', 'motor', 'jour', 'mois',
       'hrmn', 'lum', 'dep', 'com', 'agg', 'int', 'atm', 'col', 'adr', 'catr',
       'voie', 'v1', 'circ', 'nbv', 'vosp', 'prof', 'pr', 'pr1', 'plan',
       'surf', 'infra', 'situ', 'vma', 'age'],
      dtype='object')

# Questions
## 1. Comment les modalites avec les petites representations vont affecter le modele?
## 2. Quels sont les algos de classifications qu'on peut encore essayer pour ameliorer?
## 3. Prisen en comptes des suggestions se l'article, comment s'y prendre? (https://larevueia.fr/xgboost-vs-random-forest-predire-la-gravite-dun-accident-de-la-route/)
## 4. Suggestions?