In [188]:
import math 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression


caracteristiques = pd.read_csv('data/caracteristiques-2019.csv',
                                sep=';', index_col='Num_Acc', 
                                usecols=['Num_Acc','dep', 'lum', 'agg', 'atm','col'], 
                                dtype={'dep' : str})
usagers = pd.read_csv('data/usagers-2019.csv',
                        sep=';', index_col='Num_Acc',
                        usecols=['Num_Acc', 'sexe', 'an_nais', 'grav', 'secu1', 'secu2', 'secu3'])

lieux = pd.read_csv('data/lieux-2019.csv',
                    sep=';', index_col='Num_Acc',
                    usecols=['Num_Acc', 'catr','circ', 'nbv', 'plan', 'surf'])

vehicules = pd.read_csv('data/vehicules-2019.csv',
                        sep=';', index_col='Num_Acc',
                        usecols=['Num_Acc', 'catv' ])

donnes_brut = pd.read_csv('data/donnes.csv', index_col='dep')

In [189]:
# Variable agg (0 : Accident hors agglo, 1 : Accident en Agglo )
caracteristiques['agglo'] = caracteristiques['agg']-1
# Variable nuit (0 : Accident de jour, 1 : Accident de nuit)
caracteristiques['nuit'] = np.where(caracteristiques['lum'] == 1, 0, 1)
# Variable atm_extreme :
#   0 : atmosphère normale lors de l'accident = [normale, pluie légère, temps éblouissant, temps couvert]
#   1 : atmosphère extreme lors de l'accident = [pluie forte, neige/grele, brouillard/fumée, vent fort/tempete]
caracteristiques['atm_extreme'] = np.where((caracteristiques['atm'] < 2) | (caracteristiques['atm'] > 6),0, 1)
# Varriable col_front (0 : collision non frontale, 1: collision frontale)
caracteristiques['col_front'] = np.where(caracteristiques['col'] == 1,1,0)


# Variable auto : (0 : Accident hors autoroute, 1: Accident sur autouroute)
lieux['auto'] = np.where(lieux['catr'] == 1, 1, 0)
# Variable natio (0 : Accident hors nationale, 1: Accident sur nationale)
lieux['natio'] = np.where(lieux['catr'] == 2, 1, 0)
# Variable depart : (0 : Accident hors departementale, 1: Accident sur départementale)
lieux['depart'] = np.where(lieux['catr'] == 3, 1, 0)
# Variable bidirec (0 : Accident sur route non bidirectionnelle, 1 : Accident sur route bidirectionnelle)
lieux['bidirec'] = np.where(lieux['circ'] == 2, 1, 0)
# Variable virage (0 : Accident ayant eu lieu sur un plan rectiligne, 1 : Accident ayant eu lieu sur un plan non rectiligne)
lieux['virage'] = np.where(lieux['plan'] > 1, 1, 0)
# Variable surface_normale (0 : Etat de la surface normale, 1 : Etat de la surface non normale)
lieux ['surface_normale'] = np.where(lieux['surf'] == 1, 1, 0)




# Calcul age
usagers['age'] = 2019 - usagers['an_nais'] 
# 
usagers['femme'] = np.where(usagers['sexe'] == 2, 1, 0)
usagers['homme'] = np.where(usagers['sexe'] == 1, 1, 0)
# 0 : pas mort, 1 : mort
usagers['mort'] = np.where(usagers['grav'] == 2, 1, 0)


df = pd.DataFrame()
# Moyenne d'age par accidents
df['age_moy'] = usagers.age.groupby('Num_Acc').mean()
# Nombre de femmes et homme par accidents
df['femme'] = usagers.femme.groupby('Num_Acc').sum()
df['homme'] = usagers.homme.groupby('Num_Acc').sum()
df['usagers'] = usagers.groupby('Num_Acc').size()
# Nombre de morts par accidents
df['nb_mort'] = usagers.mort.groupby('Num_Acc').sum()


caracteristiques = caracteristiques.join([df, lieux])

data = pd.DataFrame()
# Nombre de morts par départements
data['morts'] = caracteristiques.groupby('dep')['nb_mort'].sum()
# Nombre d'accidents par départements
data['accidents'] = caracteristiques.groupby('dep').size()
# Nombre d'accidents qui ont eu lieu en agglomération
data['agglo'] = caracteristiques.groupby('dep')['agglo'].sum()
# Nombre d'accidents qui ont eu lieu pendant la nuit
data['nuit'] = caracteristiques.groupby('dep')['nuit'].sum()
# Nombre d'accidents ayant eu lieu sous conditions atmosphériques extremes
data['atm_extreme'] = caracteristiques.groupby('dep')['atm_extreme'].sum()
# Nombre d'accidents en collision frontale
data['col_front'] = caracteristiques.groupby('dep')['col_front'].sum()
# Nombre d'accidents impliquant des femmes
data['femme'] = caracteristiques.groupby('dep')['femme'].sum()
# Nombre d'accidents impliquant des hommes
data['homme'] = caracteristiques.groupby('dep')['homme'].sum()
# Nombre d'accidentés
data['usagers'] = caracteristiques.groupby('dep')['usagers'].sum()
# Nombre moyen d'accidentés par accidents
data['usagers_moy'] = caracteristiques.groupby('dep')['usagers'].mean()
# Age moyen des accidentés
data['age_moy'] = caracteristiques.groupby('dep')['age_moy'].mean()
# Nombre d'accidents sur l'autoroute
data['auto'] = caracteristiques.groupby('dep')['auto'].sum()
# Nombre d'accidents sur nationale
data['natio'] = caracteristiques.groupby('dep')['natio'].sum()
# Nombre d'accidents sur départementale
data['depart'] = caracteristiques.groupby('dep')['depart'].sum()
# Nombre d'accidents sur une voie bidirectionelle
data['bidirec'] = caracteristiques.groupby('dep')['bidirec'].sum()
# Nombre d'accidents ayant eu lieu dans des virages
data['virage'] = caracteristiques.groupby('dep')['virage'].sum()
# Nombre de voies moyen par accidents
data['nbv_moy'] = caracteristiques.groupby('dep')['nbv'].mean()
# Nombre d'accidents ayant eu lieu sur une route avec surface normale
data['surface_normale'] = caracteristiques.groupby('dep')['surface_normale'].sum()

data = data.drop(data.index[96:])
display(data)

Unnamed: 0_level_0,morts,accidents,agglo,nuit,atm_extreme,col_front,femme,homme,usagers,usagers_moy,age_moy,auto,natio,depart,bidirec,virage,nbv_moy,surface_normale
dep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,34,474,179,175,93,64,394,750,1144,2.413502,39.942207,57,0,337,359,139,2.111814,353
10,22,384,265,104,56,31,337,546,883,2.299479,38.920244,13,7,143,266,70,2.088542,301
11,36,318,161,107,35,31,285,491,776,2.440252,40.915240,16,5,155,248,73,2.088050,267
12,18,194,75,40,28,33,175,275,450,2.319588,45.864519,11,12,111,159,78,1.989691,156
13,103,2997,1992,1087,195,226,2267,4736,7003,2.336670,38.872651,736,42,470,1499,507,2.386720,2737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,30,1460,748,500,273,83,1055,2329,3384,2.317808,37.982585,305,389,233,556,225,2.876712,1070
92,25,2446,2018,817,371,102,1487,3817,5304,2.168438,39.033907,350,94,1242,1081,256,2.608749,1923
93,30,3130,2327,1047,574,177,1942,5167,7109,2.271246,36.287971,745,0,1358,1321,384,3.204473,2404
94,27,2681,2066,899,408,140,1795,4250,6045,2.254756,38.282028,576,148,1064,1215,312,2.565461,2113


In [190]:
"""reg = sm.ols(formula='morts ~ col_front', data=data).fit()
print(reg.summary())"""

"reg = sm.ols(formula='morts ~ col_front', data=data).fit()\nprint(reg.summary())"

In [191]:
for i in data.iloc[:, 2:]:
    x = data[i]
    X = sm.add_constant(x)

    reg = sm.OLS(data['morts'],X).fit()
    print(reg.summary())


                            OLS Regression Results                            
Dep. Variable:                  morts   R-squared:                       0.072
Model:                            OLS   Adj. R-squared:                  0.062
Method:                 Least Squares   F-statistic:                     7.251
Date:                Sun, 06 Nov 2022   Prob (F-statistic):            0.00839
Time:                        16:46:18   Log-Likelihood:                -417.08
No. Observations:                  96   AIC:                             838.2
Df Residuals:                      94   BIC:                             843.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         31.0076      2.184     14.201      0.0

In [192]:
disp = pd.DataFrame(columns=['var', 'coef', 'p_val_coef' 'R', 'p_val_Fstat'])
for i in data.iloc[:, 2:]:
    name = data[i].name
    x = data[i]
    X = sm.add_constant(x)
    reg = sm.OLS(data['morts'],X).fit()
    disp = disp.append({'var' : name, 'coef' : reg.params[name], 'p_val_coef' : reg.pvalues[name], 'R' : reg.rsquared, 'p_val_Fstat' : reg.f_pvalue}, ignore_index=True)
    #print(f'{data[i].name} R = {reg.rsquared} prob(F-stat) ={reg.f_pvalue}')
display(disp)

  disp = disp.append({'var' : name, 'coef' : reg.params[name], 'p_val_coef' : reg.pvalues[name], 'R' : reg.rsquared, 'p_val_Fstat' : reg.f_pvalue}, ignore_index=True)
  disp = disp.append({'var' : name, 'coef' : reg.params[name], 'p_val_coef' : reg.pvalues[name], 'R' : reg.rsquared, 'p_val_Fstat' : reg.f_pvalue}, ignore_index=True)
  disp = disp.append({'var' : name, 'coef' : reg.params[name], 'p_val_coef' : reg.pvalues[name], 'R' : reg.rsquared, 'p_val_Fstat' : reg.f_pvalue}, ignore_index=True)
  disp = disp.append({'var' : name, 'coef' : reg.params[name], 'p_val_coef' : reg.pvalues[name], 'R' : reg.rsquared, 'p_val_Fstat' : reg.f_pvalue}, ignore_index=True)
  disp = disp.append({'var' : name, 'coef' : reg.params[name], 'p_val_coef' : reg.pvalues[name], 'R' : reg.rsquared, 'p_val_Fstat' : reg.f_pvalue}, ignore_index=True)
  disp = disp.append({'var' : name, 'coef' : reg.params[name], 'p_val_coef' : reg.pvalues[name], 'R' : reg.rsquared, 'p_val_Fstat' : reg.f_pvalue}, ignore_index=True

Unnamed: 0,var,coef,p_val_coefR,p_val_Fstat,p_val_coef,R
0,agglo,0.007371,,0.008390548,0.008390548,0.071615
1,nuit,0.026372,,0.0001659737,0.0001659737,0.140723
2,atm_extreme,0.041355,,0.01090424,0.01090424,0.066963
3,col_front,0.353133,,1.202649e-15,1.202649e-15,0.495905
4,femme,0.017129,,6.436411e-06,6.436411e-06,0.195604
5,homme,0.005631,,0.0002774846,0.0002774846,0.131808
6,usagers,0.004295,,9.919417e-05,9.919417e-05,0.149592
7,usagers_moy,53.033075,,0.0009538313,0.0009538313,0.110176
8,age_moy,-3.254614,,0.0001241448,0.0001241448,0.145734
9,auto,0.04592,,0.001518617,0.001518617,0.101962
