In [2]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import dump, load


In [3]:
# repertoire (relatif) des ficheirs de données
data_path = '../../Data'
models_path = '../../Models'


In [4]:
# récupération du modèle de clustering pour la géolocalisation
clusteringModel = load("{}/clustering_geoloc.joblib".format(models_path))

In [5]:
#chargement des données
annees = [str(x) for x in np.arange(2019,2023)]

carac = pd.DataFrame()
lieux = pd.DataFrame()
usagers = pd.DataFrame()
vehicules = pd.DataFrame()
for annee in annees : 
    carac = pd.concat([carac, pd.read_csv("{}/caracteristiques-{}.csv".format(data_path, annee), sep=';', dtype=object)])
    lieux = pd.concat([lieux, pd.read_csv("{}/lieux-{}.csv".format(data_path, annee), sep=';', dtype=object)])
    usagers = pd.concat([usagers, pd.read_csv("{}/usagers-{}.csv".format(data_path, annee), dtype=object,sep=';')])
    vehicules = pd.concat([vehicules, pd.read_csv("{}/vehicules-{}.csv".format(data_path, annee), sep=';', dtype=object)])


In [6]:
#correction identifiant accident
carac['Num_Acc'] = carac['Num_Acc'].fillna(carac['Accident_Id'])
carac = carac.drop('Accident_Id', axis = 1)

#jointure carac-lieux
df = pd.merge(left = carac, right = lieux, left_on = 'Num_Acc', right_on='Num_Acc')

#jointure df-vehicules
df = pd.merge(left=df, right=vehicules, left_on='Num_Acc', right_on='Num_Acc')

#jointure df-usagers
df = pd.merge(left=df, right=usagers, right_on='id_vehicule', left_on='id_vehicule')

#suppression des identifiants
df = df.drop(['Num_Acc_x', 'id_vehicule', 'num_veh_x', 'Num_Acc_y','num_veh_y', 'id_usager'], axis = 1)

In [14]:
#recodage de la variable d'Interêt
df['grav'] = df['grav'].replace({'1':'0','4':'1','3':'2','2':'3', '-1':'np.nan'})


In [15]:
#suppression des lignes sans variable d'Interet
df = df.dropna(subset=['grav'])

In [16]:
X = df.drop('grav', axis = 1)
y = df['grav']

In [18]:
class RecodeMissingValues(BaseEstimator, TransformerMixin) :
    def __init__(self) :
        return None

    def fit(self, X, y = None) :
        return self

    def transform(self, X) :
        return X.replace({'-1':np.nan,
                            -1:np.nan,
                            ' -1':np.nan,
                            '#ERREUR':np.nan
                        })



In [19]:
recode_missing_values = RecodeMissingValues()

X = recode_missing_values.fit_transform(X)


In [32]:
class RecodeSecu(BaseEstimator, TransformerMixin) :
    def __init(self):
        return None
    def fit(self, X, y = None) :
        return self
    def transform(self, X) :
        x = X.copy()
        #secu  : un champ binaire par équipement
        for i in range(1,10) :
            x['secu_'+str(i)] = ((x['secu1']==i) | (x['secu2']==i) | (x['secu3']==i))*1
        x = x.drop(['secu1','secu2','secu3'], axis = 1)
        return x

In [42]:
class DropColumns(BaseEstimator, TransformerMixin) :
    def __init__(self, columns_list):
        self.columns = columns_list
        return None
    def fit(self, X, y = None) :
        return self
    def transform(self, X) :
        return X.drop(self.columns, axis = 1)

In [45]:
drop_columns = DropColumns(['adr', 'voie','v1', 'v2', 'lartpc', 'larrout', 'occutc', 'locp', 'actp', 'etatp', 'pr','pr1', 'senc', 'dep'])
drop_columns.fit_transform(X)

Unnamed: 0,jour,mois,an,hrmn,lum,dep,com,agg,int,atm,...,trajet,secu_1,secu_2,secu_3,secu_4,secu_5,secu_6,secu_7,secu_8,secu_9
0,30,11,2019,01:30,4,93,93053,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,30,11,2019,01:30,4,93,93053,1,1,1,...,5,0,0,0,0,0,0,0,0,0
2,30,11,2019,01:30,4,93,93053,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,30,11,2019,02:50,3,93,93066,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,28,11,2019,15:15,1,92,92036,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494177,1,01,2022,08:40,1,81,81099,1,3,1,...,5,0,0,0,0,0,0,0,0,0
494178,1,01,2022,08:40,1,81,81099,1,3,1,...,5,0,0,0,0,0,0,0,0,0
494179,1,01,2022,08:40,1,81,81099,1,3,1,...,5,0,0,0,0,0,0,0,0,0
494180,1,03,2022,16:55,1,41,41018,2,1,1,...,1,0,0,0,0,0,0,0,0,0


In [35]:
recode_secu = RecodeSecu()
X = recode_secu.fit_transform(X)

In [36]:
class DropNa(BaseEstimator, TransformerMixin) :
    def __init__(self) :
        return None
    
    def fit(self, X, y = None) :
        return self
    
    def transform(self, X) :
        return X.dropna(how = 'any', axis=0)
    


In [46]:
class ConvertCoords(BaseEstimator, TransformerMixin) :
    def __init__(self) :
        return None
    
    def fit(self, X, y = None) :
        return self
    
    def transform(self, X) :
        x = X.copy()
        x['lat']= x['lat'].str.replace(',','.').astype('float')
        x['long']= x['long'].str.replace(',','.').astype('float')

        return x


In [50]:
class RecodeDatime(BaseEstimator, TransformerMixin) :
    def __init__(self) :
        return None
    
    def fit(self, X, y = None) :
        return self
    
    def transform(self, X) :
        x = X.copy()
        #créneau horaire
        x['hh'] = x['hrmn'].str[:2].astype('int')
        
        #conversin mois
        x['mois'] = x['mois'].astype(int)

        #ajout d'une variable nbinaire weekend
        x['weekend']=(pd.to_datetime(dict(day=x['jour'], month=x['mois'].astype(int), year=x['an'])).dt.weekday>=5)*1

        #calcul de l'age de la victime dans l'année de l'accident
        x['age'] = x['an'].astype(int) - x['an_nais'].astype(int)

        #suppression des variables inutiles
        x = x.drop(['jour','an', 'an_nais','hrmn'], axis = 1)

        return x

In [1]:
class Geolocalization(BaseEstimator, TransformerMixin) :
    def __init__(self, clustering_model) :
        self.clusteringModel = clustering_model
        return None
    
    def fit(self, X, y=None) :
        return self
    
    def transform(self, X) :
        x = X.copy()
        x['geoloc'] = self.clusteringModel.transform(x[['lat','long']])
        x = x.drop(['lat','long'], axis = 1)
        return x


NameError: name 'BaseEstimator' is not defined

In [None]:
class RecodeAges(BaseEstimator, TransformerMixin) :
    def __init__(self) :
        return None
    
    def fit(self, X, y=None) :
        return self
    
    def transform(self, X) :
        x = X.copy()
        bins = [0,10,18,25,45,65,999]
        labels=[0,1,2,3,4,5]
        x['classe_age'] = pd.cut(x['age'], bins=bins, labels=labels, include_lowest=True)
        x = x.drop('age', axis = 1)
        return x

0         4
1         4
2         1
3         4
4         1
         ..
494177    1
494178    3
494179    4
494180    3
494181    1
Name: grav, Length: 494182, dtype: object

In [55]:
y.replace({4:9})

0         4
1         4
2         1
3         4
4         1
         ..
494177    1
494178    3
494179    4
494180    3
494181    1
Name: grav, Length: 494182, dtype: object