In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import dump, load

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline


In [2]:
# repertoire (relatif) des ficheirs de données
data_path = '../../Data'
models_path = '../../Models'


In [3]:
# récupération du modèle de clustering pour la géolocalisation
clusteringModel = load("{}/clustering_geoloc.joblib".format(models_path))
scaler = load("{}/scaler.joblib".format(models_path))

In [4]:
df = pd.read_csv('{0}/accidents_merge.zip'.format(data_path), compression=dict(method='zip', archive_name='accidents_merge.csv'))
df = df.drop(['Num_Acc_x', 'id_vehicule', 'num_veh_x', 'Num_Acc_y','num_veh_y', 'id_usager'], axis = 1)
XX = df.drop('grav', axis = 1)
yy = df['grav']

  df = pd.read_csv('{0}/accidents_merge.zip'.format(data_path), compression=dict(method='zip', archive_name='accidents_merge.csv'))


In [5]:
def replaceByNan(df) :
    df =  df.replace({'-1':np.nan,
                            -1:np.nan,
                            ' -1':np.nan,
                            '#ERREUR':np.nan
                        })
    return df

class RecodeMissingValues(BaseEstimator, TransformerMixin) :
    def __init__(self) :
        return None

    def fit(self, X, y = None) :
        return self

    def transform(self, X) :
        x = X.copy()
        return replaceByNan(x)



In [6]:
def addSecu(ds) :
    for i in range(1,10) :
        ds['secu_'+str(i)] = ((ds['secu1']==str(i)) | (ds['secu2']==str(i)) | (ds['secu3']==str(i)))*1
    ds = ds.drop(['secu1','secu2','secu3'], axis = 1)
    return ds

class RecodeSecu(BaseEstimator, TransformerMixin) :
    def __init__(self):
        return None
    
    def fit(self, X, y = None) :
        return self
    
    def transform(self, X) :
        #secu  : un champ binaire par équipement
        x = X.copy()
        return addSecu(x)



In [7]:
class DropColumns(BaseEstimator, TransformerMixin) :
    def __init__(self, columns_list):
        self.columns = columns_list
        return None
    def fit(self, X, y = None) :
        return self
    def transform(self, X) :
        x = X.copy()
        x = x.drop(self.columns, axis = 1)
        return x

In [8]:
class DropNa(BaseEstimator, TransformerMixin) :
    def __init__(self) :
        return None
    
    def fit(self, X, y = None) :
        return self
    
    def transform(self, X) :
        x= X.copy()
        x = x.dropna(how = 'any', axis=0)
        return x
    


In [9]:
class ConvertCoords(BaseEstimator, TransformerMixin) :
    def __init__(self) :
        return None
    
    def fit(self, X, y = None) :
        return self
    
    def transform(self, X) :
        x = X.copy()
        x['lat']= x['lat'].str.replace(',','.').astype('float')
        x['long']= x['long'].str.replace(',','.').astype('float')

        return x


In [10]:
def recodeDateTime(ds) :
    ds['hh'] = ds['hrmn'].str[:2].astype('int')
        
    #conversin mois
    ds['mois'] = ds['mois'].astype(int)

    #ajout d'une variable nbinaire weekend
    ds['weekend']=(pd.to_datetime(dict(day=ds['jour'], month=ds['mois'].astype(int), year=ds['an'])).dt.weekday>=5)*1

    #calcul de l'age de la victime dans l'année de l'accident
    ds['age'] = ds['an'].astype(int) - ds['an_nais'].astype(int)

    #suppression des variables inutiles
    ds = ds.drop(['jour','an', 'an_nais','hrmn'], axis = 1)

    return ds

class RecodeDatetime(BaseEstimator, TransformerMixin) :
    def __init__(self) :
        return None
    
    def fit(self, X, y = None) :
        return self
    
    def transform(self, X) :
        x = X.copy()
        
        x = recodeDateTime(x)

        return x

In [11]:
def addGeoloc(df, model) :
    df['geoloc'] = model.predict(df[['lat','long']])
    df = df.drop(['lat','long'], axis = 1) 
    return df                              


class Geolocalization(BaseEstimator, TransformerMixin) :
    def __init__(self, clustering_model) :
        self.clusteringModel = clustering_model
        return None
    
    def fit(self, X, y=None) :
        return self
    
    def transform(self, X) :
        x= X.copy()
        x = addGeoloc(x, self.clusteringModel)
        return x


In [12]:
def classesAges(ds) :
    bins = [0,9,17,24,44,64,999]
    labels=[0,1,2,3,4,5]
    ds['classe_age'] = pd.cut(ds['age'], bins=bins, labels=labels, include_lowest=True)
    ds = ds.drop('age', axis = 1)
    return ds

class RecodeAges(BaseEstimator, TransformerMixin) :
    def __init__(self) :
        return None
    
    def fit(self, X, y=None) :
        return self
    
    def transform(self, X) :
        x = X.copy()
        x = classesAges(x)
        return x

In [13]:
def binarisation(ds, cols) :

    ds[cols] = ds[cols].astype(int)

    encoder = OneHotEncoder(sparse_output=False)
    transformer = make_column_transformer((encoder, cols), remainder='passthrough', verbose_feature_names_out=False) 
    transformed=transformer.fit_transform(ds)
    ds = pd.DataFrame(transformer.transform(ds), columns=transformer.get_feature_names_out(), index=ds.index)

    return ds


class Binarisation(BaseEstimator, TransformerMixin) :
    def __init__(self, cat_cols) :
        self.catCols = cat_cols
        return None
    
    def fit(self, X, y=None) :
        return self
    
    def transform(self, X) :
        x = X.copy()
        
        x = binarisation(x, self.catCols)

        return x

In [29]:
def scaling(ds, cols, scaler) :
    ds[cols] = scaler.transform(ds[cols])
    
    return ds

class Scaling(BaseEstimator, TransformerMixin) :
    def __init__(self, num_cols, scaler) :
        self.scaler = scaler
        self.numCols = num_cols
        return None
    
    def fit(self, X, y=None) :
        return self
    
    def transform(self, X) :
        x = X.copy()
        x = scaling(x, self.numCols, self.scaler)
        return x

In [15]:
cat_cols=['mois', 'lum', 'agg', 'int', 'atm', 'col', 'catr', 'circ', 'vosp', 'prof', 'plan', 
          'surf', 'infra', 'situ', 'catv', 'obs', 'obsm', 'choc', 'manv', 'motor', 
          'place', 'catu', 'trajet','sexe','hh','weekend','geoloc','classe_age']
num_cols = ['nbv','vma']

In [30]:
pipe = Pipeline(
    steps = [
        ('Recodage Valeurs Manquantes', RecodeMissingValues()),
        ('Recodage des équipements de sécurité',RecodeSecu()),
        ('Suppression Colonnes',DropColumns(['adr', 'voie','v1', 'v2', 'lartpc', 'larrout', 'occutc', 'locp', 'actp', 'etatp', 'pr','pr1', 'senc', 'dep','com'])),
        ('Suppression des lignes avec données manquantes',DropNa()),
        ('Conversion des coordonnées géographiques', ConvertCoords()),
        ('Recodage des variables temporelles', RecodeDatetime()),
        ('Clusterisation des géolocalisation',Geolocalization(clusteringModel)),
        ('Tranches d\'ages',RecodeAges()),
        ('Binarisation des variables catégorielle',Binarisation(cat_cols)),
        ('Standardisation des variables numériques', Scaling(num_cols, scaler))
    ]
    
)

In [31]:
X1=XX.copy()
X1 = X1.loc[:100]

In [32]:
X1

Unnamed: 0,jour,mois,an,hrmn,lum,dep,com,agg,int,atm,...,catu,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,30,11,2019,01:30,4,93,93053,1,1,1,...,2,2,2002.0,0,1,0,-1,-1,-1,-1
1,30,11,2019,01:30,4,93,93053,1,1,1,...,1,2,1993.0,5,1,0,-1,-1,-1,-1
2,30,11,2019,01:30,4,93,93053,1,1,1,...,1,1,1959.0,0,1,0,-1,-1,-1,-1
3,30,11,2019,02:50,3,93,93066,1,1,1,...,1,2,1994.0,0,1,0,-1,-1,-1,-1
4,28,11,2019,15:15,1,92,92036,1,1,1,...,1,1,1996.0,0,1,0,-1,-1,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,28,11,2019,12:10,1,51,51454,2,1,2,...,1,2,1986.0,1,1,0,-1,-1,0,-1
97,28,11,2019,06:55,5,51,51454,2,1,1,...,1,1,1965.0,4,8,7,-1,-1,0,-1
98,28,11,2019,19:12,5,51,51108,2,1,1,...,1,1,1998.0,4,1,0,-1,-1,0,-1
99,28,11,2019,19:12,5,51,51108,2,1,1,...,3,2,2002.0,9,0,-1,-1,1,3,1


In [33]:
#print(XX['lat'])

X2 = pipe.transform(X1)


  df =  df.replace({'-1':np.nan,


In [28]:
scaling(X2, num_cols, scaler)

Unnamed: 0,mois_11,lum_1,lum_2,lum_3,lum_4,lum_5,agg_1,agg_2,int_1,int_2,...,vma,secu_1,secu_2,secu_3,secu_4,secu_5,secu_6,secu_7,secu_8,secu_9
0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.312586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.312586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.312586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.312586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.127232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,-0.502061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,-0.502061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,-0.502061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,-0.502061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
X2

Unnamed: 0,mois_11,lum_1,lum_2,lum_3,lum_4,lum_5,agg_1,agg_2,int_1,int_2,...,vma,secu_1,secu_2,secu_3,secu_4,secu_5,secu_6,secu_7,secu_8,secu_9
0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.312586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.312586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.312586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.312586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.127232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,-0.502061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,-0.502061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,-0.502061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,-0.502061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
dump(pipe, "{}/pipeline.joblib".format(models_path))

['../../Models/pipeline.joblib']