In [1]:
import configparser
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing

In [2]:
config = configparser.ConfigParser()
config.read('config3.ini')

RANDOM_STATE = config.getint('Default', 'RANDOM_STATE')
DIR_DATA = config.get('Paths', 'DIR_DATA')
PATH_DATAFRAME = config.get('Paths', 'PATH_DATAFRAME')
PATH_DATA_PREPROCESSOR = config.get('Paths', 'PATH_DATA_PREPROCESSOR')
PATH_PROCESSED_DATA = config.get('Paths', 'PATH_PROCESSED_DATA')
PATH_PROCESSED_SCHEMA = config.get('Paths', 'PATH_PROCESSED_SCHEMA')
PATH_LABEL_DICT = config.get('Paths', 'PATH_LABEL_DICT')

In [3]:
# Chargement des données
data:pd.DataFrame
with open(PATH_DATAFRAME,"rb") as file:
    data = pickle.load(file)

categories_map:dict
with open(PATH_LABEL_DICT,"rb") as file:
    categories_map = pickle.load(file)

In [4]:
categories_map

{0: 'Citadine',
 1: 'Haut de gamme',
 2: 'Familliale',
 3: 'Economique',
 4: 'Moyen'}

In [6]:
data.head(10)

Unnamed: 0,age,sexe,taux,nbEnfantsAcharge,situationFamiliale,2emevoiture,categorie_label,categorie
0,49,F,914,1,En Couple,False,2,Familliale
1,18,M,563,4,En Couple,False,1,Haut de gamme
2,82,M,417,0,Célibataire,False,3,Economique
3,72,M,442,4,En Couple,False,1,Haut de gamme
4,41,M,592,0,Célibataire,False,4,Moyen
5,54,M,438,0,En Couple,False,4,Moyen
6,30,F,736,1,En Couple,False,2,Familliale
7,56,M,1325,2,En Couple,False,1,Haut de gamme
8,73,M,532,0,Célibataire,False,3,Economique
9,31,M,206,0,En Couple,False,3,Economique


In [7]:
data.dtypes

age                    int64
sexe                  object
taux                   int64
nbEnfantsAcharge       int64
situationFamiliale    object
2emevoiture             bool
categorie_label        int32
categorie             object
dtype: object

In [8]:
# Split des données
X = data.drop("categorie_label",axis=1)
y = data["categorie_label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [9]:
# Preprocessing des données
categorical_features = ["situationFamiliale","2emevoiture"]
numeric_features = ["taux","nbEnfantsAcharge","age"]

data_preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", preprocessing.OneHotEncoder(), categorical_features),
        ("passthrough", "passthrough", numeric_features),
    ]
)

In [10]:
data_preprocessor.fit(X_train)

In [11]:
X_train_transformed = data_preprocessor.transform(X_train)
X_test_transformed = data_preprocessor.transform(X_test)

In [12]:
# Enregistrement de l'objet data_preprocessor
with open(PATH_DATA_PREPROCESSOR, 'wb') as file:
    pickle.dump(data_preprocessor, file)

# Enregistrenement des données prétraitées
with open(PATH_PROCESSED_DATA, 'wb') as file:
    pickle.dump((X_train_transformed, X_test_transformed, y_train, y_test), file)

schema = {
    "features_names" : {
        "categorical":categorical_features,
        "numeric":numeric_features
    },
    "categories_map" : categories_map
}

# Enregistrement des categorical_features et numeric_features
with open(PATH_PROCESSED_SCHEMA, 'wb') as file:
    pickle.dump(schema, file)