In [None]:
%autosave 0
import os
import glob
import numpy 
import matplotlib.pyplot as plt
import pickle
import pandas as pd
import numpy as np 
import tensorflow as tf
from pandas import read_csv
from matplotlib import pyplot
from pandas.plotting import scatter_matrix 
from numpy import set_printoptions
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRFRegressor, XGBRegressor
from sklearn.compose import ColumnTransformer
from datetime import datetime
from mlflow import log_metric, log_param

set_printoptions(precision=0)
pd.set_option("display.precision", 2)
pd.option_context('display.float_format', '{:0f}'.format);

seed = 15
tf.random.set_seed(seed)
# tf.set_random_seed(seed)
numpy.random.seed(seed)

# création du dataset final

récupération de tous les fichiers csv commençant par dataset_final 

In [None]:
extension = 'csv'
all_filenames = [i for i in glob.glob('../csv/dataset_final_*.{}'.format(extension))]

concaténation de tous les fichiers récupérés afin d'en obtenir qu'un seul

In [None]:
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "../csv/dataset_final.csv", index=False, encoding='utf-8-sig')

# préprocessing du dataset

récupération du fichier csv concaténé précédement

In [None]:
# Importation du fichier csv dans un dataframe
df_train_full = read_csv('../csv/dataset_final.csv', index_col=0)
df_train_full.head()

mise en forme du dataframe

In [None]:
# df_train_full = df_train_full.set_index('ref')
df_train_full = df_train_full.drop(columns=['titre','publish_date'])
print(df_train_full.shape)
df_train_full.head(5)

In [None]:
df_train_full.describe()

split du dataframe en input et output

In [None]:
Y_trainfull = df_train_full['output']
X_trainfull = df_train_full.drop('output', axis=1)

In [None]:
print(Y_trainfull.shape)
print(X_trainfull.shape)

Récupération de la liste des villes

In [None]:
list_ville = X_trainfull["city"].unique()
list_departement = X_trainfull["departement"].unique()
list_region = X_trainfull["region"].unique()

In [None]:
df_ville = pd.DataFrame(list_ville, columns = ['list_ville'])
df_departement = pd.DataFrame(list_departement, columns = ['list_departement'])
df_region = pd.DataFrame(list_region, columns = ['list_region'])

frames = [df_ville, df_departement, df_region]

result = pd.concat(frames, axis=1)
result.to_csv('../templates/list_html.csv', index=False)

# Traitement des variables catégorielles

Sélection de toutes les colonnes concernant les variables catégorielles

In [None]:
cat_columns = ['city', 'departement', 'region', 'type']

création des pipelines onehotencoder afin de transformer les variables catégorielles en quantitatives

In [None]:
city_pipe = Pipeline(steps=[
    ('one', OneHotEncoder(categories = [X_trainfull['city'].unique()]))
])

In [None]:
departement_pipe = Pipeline(steps=[
    ('one', OneHotEncoder(categories = [X_trainfull['departement'].unique()]))
])

In [None]:
region_pipe = Pipeline(steps=[
    ('one', OneHotEncoder(categories = [X_trainfull['region'].unique()]))
])

In [None]:
type_pipe = Pipeline(steps=[
    ('one', OneHotEncoder(categories = [X_trainfull['type'].unique()]))
])

# Traitement des valeurs manquantes

Détection de la présence de valeurs manquantes

In [None]:
df_train_full.isna().any()

Sélection des colonnes de valeurs quantitatives

In [None]:
boolean_columns = ['pool', 'cellar', 'garage']
area_column = ['living_area_m2']
ground_column = ['lot_size_m2']
nb_room_column = ['nb_room']
nb_bedroom_column = ['nb_bedroom']

Création des pipelines gérant les valeurs manquantes dans les colonnes de valeurs quantitatives

In [None]:
area_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

In [None]:
ground_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0))
])

In [None]:
room_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=1))
])

In [None]:
bedroom_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=1))
])

# Construction du pipeline de pré-traitement

In [None]:
preprocess_pipe = ColumnTransformer(transformers=[
    ('city', city_pipe, ['city']),
    ('departement', departement_pipe, ['departement']),
    ('region', region_pipe, ['region']),
    ('type', type_pipe, ['type']),
    ('area', area_pipe, area_column),
    ('ground', ground_pipe, ground_column),
    ('room', room_pipe, nb_room_column),
    ('bedroom', bedroom_pipe, nb_bedroom_column),
])

# Choix du modèle et intégration dans la pipeline

Création de la fonction calculant la valeur moyenne absolut en pourcentage d'erreur

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return round(np.mean(np.abs((y_true - y_pred) / y_true)) * 100, 2)

Création de la fonction de test de différents modèles

In [None]:
def mae_model(preprocess_pipe, model):

    full_pipe = Pipeline(steps=[
        ('pp', preprocess_pipe),
        ('model', model)
    ])

    X_train, X_val, Y_train, Y_val = train_test_split(X_trainfull, Y_trainfull,
                                                      test_size = 0.2, random_state = seed)

    full_pipe.fit(X_train, Y_train);

    y_train_pred = full_pipe.predict(X_train)
    y_val_pred = full_pipe.predict(X_val)

    mae_train = round(mean_absolute_percentage_error(Y_train, y_train_pred), 0)
    mae_val = round(mean_absolute_percentage_error(Y_val, y_val_pred), 0)

    return mae_train, mae_val

Liste des différents modèles

In [None]:
lin = LinearRegression()
#log = LogisticRegression()
tree = DecisionTreeRegressor()
rf = RandomForestRegressor()
bag = BaggingRegressor()
xgb = XGBRegressor()
xgb_rf = XGBRFRegressor()

In [None]:
models = [lin, tree, rf, bag, xgb, xgb_rf]

calcul du MAPE

In [None]:
errors = []
for model in models:
    errors += [mae_model(preprocess_pipe, model)]

print(errors)

In [None]:
plt.plot(errors);
plt.grid();
plt.legend(['MAPE sur train', 'MAPE sur val']);

Du graphique précédent, nous choisissons de conserver le modèle 5 à savoir le modèle xgbregressor

# Génération du modèle retenu

In [None]:
model = XGBRegressor()

In [None]:
full_pipe = Pipeline(steps=[
        ('pp', preprocess_pipe),
        ('model', model)
    ])

In [None]:
full_pipe.fit(X_trainfull, Y_trainfull);

## exportation du modèle

In [None]:
# suppression du fichier si un ancien modèle existe
if os.path.exists('../models/full_pipe.file'):
    os.remove('../models/full_pipe.file')
else:
    print("Impossible de supprimer le fichier car il n'existe pas")

In [None]:
model_file_name = "../models/full_pipe_" + str(datetime.today())+ ".file"
last_model_name = "../models/full_pipe.file"

In [None]:
pickle.dump(full_pipe, open(model_file_name, "wb" ))
pickle.dump(full_pipe, open(last_model_name, "wb" ))

# Prédictions

## chargement du modèle

In [None]:
full_pipe = pickle.load(open(last_model_name, "rb"))

In [None]:
type(full_pipe)

## prédiction sur trainfull complet

In [None]:
X_trainfull

In [None]:
y_train_pred = full_pipe.predict(X_trainfull)

In [None]:
mape_train = round(metrics.mean_absolute_percentage_error(Y_trainfull, y_train_pred), 0)
print('MAPE sur train : ', mape_train, '€')

In [None]:
trainfull_pred = pd.DataFrame(y_train_pred)
trainfull_pred.to_csv('../csv/trainfull_pred.csv', index=False)
Y_trainfull.to_csv('../csv/Y_trainfull.csv', index=False)

## prédiction sur valeur unique

In [None]:
new_value = pd.DataFrame(data = [
    ['Bordeaux', 'Gironde', 'Aquitaine', 'appartement', 54, 12, 1, 1, 'True', 'True', 'True']
    ], index = ['test'], columns = X_trainfull.columns)

new_value.index.name = 'ref'
new_value

In [None]:
full_pipe.predict(new_value)

# Enregistrement des résultats si voulus (mettre log = True)
-> accessible sous localhost:3000

In [None]:
log_param("X_trainfull.shape", X_trainfull.shape)
log_metric("MAE sur train", mae_train)