# Summary

Dans ce notebook, nous allons mettre en pratique les algorithmes de regression sur un jeu de données relatif aux prix de biens immobilliers à Ames, Iowa.

Voici un résumé des **étapes du travail**:


**Outils:**
- **Collecte et exploration des données**: pandas, polar, missingno, ydata-profiling, seaborn, plotly, ...
- **Modélisaion**: lazypredict sklearn, xgboost, lightgbm, catboost, yellowbricks, ...
- **Optimisation des hyperparamètres**: gridsearch (sklearn), optuna, hyperopt, ...
- **Déploiement**: mlflow, fastapi, evidently, heroku, pythonanywhere, azure webapp, ...

In [1]:
#!pip -q install missingno optuna pendulum loguru ydata_profiling yellowbrick mlflow xgboost tensorflow pycaret boto3

# Librairies

In [43]:
# reload modules before executing user code.
%reload_ext autoreload
%autoreload 2

import boto3
import os
import sys
import warnings
from pathlib import Path
from typing import Dict, Union

import dill
import matplotlib.pyplot as plt
import missingno as msno
import mlflow
import mlflow.sklearn
import numpy as np
import optuna
import pandas as pd
import plotly.express as px
import pendulum
import seaborn as sns
import boto3


from botocore.exceptions import NoCredentialsError
from loguru import logger
from mlflow.models import infer_signature
from sklearn import set_config
from sklearn.compose import ColumnTransformer, make_column_selector, TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (r2_score,
                             mean_squared_error,
                             mean_absolute_percentage_error,
                             max_error,
                            )
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
#from ydata_profiling import ProfileReport
from yellowbrick.regressor import ResidualsPlot
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


sys.path.append(str(Path.cwd().parent))
from settings.params import (DATA_DIR_INPUT,
                             DATA_DIR_OUTPUT,
                             MODEL_PARAMS,
                             REPORT_DIR,
                             RUNS_DIR,
                             TIMEZONE,
                             MODEL_DIR,
                             MODEL_NAME
                            )
from src.make_dataset import load_data
from src.trainer import define_pipeline, eval_metrics,train_models
from src.optimizer import optimize_model
from src.utils import (filter_variables_by_completion_rate, 
                       remove_single_modality_categorical_variables,
                       split_dataset,
                       save_object_with_dill,
                       load_dataset
                      )

set_config(display="diagram", print_changed_only=False)
pd.set_option("display.max_columns", None)

# Settings

In [3]:
# time in UTC
log_fmt = ("<green>{time:YYYY-MM-DD HH:mm:ss.SSS!UTC}</green> | <level>{level: <8}</level> | "
           "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - {message}"
          )
log_config = {
    "handlers": [
        {"sink": sys.stderr, "format": log_fmt},
    ],
}
logger.configure(**log_config)


[1]

In [4]:
EXECUTION_DATE = pendulum.now(tz=TIMEZONE)

logger.info(f"Execution date: {EXECUTION_DATE}")

logger.info(f"\nData input directory : {DATA_DIR_INPUT}\nData output directory: {DATA_DIR_OUTPUT}")


# model parameters
FEATURES = MODEL_PARAMS["FEATURES"]
TARGET_NAME = MODEL_PARAMS["TARGET"]

[32m2023-08-22 20:15:53.527[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - Execution date: 2023-08-22T20:15:53.527250+00:00
[32m2023-08-22 20:15:53.528[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - 
Data input directory : /Users/ahmadou-bamba/Desktop/MLOPS_PROJECT/notebooks/data/input
Data output directory: /Users/ahmadou-bamba/Desktop/MLOPS_PROJECT/notebooks/data/output


# Data Collection

In [5]:
data = load_dataset("cleaned_data")

In [6]:
data.head()

Unnamed: 0,mssubclass,mszoning,lotfrontage,lotarea,street,lotshape,landcontour,utilities,lotconfig,landslope,neighborhood,condition1,condition2,bldgtype,housestyle,overallqual,overallcond,yearbuilt,yearremodadd,roofstyle,roofmatl,exterior1st,exterior2nd,masvnrtype,masvnrarea,exterqual,extercond,foundation,bsmtqual,bsmtcond,bsmtexposure,bsmtfintype1,bsmtfinsf1,bsmtfintype2,bsmtfinsf2,bsmtunfsf,totalbsmtsf,heating,heatingqc,centralair,electrical,1stflrsf,2ndflrsf,lowqualfinsf,grlivarea,bsmtfullbath,bsmthalfbath,fullbath,halfbath,bedroomabvgr,kitchenabvgr,kitchenqual,totrmsabvgrd,functional,fireplaces,fireplacequ,garagetype,garageyrblt,garagefinish,garagecars,garagearea,garagequal,garagecond,paveddrive,wooddecksf,openporchsf,enclosedporch,3ssnporch,screenporch,poolarea,miscval,mosold,yrsold,saletype,salecondition,saleprice,building_age,remodel_age
0,60.0,RL,65.0,8450.0,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7.0,5.0,2003.0,2003.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,SBrkr,856.0,854.0,0.0,1710.0,1.0,0.0,2.0,1.0,3.0,1.0,Gd,8.0,Typ,0.0,,Attchd,2003.0,RFn,2.0,548.0,TA,TA,Y,0.0,61.0,0.0,0.0,0.0,0.0,0.0,2.0,2008.0,WD,Normal,208500.0,5.0,5.0
1,20.0,RL,80.0,9600.0,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6.0,8.0,1976.0,1976.0,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,SBrkr,1262.0,0.0,0.0,1262.0,0.0,1.0,2.0,0.0,3.0,1.0,TA,6.0,Typ,1.0,TA,Attchd,1976.0,RFn,2.0,460.0,TA,TA,Y,298.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,WD,Normal,181500.0,31.0,31.0
2,60.0,RL,68.0,11250.0,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7.0,5.0,2001.0,2002.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,SBrkr,920.0,866.0,0.0,1786.0,1.0,0.0,2.0,1.0,3.0,1.0,Gd,6.0,Typ,1.0,TA,Attchd,2001.0,RFn,2.0,608.0,TA,TA,Y,0.0,42.0,0.0,0.0,0.0,0.0,0.0,9.0,2008.0,WD,Normal,223500.0,7.0,6.0
3,70.0,RL,60.0,9550.0,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7.0,5.0,1915.0,1970.0,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,SBrkr,961.0,756.0,0.0,1717.0,1.0,0.0,1.0,0.0,3.0,1.0,Gd,7.0,Typ,1.0,Gd,Detchd,1998.0,Unf,3.0,642.0,TA,TA,Y,0.0,35.0,272.0,0.0,0.0,0.0,0.0,2.0,2006.0,WD,Abnorml,140000.0,91.0,36.0
4,60.0,RL,84.0,14260.0,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8.0,5.0,2000.0,2000.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,SBrkr,1145.0,1053.0,0.0,2198.0,1.0,0.0,2.0,1.0,4.0,1.0,Gd,9.0,Typ,1.0,TA,Attchd,2000.0,RFn,3.0,836.0,TA,TA,Y,192.0,84.0,0.0,0.0,0.0,0.0,0.0,12.0,2008.0,WD,Normal,250000.0,8.0,8.0


# Modeling

## Train / test split

In [7]:
X_train, X_test, y_train, y_test = split_dataset(data)

[32m2023-08-22 20:15:53.985[0m | [1mINFO    [0m | [36msrc.utils[0m:[36msplit_dataset[0m:[36m96[0m - 
x train: (1168, 20)
Y train: (1168,) 
X test: (292, 20)
Y test: (292,)


In [8]:
categorical_features = data.select_dtypes(include="object").columns
#print(f"Categorical features:\n {categorical_features}\n")

numerical_features = data.select_dtypes(include="number").columns
#print(f"Numerical features:\n {numerical_features}")

In [9]:
#numerical_features=numerical_features.drop('id')
numerical_features=numerical_features.drop('saleprice')

## Pipeline-modeling-training

In [10]:
ec2_user = os.getenv("MLFLOW_SERVER_USERNAME")
ec2_pwd = os.getenv("MLFLOW_SERVER_PASSWORD")
os.environ['AWS_ACCESS_KEY_ID'] = 'AKIAZDXC42LPILEJCGNG'
os.environ['AWS_SECRET_ACCESS_KEY'] = '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'

In [11]:
tracking_uri = "http://{0}:{1}@ec2-user@ec2-3-253-117-137.eu-west-1.compute.amazonaws.com:5000".format(ec2_user,ec2_pwd)

In [12]:
mlflow.set_tracking_uri(tracking_uri)

In [13]:
exp_name = "house-pricing"
experiment = mlflow.get_experiment_by_name(exp_name)
artifact_path = "model"
if not experiment:
    experiment_id = mlflow.create_experiment(exp_name, s3_bucket)
else:
    experiment_id = experiment.experiment_id

In [14]:
mlflow.autolog()

2023/08/22 22:15:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/08/22 22:15:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2023/08/22 22:15:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.


In [15]:
model_results = train_models(
    data, 
    X_train, 
    y_train, 
    X_test, 
    y_test, 
    categorical_features,
    numerical_features,
    artifact_path,
    experiment_id,
    target_transformer=False
)

[32m2023-08-22 20:15:57.499[0m | [1mINFO    [0m | [36msrc.trainer[0m:[36mdefine_pipeline[0m:[36m97[0m - Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                         

                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('simpleimputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                          ...`
                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('simpleimputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                            ...`
         steps=[('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_val

                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('simpleimputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                            ...`
                          init=None, learning_rate=0.1, loss='squared_error',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_samples_leaf=1,
                          min_samples_split=2, min_weight_fraction_leaf=0.0,
                          n_estimators=10, n_iter_no_change=None,
                          random_state=None, s...`
         steps=[('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               

In [16]:
model_with_transformer_results = train_models(
    data, 
    X_train, 
    y_train, 
    X_test, 
    y_test, 
    categorical_features,
    numerical_features,
    artifact_path,
    experiment_id,
    target_transformer=True
)

[32m2023-08-22 20:17:03.766[0m | [1mINFO    [0m | [36msrc.trainer[0m:[36mdefine_pipeline[0m:[36m97[0m - Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                         

Created version '5' of model 'LinearRegressionModel'.
[32m2023-08-22 20:19:37.901[0m | [1mINFO    [0m | [36msrc.trainer[0m:[36mdefine_pipeline[0m:[36m97[0m - Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing

Registered model 'RandomForestModel' already exists. Creating a new version of this model...
2023/08/22 22:20:09 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestModel, version 16
Created version '16' of model 'RandomForestModel'.
[32m2023-08-22 20:20:10.346[0m | [1mINFO    [0m | [36msrc.trainer[0m:[36mdefine_pipeline[0m:[36m97[0m - Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                 

[32m2023-08-22 20:20:22.613[0m | [1mINFO    [0m | [36msrc.trainer[0m:[36mtrain_models[0m:[36m151[0m - Model: GradientBoosting
[32m2023-08-22 20:20:22.614[0m | [1mINFO    [0m | [36msrc.trainer[0m:[36mtrain_models[0m:[36m152[0m - run_id: 0680f3e9e30140b88be9af85a1f95ec8
[32m2023-08-22 20:20:22.615[0m | [1mINFO    [0m | [36msrc.trainer[0m:[36mtrain_models[0m:[36m153[0m - version tag value: v1
[32m2023-08-22 20:20:22.616[0m | [1mINFO    [0m | [36msrc.trainer[0m:[36mtrain_models[0m:[36m154[0m - --
[32m2023-08-22 20:20:22.991[0m | [1mINFO    [0m | [36msrc.trainer[0m:[36mtrain_models[0m:[36m155[0m - default artifacts URI: 's3://mlflow010/0680f3e9e30140b88be9af85a1f95ec8/artifacts'
[32m2023-08-22 20:20:22.992[0m | [1mINFO    [0m | [36msrc.trainer[0m:[36mtrain_models[0m:[36m156[0m - Train: {'rmse': 51355.10265433455, 'mae': 31668.180330759056, 'r2': 0.6013470529184028, 'max_error': 456864.1660949373}
[32m2023-08-22 20:20:22.993[0m | 

## Model évaluation

In [17]:
model_results

{'LinearRegression': {'train_metrics': {'rmse': 36410.10552514463,
   'mae': 22319.738710703285,
   'r2': 0.7996120276150427,
   'max_error': 514087.1837477549},
  'test_metrics': {'rmse': 29546.5868377435,
   'mae': 22713.028277810656,
   'r2': 0.826853355865427,
   'max_error': 120920.51953596156},
  'run_id': '5fb61144eced4379a3078a58e53beb43'},
 'RandomForest': {'train_metrics': {'rmse': 15351.950811982224,
   'mae': 9267.995662100457,
   'r2': 0.9643749964625764,
   'max_error': 125975.0},
  'test_metrics': {'rmse': 26649.378791438958,
   'mae': 19757.64931506849,
   'r2': 0.8591445615489602,
   'max_error': 108200.0},
  'run_id': '1e855d674bc24e27b969d44cc6a155f0'},
 'GradientBoosting': {'train_metrics': {'rmse': 46085.0315137492,
   'mae': 31662.62326456293,
   'r2': 0.6789685700228276,
   'max_error': 367874.59311958746},
  'test_metrics': {'rmse': 40145.52182316525,
   'mae': 30655.853107624855,
   'r2': 0.6803507757454754,
   'max_error': 151109.82026537543},
  'run_id': '6ff

In [21]:
model_with_transformer_results

{'LinearRegression': {'train_metrics': {'rmse': 46590.163952345385,
   'mae': 20821.966861990466,
   'r2': 0.6718924279285152,
   'max_error': 1208155.0889966583},
  'test_metrics': {'rmse': 26560.806990463974,
   'mae': 19418.81318247887,
   'r2': 0.8600792991019943,
   'max_error': 140966.15195289956},
  'run_id': 'f284e065d11d4586b958496de18b4e3b'},
 'RandomForest': {'train_metrics': {'rmse': 14719.883311422936,
   'mae': 8421.127475851947,
   'r2': 0.9672480991825624,
   'max_error': 125782.67973907443},
  'test_metrics': {'rmse': 26569.516036382833,
   'mae': 19171.354932266353,
   'r2': 0.8599875266201196,
   'max_error': 112438.08824016832},
  'run_id': '07000a3b0cc54b8ab945bd0d5232ca50'},
 'GradientBoosting': {'train_metrics': {'rmse': 51355.10265433455,
   'mae': 31668.180330759056,
   'r2': 0.6013470529184028,
   'max_error': 456864.1660949373},
  'test_metrics': {'rmse': 41656.99418017475,
   'mae': 29615.243399228453,
   'r2': 0.6558281872530622,
   'max_error': 182010.1337

In [18]:
for model_name, metrics_dict in model_results.items():
    print(f"Model: {model_name}")
    print("Train Metrics:")
    for metric_name, metric_value in metrics_dict["train_metrics"].items():
        print(f"  {metric_name}: {metric_value:.4f}")
    print("Test Metrics:")
    for metric_name, metric_value in metrics_dict["test_metrics"].items():
        print(f"  {metric_name}: {metric_value:.4f}")
    print("\n")

Model: LinearRegression
Train Metrics:
  rmse: 36410.1055
  mae: 22319.7387
  r2: 0.7996
  max_error: 514087.1837
Test Metrics:
  rmse: 29546.5868
  mae: 22713.0283
  r2: 0.8269
  max_error: 120920.5195


Model: RandomForest
Train Metrics:
  rmse: 15351.9508
  mae: 9267.9957
  r2: 0.9644
  max_error: 125975.0000
Test Metrics:
  rmse: 26649.3788
  mae: 19757.6493
  r2: 0.8591
  max_error: 108200.0000


Model: GradientBoosting
Train Metrics:
  rmse: 46085.0315
  mae: 31662.6233
  r2: 0.6790
  max_error: 367874.5931
Test Metrics:
  rmse: 40145.5218
  mae: 30655.8531
  r2: 0.6804
  max_error: 151109.8203




## Tuning

In [49]:
rf_model_run_id = model_results['RandomForest']['run_id']
best_model = mlflow.sklearn.load_model(f"runs:/{rf_model_run_id}/model")

In [None]:
param_dist = {
    "estimator__n_estimators": [10, 15, 20, 25, 30, 40],
    "estimator__max_depth": [None, 10, 20, 30, 35, 40],
    "estimator__min_samples_split": [2, 3, 4, 5, 10]
}

In [44]:
best_estimator, best_params = optimize_model(
    X_train, 
    y_train, 
    best_model, 
    param_dist, 
    n_iter=100, 
    cv=3, 
    random_state=42, 
    n_jobs=-1
)

















Best Parameters: {'estimator__n_estimators': 30, 'estimator__min_samples_split': 4, 'estimator__max_depth': None}
Test Metrics for Optimized Model:
  rmse: 26277.0559
  mae: 18592.1340
  r2: 0.8631
  max_error: 113246.7520


## Best Model performances

In [45]:
# Evaluate the best estimator on the test set
y_test_pred = best_estimator.predict(X_test)
test_metrics = eval_metrics(y_test, y_test_pred)

# Print the best parameters and test metrics
print("Best Parameters:", best_params)
print("Test Metrics for Optimized Model:")
for metric_name, metric_value in test_metrics.items():
    print(f"  {metric_name}: {metric_value:.4f}")

Best Parameters: {'estimator__n_estimators': 30, 'estimator__min_samples_split': 4, 'estimator__max_depth': None}
Test Metrics for Optimized Model:
  rmse: 26277.0559
  mae: 18592.1340
  r2: 0.8631
  max_error: 113246.7520


## Déploiement

In [50]:
# sauvargade du modèle en local: dill
model_path_name = Path(MODEL_DIR, f'{EXECUTION_DATE.strftime("%Y%m%d")}-{MODEL_NAME}')
logger.info(f"Best model path: {model_path_name}")
save_object_with_dill(object_to_save=best_estimator, object_path=model_path_name)

[32m2023-08-23 11:31:44.578[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - Best model path: /Users/ahmadou-bamba/Desktop/MLOPS_PROJECT/notebooks/models/20230822-model_house_pricing.dill


In [51]:
# Set your AWS credentials if not already configured
aws_access_key = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')

# Create a Boto3 S3 client
s3_client = boto3.client("s3", aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)

# Define the S3 bucket name and the destination path
s3_bucket_name = "mlflow010"
s3_destination_path = "best_model/"

# Upload the saved model to S3
try:
    s3_client.upload_file(str(model_path_name), s3_bucket_name, s3_destination_path + "best_model.dill")
    print("Upload successful")
except FileNotFoundError:
    print("The file was not found")
except NoCredentialsError:
    print("Credentials not available")

Upload successful
