# Summary

Dans ce notebook, nous allons mettre en pratique les algorithmes de regression sur un jeu de données relatif aux prix de biens immobilliers à Ames, Iowa.

Voici un résumé des **étapes du travail**:


**Outils:**
- **Collecte et exploration des données**: pandas, polar, missingno, ydata-profiling, seaborn, plotly, ...
- **Modélisaion**: lazypredict sklearn, xgboost, lightgbm, catboost, yellowbricks, ...
- **Optimisation des hyperparamètres**: gridsearch (sklearn), optuna, hyperopt, ...
- **Déploiement**: mlflow, fastapi, evidently, heroku, pythonanywhere, azure webapp, ...

In [1]:
#!pip -q install missingno optuna pendulum loguru ydata_profiling yellowbrick mlflow xgboost tensorflow pycaret boto3

# Librairies

In [8]:
# reload modules before executing user code.
%reload_ext autoreload
%autoreload 2

import boto3
import os
import sys
from pathlib import Path
from typing import Dict, Union


import dill
import matplotlib.pyplot as plt
import missingno as msno
import mlflow
import mlflow.sklearn
import numpy as np
import optuna
import pandas as pd
import plotly.express as px
import pendulum
import seaborn as sns


from loguru import logger
from mlflow.models import infer_signature
from sklearn import set_config
from sklearn.compose import ColumnTransformer, make_column_selector, TransformedTargetRegressor
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, VotingRegressor, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (r2_score,
                             mean_squared_error,
                             mean_absolute_percentage_error,
                             max_error,
                            )
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
#from ydata_profiling import ProfileReport
from yellowbrick.regressor import ResidualsPlot


#sys.path.append(str(Path.cwd().parent))
from settings.params import (DATA_DIR_INPUT,
                             DATA_DIR_OUTPUT,
                             MODEL_PARAMS,
                             REPORT_DIR,
                             RUNS_DIR,
                             TIMEZONE,
                             MODEL_DIR,
                             MODEL_NAME
                            )
from src.make_dataset import load_data
from src.trainer import define_pipeline, eval_metrics

from src.utils import (filter_variables_by_completion_rate, 
                       remove_single_modality_categorical_variables,
                       split_dataset,
                       save_object_with_dill,
                       load_dataset
                      )

set_config(display="diagram", print_changed_only=False)
pd.set_option("display.max_columns", None)

ImportError: attempted relative import beyond top-level package

In [None]:
pwd

# Settings

In [3]:
# time in UTC
log_fmt = ("<green>{time:YYYY-MM-DD HH:mm:ss.SSS!UTC}</green> | <level>{level: <8}</level> | "
           "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - {message}"
          )
log_config = {
    "handlers": [
        {"sink": sys.stderr, "format": log_fmt},
    ],
}
logger.configure(**log_config)


[1]

In [4]:
EXECUTION_DATE = pendulum.now(tz=TIMEZONE)

logger.info(f"Execution date: {EXECUTION_DATE}")

logger.info(f"\nData input directory : {DATA_DIR_INPUT}\nData output directory: {DATA_DIR_OUTPUT}")


# model parameters
FEATURES = MODEL_PARAMS["FEATURES"]
TARGET_NAME = MODEL_PARAMS["TARGET"]

[32m2023-08-20 11:10:44.649[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - Execution date: 2023-08-20T11:10:44.649896+00:00
[32m2023-08-20 11:10:44.651[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - 
Data input directory : /Users/ahmadou-bamba/Desktop/TP_house_pricing_CHEIKH_AHMADOU_BAMBA_DIOP/data/input
Data output directory: /Users/ahmadou-bamba/Desktop/TP_house_pricing_CHEIKH_AHMADOU_BAMBA_DIOP/data/output


# Data Collection

In [5]:
data = load_dataset("cleaned_data")

In [6]:
data.head()

Unnamed: 0,mssubclass,mszoning,lotfrontage,lotarea,street,lotshape,landcontour,utilities,lotconfig,landslope,neighborhood,condition1,condition2,bldgtype,housestyle,overallqual,overallcond,yearbuilt,yearremodadd,roofstyle,roofmatl,exterior1st,exterior2nd,masvnrtype,masvnrarea,exterqual,extercond,foundation,bsmtqual,bsmtcond,bsmtexposure,bsmtfintype1,bsmtfinsf1,bsmtfintype2,bsmtfinsf2,bsmtunfsf,totalbsmtsf,heating,heatingqc,centralair,electrical,1stflrsf,2ndflrsf,lowqualfinsf,grlivarea,bsmtfullbath,bsmthalfbath,fullbath,halfbath,bedroomabvgr,kitchenabvgr,kitchenqual,totrmsabvgrd,functional,fireplaces,fireplacequ,garagetype,garageyrblt,garagefinish,garagecars,garagearea,garagequal,garagecond,paveddrive,wooddecksf,openporchsf,enclosedporch,3ssnporch,screenporch,poolarea,miscval,mosold,yrsold,saletype,salecondition,saleprice,building_age,remodel_age
0,60.0,RL,65.0,8450.0,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7.0,5.0,2003.0,2003.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,SBrkr,856.0,854.0,0.0,1710.0,1.0,0.0,2.0,1.0,3.0,1.0,Gd,8.0,Typ,0.0,,Attchd,2003.0,RFn,2.0,548.0,TA,TA,Y,0.0,61.0,0.0,0.0,0.0,0.0,0.0,2.0,2008.0,WD,Normal,208500.0,5.0,5.0
1,20.0,RL,80.0,9600.0,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6.0,8.0,1976.0,1976.0,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,SBrkr,1262.0,0.0,0.0,1262.0,0.0,1.0,2.0,0.0,3.0,1.0,TA,6.0,Typ,1.0,TA,Attchd,1976.0,RFn,2.0,460.0,TA,TA,Y,298.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,WD,Normal,181500.0,31.0,31.0
2,60.0,RL,68.0,11250.0,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7.0,5.0,2001.0,2002.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,SBrkr,920.0,866.0,0.0,1786.0,1.0,0.0,2.0,1.0,3.0,1.0,Gd,6.0,Typ,1.0,TA,Attchd,2001.0,RFn,2.0,608.0,TA,TA,Y,0.0,42.0,0.0,0.0,0.0,0.0,0.0,9.0,2008.0,WD,Normal,223500.0,7.0,6.0
3,70.0,RL,60.0,9550.0,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7.0,5.0,1915.0,1970.0,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,SBrkr,961.0,756.0,0.0,1717.0,1.0,0.0,1.0,0.0,3.0,1.0,Gd,7.0,Typ,1.0,Gd,Detchd,1998.0,Unf,3.0,642.0,TA,TA,Y,0.0,35.0,272.0,0.0,0.0,0.0,0.0,2.0,2006.0,WD,Abnorml,140000.0,91.0,36.0
4,60.0,RL,84.0,14260.0,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8.0,5.0,2000.0,2000.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,SBrkr,1145.0,1053.0,0.0,2198.0,1.0,0.0,2.0,1.0,4.0,1.0,Gd,9.0,Typ,1.0,TA,Attchd,2000.0,RFn,3.0,836.0,TA,TA,Y,192.0,84.0,0.0,0.0,0.0,0.0,0.0,12.0,2008.0,WD,Normal,250000.0,8.0,8.0


# Modeling

## Train / test split

In [32]:
X_train, X_test, y_train, y_test = split_dataset(data)

[32m2023-08-20 11:10:54.135[0m | [1mINFO    [0m | [36msrc.utils[0m:[36msplit_dataset[0m:[36m93[0m - 
x train: (1168, 20)
Y train: (1168,) 
X test: (292, 20)
Y test: (292,)


In [33]:
categorical_features = data.select_dtypes(include="object").columns
#print(f"Categorical features:\n {categorical_features}\n")

numerical_features = data.select_dtypes(include="number").columns
#print(f"Numerical features:\n {numerical_features}")

In [34]:
#numerical_features=numerical_features.drop('id')
numerical_features=numerical_features.drop('saleprice')

In [35]:
dummy_model = DummyRegressor()
linear_model = LinearRegression()
ensemble_model = RandomForestRegressor()

In [36]:
#Path(RUNS_DIR).mkdir(parents=True, exist_ok=True)

## Pipeline-modeling-training

In [37]:
ec2_user = os.getenv("MLFLOW_SERVER_USERNAME")
ec2_pwd = os.getenv("MLFLOW_SERVER_PASSWORD")

In [118]:
tracking_uri = "http://{0}:{1}@ec2-user@ec2-3-253-117-137.eu-west-1.compute.amazonaws.com:5000".format(ec2_user,ec2_pwd)

In [119]:
mlflow.set_tracking_uri(tracking_uri)

In [121]:
exp_name = "house-pricing"
experiment = mlflow.get_experiment_by_name(exp_name)
artifact_path = "mlflow_model"
if not experiment:
    experiment_id = mlflow.create_experiment(exp_name, s3_bucket)
else:
    experiment_id = experiment.experiment_id

In [122]:
mlflow.autolog()

2023/08/20 17:29:42 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/08/20 17:29:42 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2023/08/20 17:29:42 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


In [129]:
with mlflow.start_run(
    run_name=f"{EXECUTION_DATE.strftime('%Y%m%d_%H%m%S')}-house_pricing",
    experiment_id=experiment_id,
    tags={"version": "v1", "priority": "P1"},
    description="house price modeling",) as mlf_run:
    
    
    print(f"run_id: {mlf_run.info.run_id}")
    print(f"version tag value: {mlf_run.data.tags.get('version')}")
    print("--")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

    # Select number of estimator
    estimators = 10
    mlflow.log_param("n_estimators", estimators)
    # Model definition
    reg = define_pipeline(numerical_transformer=[SimpleImputer(strategy="median"),
                                                 RobustScaler()],
                          categorical_transformer=[SimpleImputer(strategy="constant", fill_value="undefined"),
                                                   OneHotEncoder(drop="if_binary", handle_unknown="ignore")],
                          target_transformer=False,
                          estimator=RandomForestClassifier(n_estimators=estimators)
                     )
    
    reg.fit(X_train, y_train)

    # Evaluate Metrics
    y_train_pred = reg.predict(X_train)
    y_test_pred = reg.predict(X_test)
    train_metrics = eval_metrics(y_train , y_train_pred)
    test_metrics = eval_metrics(y_test , y_test_pred)

    # log out metrics
    logger.info(f"Train: {train_metrics}")
    logger.info(f"Test: {test_metrics}")
    
    # Infer model signature
    # Converting train features into a DataFrame
    X_train_df = pd.DataFrame(data=X_train, columns=data.columns)
    
    X_train_df.loc[:, categorical_features] = X_train_df.loc[:, categorical_features].astype(str)
    X_train_df.loc[:, numerical_features] = X_train_df.loc[:, numerical_features].astype(str)

    signature = infer_signature(model_input=X_train_df, 
                                model_output=y_train_pred)
    

    # Log parameter, metrics, and model to MLflow
    for group_name, set_metrics in [("train", train_metrics),("test", test_metrics),]:
        
        for metric_name, metric_value in set_metrics.items():
            mlflow.log_metric(f"{group_name}_{metric_name}", metric_value)
            
            
    mlflow.sklearn.log_model(reg, artifact_path=artifact_path,signature=signature, registered_model_name="RandomForestModel")

run_id: f4bb8d179f2d4713beb00d4c1f664e38
version tag value: v1
--
default artifacts URI: 's3://mlflow010/f4bb8d179f2d4713beb00d4c1f664e38/artifacts'


[32m2023-08-20 19:24:43.570[0m | [1mINFO    [0m | [36msrc.trainer[0m:[36mdefine_pipeline[0m:[36m83[0m - Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                         

## Model évaluation

In [None]:
# afficher les performances des modèles déja entrainés

## Tuning

In [None]:
# check for the best parameters, implement early stoping also

## Performance analysis

In [None]:
# analyse the model params

## Model validation

In [None]:
# save the best params and test the model on test set

## Business performances

In [None]:
# make predictions on new data

## Data drift

In [2]:
# evaluate data drift, fight it

# Data leakage: attention à la fuite de données
#Les variables importantes alors qu'elles ne seront pas disponibles, à priori lors de la prédiction (futures données)
#exemple: salecondition

## Déploiement

In [None]:
# sauvargade du modèle en local: dill

In [None]:
model_path_name = Path(MODEL_DIR, f'{EXECUTION_DATE.strftime("%Y%m%d")}-{MODEL_NAME}')

In [None]:
model_path_name

In [None]:
save_object_with_dill(object_to_save=reg, object_path=model_path_name)