# Librairies

In [1]:

from loguru import lgger
from mlflow.models import infer_signature
from sklearn import set_config
from sklearn.compose import ColumnTransformer, make_column_selector, TransformedTargetRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, max_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder

sys.path.append(str(Path.cwd().parent))
from settings.params import (DATA_DIR_INPUT,
                             DATA_DIR_OUTPUT,
                             MODEL_DIR,
                             MODEL_PARAMS,
                             REPORT_DIR,
                             TIMEZONE,
                             SEED
                            )
from src.make_dataset import load_data
#from src.trainer import Trainer
#from src.utils import save_object_with_dill


set_config(display="diagram", print_changed_only=False)
pd.set_option("display.max_columns", None)

In [None]:
# Column_to_lower
column_to_lower = True

# Settings

In [2]:
EXECUTION_DATE = pendulum.now(tz=TIMEZONE)

logger.info(f"Execution date: {EXECUTION_DATE}")

logger.info(f"\nData input directory : {DATA_DIR_INPUT}\nData output directory: {DATA_DIR_OUTPUT}")

# model parameters
FEATURES = MODEL_PARAMS.get("FEATURES")
TARGET_NAME = MODEL_PARAMS["TARGET"]

[32m2023-07-20 16:50:14.318[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mExecution date: 2023-07-20T16:50:14.318171+00:00[0m
[32m2023-07-20 16:50:14.319[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1m
Data input directory : /Users/ahmadou-bamba/Desktop/TP_house_pricing_CHEIKH_AHMADOU_BAMBA_DIOP/data/input
Data output directory: /Users/ahmadou-bamba/Desktop/TP_house_pricing_CHEIKH_AHMADOU_BAMBA_DIOP/data/output[0m


# Data collection

In [3]:
data = load_data(dataset_name="house_prices", column_to_lower=True)

[32m2023-07-20 16:50:14.400[0m | [1mINFO    [0m | [36msrc.make_dataset[0m:[36mload_data[0m:[36m24[0m - [1m
Args: dataset name: house_prices 
column to lower: True
[32m2023-07-20 16:50:14.567[0m | [1mINFO    [0m | [36msrc.make_dataset[0m:[36mload_data[0m:[36m30[0m - [1mShape of raw input features: (1460, 81)[0m
[32m2023-07-20 16:50:14.568[0m | [1mINFO    [0m | [36msrc.make_dataset[0m:[36mload_data[0m:[36m31[0m - [1mFull description of the dataset
Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.

With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.

MSSubClass: Identifies the type of

In [4]:
data.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,landslope,neighborhood,condition1,condition2,bldgtype,housestyle,overallqual,overallcond,yearbuilt,yearremodadd,roofstyle,roofmatl,exterior1st,exterior2nd,masvnrtype,masvnrarea,exterqual,extercond,foundation,bsmtqual,bsmtcond,bsmtexposure,bsmtfintype1,bsmtfinsf1,bsmtfintype2,bsmtfinsf2,bsmtunfsf,totalbsmtsf,heating,heatingqc,centralair,electrical,1stflrsf,2ndflrsf,lowqualfinsf,grlivarea,bsmtfullbath,bsmthalfbath,fullbath,halfbath,bedroomabvgr,kitchenabvgr,kitchenqual,totrmsabvgrd,functional,fireplaces,fireplacequ,garagetype,garageyrblt,garagefinish,garagecars,garagearea,garagequal,garagecond,paveddrive,wooddecksf,openporchsf,enclosedporch,3ssnporch,screenporch,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1.0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7.0,5.0,2003.0,2003.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,SBrkr,856.0,854.0,0.0,1710.0,1.0,0.0,2.0,1.0,3.0,1.0,Gd,8.0,Typ,0.0,,Attchd,2003.0,RFn,2.0,548.0,TA,TA,Y,0.0,61.0,0.0,0.0,0.0,0.0,,,,0.0,2.0,2008.0,WD,Normal,208500.0
1,2.0,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6.0,8.0,1976.0,1976.0,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,SBrkr,1262.0,0.0,0.0,1262.0,0.0,1.0,2.0,0.0,3.0,1.0,TA,6.0,Typ,1.0,TA,Attchd,1976.0,RFn,2.0,460.0,TA,TA,Y,298.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,5.0,2007.0,WD,Normal,181500.0
2,3.0,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7.0,5.0,2001.0,2002.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,SBrkr,920.0,866.0,0.0,1786.0,1.0,0.0,2.0,1.0,3.0,1.0,Gd,6.0,Typ,1.0,TA,Attchd,2001.0,RFn,2.0,608.0,TA,TA,Y,0.0,42.0,0.0,0.0,0.0,0.0,,,,0.0,9.0,2008.0,WD,Normal,223500.0
3,4.0,70.0,RL,60.0,9550.0,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7.0,5.0,1915.0,1970.0,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,SBrkr,961.0,756.0,0.0,1717.0,1.0,0.0,1.0,0.0,3.0,1.0,Gd,7.0,Typ,1.0,Gd,Detchd,1998.0,Unf,3.0,642.0,TA,TA,Y,0.0,35.0,272.0,0.0,0.0,0.0,,,,0.0,2.0,2006.0,WD,Abnorml,140000.0
4,5.0,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8.0,5.0,2000.0,2000.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,SBrkr,1145.0,1053.0,0.0,2198.0,1.0,0.0,2.0,1.0,4.0,1.0,Gd,9.0,Typ,1.0,TA,Attchd,2000.0,RFn,3.0,836.0,TA,TA,Y,192.0,84.0,0.0,0.0,0.0,0.0,,,,0.0,12.0,2008.0,WD,Normal,250000.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1460 non-null   float64
 1   mssubclass     1460 non-null   float64
 2   mszoning       1460 non-null   object 
 3   lotfrontage    1201 non-null   float64
 4   lotarea        1460 non-null   float64
 5   street         1460 non-null   object 
 6   alley          91 non-null     object 
 7   lotshape       1460 non-null   object 
 8   landcontour    1460 non-null   object 
 9   utilities      1460 non-null   object 
 10  lotconfig      1460 non-null   object 
 11  landslope      1460 non-null   object 
 12  neighborhood   1460 non-null   object 
 13  condition1     1460 non-null   object 
 14  condition2     1460 non-null   object 
 15  bldgtype       1460 non-null   object 
 16  housestyle     1460 non-null   object 
 17  overallqual    1460 non-null   float64
 18  overallc

In [6]:
data = data.assign(building_age=lambda dfr: dfr.yrsold - dfr.yearbuilt,
                   remodel_age=lambda dfr: dfr.yrsold - dfr.yearremodadd
                  )
data.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,landslope,neighborhood,condition1,condition2,bldgtype,housestyle,overallqual,overallcond,yearbuilt,yearremodadd,roofstyle,roofmatl,exterior1st,exterior2nd,masvnrtype,masvnrarea,exterqual,extercond,foundation,bsmtqual,bsmtcond,bsmtexposure,bsmtfintype1,bsmtfinsf1,bsmtfintype2,bsmtfinsf2,bsmtunfsf,totalbsmtsf,heating,heatingqc,centralair,electrical,1stflrsf,2ndflrsf,lowqualfinsf,grlivarea,bsmtfullbath,bsmthalfbath,fullbath,halfbath,bedroomabvgr,kitchenabvgr,kitchenqual,totrmsabvgrd,functional,fireplaces,fireplacequ,garagetype,garageyrblt,garagefinish,garagecars,garagearea,garagequal,garagecond,paveddrive,wooddecksf,openporchsf,enclosedporch,3ssnporch,screenporch,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice,building_age,remodel_age
0,1.0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7.0,5.0,2003.0,2003.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,SBrkr,856.0,854.0,0.0,1710.0,1.0,0.0,2.0,1.0,3.0,1.0,Gd,8.0,Typ,0.0,,Attchd,2003.0,RFn,2.0,548.0,TA,TA,Y,0.0,61.0,0.0,0.0,0.0,0.0,,,,0.0,2.0,2008.0,WD,Normal,208500.0,5.0,5.0
1,2.0,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6.0,8.0,1976.0,1976.0,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,SBrkr,1262.0,0.0,0.0,1262.0,0.0,1.0,2.0,0.0,3.0,1.0,TA,6.0,Typ,1.0,TA,Attchd,1976.0,RFn,2.0,460.0,TA,TA,Y,298.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,5.0,2007.0,WD,Normal,181500.0,31.0,31.0
2,3.0,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7.0,5.0,2001.0,2002.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,SBrkr,920.0,866.0,0.0,1786.0,1.0,0.0,2.0,1.0,3.0,1.0,Gd,6.0,Typ,1.0,TA,Attchd,2001.0,RFn,2.0,608.0,TA,TA,Y,0.0,42.0,0.0,0.0,0.0,0.0,,,,0.0,9.0,2008.0,WD,Normal,223500.0,7.0,6.0
3,4.0,70.0,RL,60.0,9550.0,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7.0,5.0,1915.0,1970.0,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,SBrkr,961.0,756.0,0.0,1717.0,1.0,0.0,1.0,0.0,3.0,1.0,Gd,7.0,Typ,1.0,Gd,Detchd,1998.0,Unf,3.0,642.0,TA,TA,Y,0.0,35.0,272.0,0.0,0.0,0.0,,,,0.0,2.0,2006.0,WD,Abnorml,140000.0,91.0,36.0
4,5.0,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8.0,5.0,2000.0,2000.0,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,SBrkr,1145.0,1053.0,0.0,2198.0,1.0,0.0,2.0,1.0,4.0,1.0,Gd,9.0,Typ,1.0,TA,Attchd,2000.0,RFn,3.0,836.0,TA,TA,Y,192.0,84.0,0.0,0.0,0.0,0.0,,,,0.0,12.0,2008.0,WD,Normal,250000.0,8.0,8.0


# Hold out

In [7]:
x_train, x_test, y_train, y_test = train_test_split(data.loc[:, FEATURES],
                                                    data.loc[:, TARGET_NAME],
                                                    test_size=MODEL_PARAMS["TEST_SIZE"],
                                                    random_state=SEED,
                                                   )

# Modeling

In [8]:
def eval_metrics(y_actual: Union[pd.DataFrame, pd.Series, np.ndarray],
                 y_pred: Union[pd.DataFrame, pd.Series, np.ndarray]
                 ) -> Dict[str, float]:
    """ Compute evaluation metrics

    Args:
        y_actual: Ground truth (correct) target values
        y_pred: Estimated target values.

    Returns:
        Dict[str, float]: dictionary of evaluation metrics.
            Expected keys are: "rmse", "mae", "r2", "max_error"

    """
    # Root mean squared error
    rmse = mean_squared_error(y_actual, y_pred, squared=False)
    # mean absolute error
    mae = mean_absolute_error(y_actual, y_pred)
    # R-squared: coefficient of determination
    r2 = r2_score(y_actual, y_pred)
    # max error: maximum value of absolute error (y_actual - y_pred)
    maxerror = max_error(y_actual, y_pred)
    return {"rmse": rmse,
            "mae": mae,
            "r2": r2,
            "max_error": maxerror
           }

In [9]:
mlflow.get_tracking_uri()

'file:///Users/ahmadou-bamba/Desktop/TP_house_pricing_CHEIKH_AHMADOU_BAMBA_DIOP/notebooks/mlruns'

In [10]:
def define_pipeline(numerical_transformer: list,
                    categorical_transformer: list,
                    target_transformer,
                    estimator: Pipeline,
                    **kwargs: dict) -> Pipeline:
    """ Define pipeline for modeling

    Args:
        **kwargs:

    Returns:
        Pipeline: sklearn pipeline
    """
    numerical_transformer = make_pipeline(*numerical_transformer)

    categorical_transformer = make_pipeline(*categorical_transformer)

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, make_column_selector(dtype_include=["number"])),
            ("cat", categorical_transformer, make_column_selector(dtype_include=["object", "bool"])),
        ],
        remainder="drop",  # non-specified columns are dropped
        verbose_feature_names_out=False,  # will not prefix any feature names with the name of the transformer
    )
    # Append regressor to preprocessing pipeline.
    # Now we have a full prediction pipeline.
    if target_transformer:
        model_pipe = Pipeline(steps=[("preprocessor", preprocessor),
                                     ("estimator", TransformedTargetRegressor(regressor=estimator,
                                                                              func=np.log,
                                                                              inverse_func=np.exp))])
    
    
    else:
        
        model_pipe = Pipeline(steps=[("preprocessor", preprocessor), ("estimator", estimator)])
        
    logger.info(f"{model_pipe}")
    return model_pipe

In [16]:
# Model definition
reg = define_pipeline(numerical_transformer=[SimpleImputer(strategy="median"),
                                             RobustScaler()],
                      categorical_transformer=[SimpleImputer(strategy="constant", fill_value="undefined"),
                                               OneHotEncoder(drop="if_binary", handle_unknown="ignore")],
                      target_transformer=False,
                      estimator=RandomForestClassifier(n_estimators=30)
                 )

reg

[32m2023-07-20 16:53:44.609[0m | [1mINFO    [0m | [36m__main__[0m:[36mdefine_pipeline[0m:[36m39[0m - [1mPipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                        

In [17]:
reg.fit(x_train, y_train)

# Evaluate Metrics
y_train_pred = reg.predict(x_train)
y_test_pred = reg.predict(x_test)
train_metrics = eval_metrics(y_train , y_train_pred)
test_metrics = eval_metrics(y_test , y_test_pred)

# log out metrics
logger.info(f"Train: {train_metrics}")
logger.info(f"Test: {test_metrics}")

[32m2023-07-20 16:53:46.544[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTrain: {'rmse': 327.3377931463427, 'mae': 13.618150684931507, 'r2': 0.999982965763578, 'max_error': 8300.0}[0m
[32m2023-07-20 16:53:46.545[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mTest: {'rmse': 41357.12709594487, 'mae': 27588.931506849316, 'r2': 0.7313224254038865, 'max_error': 194500.0}[0m


In [19]:
# Create nested runs
# Create an experiment if not exists
exp_name = "house-price"
experiment = mlflow.get_experiment_by_name(exp_name)
if not experiment:
    experiment_id = mlflow.create_experiment(exp_name)
else:
    experiment_id = experiment.experiment_id    

In [27]:
x_train

Unnamed: 0,bsmtfinsf1,bsmtunfsf,condition2,exterqual,foundation,garagecars,garagetype,heating,heatingqc,housestyle,lotarea,masvnrarea,masvnrtype,miscfeature,mssubclass,overallqual,saletype,street,totalbsmtsf,building_age,remodel_age
676,0.0,1095.0,Norm,TA,BrkTil,3.0,2Types,GasW,Fa,2Story,9600.0,0.0,,,70.0,4.0,WD,Pave,1095.0,106.0,56.0
990,1074.0,322.0,Norm,Gd,PConc,3.0,Attchd,GasA,Ex,2Story,9452.0,423.0,BrkFace,,60.0,8.0,WD,Pave,1396.0,9.0,8.0
71,565.0,280.0,Norm,TA,CBlock,2.0,Detchd,GasA,TA,1Story,7599.0,0.0,,,20.0,4.0,WD,Pave,845.0,25.0,1.0
979,651.0,470.0,Norm,TA,PConc,2.0,Detchd,GasA,TA,1Story,8816.0,0.0,,,20.0,5.0,WD,Pave,1121.0,46.0,46.0
946,748.0,102.0,Norm,TA,CBlock,1.0,Attchd,GasA,TA,SLvl,8163.0,128.0,BrkFace,,80.0,5.0,WD,Pave,1144.0,47.0,47.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16,578.0,426.0,Norm,TA,CBlock,2.0,Attchd,GasA,Ex,1Story,11241.0,180.0,BrkFace,Shed,20.0,6.0,WD,Pave,1004.0,40.0,40.0
58,0.0,1410.0,Norm,Ex,PConc,3.0,BuiltIn,GasA,Ex,2Story,13682.0,1031.0,BrkFace,,60.0,10.0,New,Pave,1410.0,0.0,0.0
277,120.0,744.0,Norm,TA,CBlock,2.0,Detchd,GasA,Ex,1Story,19138.0,0.0,,,20.0,4.0,WD,Pave,864.0,59.0,59.0
255,0.0,975.0,Norm,Gd,PConc,2.0,BuiltIn,GasA,Ex,2Story,8738.0,302.0,BrkFace,,60.0,7.0,WD,Pave,975.0,7.0,7.0


In [37]:
# Useful for multiple runs (only doing one run in this sample notebook)
with mlflow.start_run(run_name=f"{EXECUTION_DATE.strftime('%Y%m%d_%H%m%S')}-house_price",
                      experiment_id=experiment_id,
                      tags={"version": "v1", "priority": "P1"},
                      description="house price modeling",) as mlf_run:
    print(f"run_id: {mlf_run.info.run_id}")
    print(f"version tag value: {mlf_run.data.tags.get('version')}")
    print("--")

    # Select number of estimator
    estimators = 10 #int(input("Estimator(s): "))
    mlflow.log_param("n_estimators", estimators)
    # Model definition
    reg = define_pipeline(numerical_transformer=[SimpleImputer(strategy="median"),
                                                 RobustScaler()],
                          categorical_transformer=[SimpleImputer(strategy="constant", fill_value="undefined"),
                                                   OneHotEncoder(drop="if_binary", handle_unknown="ignore")],
                          estimator=RandomForestClassifier(n_estimators=estimators),
                          target_transformer=False
                     )

    reg.fit(x_train, y_train)

    # Evaluate Metrics
    y_train_pred = reg.predict(x_train)
    y_test_pred = reg.predict(x_test)
    train_metrics = eval_metrics(y_train , y_train_pred)
    test_metrics = eval_metrics(y_test , y_test_pred)

    # log out metrics
    logger.info(f"Train: {train_metrics}")
    logger.info(f"Test: {test_metrics}")
    
    # Infer model signature
    predictions = reg.predict(x_train)
    #signature = infer_signature(x_train, predictions)

    # Log parameter, metrics, and model to MLflow
    for group_name, set_metrics in [("train", train_metrics),
                                    ("test", test_metrics),
                                   ]:
        for metric_name, metric_value in set_metrics.items():
            mlflow.log_metric(f"{group_name}_{metric_name}", metric_value)
    #mlflow.sklearn.log_model(reg, "model", signature=signature)

run_id: 75d5f9a532c243d4bdd595b88f3a831f
version tag value: v1
--
Estimator(s): 10


[32m2023-07-20 17:11:54.770[0m | [1mINFO    [0m | [36m__main__[0m:[36mdefine_pipeline[0m:[36m39[0m - [1mPipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                        

In [31]:
categorical_features = x_train.select_dtypes(include="object").columns

In [32]:
categorical_features 

Index(['condition2', 'exterqual', 'foundation', 'garagetype', 'heating',
       'heatingqc', 'housestyle', 'masvnrtype', 'miscfeature', 'saletype',
       'street'],
      dtype='object')

In [34]:
x_train.loc[:, categorical_features] = x_train.loc[:, categorical_features].astype(str)

In [38]:
!mlflow ui --host 0.0.0.0 --port 5002

[2023-07-20 17:12:03 +0000] [75719] [INFO] Starting gunicorn 20.1.0
[2023-07-20 17:12:03 +0000] [75719] [INFO] Listening at: http://0.0.0.0:5002 (75719)
[2023-07-20 17:12:03 +0000] [75719] [INFO] Using worker: sync
[2023-07-20 17:12:03 +0000] [75728] [INFO] Booting worker with pid: 75728
[2023-07-20 17:12:03 +0000] [75729] [INFO] Booting worker with pid: 75729
[2023-07-20 17:12:03 +0000] [75732] [INFO] Booting worker with pid: 75732
[2023-07-20 17:12:03 +0000] [75733] [INFO] Booting worker with pid: 75733
^C
[2023-07-20 17:57:53 +0000] [75719] [INFO] Handling signal: int
[2023-07-20 17:57:53 +0000] [75729] [INFO] Worker exiting (pid: 75729)
[2023-07-20 17:57:53 +0000] [75728] [INFO] Worker exiting (pid: 75728)
[2023-07-20 17:57:53 +0000] [75733] [INFO] Worker exiting (pid: 75733)
[2023-07-20 17:57:53 +0000] [75732] [INFO] Worker exiting (pid: 75732)


## Performance analysis

# Save model

In [None]:
model_name = Path(MODEL_DIR, f'{EXECUTION_DATE.strftime("%Y%m%d")}-best-model.dill')
model_name

In [None]:
# sauvargade du modèle en local: dill, joblib (sklearn), pickle, pycaret
model_path_name = Path(MODEL_DIR, f'{EXECUTION_DATE.strftime("%Y%m%d")}-best-model.dill')
save_object_with_dill(object_to_save=reg, object_path=model_path_name)