**Regression Project using PyCaret Python Library**<br>
By: Jose German

# Install requirements

In [1]:
# install pycaret; full installation including dependancies
#pip install pycaret[full]

# install pycaret plus all dependencies
# visual studio code recommends to use %pip instead fo !pip
#%pip install pycaret # <--------------------------------------

In [2]:
# Install analysis components of PyCaret which allow interpret_model function further in code

#%pip install pycaret[analysis] # <--------------------------------------

In [3]:
# Install MLFlow
# This library manages the ML project; It also used to track parameters, metrics, hyperparameters, feature parameters, code versions, etc
#%pip install mlflow # <--------------------------------------

In [4]:
# Gradio library used by PyCaret to create a simple web application

#%pip install gradio
#%pip install gradio==3.50 # <--------------------------------------

In [5]:
# Pydantic library used by PyCaret to build a simple API

#%pip install pydantic==1.10.12 # <--------------------------------------

# Downgrade pydantic 2.6.0; Too many annotation type errors


In [6]:
# May be required to build the dashboard

##%pip install explainerdashboard # <--------------------------------------

# Imports

In [1]:
# check installed version; should be 3.0 or newer
import pycaret
pycaret.__version__

'3.2.0'

In [2]:
import pandas as pd

# Loading dataset

In [3]:
# Here we load our CSV dataset into a pandas dataframe

#housedata = pd.read_csv('/content/new_train.csv')
housedata = pd.read_csv('C:\\Users\\joseg\\Documents\\GitHub\\csml1000-project1\\houseprices_modified.csv') #csv path
housedata.head()

Unnamed: 0,LotFrontageSF,LotAreaSF,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMaterial,...,GrLivArea,GarageType,GarageYrBlt,GarageCars,GarageSF,GarageQual,WoodDeckSF,OpenPorchSF,MiscFeature,SalePrice
0,65.0,8450,Single Family Detached,2 Story,7,5,2003,2003,Gable,Standard Composite Shingle,...,1710,Attached,2003,2,548,3,0,61,,208500
1,80.0,9600,Single Family Detached,1 Story,6,8,1976,1976,Gable,Standard Composite Shingle,...,1262,Attached,1976,2,460,3,298,0,,181500
2,68.0,11250,Single Family Detached,2 Story,7,5,2001,2002,Gable,Standard Composite Shingle,...,1786,Attached,2001,2,608,3,0,42,,223500
3,60.0,9550,Single Family Detached,2 Story,7,5,1915,1970,Gable,Standard Composite Shingle,...,1717,Detached,1998,3,642,3,0,35,,140000
4,84.0,14260,Single Family Detached,2 Story,8,5,2000,2000,Gable,Standard Composite Shingle,...,2198,Attached,2000,3,836,3,192,84,,250000


# Initial setup

In [5]:
# Inial setup of regression ML project. First command uses MLFlow to log experiment

from pycaret.regression import *
#reg1 = setup(housedata, target='SalePrice', session_id=123, log_experiment=True, experiment_name='houseprice1')

# Below code line is similar to above except here we don't need MLFLow to log the experiment
reg1 = setup(housedata, target='SalePrice', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,SalePrice
2,Target type,Regression
3,Original data shape,"(1456, 32)"
4,Transformed data shape,"(1456, 68)"
5,Transformed train set shape,"(1019, 68)"
6,Transformed test set shape,"(437, 68)"
7,Numeric features,23
8,Categorical features,8
9,Rows with missing values,22.9%


# Compare model baselines
Compare models function trains and evaluates the performance of al the estimators available in the model library using cross validation. The output is a scoring grid with average cross validated scored.

In [6]:
best_model = compare_models(fold=5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,16078.4104,576565664.6589,23807.9448,0.906,0.1309,0.0949,0.106
lightgbm,Light Gradient Boosting Machine,16885.0677,654045127.8792,25373.7972,0.8931,0.1377,0.0992,0.112
et,Extra Trees Regressor,17289.0104,670899361.3481,25635.0656,0.8912,0.1423,0.1043,0.142
rf,Random Forest Regressor,17793.3676,730145650.3652,26831.1587,0.8804,0.1463,0.1067,0.224
br,Bayesian Ridge,19485.8968,783138693.6237,27726.7077,0.8717,0.2112,0.1209,0.034
ridge,Ridge Regression,19837.4666,789259008.9103,27858.1809,0.8705,0.2185,0.1237,0.038
llar,Lasso Least Angle Regression,19888.8712,793248849.7724,27927.8244,0.8699,0.2148,0.1242,0.036
lasso,Lasso Regression,19906.1714,794467455.3906,27947.7355,0.8697,0.2154,0.1244,0.296
lr,Linear Regression,19942.6637,796220207.686,27981.6395,0.8694,0.2152,0.1246,0.456
en,Elastic Net,20160.9474,902852107.798,29729.4959,0.8523,0.2077,0.1234,0.044


`compare_model` uses all estimators in model library except models with `Turbo=False`. `models()` will list all available modelss.

In [7]:
# Show best model
best_model

# Analyze model
Plot model function analyzes the performance of the trained model ona test test.

In [13]:
# Check help for available plots
#help(plot_model)

In [8]:
# Lets plot residuals
#plot_model(best_model, plot='residuals')

In [9]:
# Lets plot the errors
#plot_model(best_model, plot='error')

In [10]:
# Lets plot feature importance
#plot_model(best_model, plot='feature')

# Create model

CatBoost Regressor used here which is the best model.

In [11]:
# Train model with default fold=10
gbr = create_model(best_model)

# Similar as above code line with specific number of folds
#lightgbm = create_model('lightgbm', fold=3)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,17967.4292,723030319.7446,26889.2231,0.9026,0.1501,0.1043
1,18080.8352,694439971.5315,26352.229,0.8638,0.1396,0.1006
2,15900.0473,594451534.4195,24381.3768,0.8904,0.1337,0.0986
3,13803.9277,313128950.5808,17695.45,0.928,0.1248,0.0966
4,15148.2626,473264664.8364,21754.647,0.9217,0.1404,0.1008
5,15488.2358,577398022.0034,24029.1078,0.8808,0.1323,0.0925
6,19755.5878,1084628543.1915,32933.6992,0.8738,0.1362,0.0998
7,19945.1114,972821606.7218,31190.0883,0.8797,0.1616,0.1139
8,14781.0791,470430359.5946,21689.4066,0.9156,0.1062,0.0792
9,14933.9479,419761283.9333,20488.0766,0.9149,0.1057,0.0815


`compare_models` allows specific models to be compared.

In [12]:
# Compare top 3 models
compare_regression_models = compare_models(include = ['lightgbm','gbr','br'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,16580.4464,632335525.6557,24740.3304,0.8971,0.1331,0.0968,0.121
lightgbm,Light Gradient Boosting Machine,16885.5229,659093465.2598,25343.0882,0.8916,0.1358,0.0985,0.122
br,Bayesian Ridge,19492.7425,780725497.6323,27633.1188,0.8703,0.2075,0.1206,0.039


In [13]:
# Show best model from the top 3 specified in previous cell
compare_regression_models

By default `compare_models` return the single best performing model (as seen in above cell) based on a defined metric.

In [14]:
# Get top 3 model based on MAE
#best_mae_models_top3 = compare_models(sort = 'MAE', n_select=3)

In [15]:
#best_mae_models_top3

# Tune hyperparameters

In [16]:
#tuned_lightgbm = tune_model(lightgbm, n_iter=50, optimize='MAE')

# Default parameters
#tuned_gbr = tune_model(gbr)

In [17]:
# Show tuned hyperparameters
#tuned_gbr

In [26]:
#help(tune_model)

# Ensemble model
Ensemble model function enables a given estimator.

In [18]:
#gbr_dt = ensemble_model(gbr, n_estimators=50)

In [19]:
#gbr_boosted = ensemble_model(gbr, method='Boosting')

In [29]:
# Help will show additional parameters for ensemble_model
#help(ensemble_model)

# Blend models
`blend_models` function trains a `VotingRegressor` for select models passed in the `estimator_list` parameter.

In [20]:
# Top 3 models
#best_mae_models_top3

In [21]:
# Blend top 3 models
#blend_models(best_mae_models_top3)

In [32]:
#help(blend_models)

# Stack models
The stack_models function trains a meta-model over select estimators passed in the estimator_list parameter.

In [22]:
#stack_models(best_mae_models_top3)

In [34]:
#help(stack_models)

# Interpret model
Requires use of the analysis function

In [24]:
#help(interpret_model)

In [25]:
# This function only supports tree based models for binary classification: lightgbm, dt, et, rf

#interpret_model(gbr)

In [26]:
#interpret_model(gbr, plot='correlation')

In [27]:
#interpret_model(gbr, plot='reason', observation=12)

# Get Leaderboard
Returning the leaderboard of all trained models in current setup.

In [23]:
#leadbrd = get_leaderboard()
#leadbrd

# AutoML
Function returns the best model out of all trained models in the current setup based on the optimize parameter.

In [24]:
#automl()

# As in our above code the best model was also chosen the best model by AutoML()

# Dashboard
Dashboard function generates the interactive dashboard for a trained model.

In [32]:
#using lightgbm; some pycaret function don't support gbr

#dashboard(gbr, display_format='inline')

# Predict model
Predict model function allows us to review the actual 'SalePrice' with the 'prediction_label'. Actual prices are very close to predicted.

In [25]:
pred_holdouts = predict_model(gbr)
pred_holdouts.head()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,16042.49,608976514.6424,24677.4495,0.8864,0.132,0.0944


Unnamed: 0,LotFrontageSF,LotAreaSF,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMaterial,...,GarageType,GarageYrBlt,GarageCars,GarageSF,GarageQual,WoodDeckSF,OpenPorchSF,MiscFeature,SalePrice,prediction_label
666,65.0,8125,Single Family Detached,1 Story,6,5,1994,1998,Gable,Standard Composite Shingle,...,Attached,1994,2,575,3,224,42,,193500,198303.622832
633,64.0,6979,Duplex,Split Foyer,6,5,1980,1980,Gable,Standard Composite Shingle,...,Detached,1980,2,576,3,264,56,Shed,144000,141679.370083
161,110.0,13688,Single Family Detached,2 Story,9,5,2003,2004,Gable,Standard Composite Shingle,...,Built-In,2003,3,726,3,400,0,,412500,398114.791978
147,,9505,Single Family Detached,2 Story,7,5,2001,2001,Gable,Standard Composite Shingle,...,Built-In,2001,2,434,3,144,48,,222500,227695.478263
18,66.0,13695,Single Family Detached,1 Story,5,5,2004,2004,Gable,Standard Composite Shingle,...,Detached,2004,2,576,3,0,102,,159000,151102.687279


In [28]:
# Saving the model

import pickle

with open('houseprice_best_model.pkl','wb') as file1:
    pickle.dump(best_model,file1)

In [27]:
gbr

# Create app
Create a basic gradio app for inference

In [43]:
#help(create_app)

In [35]:
# creation of the app is working; however I think that my dataset contains too may features; Will consider removing some of the less useful ones

#create_app(gbr)


# Create API
Function takes an input model and creates a POST API for inference.

In [36]:
# Create API
create_api(gbr, api_name = 'housing_price_api')

API successfully created. This function only creates a POST API, it doesn't run it automatically. To run your API, please run this command --> !python housing_price_api.py


In [46]:
# Run API
!python housing_price_api.py

# NameError: name 'nan' is not defined - this error was being generated; I opened housing_price_api.py and edited the data line to removed the nan valued on some features.

# Alternately you can open and run the code in housing_price_api.py file to get more info.
"""
INFO:     Started server process [22896]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:80 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [22896]
"""

Transformation Pipeline and Model Successfully Loaded


Traceback (most recent call last):
  File "c:\Users\joseg\Documents\GitHub\PyCaret-Regression-Python\housing_price_api.py", line 16, in <module>
    input_model = create_model("housing_price_api_input", **{'MSSubClass': 'SPLIT FOYER', 'MSZoning': 'RL', 'LotFrontage': 57.0, 'LotArea': 8846, 'Alley': nan, 'LandContour': 'Lvl', 'LotConfig': 'CulDSac', 'LandSlope': 0, 'BldgType': '1Fam', 'HouseStyle': 'SFoyer', 'OverallQual': 5, 'OverallCond': 5, 'YearBuilt': 1996, 'YearRemodAdd': 1996, 'RoofStyle': 'Gable', 'RoofMatl': 'CompShg', 'MasVnrType': 'None', 'MasVnrArea': 0, 'ExterQual': 4, 'ExterCond': 3, 'Foundation': 'PConc', 'BsmtQual': 4, 'BsmtCond': 3, 'BsmtExposure': 2, 'BsmtFinType1': 6, 'BsmtFinSF1': 298, 'BsmtFinType2': 1, 'BsmtFinSF2': 0, 'BsmtUnfSF': 572, 'TotalBsmtSF': 870, 'Heating': 'GasA', 'HeatingQC': 5, 'CentralAir': 1, 'Electrical': 5, '1stFlrSF': 914, '2ndFlrSF': 0, 'GrLivArea': 914, 'GarageType': 'Detchd', 'GarageYrBlt': 1998, 'GarageFinish': 1, 'GarageCars': 2, 'GarageArea'

'\nINFO:     Started server process [22896]\nINFO:     Waiting for application startup.\nINFO:     Application startup complete.\nINFO:     Uvicorn running on http://127.0.0.1:80 (Press CTRL+C to quit)\nINFO:     Shutting down\nINFO:     Waiting for application shutdown.\nINFO:     Application shutdown complete.\nINFO:     Finished server process [22896]\n'

# Finalize model
trains the provided model on the entire dataset including the hold-out set

In [37]:
final_best = finalize_model(best_model)

In [38]:
final_best

# Save/Load model
Save the transformaiton pipeline and the trained model object as a pickle file

In [39]:
# save model
save_model(best_model, 'houseprice_best_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['LotFrontageSF', 'LotAreaSF',
                                              'OverallQual', 'OverallCond',
                                              'YearBuilt', 'YearRemodAdd',
                                              'ExterQual', 'ExterCond',
                                              'BsmtQual', 'BsmtCond',
                                              'BsmtFinSF', 'TotalBsmtSF',
                                              'HeatingQC', 'CentralAir',
                                              '1stFlrSF', '2ndFlrSF',
                                              'GrLivArea', 'GarageYrBlt',
                                              'GarageCars', 'GarageSF',
                                              'Gar...
                  TransformerWrapper(include=['BldgType', 'HouseStyle',
                                              'RoofStyle', 'Roo