In [4]:
import pycaret

import pandas as pd
import numpy as np

from pycaret.utils import version
version()

'2.3.10'

In [5]:
import json
import time

class Timer:
    def __enter__(self, *args, **kwargs):
        self.tick = time.time()
        return self
    
    def __exit__(self, *args, **kwargs):
        self.elapsed = time.time() - self.tick
        
benchmark_list = []

In [3]:
# Begin Boiler Plate Code, will have few quick sample commands to run at the end

# Data file is from https://archive.ics.uci.edu/ml/machine-learning-databases
dataset = pd.read_csv('data/YearPredictionMSD.txt')

In [6]:
# Fixing attribute labels
categorical_feature_names = ['Year']
numeric_feature_names = []
for x in range(1,13):
    numeric_feature_names.append('t_avg_' + str(x)) # These attributes are timbre averages
for x in range(1,79):
    numeric_feature_names.append('t_cov_' + str(x)) # These attributes are timbre covariances
dataset.columns = [*categorical_feature_names, *numeric_feature_names]

dataset.head()

Unnamed: 0,Year,t_avg_1,t_avg_2,t_avg_3,t_avg_4,t_avg_5,t_avg_6,t_avg_7,t_avg_8,t_avg_9,...,t_cov_69,t_cov_70,t_cov_71,t_cov_72,t_cov_73,t_cov_74,t_cov_75,t_cov_76,t_cov_77,t_cov_78
0,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
1,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
2,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
3,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903
4,2001,50.54767,0.31568,92.35066,22.38696,-25.5187,-19.04928,20.67345,-5.19943,3.63566,...,6.59753,-50.69577,26.02574,18.9443,-0.3373,6.09352,35.18381,5.00283,-11.02257,0.02263


In [7]:
# Split Dataset into Training and Test
df = dataset[:463716]
unseen_df = dataset[463716:515346]
unseen_df.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(df.shape))
print('Unseen Data For Predictions: ' + str(unseen_df.shape))

Data for Modeling: (463716, 91)
Unseen Data For Predictions: (51628, 91)


In [7]:
from pycaret.regression import *

exp_reg = setup(data=df, target='Year', session_id=123, normalize=True, use_gpu=False, 
numeric_features=numeric_feature_names, silent=True, fold=3)

IntProgress(value=0, description='Processing: ', max=3)

KeyboardInterrupt: 

In [19]:
with Timer() as elapsed:
    best_models = compare_models(exclude = ['ransac'], n_select = 3)
    
benchmark_payload = {}
benchmark_payload["function"] = "compare models"
benchmark_payload["model"] = "all"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,6.3215,81.4899,9.027,0.3193,0.0045,0.0032,28.678
et,Extra Trees Regressor,6.4573,81.554,9.0306,0.3188,0.0045,0.0032,167.33
rf,Random Forest Regressor,6.526,84.0191,9.1661,0.2982,0.0046,0.0033,433.299
gbr,Gradient Boosting Regressor,6.5796,86.9085,9.3224,0.2741,0.0047,0.0033,849.201
lr,Linear Regression,6.8081,91.5166,9.5663,0.2356,0.0048,0.0034,1.387
ridge,Ridge Regression,6.8081,91.5166,9.5663,0.2356,0.0048,0.0034,0.174
br,Bayesian Ridge,6.8083,91.5165,9.5663,0.2356,0.0048,0.0034,2.949
lar,Least Angle Regression,6.8089,91.5292,9.567,0.2355,0.0048,0.0034,0.221
knn,K Neighbors Regressor,6.8613,92.6931,9.6275,0.2258,0.0048,0.0034,476.197
huber,Huber Regressor,6.5292,96.0938,9.8026,0.1974,0.0049,0.0033,16.472


In [8]:
# TODO run the GPU ones first, to get them completed since they'll run far more quickly...
# Create Model (Linear Regression)

with Timer() as elapsed:
    lr = create_model('lr', fold = 5)

benchmark_payload = {}
benchmark_payload["function"] = "create model cpu"
benchmark_payload["model"] = "lr"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

NameError: name 'create_model' is not defined

In [None]:
# Tune the Linear Regression Model
with Timer() as elasped:
    tuned_lr = tune_model(lr)
    
benchmark_payload = {}
benchmark_payload["function"] = "tune model cpu"
benchmark_payload["model"] = "lr"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

In [None]:
# Ensemble a Model

In [None]:
# Blender Ensemble Model
with Timer() as elapsed:
    # Train individual models to blend

    # excluding xgboost since cannot run on GPU for comparison
    # xgboost = create_model('xgboost', verbose = False)
    lr = create_model('lr', verbose = False)
    knn = create_model('knn', verbose = False)
    
    # Blend individual models
    blender = blend_models(estimator_list = [lr, knn])
    
benchmark_payload = {}
benchmark_payload["function"] = "ensemble - blending cpu"
benchmark_payload["model"] = "Blending lr, knn"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

In [None]:
# Stacking Ensemble Model
with Timer() as elapsed:
    stacker = stack_models(best_models)

benchmark_payload = {}
benchmark_payload["function"] = "ensemble - stacking cpu"
benchmark_payload["model"] = "best_models cpu"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

In [None]:
# Plot Error
plot_model(blender, plot = 'error')
plot_model(stacker, plot = 'error')

In [None]:
# Predict on Hold-Out Sample
with Timer() as elapsed:
    predict_model(stacker);
    
benchmark_payload = {}
benchmark_payload["function"] = "predict model cpu"
benchmark_payload["model"] = "stacker"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

In [None]:
# Finalize Stacker Ensemble Model and predict with it
with Timer() as elapsed:
    final_stacker = finalize_model(stacker)
    
benchmark_payload = {}
benchmark_payload["function"] = "finalize model cpu"
benchmark_payload["model"] = "stacker"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)
with Timer() as elapsed:
    predict_model(final_stacker);

benchmark_payload = {}
benchmark_payload["function"] = "predict model cpu"
benchmark_payload["model"] = "final stacker"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

In [None]:
# Predict on Unseen Data with the Stack Ensemble Model
with Timer() as elapsed:
    unseen_predictions = predict_model(final_stacker, data=unseen_df)
    unseen_predictions.head()
    
benchmark_payload = {}
benchmark_payload["function"] = "predict on unseen cpu"
benchmark_payload["model"] = "final stacker"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

In [9]:
# Try GPU
from pycaret.regression import *

exp_reg = setup(data=df, target='Year', session_id=123, normalize=True, use_gpu=True, 
numeric_features=numeric_feature_names, silent=True, fold=3)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Year
2,Original Data,"(463716, 91)"
3,Missing Values,False
4,Numeric Features,90
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(324601, 90)"


In [10]:
# Exclude all non-GPU models and copy to rest
non_gpu_models = ['ransac', 'huber', 'par', 'ada', 'omp', 'llar']
with Timer() as elapsed:
    best_models = compare_models(exclude=non_gpu_models, n_select=3)
    
benchmark_payload = {}
benchmark_payload["function"] = "compare models gpu"
benchmark_payload["model"] = "all"
benchmark_payload["processor"] = "gpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,6.3258,81.5683,9.0315,0.3187,0.0045,0.0032,22.0367
et,Extra Trees Regressor,6.5117,82.7475,9.0966,0.3088,0.0046,0.0033,127.61
rf,Random Forest Regressor,6.5695,84.9802,9.2185,0.2902,0.0046,0.0033,341.2233
gbr,Gradient Boosting Regressor,6.5846,87.0026,9.3275,0.2733,0.0047,0.0033,613.0867
lr,Linear Regression,6.8083,91.5227,9.5667,0.2355,0.0048,0.0034,1.31
ridge,Ridge Regression,6.8084,91.5227,9.5667,0.2355,0.0048,0.0034,0.2567
br,Bayesian Ridge,6.8087,91.5226,9.5667,0.2355,0.0048,0.0034,2.8333
lar,Least Angle Regression,6.8109,91.552,9.5683,0.2353,0.0048,0.0034,0.2967
knn,K Neighbors Regressor,6.9753,95.4457,9.7696,0.2028,0.0049,0.0035,1275.55
en,Elastic Net,7.5717,105.6523,10.2787,0.1175,0.0052,0.0038,1.0833


In [11]:
# Create Model (Linear Regression)

with Timer() as elapsed:
    lr = create_model('lr', fold=5)

benchmark_payload = {}
benchmark_payload["function"] = "create model"
benchmark_payload["model"] = "lr"
benchmark_payload["processor"] = "gpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6.8459,92.2263,9.6035,0.2296,0.0048,0.0034
1,6.792,91.1726,9.5484,0.2365,0.0048,0.0034
2,6.8086,91.7453,9.5784,0.2331,0.0048,0.0034
3,6.8222,92.044,9.594,0.2383,0.0048,0.0034
4,6.7727,90.4165,9.5088,0.2403,0.0048,0.0034
Mean,6.8083,91.5209,9.5666,0.2356,0.0048,0.0034
Std,0.025,0.6577,0.0344,0.0038,0.0,0.0


In [12]:
# Tune the Linear Regression Model
with Timer() as elasped:
    tuned_lr = tune_model(lr)
    
benchmark_payload = {}
benchmark_payload["function"] = "tune model"
benchmark_payload["model"] = "lr"
benchmark_payload["processor"] = "gpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6.8277,91.8777,9.5853,0.2312,0.0048,0.0034
1,6.7952,91.3209,9.5562,0.2362,0.0048,0.0034
2,6.8021,91.3695,9.5587,0.2392,0.0048,0.0034
Mean,6.8083,91.5227,9.5667,0.2355,0.0048,0.0034
Std,0.014,0.2518,0.0132,0.0033,0.0,0.0


In [13]:
# Ensemble a Model

In [14]:
# Blender Ensemble Model
with Timer() as elapsed:
    #train individual models to blend
    # xgboost is CPU only
    # xgboost = create_model('xgboost', verbose = False)
    lr = create_model('lr', verbose=False)
    knn = create_model('knn', verbose=False)
    
    #blend individual models
    blender = blend_models(estimator_list = [lr, knn])
    
benchmark_payload = {}
benchmark_payload["function"] = "ensemble - blending"
benchmark_payload["model"] = "Blending lr, knn"
benchmark_payload["processor"] = "gpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

IntProgress(value=0, description='Processing: ', max=6)

Unnamed: 0,Fold,MAE,MSE,RMSE,R2,RMSLE,MAPE


In [None]:
# TODO Fix crash of Python kernel on training
#       of stacked model, and uncomment/run the rest

# Stacking Ensemble Model
# with Timer() as elapsed:
#     stacker = stack_models(best_models)

# benchmark_payload = {}
# benchmark_payload["function"] = "ensemble - stacking"
# benchmark_payload["model"] = "best_models gpu"
# benchmark_payload["processor"] = "gpu"
# benchmark_payload["fit_time"] = elapsed.elapsed

# benchmark_list.append(benchmark_payload)

In [None]:
# Plot Error of Blender & Stacking Models
plot_model(blender, plot = 'error')
# plot_model(stacker, plot = 'error')

IntProgress(value=0, description='Processing: ', max=5)

KeyboardInterrupt: 

<Figure size 576x396 with 0 Axes>

In [None]:
# Predict on Hold-Out Sample
with Timer() as elapsed:
    predict_model(blender);
    
benchmark_payload = {}
benchmark_payload["function"] = "predict model"
benchmark_payload["model"] = "blender"
benchmark_payload["processor"] = "gpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

In [None]:
# Finalize Blender Ensemble Model and predict with it
with Timer() as elapsed:
    final_blender = finalize_model(blender)
    
benchmark_payload = {}
benchmark_payload["function"] = "finalize model"
benchmark_payload["model"] = "blender"
benchmark_payload["processor"] = "gpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)
with Timer() as elapsed:
    predict_model(final_blender);

benchmark_payload = {}
benchmark_payload["function"] = "predict model"
benchmark_payload["model"] = "final stacker"
benchmark_payload["processor"] = "gpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

In [None]:
# Predict on Unseen Data with the Stack Ensemble Model
with Timer() as elapsed:
    unseen_predictions = predict_model(final_blender, data=unseen_df)
    unseen_predictions.head()
    
benchmark_payload = {}
benchmark_payload["function"] = "predict on unseen"
benchmark_payload["model"] = "final blender"
benchmark_payload["processor"] = "gpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

In [None]:
# Write Times to File
outpath = "outputLocal/pycaret_benchmarksCPUvsGPU_justGPU20220501.json"

with open(outpath, "a") as fh:
    fh.write(json.dumps(benchmark_list))
    fh.write("\n")

In [None]:
# EXAMPLE - Sample Compare_models
from pycaret.regression import *
exp_reg = setup(data=df, target ='Year', session_id=632, normalize=True, use_gpu=True)

top3 = compare_models(exclude=non_gpu_models, n_select=3)