In [3]:
import pycaret

import pandas as pd
import numpy as np

from pycaret.utils import version
version()

'2.3.10'

In [4]:
import json
import time

class Timer:
    def __enter__(self, *args, **kwargs):
        self.tick = time.time()
        return self
    
    def __exit__(self, *args, **kwargs):
        self.elapsed = time.time() - self.tick
        
benchmark_list = []

In [10]:
# Begin Boiler Plate Code, will have few quick sample commands to run at the end

# Data file is from https://archive.ics.uci.edu/ml/machine-learning-databases
dataset = pd.read_csv('data/YearPredictionMSD.txt')

In [11]:
# Fixing attribute labels
categorical_feature_names = ['Year']
numeric_feature_names = []
for x in range(1,13):
    numeric_feature_names.append('t_avg_' + str(x)) # These attributes are timbre averages
for x in range(1,79):
    numeric_feature_names.append('t_cov_' + str(x)) # These attributes are timbre covariances
dataset.columns = [*categorical_feature_names, *numeric_feature_names]

dataset.head()

Unnamed: 0,Year,t_avg_1,t_avg_2,t_avg_3,t_avg_4,t_avg_5,t_avg_6,t_avg_7,t_avg_8,t_avg_9,...,t_cov_69,t_cov_70,t_cov_71,t_cov_72,t_cov_73,t_cov_74,t_cov_75,t_cov_76,t_cov_77,t_cov_78
0,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
1,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
2,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
3,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903
4,2001,50.54767,0.31568,92.35066,22.38696,-25.5187,-19.04928,20.67345,-5.19943,3.63566,...,6.59753,-50.69577,26.02574,18.9443,-0.3373,6.09352,35.18381,5.00283,-11.02257,0.02263


In [12]:
# Split Dataset into Training and Test
df = dataset[:463716]
unseen_df = dataset[463716:515346]
unseen_df.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(df.shape))
print('Unseen Data For Predictions: ' + str(unseen_df.shape))

Data for Modeling: (463716, 91)
Unseen Data For Predictions: (51628, 91)


In [18]:
from pycaret.regression import *

exp_reg = setup(data=df, target='Year', session_id=123, normalize=True, use_gpu=True, 
numeric_features=numeric_feature_names, silent=True)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Year
2,Original Data,"(463716, 91)"
3,Missing Values,False
4,Numeric Features,90
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(324601, 90)"


In [19]:
with Timer() as elapsed:
    best_models = compare_models(exclude = ['ransac'], n_select = 3)
    
benchmark_payload = {}
benchmark_payload["function"] = "compare models"
benchmark_payload["model"] = "all"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

IntProgress(value=0, description='Processing: ', max=89)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lr,Linear Regression,6.8081,91.5166,9.5663,0.2356,0.0048,0.0034,1.387
ridge,Ridge Regression,6.8081,91.5166,9.5663,0.2356,0.0048,0.0034,0.174
br,Bayesian Ridge,6.8083,91.5165,9.5663,0.2356,0.0048,0.0034,2.949
lar,Least Angle Regression,6.8089,91.5292,9.567,0.2355,0.0048,0.0034,0.221
huber,Huber Regressor,6.5292,96.0938,9.8026,0.1974,0.0049,0.0033,16.472
omp,Orthogonal Matching Pursuit,7.031,97.0601,9.8518,0.1893,0.005,0.0035,0.181
en,Elastic Net,7.5716,105.6498,10.2785,0.1175,0.0052,0.0038,2.489
lasso,Lasso Regression,7.6678,107.9925,10.3918,0.098,0.0052,0.0039,0.602
llar,Lasso Least Angle Regression,8.1659,119.7255,10.9418,-0.0,0.0055,0.0041,0.188
par,Passive Aggressive Regressor,11.6849,264.4097,16.1819,-1.2102,0.0081,0.0059,1.403


In [None]:
# EXAMPLES - Sample Commands
