## PyCaret CPU vs. GPU Benchmarking
-------

## Import Libraries

In [None]:
import pycaret

import pandas as pd
import numpy as np

import time

from pycaret.utils import version
version()

## Timing

In [None]:
import json
import time

class Timer:
    def __enter__(self, *args, **kwargs):
        self.tick = time.time()
        return self
    
    def __exit__(self, *args, **kwargs):
        self.elapsed = time.time() - self.tick
        
benchmark_list = []

## Get Data

The dataset we used can be found [here](https://archive.ics.uci.edu/ml/datasets/YearPredictionMSD).

In [None]:
dataset = pd.read_csv('YearPredictionMSd.txt')

In [None]:
#fixing attribute labels
names = ['Year']
for x in range(1,13):
    names.append('t_avg_' + str(x)) #these attributes are timbre averages
for x in range(1,79):
    names.append('t_cov_' + str(x)) #these attributes are timbre covariances
dataset.columns = names

dataset.head()

Withhold a sample of 600 records from the original dataset to be used for predictions (not to be confused with train/test split).

In [None]:
#gpu data
df = dataset[:463716]
unseen_df = dataset[463716:515346]
unseen_df.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(df.shape))
print('Unseen Data For Predictions: ' + str(unseen_df.shape))

### Set up Environment in PyCaret

To record CPU times, keep `use_gpu=False`, and to record GPU times, set it to `True`. Be sure to update the labels in the timing module at the end of each cell to match what's being recorded.

In [None]:
from pycaret.regression import *
exp_reg = setup(data = df, target = 'Year', session_id = 123, normalize = True, use_gpu=False)

## Compare All Models

Not all models can be run on GPU, so even when `use_gpu=True`, those that cannot be run on GPU will automatically be run on CPU. To compare the times of only those models that can be run on GPU, `exclude = ['ransac', 'huber', 'par', 'ada', 'omp', 'llar']`.

In [None]:
with Timer() as elapsed:
    best_models = compare_models(exclude = ['ransac'], n_select = 3)
    
benchmark_payload = {}
benchmark_payload["function"] = "compare models"
benchmark_payload["model"] = "all"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

## Create Models

Here we can time the fitting of an individual model. Linear regression is used for example.

In [None]:
with Timer() as elapsed:
    lr = create_model('lr', fold = 5)

benchmark_payload = {}
benchmark_payload["function"] = "create model"
benchmark_payload["model"] = "lr"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)


## Tune Models

Here we can time the tuning of a model we've created.

In [None]:
with Timer() as elasped:
    tuned_lr = tune_model(lr)
    
benchmark_payload = {}
benchmark_payload["function"] = "tune model"
benchmark_payload["model"] = "lr"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)


## Ensemble a Model

### Blending

In [None]:
with Timer() as elapsed:
    #train individual models to blend
    xgboost = create_model('xgboost', verbose = False)
    lr = create_model('lr', verbose = False)
    knn = create_model('knn', verbose = False)
    
    #blend individual models
    blender = blend_models(estimator_list = [xgboost, lr, knn])
    
benchmark_payload = {}
benchmark_payload["function"] = "ensemble - blending"
benchmark_payload["model"] = "xgboost, lr, knn"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)


### Stacking

In [None]:
with Timer() as elapsed:
    stacker = stack_models(best_models)

benchmark_payload = {}
benchmark_payload["function"] = "ensemble - stacking"
benchmark_payload["model"] = "best_models cpu"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)

## Plot Error

In [None]:
plot_model(blender, plot = 'error')

In [None]:
plot_model(stacker, plot = 'error')

## Predict on Hold-Out Sample

In [None]:
with Timer() as elapsed:
    predict_model(stacker);
    
benchmark_payload = {}
benchmark_payload["function"] = "predict model"
benchmark_payload["model"] = "stacker"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)


## Finalize Model

In [None]:
with Timer() as elapsed:
    final_stacker = finalize_model(stacker)
    
benchmark_payload = {}
benchmark_payload["function"] = "finalize model"
benchmark_payload["model"] = "stacker"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)


In [None]:
with Timer() as elapsed:
    predict_model(final_stacker);

benchmark_payload = {}
benchmark_payload["function"] = "predict model"
benchmark_payload["model"] = "final stacker"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)


## Predict on Unseen Data

In [None]:
with Timer() as elapsed:
    unseen_predictions = predict_model(final_stacker, data=data_unseen)
    unseen_predictions.head()
    
benchmark_payload = {}
benchmark_payload["function"] = "predict on unseen"
benchmark_payload["model"] = "final stacker"
benchmark_payload["processor"] = "cpu"
benchmark_payload["fit_time"] = elapsed.elapsed

benchmark_list.append(benchmark_payload)


## Write Times to File

In [None]:
outpath = "pycaret_benchmarksCPU.json"

with open(outpath, "a") as fh:
    fh.write(json.dumps(benchmark_list))
    fh.write("\n")