In [None]:
!pip install --ignore-installed pycaret[full]

In [1]:
# import libraries
import boto3, os
from sagemaker import get_execution_role
import pandas as pd
from dotenv import load_dotenv
from load_data import load_data



my_region = boto3.session.Session().region_name

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/lanre.bakare/Library/Application Support/sagemaker/config.yaml


In [2]:
load_dotenv(".env")
role = get_execution_role()

model_name = os.getenv("model_name").replace(" ", "").lower()
model_features = [x.strip() for x in os.environ["features"].split(",")]
target = os.getenv("target")
print(model_name, model_features, target)

bucket= os.getenv("bucket")
data_key = os.getenv("key")
data_location = 's3://{}/{}'.format(bucket, data_key)
print(data_location)

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/lanre.bakare/Library/Application Support/sagemaker/config.yaml
randomforestregressor ['Bakerloo', 'dayOfWeek', 'hour', 'minute'] Bakerloo10
s3://streaming-data-platform-ml-data/bakerloo.csv


In [3]:
data = load_data(data_location)
df = data.copy()
df.shape

(7127, 5)

In [4]:
print(df.shape)
# Randomly shuffle the DataFrame
df_shuffled = df.sample(frac=1).reset_index(drop=True)
train_size = int(0.8 * len(df))
train_data = df_shuffled[:train_size]
test_data = df_shuffled[train_size:]
target = os.getenv("target")
print(train_data.shape, test_data.shape)

(7127, 5)
(5701, 5) (1426, 5)


In [5]:
from pycaret.regression import *
s = setup(data = train_data, target = target, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Bakerloo10
2,Target type,Regression
3,Original data shape,"(5701, 5)"
4,Transformed data shape,"(5701, 5)"
5,Transformed train set shape,"(3990, 5)"
6,Transformed test set shape,"(1711, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


In [6]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.7298,1.3365,1.1472,0.976,0.0726,0.0515,0.083
lightgbm,Light Gradient Boosting Machine,0.7569,1.4215,1.1818,0.9744,0.0786,0.0549,0.376
xgboost,Extreme Gradient Boosting,0.7425,1.4669,1.2037,0.9737,0.0739,0.0503,0.011
rf,Random Forest Regressor,0.6771,1.4767,1.2061,0.9735,0.0683,0.0404,0.05
et,Extra Trees Regressor,0.6476,1.5881,1.2482,0.9715,0.0682,0.0374,0.038
gbr,Gradient Boosting Regressor,0.9047,1.8765,1.3628,0.9661,0.0931,0.0685,0.022
knn,K Neighbors Regressor,0.934,2.0091,1.4089,0.9638,0.0853,0.0605,0.005
dt,Decision Tree Regressor,0.7622,2.4,1.5398,0.9568,0.0877,0.0431,0.004
lar,Least Angle Regression,1.1612,2.854,1.6835,0.9485,0.134,0.0978,0.004
lr,Linear Regression,1.1612,2.854,1.6835,0.9485,0.134,0.0978,0.25


In [7]:
# Users are allowed to create their own model if they want to.
# This uses the default hyperparameters to train the model which you can then choose to tune as you want.
# In order to tune hyperparameters, the tune_model function is used
xgboost = create_model('xgboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.7688,1.2348,1.1112,0.9772,0.0642,0.0503
1,0.7469,1.7322,1.3161,0.9687,0.0779,0.0502
2,0.7588,1.3599,1.1662,0.9763,0.0846,0.053
3,0.7299,1.764,1.3281,0.9706,0.0802,0.0535
4,0.7436,1.3235,1.1504,0.977,0.0726,0.0513
5,0.7181,1.1049,1.0511,0.9773,0.0589,0.0426
6,0.7315,1.3288,1.1527,0.9797,0.0864,0.0572
7,0.6792,1.1702,1.0817,0.9796,0.066,0.0495
8,0.7365,1.3602,1.1663,0.9735,0.067,0.0463
9,0.8116,2.2906,1.5135,0.9566,0.0813,0.0493


In [8]:
tuned_model = tune_model(xgboost)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.73,1.0913,1.0447,0.9799,0.0682,0.0496
1,0.6925,1.4998,1.2247,0.9729,0.0639,0.0444
2,0.7208,1.0845,1.0414,0.9811,0.0842,0.0497
3,0.746,1.7133,1.3089,0.9714,0.0809,0.0548
4,0.7262,1.2174,1.1034,0.9789,0.0789,0.0488
5,0.7195,0.9949,0.9975,0.9796,0.0563,0.0432
6,0.7027,1.2512,1.1186,0.9809,0.0743,0.051
7,0.7059,1.0849,1.0416,0.9811,0.0666,0.0482
8,0.7618,1.4619,1.2091,0.9715,0.0687,0.0472
9,0.8173,2.3243,1.5246,0.956,0.086,0.0528


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [9]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [10]:
predict_model(best)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,0.7352,1.3652,1.1684,0.9767,0.0727,0.0498


Unnamed: 0,Bakerloo,dayOfWeek,hour,minute,Bakerloo10,prediction_label
2676,11,3,23,23,14.0,14.079502
724,21,6,9,58,21.0,20.438036
4595,22,0,5,50,17.0,17.121785
1112,26,0,18,45,25.0,24.671917
4064,21,3,21,21,21.0,19.621684
...,...,...,...,...,...,...
3854,21,5,19,56,20.0,20.496424
4481,18,5,11,35,17.0,18.502632
467,2,6,5,38,2.0,2.128208
3551,26,3,15,21,26.0,25.657446


In [11]:
final_best_model = finalize_model(best)
print(final_best_model)

Pipeline(memory=Memory(location=None),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=['Bakerloo', 'dayOfWeek', 'hour',
                                             'minute'],
                                    transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=[],
                                    transformer=SimpleImputer(strategy='most_frequent'))),
                ('actual_estimator',
                 <catboost.core.CatBoostRegressor object at 0x2a4d77eb0>)])


In [12]:
final_prediction = predict_model(final_best_model, data=test_data)
final_prediction.head()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,0.6701,0.8838,0.9401,0.9847,0.0643,0.0483


Unnamed: 0,Bakerloo,dayOfWeek,hour,minute,Bakerloo10,prediction_label
5701,25,1,9,28,23.0,25.247768
5702,22,6,17,1,21.0,21.697097
5703,27,0,7,29,27.0,27.847999
5704,8,1,23,27,13.0,12.945861
5705,24,3,5,53,21.0,20.488496


In [13]:
from pycaret.utils.generic import check_metric
check_metric(final_prediction.Bakerloo10, final_prediction.prediction_label, 'RMSE')

0.9401