In [1]:
# import libraries
import boto3, os, sagemaker
from sagemaker import get_execution_role
import pandas as pd
from dotenv import load_dotenv
from load_data import load_data



my_region = boto3.session.Session().region_name

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/lanre.bakare/Library/Application Support/sagemaker/config.yaml


In [2]:
load_dotenv(".env")
role = get_execution_role()

model_name = os.getenv("model_name").replace(" ", "").lower()
model_features = [x.strip() for x in os.environ["features"].split(",")]
target = os.getenv("target")
print(model_name, model_features, target)

bucket= os.getenv("bucket")
data_key = os.getenv("key")
data_location = 's3://{}/{}'.format(bucket, data_key)
print(data_location)

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/lanre.bakare/Library/Application Support/sagemaker/config.yaml
randomforestregressor ['Bakerloo', 'dayOfWeek', 'hour', 'minute'] Bakerloo10
s3://streaming-data-platform-ml-data/bakerloo.csv


In [3]:
data = load_data(data_location)
df = data.copy()
df.shape

(7127, 5)

In [4]:
print(df.shape)
# Randomly shuffle the DataFrame
df_shuffled = df.sample(frac=1).reset_index(drop=True)
train_size = int(0.8 * len(df))
train_data = df_shuffled[:train_size]
test_data = df_shuffled[train_size:]
target = os.getenv("target")
print(train_data.shape, test_data.shape)

(7127, 5)
(5701, 5) (1426, 5)


In [5]:
from pycaret.regression import *
s = setup(data = train_data, target = target, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Bakerloo10
2,Target type,Regression
3,Original data shape,"(5701, 5)"
4,Transformed data shape,"(5701, 5)"
5,Transformed train set shape,"(3990, 5)"
6,Transformed test set shape,"(1711, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


In [6]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.761,1.5231,1.2232,0.9733,0.0775,0.0553,0.083
lightgbm,Light Gradient Boosting Machine,0.7719,1.6066,1.258,0.9717,0.0822,0.056,0.352
xgboost,Extreme Gradient Boosting,0.7727,1.7076,1.2973,0.9699,0.0775,0.0531,0.01
rf,Random Forest Regressor,0.7176,1.7378,1.3104,0.9692,0.0752,0.0433,0.048
et,Extra Trees Regressor,0.6832,1.8696,1.3598,0.9669,0.0772,0.0407,0.039
gbr,Gradient Boosting Regressor,0.9099,2.0048,1.407,0.9647,0.0957,0.0693,0.023
knn,K Neighbors Regressor,0.9422,2.2504,1.4903,0.9602,0.0972,0.0654,0.005
ridge,Ridge Regression,1.1656,2.9173,1.7018,0.9487,0.136,0.0996,0.004
lr,Linear Regression,1.1656,2.9173,1.7018,0.9487,0.136,0.0996,0.251
br,Bayesian Ridge,1.1655,2.9173,1.7018,0.9487,0.136,0.0995,0.004


In [7]:
# Users are allowed to create their own model if they want to.
# This uses the default hyperparameters to train the model which you can then choose to tune as you want.
# In order to tune hyperparameters, the tune_model function is used
xgboost = create_model('xgboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.8778,2.5094,1.5841,0.9513,0.0897,0.0571
1,0.7497,1.4027,1.1844,0.9734,0.0642,0.0472
2,0.8379,1.7212,1.3119,0.9732,0.0769,0.052
3,0.7279,1.6574,1.2874,0.9723,0.0757,0.0512
4,0.7921,1.8883,1.3742,0.9676,0.0819,0.0582
5,0.7858,1.8806,1.3713,0.9678,0.082,0.059
6,0.7552,1.4741,1.2141,0.9724,0.0801,0.0531
7,0.7097,2.0992,1.4489,0.9679,0.0999,0.0626
8,0.7106,0.9512,0.9753,0.9832,0.053,0.0447
9,0.7799,1.4924,1.2216,0.9701,0.0719,0.0461


In [8]:
tuned_model = tune_model(xgboost)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.8472,2.1218,1.4566,0.9588,0.081,0.0552
1,0.7045,0.9529,0.9762,0.982,0.0591,0.0467
2,0.7981,1.4684,1.2118,0.9772,0.0745,0.0485
3,0.713,1.577,1.2558,0.9736,0.077,0.0506
4,0.8049,1.8156,1.3474,0.9689,0.0767,0.0535
5,0.7649,1.6723,1.2932,0.9714,0.0805,0.0505
6,0.7573,1.5471,1.2438,0.971,0.0848,0.0526
7,0.719,2.1572,1.4687,0.967,0.1044,0.0622
8,0.7417,1.0135,1.0067,0.9821,0.0562,0.045
9,0.7743,1.2851,1.1336,0.9743,0.0709,0.0462


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [9]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [10]:
predict_model(best)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,0.722,1.0145,1.0072,0.9815,0.0691,0.0524


Unnamed: 0,Bakerloo,dayOfWeek,hour,minute,Bakerloo10,prediction_label
2676,28,2,17,18,29.0,28.340562
724,21,4,10,2,22.0,21.400840
4595,22,2,20,33,22.0,21.953724
1112,20,5,20,56,22.0,20.763917
4064,30,1,7,19,26.0,27.549349
...,...,...,...,...,...,...
3854,19,6,11,43,19.0,19.483324
4481,21,2,14,19,20.0,21.297148
467,24,0,19,58,23.0,22.984974
3551,27,0,6,10,22.0,23.546255


In [11]:
final_best_model = finalize_model(best)
print(final_best_model)

Pipeline(memory=Memory(location=None),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=['Bakerloo', 'dayOfWeek', 'hour',
                                             'minute'],
                                    transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=[],
                                    transformer=SimpleImputer(strategy='most_frequent'))),
                ('actual_estimator',
                 <catboost.core.CatBoostRegressor object at 0x2a2931cf0>)])


In [12]:
final_prediction = predict_model(final_best_model, data=test_data)
final_prediction.head()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,0.6871,0.9515,0.9754,0.984,0.0626,0.0487


Unnamed: 0,Bakerloo,dayOfWeek,hour,minute,Bakerloo10,prediction_label
5701,24,1,11,27,22.0,23.198145
5702,27,0,7,3,28.0,27.787597
5703,12,5,5,6,9.0,9.187563
5704,18,1,22,51,18.0,18.335765
5705,23,4,9,58,21.0,22.898735


In [13]:
from pycaret.utils.generic import check_metric
check_metric(final_prediction.Bakerloo10, final_prediction.prediction_label, 'RMSE')

0.9754