In [1]:
!pip install pycaret
!pip install python-dotenv

Collecting pycaret
  Obtaining dependency information for pycaret from https://files.pythonhosted.org/packages/d5/54/d575af389203fc27d6c6cf7d60c4e67fcabfda4bc8e84271c8a396bd4a03/pycaret-3.1.0-py3-none-any.whl.metadata
  Downloading pycaret-3.1.0-py3-none-any.whl.metadata (16 kB)
Collecting pandas<2.0.0,>=1.3.0 (from pycaret)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scipy~=1.10.1 (from pycaret)
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scikit-learn<1.3.0,>=1.0 (from pycaret)
  Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K     [

<h3>1. Introduction </h3>
This template notebook will give users (data scientists and data engineers) the opportunities to train and deploy regression machine learning models with ease, without having to write all of the code from scratch. <br> 
There are a few requirements for the user anyway, such as;
<li>the data location</li> 
<li>the target (dependent) variable in your dataset and</li>
<li>what kind of machine learning algorithm you will be performing. </li>

All of these values will be entered in terraform and will be automatically applied in the notebook. 

<h3>2. Imports</h3>

The libraries that are required for this model notebook are imported below 

In [2]:
import boto3, os, tarfile
from sagemaker import get_execution_role
from dotenv import load_dotenv
from load_data import load_data
from split_data import split_data
import importlib

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/kong.nopwattanapong/Library/Application Support/sagemaker/config.yaml


<h3>3. Loading Data</h3>
Here the user is required to specify the location of the data that they will like to use for prediction. An helper function is used to load the data from S3. 

<em>Note: Your data needs to be an s3 bucket.</em>

In [3]:
# Variables Setup Stage
load_dotenv(".env")
role = get_execution_role()

# Env variables
data_location_s3 = os.getenv("data_location_s3")
algorithm_choice = os.getenv("algorithm_choice")
target = os.getenv("target")
endpoint_name = os.getenv("endpoint_name")
model_name = os.getenv("model_name")
data_location = 's3://{}'.format(data_location_s3)
pycaret_ecr_name = os.getenv("pycaret_ecr_name")
instance_type = os.getenv("instance_type")

print(data_location_s3, algorithm_choice, target, endpoint_name, model_name, data_location, pycaret_ecr_name, instance_type)

streaming-data-platform-ml-data/bakerloo.csv regression Bakerloo10 lanre-trial-endpoint lanre-test ethan_class.csv


<h3>4. Read and display a sample of data</h3>

In [4]:
# Data Option 1: Use Data from S3
df = load_data(data_location)
df.head() 

Unnamed: 0,age,job,education,default,balance,housing,loan,y
0,58,2,3,0,2143,1,0,0
1,44,3,2,0,29,1,0,0
2,33,1,2,0,2,1,1,0
3,47,7,4,0,1506,1,0,0
4,33,12,4,0,1,0,0,0


## Importing Pycaret

<h3>5. Data Exploration</h3>

In [5]:
# Splitting data and shuffle
train_data, test_data = split_data(df, shuffle=True)
print(train_data, test_data)

       age  job  education  default  balance  housing  loan  y
0       41    1          3        0      285        0     1  0
1       43    5          2        0      620        0     1  0
2       41    7          4        0       19        0     1  0
3       47    3          2        0     3519        0     1  0
4       26   11          4        0      826        0     0  0
...    ...  ...        ...      ...      ...      ...   ... ..
36163   45    1          3        0     1752        1     1  0
36164   26    4          2        0      483        1     0  0
36165   40    5          2        0        0        1     0  0
36166   30   11          3        0       34        1     0  0
36167   39    3          2        0        0        1     0  0

[36168 rows x 8 columns]        age  job  education  default  balance  housing  loan  y
36168   57    5          2        0     -752        1     0  0
36169   41    7          1        0     1428        1     1  0
36170   86    8          2   

In [3]:
# Import Pycaret Module
pycaret = importlib.import_module(f"pycaret.{algorithm_choice}")

In [5]:
# Initializes the experiment in PyCaret and creates the transformation pipeline based on all the parameters passed in the function. 
pycaret.setup(data=train_data, target=target, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Bakerloo10
2,Target type,Regression
3,Original data shape,"(5701, 5)"
4,Transformed data shape,"(5701, 5)"
5,Transformed train set shape,"(3990, 5)"
6,Transformed test set shape,"(1711, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


<pycaret.regression.oop.RegressionExperiment at 0x298fb3460>

<h3>6. Feature Engineering and Model Training</h3>

Here we are using the pycaret automl tool to train the model. The automl tool tries a number of machine learning algorithms depending on the type of machine learning problem you are trying to solve <br>
(regression, classification or time series). The automl tool then selects the best model based on the accuracy metrics of the tried models. 

In [8]:
# Trains and evaluates the performance of all estimators available in the model library using cross-validation.
bestModel = pycaret.compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.7156,1.1151,1.0493,0.98,0.0696,0.0511,0.156
xgboost,Extreme Gradient Boosting,0.7168,1.2249,1.1007,0.978,0.0704,0.0482,0.012
lightgbm,Light Gradient Boosting Machine,0.7418,1.2384,1.1046,0.9779,0.0792,0.0546,0.381
rf,Random Forest Regressor,0.6697,1.3131,1.132,0.9764,0.0682,0.0402,0.06
et,Extra Trees Regressor,0.6358,1.3532,1.1523,0.9757,0.0677,0.0377,0.046
gbr,Gradient Boosting Regressor,0.8952,1.6712,1.2876,0.9702,0.0926,0.0685,0.024
knn,K Neighbors Regressor,0.918,1.8918,1.3678,0.9663,0.0936,0.0647,0.007
dt,Decision Tree Regressor,0.7462,2.1759,1.4596,0.9608,0.089,0.0422,0.005
lar,Least Angle Regression,1.1629,2.7473,1.6545,0.9508,0.1358,0.0973,0.004
lr,Linear Regression,1.1629,2.7473,1.6545,0.9508,0.1358,0.0973,0.285


<h3>7. Model Evaluation: </h3>

Here we evaluate the performance of the best model, getting some visual representation of hyperparameters, features and other important details about the selected model.


In [9]:
# Evaluate model: Display a UI for analyzing Hyperparameters, Confusion Matrix, Class Report, etc.
pycaret.evaluate_model(bestModel)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

<h3>8. Saving Model for future predition</h3>

Here we are using a function that saves the model to s3. 

In [9]:
# Save model
final_model = pycaret.finalize_model(bestModel)
pycaret.save_model(final_model, 'final_best_model')

Pipeline(memory=Memory(location=None),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=['Bakerloo', 'dayOfWeek', 'hour',
                                             'minute'],
                                    transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=[],
                                    transformer=SimpleImputer(strategy='most_frequent'))),
                ('actual_estimator',
                 <catboost.core.CatBoostRegressor object at 0x29ddcb310>)])
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['Bakerloo', 'dayOfWeek', 'hour',
                                              'minute'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('actual_estimator',
                  <catboost.core.CatBoostRegressor object at 0x29ddcb310>)]),
 'final_best_model.pkl')

In [None]:
# Convert model into tar file and upload to S3

with tarfile.open('final_best_model.tar.gz', 'w:gz') as tar:
    tar.add('final_best_model.pkl')


s3 = boto3.client('s3')
s3.upload_file('final_best_model.tar.gz', model_name, 'final_best_model.tar.gz')

<h3>9. Deploying the model endpoints</h3> 

Here we use a function that creates the model endpoint in sagemaker. 

In [14]:
# Deploy Model
from sagemaker.model import Model
model_data = f's3://{model_name}/final_best_model.tar.gz'
# Move model to s3. Member must satisfy regular expression pattern: ^(https|s3)://([^/]+)/?(.*)$
model = Model(
    image_uri='135544376709.dkr.ecr.eu-west-1.amazonaws.com/mlops-pycaret-repo:latest',  # The ECR image you pushed
    model_data=model_data,  # Location of your serialized model
    role=role
)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    endpoint_name=endpoint_name
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
-------!None


# Clean up stage
## Remove Endpoint and Endpoint Config

In [13]:
# Create a low-level SageMaker service client.
my_region = boto3.session.Session().region_name
sagemaker_client = boto3.client('sagemaker', region_name=my_region)

# Delete endpoint
sagemaker_client.delete_endpoint(EndpointName=endpoint_name)

# Delete endpoint configuration
sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name)

{'ResponseMetadata': {'RequestId': '3b9dc2e2-8025-49f1-a2f3-0e5d8526d097',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '3b9dc2e2-8025-49f1-a2f3-0e5d8526d097',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Tue, 07 Nov 2023 09:15:10 GMT'},
  'RetryAttempts': 0}}