In [1]:
!pip install --ignore-installed pycaret[full]
!pip install python-dotenv

zsh:1: no matches found: pycaret[full]
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [None]:
import boto3, os, tarfile
from sagemaker import get_execution_role
from dotenv import load_dotenv
from load_data import load_data

# Env variable setup
### This stage is used for setting up Environment variable such as: Data_location, Algorithm_choice, target
User can choose to either to
1. use variable passed down from terraform (Option 1)
2. hard code the variable inside the notebook (Option 2)

In [None]:
# Variables Setup Stage
load_dotenv(".env")
role = get_execution_role()

# Env variables (Choose 1 Method)

# Option 1: Use in Sage Maker Notebook Instance, pass variables from terraform
data_location_s3 = os.getenv("data_location_s3")
algorithm_choice = os.getenv("algorithm_choice")
target = os.getenv("target")
endpoint_name=os.getenv("endpoint_name")
data_location = 's3://{}'.format(data_location_s3)

In [None]:
# Option 2: Use in local notebook, hard-coded variable
algorithm_choice = 'classification' # Either [classification, regression, time-series, etc.]
target = "Class variable"
endpoint_name="lanre-trial-endpoint"

## Importing Pycaret

In [5]:
import importlib

# Import Pycaret Module
pycaret = importlib.import_module(f"pycaret.{algorithm_choice}")

# Data Processing Stage

### Stage for data processing: upload data => Shuffle => Split data between training and testing

User can choose either to :
1. load data from S3 Bucket (Option 1)
2. use a data set provided by Pycaret (Option 2)

In [8]:
# Data Option 1: Use Data from S3
data = load_data(data_location)
df = data.copy()

# Randomly shuffle the DataFrame
df_shuffled = df.sample(frac=1).reset_index(drop=True)

# Sort by day and then pick the first 80% as your test data. 
train_size = int(0.8 * len(df))
train_data = df_shuffled[:train_size]
test_data = df_shuffled[train_size:]

In [6]:
# Data Option 2: Use Data from Pycaret
from pycaret.datasets import get_data
train_data = get_data('diabetes')

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
pycaret.setup(data = train_data, target = target, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Class variable
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(537, 9)"
6,Transformed test set shape,"(231, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x17b369850>

# Creating Model

### This stage allows user to create the model

User got 3 options on creating the model, <b>please choose one of the three options</b>
1. Let Pycaret choose the best model (Option 1)
2. Choose the algorithm user want to use with default hyperparameters (Option 2)
3. Choose the algorithm user want to use with customised hyperparameters (Option 3)

In [10]:
# Option 1: let pycaret choose the best model for you
bestModel = pycaret.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7689,0.8047,0.5602,0.7208,0.6279,0.4641,0.4736,0.175
ridge,Ridge Classifier,0.767,0.0,0.5497,0.7235,0.6221,0.4581,0.469,0.004
lda,Linear Discriminant Analysis,0.767,0.8055,0.555,0.7202,0.6243,0.4594,0.4695,0.005
rf,Random Forest Classifier,0.7485,0.7911,0.5284,0.6811,0.5924,0.415,0.4238,0.026
nb,Naive Bayes,0.7427,0.7955,0.5702,0.6543,0.6043,0.4156,0.4215,0.004
gbc,Gradient Boosting Classifier,0.7373,0.7917,0.555,0.6445,0.5931,0.4013,0.4059,0.015
ada,Ada Boost Classifier,0.7372,0.7799,0.5275,0.6585,0.5796,0.3926,0.4017,0.012
et,Extra Trees Classifier,0.7299,0.7788,0.4965,0.6516,0.5596,0.3706,0.3802,0.021
qda,Quadratic Discriminant Analysis,0.7282,0.7894,0.5281,0.6558,0.5736,0.3785,0.391,0.004
lightgbm,Light Gradient Boosting Machine,0.7133,0.7645,0.5398,0.6036,0.565,0.3534,0.358,0.214


In [13]:
# Option 2: Create model from algorithm you want (for example Decision Tree)
customModel = pycaret.create_model('dt')
print(customModel)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7222,0.6774,0.5263,0.625,0.5714,0.3682,0.3711
1,0.7037,0.6872,0.6316,0.5714,0.6,0.3656,0.3668
2,0.7407,0.7038,0.5789,0.6471,0.6111,0.4176,0.419
3,0.5926,0.5053,0.2105,0.3636,0.2667,0.0116,0.0125
4,0.7778,0.7684,0.7368,0.6667,0.7,0.5242,0.5259
5,0.6296,0.594,0.4737,0.4737,0.4737,0.188,0.188
6,0.6296,0.5699,0.3684,0.4667,0.4118,0.1469,0.1491
7,0.8302,0.777,0.6111,0.8462,0.7097,0.594,0.6098
8,0.6604,0.6079,0.4444,0.5,0.4706,0.2219,0.2227
9,0.6415,0.6206,0.5556,0.4762,0.5128,0.2319,0.2336


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       random_state=123, splitter='best')


In [15]:
# Option 3: Create model from Algorithm you want + configure hyper parameter yourself
customParamModel = pycaret.create_model('dt', max_depth = 5, min_samples_split = 4) # Does not limit how many hyperparameters user can defined here
print(customParamModel)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8333,0.8008,0.5263,1.0,0.6897,0.5902,0.647
1,0.6667,0.7023,0.5263,0.5263,0.5263,0.2692,0.2692
2,0.7222,0.7729,0.4211,0.6667,0.5161,0.335,0.3524
3,0.6111,0.5887,0.3158,0.4286,0.3636,0.0928,0.095
4,0.8148,0.8782,0.6316,0.8,0.7059,0.5735,0.582
5,0.6852,0.6707,0.5263,0.5556,0.5405,0.3014,0.3016
6,0.6481,0.685,0.2632,0.5,0.3448,0.1349,0.1479
7,0.8302,0.8587,0.5,1.0,0.6667,0.5691,0.6307
8,0.6415,0.6675,0.4444,0.4706,0.4571,0.1899,0.19
9,0.6604,0.669,0.3889,0.5,0.4375,0.1997,0.2029


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=4, min_weight_fraction_leaf=0.0,
                       random_state=123, splitter='best')


# Optimize Stage
### User can choose to optimize their model in this stage

Can include:
1. Auto Hyperparameter tuning
2. Tune based on Metric user want to focus (Accuracy? Prec.? MAE? MRE?) (TBC)
3. Choosing Tuner's search algorithm (TBC)
4. Blending Model

In [17]:
# (Optional) You can also let pycaret tune model for you
tuned_model = pycaret.tune_model(customParamModel)
print(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8519,0.8135,0.6842,0.8667,0.7647,0.6588,0.6686
1,0.7593,0.694,0.4737,0.75,0.5806,0.4236,0.4456
2,0.7593,0.7782,0.8421,0.6154,0.7111,0.5132,0.5318
3,0.7037,0.6511,0.4737,0.6,0.5294,0.3175,0.3223
4,0.8333,0.7632,0.5263,1.0,0.6897,0.5902,0.647
5,0.6296,0.582,0.4211,0.4706,0.4444,0.168,0.1685
6,0.7222,0.6654,0.4737,0.6429,0.5455,0.352,0.3605
7,0.7358,0.6246,0.2778,0.8333,0.4167,0.2973,0.3725
8,0.6604,0.5675,0.2778,0.5,0.3571,0.1512,0.1633
9,0.717,0.6643,0.5,0.6,0.5455,0.3424,0.3454


Fitting 10 folds for each of 10 candidates, totalling 100 fits
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=1, max_features=1.0, max_leaf_nodes=None,
                       min_impurity_decrease=0.01, min_samples_leaf=6,
                       min_samples_split=5, min_weight_fraction_leaf=0.0,
                       random_state=123, splitter='best')


In [18]:
# Evaluate model: View Hyperparameters, Confusion Matrix, Class Report, etc.
pycaret.evaluate_model(tuned_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

# Saving Model and Deployment Stage

In [19]:
# Save model
final_model = pycaret.finalize_model(tuned_model)
print(final_model)
pycaret.save_model(final_model, 'final_best_model')

Pipeline(memory=Memory(location=None),
         steps=[('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['Number of times pregnant',
                                             'Plasma glucose concentration a 2 '
                                             'hours in an oral glucose '
                                             'tolerance test',
                                             'Diastolic blood pressure (mm Hg)',
                                             'Triceps skin fold thickness (mm)',
                                             '2-Hour serum insulin (mu U/ml)',
                                             'Body mass index (weight in '
                                             'kg/(height in m)^2)',
                                             'Diabetes pedigre...
                                    transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))),
                ('actual_estima

In [None]:
# Convert model into tar file and upload to S3

with tarfile.open('final_best_model.tar.gz', 'w:gz') as tar:
    tar.add('final_best_model.pkl')


s3 = boto3.client('s3')
s3.upload_file('final_best_model.tar.gz', 'mlops-feature-engineering', 'final_best_model.tar.gz')

In [None]:
# Deploy Model
from sagemaker.model import Model
model_data = 's3://mlops-feature-engineering/final_best_model.tar.gz'
# Move model to s3. Member must satisfy regular expression pattern: ^(https|s3)://([^/]+)/?(.*)$
model = Model(
    image_uri='135544376709.dkr.ecr.eu-west-1.amazonaws.com/mlops-pycaret-repo:latest',  # The ECR image you pushed
    model_data=model_data,  # Location of your serialized model
    role=role
)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    endpoint_name=endpoint_name
)

# Clean up stage
## Remove Endpoint and Endpoint Config

In [None]:
# Create a low-level SageMaker service client.
sagemaker_client = boto3.client('sagemaker', region_name=my_region)

# Delete endpoint
sagemaker_client.delete_endpoint(EndpointName=endpoint_name)

# Delete endpoint configuration
sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name)