In [None]:
%%sh
pip install pycaret
pip install python-dotenv
pip install ydata-profiling
pip install shap
pip -q install --upgrade stepfunctions

In [6]:
import os
from sagemaker import get_execution_role
from dotenv import load_dotenv
from load_data import load_data
from split_data import split_data
import importlib
from save_model_to_s3 import save_model_to_s3
from deploy_model_endpoint import deploy_model
from finalize_and_save_model import finalize_and_save_model
from delete_sagemaker_endpoint import delete_sagemaker_endpoint
from ydata_profiling import ProfileReport

In [7]:
import stepfunctions
import uuid
import logging

from stepfunctions import steps
from stepfunctions.steps import TrainingStep, ModelStep
from stepfunctions.inputs import ExecutionInput
from stepfunctions.workflow import Workflow

stepfunctions.set_stream_logger(level=logging.INFO)

In [10]:
# Variables Setup Stage
load_dotenv(".env")
role = get_execution_role()

# Env variables
data_location_s3 = os.getenv("data_location_s3")
algorithm_choice = os.getenv("algorithm_choice")
target = os.getenv("target")
endpoint_name = os.getenv("endpoint_name")
model_name = os.getenv("model_name")
data_location = "s3://{}".format(data_location_s3)
instance_type = os.getenv("instance_type")
model_instance_count = int(os.getenv("model_instance_count"))
image_uri = os.getenv("ecr_repo_uri")
tuning_metric = os.getenv("tuning_metric")

print(
    data_location_s3,
    algorithm_choice,
    target,
    endpoint_name,
    model_name,
    data_location,
    instance_type,
    image_uri,
    tuning_metric,
)

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/lanre.bakare/Library/Application Support/sagemaker/config.yaml
streaming-data-platform-ml-data/ethan_data.csv classification y classification-proba-endpoint banking-classification s3://streaming-data-platform-ml-data/ethan_data.csv ml.m4.xlarge None AUC


In [11]:
lambda_role = "arn:aws:iam::135544376709:role/banking-classification-sagemaker-role"

In [12]:
# Load data from S3
df = load_data(data_location)
df.head()

Unnamed: 0,age,job,education,default,balance,housing,loan,y
0,32,7,2,1,-238,1,0,0
1,34,4,2,0,-478,1,1,0
2,32,3,2,0,266,1,0,0
3,36,7,2,1,13,0,1,0
4,23,11,2,0,486,0,0,0


In [13]:
# Split and shuffle data
train_data, test_data = split_data(df, shuffle=True)
print(train_data, test_data)

       age  job  education  default  balance  housing  loan  y
0       37    7          2        0      561        1     0  0
1       44    4          2        0      132        0     0  0
2       27    5          2        0     1347        0     0  1
3       51    7          2        0     3370        1     0  0
4       42    8          1        0      518        0     0  1
...    ...  ...        ...      ...      ...      ...   ... ..
44649   52    5          1        0       14        0     1  0
44650   31    1          3        0        0        1     0  0
44651   40    3          2        0     3077        0     0  1
44652   32    3          3        0     1246        1     0  0
44653   42    7          2        0      413        1     0  1

[44654 rows x 8 columns]        age  job  education  default  balance  housing  loan  y
44654   37   11          3        0        0        0     0  1
44655   41    7          1        0      141        1     0  1
44656   27    5          2   

In [14]:
# Import Pycaret library depending on the algorithm choice
pycaret = importlib.import_module(f"pycaret.{algorithm_choice}")

In [15]:
# Initialize data in PyCaret with all the defined parameters
pycaret.setup(data=train_data, target=target, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,y
2,Target type,Binary
3,Original data shape,"(44654, 8)"
4,Transformed data shape,"(44654, 8)"
5,Transformed train set shape,"(31257, 8)"
6,Transformed test set shape,"(13397, 8)"
7,Numeric features,7
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x29c489cc0>

In [17]:
# SageMaker expects unique names for each job, model and endpoint.
# If these names are not unique the execution will fail. Pass these
# dynamically for each execution using placeholders.
execution_input = ExecutionInput(
    schema={"JobName": str, "ModelName": str, "EndpointName": str}
)

In [16]:
# Train and evaluate the performance of all estimators available in the model library using cross-validation.
bestModel = pycaret.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.7999,0.8784,0.8229,0.7857,0.8039,0.6,0.6007,0.306
xgboost,Extreme Gradient Boosting,0.7817,0.8594,0.7998,0.7707,0.785,0.5635,0.5639,0.052
dt,Decision Tree Classifier,0.7815,0.7881,0.8085,0.766,0.7866,0.5631,0.564,0.026
et,Extra Trees Classifier,0.7777,0.8633,0.7977,0.7659,0.7814,0.5554,0.556,0.221
catboost,CatBoost Classifier,0.7723,0.8476,0.7883,0.7626,0.7752,0.5446,0.545,1.116
lightgbm,Light Gradient Boosting Machine,0.7651,0.8406,0.7838,0.7544,0.7687,0.5302,0.5307,0.505
knn,K Neighbors Classifier,0.7602,0.8399,0.8606,0.7157,0.7815,0.5207,0.5317,0.02
gbc,Gradient Boosting Classifier,0.7437,0.8145,0.7698,0.7304,0.7495,0.4874,0.4882,0.189
ada,Ada Boost Classifier,0.7271,0.7979,0.7558,0.7135,0.734,0.4544,0.4553,0.071
ridge,Ridge Classifier,0.72,0.0,0.7727,0.6978,0.7333,0.4403,0.4429,0.008


In [20]:
training_step = steps.TrainingStep(
    "Train Step",
    estimator=pycaret.setup(data=train_data, target=target, session_id=123),
    data={"train": bestModel},
    job_name=execution_input["JobName"],
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,y
2,Target type,Binary
3,Original data shape,"(44654, 8)"
4,Transformed data shape,"(44654, 8)"
5,Transformed train set shape,"(31257, 8)"
6,Transformed test set shape,"(13397, 8)"
7,Numeric features,7
8,Preprocess,True
9,Imputation type,simple


AttributeError: 'ClassificationExperiment' object has no attribute 'prepare_workflow_for_training'