In [None]:
%%sh
pip install pycaret
pip install python-dotenv
pip install ydata-profiling
pip install shap
pip -q install --upgrade stepfunctions

In [34]:
import os
from sagemaker import get_execution_role
from dotenv import load_dotenv
from load_data import load_data
from split_data import split_data
import importlib
from save_model_to_s3 import save_model_to_s3
from deploy_model_endpoint import deploy_model
from finalize_and_save_model import finalize_and_save_model
from delete_sagemaker_endpoint import delete_sagemaker_endpoint
from ydata_profiling import ProfileReport
import boto3

In [35]:
import stepfunctions
import uuid
import logging

from stepfunctions import steps
from stepfunctions.steps import TrainingStep, ModelStep
from stepfunctions.inputs import ExecutionInput
from stepfunctions.workflow import Workflow
from sagemaker.estimator import Estimator

stepfunctions.set_stream_logger(level=logging.INFO)

In [44]:
# Variables Setup Stage
load_dotenv(".env")
role = get_execution_role()

# Env variables
data_location_s3 = os.getenv("data_location_s3")
algorithm_choice = os.getenv("algorithm_choice")
target = os.getenv("target")
endpoint_name = os.getenv("endpoint_name")
model_name = os.getenv("model_name")
data_location = "s3://{}".format(data_location_s3)
instance_type = os.getenv("instance_type")
model_instance_count = int(os.getenv("model_instance_count"))
image_uri = os.getenv("ecr_repo_uri")
tuning_metric = os.getenv("tuning_metric")

print(
    data_location_s3,
    algorithm_choice,
    target,
    endpoint_name,
    model_name,
    data_location,
    instance_type,
    image_uri,
    tuning_metric,
)

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/lanre.bakare/Library/Application Support/sagemaker/config.yaml
streaming-data-platform-ml-data/ethan_data.csv classification y classification-proba-endpoint banking-classification s3://streaming-data-platform-ml-data/ethan_data.csv ml.m4.xlarge None AUC


In [45]:
lambda_role = "arn:aws:iam::135544376709:role/banking-classification-sagemaker-role"

In [46]:
# Load data from S3
df = load_data(data_location)
df.head()

Unnamed: 0,age,job,education,default,balance,housing,loan,y
0,32,7,2,1,-238,1,0,0
1,34,4,2,0,-478,1,1,0
2,32,3,2,0,266,1,0,0
3,36,7,2,1,13,0,1,0
4,23,11,2,0,486,0,0,0


In [52]:
# Split and shuffle data
train_data, test_data = split_data(df, shuffle=True)
print(train_data.shape, test_data.shape)

(44654, 8) (11164, 8)


In [53]:
prefix = "step_function"
FILE_TRAIN = "train.csv"
FILE_TEST = "test.csv"

In [54]:
train_s3_file = os.path.join(prefix, FILE_TRAIN)
test_s3_file = os.path.join(prefix, FILE_TEST)
print(train_s3_file, test_s3_file)

step_function/train.csv step_function/test.csv


In [55]:
from io import StringIO

# Upload the three files to Amazon S3

bucket = "streaming-data-platform-ml-data"
csv_buffer = StringIO()
train_data.to_csv(csv_buffer, index=False)


s3_resource = boto3.resource("s3")
s3_resource.Object(bucket, train_s3_file).put(Body=csv_buffer.getvalue())

csv_buffer = StringIO()
test_data.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource("s3")
s3_resource.Object(bucket, test_s3_file).put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'VFQ7SPE0YD19TXCY',
  'HostId': '1dDe1vnrpJuG1WWTIasKpLCgdEhpKeDUG4JrhGEN+Zs5/sX7UwjsRSIKZZP6vKg0zuyD84TKhNg=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '1dDe1vnrpJuG1WWTIasKpLCgdEhpKeDUG4JrhGEN+Zs5/sX7UwjsRSIKZZP6vKg0zuyD84TKhNg=',
   'x-amz-request-id': 'VFQ7SPE0YD19TXCY',
   'date': 'Tue, 09 Jan 2024 14:44:35 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"cb73ebe627aa4e76d9f8c72d31e6053c"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 1},
 'ETag': '"cb73ebe627aa4e76d9f8c72d31e6053c"',
 'ServerSideEncryption': 'AES256'}

In [17]:
# SageMaker expects unique names for each job, model and endpoint.
# If these names are not unique the execution will fail. Pass these
# dynamically for each execution using placeholders.
execution_input = ExecutionInput(
    schema={"JobName": str, "ModelName": str, "EndpointName": str}
)

In [None]:
pycaret_estimator = Estimator(
    image_uri="135544376709.dkr.ecr.eu-west-1.amazonaws.com/mlops-classification-repo:latest",
    role="role",
    instance_count=1,
    instance_type="ml.m5.large",
    entry_point="train.py",
)

pycaret_estimator.set_hyperparameters(algorithm_choice=algorithm_choice, target=target)

In [20]:
training_step = steps.TrainingStep(
    "Train Step",
    estimator=pycaret_estimator,
    data={"train": data_location},
    job_name=execution_input["JobName"],
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,y
2,Target type,Binary
3,Original data shape,"(44654, 8)"
4,Transformed data shape,"(44654, 8)"
5,Transformed train set shape,"(31257, 8)"
6,Transformed test set shape,"(13397, 8)"
7,Numeric features,7
8,Preprocess,True
9,Imputation type,simple


AttributeError: 'ClassificationExperiment' object has no attribute 'prepare_workflow_for_training'