In [87]:
%%sh
pip install pycaret
pip install python-dotenv
pip install ydata-profiling
pip install shap
pip -q install --upgrade stepfunctions

Collecting pydantic>=2 (from ydata-profiling)
  Using cached pydantic-2.5.3-py3-none-any.whl.metadata (65 kB)




Using cached pydantic-2.5.3-py3-none-any.whl (381 kB)
Installing collected packages: pydantic
  Attempting uninstall: pydantic
    Found existing installation: pydantic 1.10.13
    Uninstalling pydantic-1.10.13:
      Successfully uninstalled pydantic-1.10.13


[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastapi 0.95.2 requires pydantic!=1.7,!=1.7.1,!=1.7.2,!=1.7.3,!=1.8,!=1.8.1,<2.0.0,>=1.6.2, but you have pydantic 2.5.3 which is incompatible.[0m[31m


Successfully installed pydantic-2.5.3


[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ydata-profiling 4.6.4 requires pydantic>=2, but you have pydantic 1.10.13 which is incompatible.[0m[31m
[0m

In [88]:
import os
from sagemaker import get_execution_role
from dotenv import load_dotenv
from load_data import load_data
from split_data import split_data
import importlib
from save_model_to_s3 import save_model_to_s3
from deploy_model_endpoint import deploy_model
from finalize_and_save_model import finalize_and_save_model
from delete_sagemaker_endpoint import delete_sagemaker_endpoint
# from ydata_profiling import ProfileReport
import boto3

In [89]:
import stepfunctions
import uuid
import logging

from stepfunctions import steps
from stepfunctions.steps import TrainingStep, ModelStep
from stepfunctions.inputs import ExecutionInput
from stepfunctions.workflow import Workflow
from sagemaker.estimator import Estimator
import sagemaker

stepfunctions.set_stream_logger(level=logging.INFO)

In [90]:
# Variables Setup Stage
load_dotenv(".env")
role = get_execution_role()

# Env variables
data_location_s3 = os.getenv("data_location_s3")
algorithm_choice = os.getenv("algorithm_choice")
target = os.getenv("target")
endpoint_name = os.getenv("endpoint_name")
model_name = os.getenv("model_name")
data_location = "s3://{}".format(data_location_s3)
instance_type = os.getenv("instance_type")
model_instance_count = int(os.getenv("model_instance_count"))
image_uri = os.getenv("ecr_repo_uri")
tuning_metric = os.getenv("tuning_metric")

print(
    data_location_s3,
    algorithm_choice,
    target,
    endpoint_name,
    model_name,
    data_location,
    instance_type,
    image_uri,
    tuning_metric,
)

streaming-data-platform-ml-data/ethan_data.csv classification y classification-proba-endpoint banking-classification s3://streaming-data-platform-ml-data/ethan_data.csv ml.m4.xlarge 135544376709.dkr.ecr.eu-west-1.amazonaws.com/mlops-classification-repo:latest AUC


In [91]:
# Load data from S3
df = load_data(data_location)
df.head()

Unnamed: 0,age,job,education,default,balance,housing,loan,y
0,32,7,2,1,-238,1,0,0
1,34,4,2,0,-478,1,1,0
2,32,3,2,0,266,1,0,0
3,36,7,2,1,13,0,1,0
4,23,11,2,0,486,0,0,0


In [92]:
# Split and shuffle data
train_data, test_data = split_data(df, shuffle=True)
print(train_data.shape, test_data.shape)

(44654, 8) (11164, 8)


In [93]:
prefix = "step_function"
FILE_TRAIN = "train.csv"
FILE_TEST = "test.csv"

In [94]:
train_s3_file = os.path.join(prefix, FILE_TRAIN)
test_s3_file = os.path.join(prefix, FILE_TEST)
print(train_s3_file, test_s3_file)

step_function/train.csv step_function/test.csv


In [95]:
from io import StringIO

# Upload the three files to Amazon S3

bucket = "streaming-data-platform-ml-data"
csv_buffer = StringIO()
train_data.to_csv(csv_buffer, index=False)


s3_resource = boto3.resource("s3")
s3_resource.Object(bucket, train_s3_file).put(Body=csv_buffer.getvalue())

csv_buffer = StringIO()
test_data.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource("s3")
s3_resource.Object(bucket, test_s3_file).put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '2ADC725091V4FQWW',
  'HostId': 'HEtw7LkcGF1au8TzaVmW/6TQawypeULLUF0UfbzbZs1RYa/t0a7OELqiFJF3lE9MH8P9aphdfOg5p8Lpa3rs5A==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'HEtw7LkcGF1au8TzaVmW/6TQawypeULLUF0UfbzbZs1RYa/t0a7OELqiFJF3lE9MH8P9aphdfOg5p8Lpa3rs5A==',
   'x-amz-request-id': '2ADC725091V4FQWW',
   'date': 'Fri, 12 Jan 2024 16:12:23 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"26e13fd6a8eb41e45ebbb10131fa9dfd"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"26e13fd6a8eb41e45ebbb10131fa9dfd"',
 'ServerSideEncryption': 'AES256'}

In [96]:
train_s3_file = "s3://{}/{}".format(bucket, train_s3_file)
test_s3_file = "s3://{}/{}".format(bucket, test_s3_file)
print(train_s3_file, test_s3_file)

s3://streaming-data-platform-ml-data/step_function/train.csv s3://streaming-data-platform-ml-data/step_function/test.csv


In [97]:
# SageMaker expects unique names for each job, model and endpoint.
# If these names are not unique the execution will fail. Pass these
# dynamically for each execution using placeholders.
execution_input = ExecutionInput(
    schema={"JobName": str, "ModelName": str, "EndpointName": str}
)

In [102]:
pycaret_estimator = Estimator(
    image_uri="135544376709.dkr.ecr.eu-west-1.amazonaws.com/mlops-classification-repo:latest",
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    container_entry_point="train.py",
)

pycaret_estimator.set_hyperparameters(algorithm_choice=algorithm_choice, target=target)

pycaret_estimator = Estimator(
    image_uri="135544376709.dkr.ecr.eu-west-1.amazonaws.com/mlops-classification-repo:latest",
    source_dir="pycaret_image_files",
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    entry_point="train.py",
    git_config={
        "repo": "https://github.com/konradbachusz/AWS-MLOps-module",
        "branch": "aws-ml-model-retraining",
    },
)

pycaret_estimator.set_hyperparameters(algorithm_choice=algorithm_choice, target=target)

In [103]:
print(role)

arn:aws:iam::135544376709:role/banking-classification-sagemaker-role


In [104]:
training_step = steps.TrainingStep(
    "Train Step",
    estimator=pycaret_estimator,
    data={"train": train_s3_file},
    job_name=execution_input["JobName"],
)