In [1]:
!mkdir -p code

In [2]:

ARTIFACT_DATA = "artifacts"




In [3]:
import os
import json
import boto3
import sagemaker
import sagemaker.session

from sagemaker import utils
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput, CreateModelInput, TransformInput
from sagemaker.model import Model
from sagemaker.transformer import Transformer

from sagemaker.model_metrics import MetricsSource, ModelMetrics, FileSource
from sagemaker.drift_check_baselines import DriftCheckBaselines
from sagemaker.processing import (
    ProcessingInput,
    ProcessingOutput,
    ScriptProcessor,
)
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet

from sagemaker.workflow.parameters import (
    ParameterBoolean,
    ParameterInteger,
    ParameterString,
)
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.steps import (
    ProcessingStep,
    TrainingStep,
    CreateModelStep,
    TransformStep,
)
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.pipeline_context import PipelineSession

# Importing new steps and helper functions

from sagemaker.workflow.check_job_config import CheckJobConfig
from sagemaker.workflow.clarify_check_step import (
    DataBiasCheckConfig,
    ClarifyCheckStep,
    ModelBiasCheckConfig,
    ModelPredictedLabelConfig,
    ModelExplainabilityCheckConfig,
    SHAPConfig,
)
from sagemaker.workflow.quality_check_step import (
    DataQualityCheckConfig,
    ModelQualityCheckConfig,
    QualityCheckStep,
)
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.workflow.functions import Join
from sagemaker.model_monitor import DatasetFormat, model_monitoring
from sagemaker.clarify import BiasConfig, DataConfig, ModelConfig

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Fetched defaults config from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [10]:
region = sagemaker.Session().boto_region_name
sm_client = boto3.client("sagemaker")
boto_session = boto3.Session(region_name=region)
sagemaker_session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=sm_client)
pipeline_session = PipelineSession()
prefix = "model-monitor-clarify-step-pipeline"

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [21]:
print(sagemaker_session.sagemaker_config)

{'SchemaVersion': '1.0', 'SageMaker': {'PythonSDK': {'Modules': {'Session': {'DefaultS3Bucket': 'sagemaker-domain-dev', 'DefaultS3ObjectKeyPrefix': 'lauren-sso'}, 'RemoteFunction': {'S3KmsKeyId': 'arn:aws:kms:us-east-2:642693618675:key/fcc40d6b-e6e3-40e8-8128-d7fc136539f7', 'S3RootUri': 's3://sagemaker-domain-dev/lauren-sso', 'VolumeKmsKeyId': 'arn:aws:kms:us-east-2:642693618675:key/fcc40d6b-e6e3-40e8-8128-d7fc136539f7'}}}, 'TransformJob': {'DataCaptureConfig': {'KmsKeyId': 'arn:aws:kms:us-east-2:642693618675:key/fcc40d6b-e6e3-40e8-8128-d7fc136539f7'}, 'TransformOutput': {'KmsKeyId': 'arn:aws:kms:us-east-2:642693618675:key/fcc40d6b-e6e3-40e8-8128-d7fc136539f7'}, 'TransformResources': {'VolumeKmsKeyId': 'arn:aws:kms:us-east-2:642693618675:key/fcc40d6b-e6e3-40e8-8128-d7fc136539f7'}}, 'Pipeline': {'Tags': [{'Key': 'sagemaker:user-profile-arn', 'Value': 'arn:aws:sagemaker:us-east-2:642693618675:user-profile/d-zuyfohg0ma5d/lauren-sso'}, {'Key': 'domain_name', 'Value': 'sagemaker-domain-dev'

In [6]:
role = sagemaker.get_execution_role()
base_job_prefix = "bike-predictions"
model_package_group_name = "bike-share-model-group"
pipeline_name = "bikepredictionpipeline"

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [15]:
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.t3.medium")
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)
# The dataset used here is the open source Abalone dataset that can be found
# here - https://archive.ics.uci.edu/ml/datasets/abalone
input_data = ParameterString(
    name="InputDataUrl",
    default_value=f"s3://{pipeline_session.default_bucket()}/{pipeline_session.default_bucket_prefix}/day.csv",
)

In [16]:
%%writefile code/preprocess.py

import os
import boto3
import numpy as np
import pandas as pd
import time

import argparse
import logging
import pathlib
import requests
import tempfile



logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())


if __name__ == "__main__":
    logger.debug("Starting preprocessing.")
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-data", type=str, required=True)
    args = parser.parse_args()    
    
    base_dir = "/opt/ml/processing"
    pathlib.Path(f"{base_dir}/data").mkdir(parents=True, exist_ok=True)
    input_data = args.input_data
    bucket = input_data.split("/")[2]
    key = "/".join(input_data.split("/")[3:])

    logger.info("Downloading data from bucket: %s, key: %s", bucket, key)
    fn = f"{base_dir}/data/day.csv"
    s3 = boto3.resource("s3")
    s3.Bucket(bucket).download_file(key, fn)

    logger.debug("Reading downloaded data.")
    bike = pd.read_csv(fn, sep=',')
    os.unlink(fn)
    
    #Rename the columns
    bike.rename(columns={'instant':'rec_id','dteday':'datetime','yr':'year','mnth':'month','weathersit':'weather_condition',
                       'hum':'humidity','cnt':'total_count'},inplace=True)

    #Type casting the datetime and numerical attributes to category

    bike['datetime']=pd.to_datetime(bike.datetime, format="%d-%m-%Y")
    bike['season']=bike.season.astype('category')
    bike['year']=bike.year.astype('category')
    bike['month']=bike.month.astype('category')
    bike['holiday']=bike.holiday.astype('category')
    bike['weekday']=bike.weekday.astype('category')
    bike['workingday']=bike.workingday.astype('category')
    bike['weather_condition']=bike.weather_condition.astype('category')

    #TODO - Add quality check to test for Nulls
    
    #create dataframe for outliers
    wind_hum=pd.DataFrame(bike,columns=['windspeed','humidity'])
     #Cnames for outliers                     
    cnames=['windspeed','humidity']       
                      
    for i in cnames:
        q75,q25=np.percentile(wind_hum.loc[:,i],[75,25]) # Divide data into 75%quantile and 25%quantile.
        iqr=q75-q25 #Inter quantile range
        min=q25-(iqr*1.5) #inner fence
        max=q75+(iqr*1.5) #outer fence
        wind_hum.loc[wind_hum.loc[:,i]<min,:i]=np.nan  #Replace with NA
        wind_hum.loc[wind_hum.loc[:,i]>max,:i]=np.nan  #Replace with NA
    #Imputating the outliers by mean Imputation
    wind_hum['windspeed']=wind_hum['windspeed'].fillna(wind_hum['windspeed'].mean())
    wind_hum['humidity']=wind_hum['humidity'].fillna(wind_hum['humidity'].mean())

    #Replacing the imputated windspeed
    bike['windspeed']=bike['windspeed'].replace(wind_hum['windspeed'])
    #Replacing the imputated humidity
    bike['humidity']=bike['humidity'].replace(wind_hum['humidity'])
    
    #Create a new dataset 
    features=bike[['season','month','year','weekday','holiday','workingday','weather_condition','humidity','temp','windspeed']]
    #categorical attributes
    cat_attributes=['season','holiday','workingday','weather_condition','year']
    encoded_features=pd.get_dummies(features,columns=cat_attributes)
    logger.info(f"Shape of transfomed dataframe:: {encoded_features.shape}")

    
    pd.DataFrame(encoded_features).to_csv(f"{base_dir}/processed/data.csv", header=False, index=False)



Overwriting code/preprocess.py


In [22]:
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    instance_type="ml.t3.medium",
    instance_count=processing_instance_count,
    base_job_name=f"{base_job_prefix}/xgboost-bike-predictions-preprocess",
    sagemaker_session=pipeline_session,
    role=role,
)
processor_args = sklearn_processor.run(
    outputs=[
        ProcessingOutput(output_name="processed", 
                         source="/opt/ml/processing/processed",
                         destination=f"s3://{pipeline_session.default_bucket()}/{pipeline_session.default_bucket_prefix}/processed")
    ],
    code="code/preprocess.py",
    arguments=["--input-data", input_data],
)
step_process = ProcessingStep(name="PreprocessBikeData", step_args=processor_args)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.ProcessingOutputConfig.KmsKeyId
sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.ProcessingResources.ClusterConfig.VolumeKmsKeyId




# Define pipeline steps for model inference

In [23]:
import boto3

# Initialize the SageMaker client
sagemaker_client = boto3.client('sagemaker')

# Define the model package group name
model_package_group_name = 'bike-share-model-group-rf'

# List the model packages with the 'Approved' status
response = sagemaker_client.list_model_packages(
    ModelPackageGroupName=model_package_group_name,
    ModelApprovalStatus='Approved',
    SortBy='CreationTime',
    SortOrder='Descending'
)

# Get the latest approved model package
latest_approved_model = response['ModelPackageSummaryList'][0]
model_package_arn = latest_approved_model['ModelPackageArn']
model_version = latest_approved_model['ModelPackageVersion']
model_name = model_package_group_name + "-v" + str(model_version)

# Describe the model package to get configuration
model_package_details = sagemaker_client.describe_model_package(ModelPackageName=model_package_arn)

model_inference_specification = model_package_details['InferenceSpecification']['Containers'][0]
print(f"Latest approved model package ARN: {model_name}")
print(f"Model inference specification: {model_inference_specification}")

Latest approved model package ARN: bike-share-model-group-rf-v1
Model inference specification: {'Image': '257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.7-1', 'ImageDigest': 'sha256:dc8a4cde38c9404662c39e33b8477595ba86632b9de313408ac5238b316997cb', 'ModelDataUrl': 's3://sagemaker-domain-dev/artifacts/demo-xgboost-model/model', 'ModelDataETag': 'ec3fa86c00a63ef6ad9035802dca6c43'}


# Testing model inference and container in local studio instance
Reference documentation for local mode: https://docs.aws.amazon.com/sagemaker/latest/dg/studio-updated-local.html

In [26]:
from sagemaker.local import LocalSession

local_sagemaker_session = LocalSession(boto_session=boto_session)
local_sagemaker_session.config = {'local': {'local_code': True}}

local_model = Model(
    image_uri=model_inference_specification['Image'],
    name="BikeModel",
    model_data=f"{model_inference_specification['ModelDataUrl']}",
    sagemaker_session=local_sagemaker_session,
    role=role,
    # entry_point="inference.py",
    # env={"SAGEMAKER_PROGRAM": "inference.py",
    # "SAGEMAKER_SUBMIT_DIRECTORY": "/opt/ml/model/",
    # },
)

local_model.create(instance_type="ml.m5.large")

#Define the transformer
local_transformer = Transformer(
    model_name=local_model.name,
    instance_type="ml.m4.xlarge",
    instance_count=1,
    output_path=f"s3://{pipeline_session.default_bucket()}/BikePredictions",
    accept="text/csv",
    assemble_with="Line",
    sagemaker_session=local_sagemaker_session,
    strategy="MultiRecord", 
)

local_transformer.transform(
    data=f's3://{pipeline_session.default_bucket()}/{pipeline_session.default_bucket_prefix}/bikepredictionpipeline/j3xr2zx180ik/PreprocessBikeData/output/processed/data.csv', 
    data_type='S3Prefix', 
    content_type="text/csv",
    split_type="Line",
    join_source="Input",
    wait=True,
    logs=True,
    )


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


INFO:sagemaker:Creating model with name: BikeModel
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.


sagemaker.config INFO - Applied value from config key = SageMaker.TransformJob.TransformResources.VolumeKmsKeyId
sagemaker.config INFO - Applied value from config key = SageMaker.TransformJob.TransformOutput.KmsKeyId


INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2025-02-25-17-11-16-475
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.local.image:'Docker Compose' found using Docker CLI.
INFO:sagemaker.local.image:serving
INFO:sagemaker.local.image:creating hosting dir in /home/sagemaker-user/tmp/tmpu6v3u23h
https://docs.docker.com/engine/reference/commandline/login/#credential-stores

INFO:sagemaker.local.image:docker command: docker pull 257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.7-1


Login Succeeded


INFO:sagemaker.local.image:image pulled: 257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.7-1
INFO:sagemaker.local.image:docker compose file: 
services:
  sagemaker-local:
    command: serve
    container_name: phel0vo2s6-sagemaker-local
    environment:
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    image: 257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.7-1
    network_mode: sagemaker
    stdin_open: true
    tty: true
    volumes:
    - /home/sagemaker-user/tmp/tmp8al9geqp:/opt/ml/model
version: '2.3'

INFO:sagemaker.local.image:docker command: docker compose -f /home/sagemaker-user/tmp/tmpu6v3u23h/docker-compose.yaml up --build --abort-on-container-exit
INFO:sagemaker.local.entities:Checking if serving container is up, attempt: 5
INFO:sagemaker.local.entities:Container still not up, got: -1


Attaching to phel0vo2s6-sagemaker-local
phel0vo2s6-sagemaker-local  | [2025-02-25:17:13:03:INFO] No GPUs detected (normal if no gpus installed)
phel0vo2s6-sagemaker-local  | [2025-02-25:17:13:03:INFO] No GPUs detected (normal if no gpus installed)
phel0vo2s6-sagemaker-local  | [2025-02-25:17:13:03:INFO] nginx config: 
phel0vo2s6-sagemaker-local  | worker_processes auto;
phel0vo2s6-sagemaker-local  | daemon off;
phel0vo2s6-sagemaker-local  | pid /tmp/nginx.pid;
phel0vo2s6-sagemaker-local  | error_log  /dev/stderr;
phel0vo2s6-sagemaker-local  | 
phel0vo2s6-sagemaker-local  | worker_rlimit_nofile 4096;
phel0vo2s6-sagemaker-local  | 
phel0vo2s6-sagemaker-local  | events {
phel0vo2s6-sagemaker-local  |   worker_connections 2048;
phel0vo2s6-sagemaker-local  | }
phel0vo2s6-sagemaker-local  | 
phel0vo2s6-sagemaker-local  | http {
phel0vo2s6-sagemaker-local  |   include /etc/nginx/mime.types;
phel0vo2s6-sagemaker-local  |   default_type application/octet-stream;
phel0vo2s6-sagemaker-local  |   

INFO:sagemaker.local.entities:Checking if serving container is up, attempt: 10


phel0vo2s6-sagemaker-local  | [2025-02-25:17:13:06:INFO] No GPUs detected (normal if no gpus installed)
phel0vo2s6-sagemaker-local  | [2025-02-25:17:13:06:INFO] Loading the model from /opt/ml/model/model
phel0vo2s6-sagemaker-local  | [2025-02-25:17:13:06:INFO] Model objective : reg:squarederror
phel0vo2s6-sagemaker-local  | [2025-02-25:17:13:06:INFO] No GPUs detected (normal if no gpus installed)
phel0vo2s6-sagemaker-local  | 127.0.0.1 - - [25/Feb/2025:17:13:06 +0000] "GET /ping HTTP/1.1" 200 0 "-" "python-urllib3/1.26.19"
phel0vo2s6-sagemaker-local  | 127.0.0.1 - - [25/Feb/2025:17:13:06 +0000] "GET /execution-parameters HTTP/1.1" 200 84 "-" "python-urllib3/1.26.19"
phel0vo2s6-sagemaker-local  | [2025-02-25:17:13:06:INFO] No GPUs detected (normal if no gpus installed)
phel0vo2s6-sagemaker-local  | [2025-02-25:17:13:06:INFO] Loading the model from /opt/ml/model/model
phel0vo2s6-sagemaker-local  | [2025-02-25:17:13:06:INFO] Model objective : reg:squarederror
.

# Define pipeline inference steps and create model

In [29]:
pipeline_model = Model(
    image_uri=model_inference_specification['Image'],
    name=model_name,
    model_data=f"{model_inference_specification['ModelDataUrl']}.tar.gz",
    sagemaker_session=sagemaker_session,
    role=role,
    # entry_point="inference.py",
    # env={"SAGEMAKER_PROGRAM": "inference.py",
    # "SAGEMAKER_SUBMIT_DIRECTORY": "/opt/ml/model/",
    # },
)

pipeline_model.create(instance_type="ml.m5.large")


INFO:sagemaker:Creating model with name: bike-share-model-group-rf-v1


In [30]:
# Define the transformer
transformer = Transformer(
    model_name=model_name,
    instance_type="ml.m4.xlarge",
    instance_count=1,
    output_path=f"s3://{pipeline_session.default_bucket()}/{pipeline_session.default_bucket_prefix}/BikePredictions",
    accept="text/csv",
    assemble_with="Line",
)

data=step_process.properties.ProcessingOutputConfig.Outputs["processed"].S3Output.S3Uri
# data='s3://sagemaker-us-east-2-233603974917/bikepredictionpipeline/j3xr2zx180ik/PreprocessBikeData/output/processed' 


step_transform = TransformStep(
    name=f"Transform",
    transformer=transformer,
    inputs=TransformInput(
        data=data,
        # input_filter="$[1:]",
        join_source="Input",
        # output_filter="$[0,-1]",
        content_type="text/csv",
        split_type="Line",
    ),
    
    
)

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.TransformJob.TransformResources.VolumeKmsKeyId
sagemaker.config INFO - Applied value from config key = SageMaker.TransformJob.TransformOutput.KmsKeyId


In [31]:
# pipeline instance
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_count,
        training_instance_type,
        model_approval_status,
        input_data,
    ],
    steps=[
        step_process,
        step_transform,
    ],
    sagemaker_session=pipeline_session,
)

In [32]:
import json

definition = json.loads(pipeline.definition())
definition



{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceCount',
   'Type': 'Integer',
   'DefaultValue': 1},
  {'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.t3.medium'},
  {'Name': 'ModelApprovalStatus',
   'Type': 'String',
   'DefaultValue': 'PendingManualApproval'},
  {'Name': 'InputDataUrl',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-domain-dev/lauren-sso/day.csv'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'PreprocessBikeData',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'VolumeKmsKeyId': 'arn:aws:kms:us-east-2:642693618675:key/fcc40d6b-e6e3-40e8-8128-d7fc136539f7',
      'InstanceType': 'ml.t3.medium',
      'InstanceCount': {'Get': 'Parameters.ProcessingInstanceCount'},
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '25775804481

In [33]:
pipeline.upsert(role_arn=role)


sagemaker.config INFO - Applied value(s) from config key = SageMaker.Pipeline.Tags




{'PipelineArn': 'arn:aws:sagemaker:us-east-2:642693618675:pipeline/bikepredictionpipeline',
 'ResponseMetadata': {'RequestId': '4b1116c5-adf8-4d95-9bf1-6c2573bd9590',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '4b1116c5-adf8-4d95-9bf1-6c2573bd9590',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '90',
   'date': 'Tue, 25 Feb 2025 17:14:38 GMT'},
  'RetryAttempts': 0}}

In [34]:
execution = pipeline.start()