# SageMaker Pipeline

### Define Libraries

In [1]:
import sagemaker
# from sagemaker import xgboost
from sagemaker.estimator import Estimator as xgb_Estimator
from sagemaker import image_uris
from sagemaker.inputs import TrainingInput
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import ParameterString, ParameterInteger, ParameterFloat
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.conditions import ConditionGreaterThan
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.functions import Join
from sagemaker.workflow.functions import JsonGet
from sagemaker.workflow.fail_step import FailStep
# from sagemaker.model import Model

from sagemaker.sklearn.processing import SKLearnProcessor

import os

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


### Define variables

In [2]:
sagemaker_role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
pipeline_session = PipelineSession()

aws_region = sagemaker_session.boto_session.region_name

image_uri = image_uris.retrieve(
    framework="xgboost",          # Framework name
    region=aws_region,            # Region (defaults to the region of the session)
    version="1.5-1",              # Container version
    #instance_type="ml.m5.large",  # Instance type (optional, used for multi-model endpoints)
    py_version="py3"              # Python version (optional, default is "py3")
)

s3_bucket = sagemaker_session.default_bucket()
s3_prefix = "xgb-pipeline-lam-01"
s3_base_url = f"s3://{s3_bucket}/{s3_prefix}"
s3_bucket, s3_prefix, s3_base_url, aws_region,image_uri

INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


('sagemaker-us-east-1-678467581510',
 'xgb-pipeline-lam-01',
 's3://sagemaker-us-east-1-678467581510/xgb-pipeline-lam-01',
 'us-east-1',
 '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1')

### Define the pipeline parameters

In [3]:
processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge")
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
batch_instance_type = ParameterString(name="BatchInstanceType", default_value="ml.m5.xlarge")
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="PendingManualApproval")
input_data_uri = ParameterString(name="InputDataUri", default_value=f"s3://{s3_bucket}/{s3_prefix}/data/data.csv")
output_path = ParameterString(name="OutputPath", default_value=f"s3://{s3_bucket}/{s3_prefix}/output")
accuracy_threshold = ParameterFloat(name="AccuracyThreshold", default_value=0.85)
model_package_group_name="xgb-Model-01"

### Step 0: Get Data

In [4]:
import boto3
s3 = boto3.client("s3")
s3.download_file(
    f"sagemaker-example-files-prod-{sagemaker_session.boto_region_name}",
    "datasets/tabular/synthetic/churn.txt",
    "churn.txt",
)

In [5]:
import pandas as pd
data = pd.read_csv("./data/churn.txt")

In [6]:
data.to_csv("./data/data.csv")

In [7]:
# Mode data to S3
local_path = "./data/data.csv"
destination_path = os.path.join(s3_base_url, "data")
s3_input_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=local_path,
    desired_s3_uri=destination_path
)
s3_input_data_uri

's3://sagemaker-us-east-1-678467581510/xgb-pipeline-lam-01/data/data.csv'

In [8]:
input_data_uri

ParameterString(name='InputDataUri', parameter_type=<ParameterTypeEnum.STRING: 'String'>, default_value='s3://sagemaker-us-east-1-678467581510/xgb-pipeline-lam-01/data/data.csv')

### Step 2: Data Preprocessing

In [9]:
sklearn_processor = SKLearnProcessor(
    framework_version="1.2-1",
    instance_type=processing_instance_type,
    instance_count=1,
    role=sagemaker_role,
    sagemaker_session=pipeline_session,
    base_job_name="xgb_pipeline",
)

procesor_args = sklearn_processor.run (
    inputs=[
        ProcessingInput(source=input_data_uri, destination="/opt/ml/processing/input")
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train", destination=f"s3://{s3_bucket}/{s3_prefix}/processing"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation", destination=f"s3://{s3_bucket}/{s3_prefix}/processing"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test", destination=f"s3://{s3_bucket}/{s3_prefix}/processing"),
    ],
    code="./code/preprocess.py"  # your custom preprocessing script
)

step_process = ProcessingStep (name="PreprocessData", step_args=procesor_args)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


### Step 2: Training

In [10]:
xgb_estimator = xgb_Estimator(
    image_uri=sagemaker.image_uris.retrieve(framework="xgboost", region=pipeline_session.boto_session.region_name, version="1.5-1"),
    entry_point="code/train.py",  # your custom training script
    instance_type=training_instance_type,
    instance_count=1,
    role=sagemaker_role,
    output_path=output_path,
    sagemaker_session=pipeline_session,
    base_job_name="TrainXGBoostModel",
    hyperparameters={
        "objective": "binary:logistic",
        "num_round": 50,
        "max_depth": 5,
        "eta": 0.2,
        "gamma": 4,
        "min_child_weight": 6,
        "subsample": 0.8
    }
)

train_args = xgb_estimator.fit(
    inputs={
        "train": TrainingInput(step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri, content_type="text/csv"),
        "validation": TrainingInput(step_process.properties.ProcessingOutputConfig.Outputs["validation"].S3Output.S3Uri, content_type="text/csv"),
    }
)

step_train = TrainingStep (name="TrainXGBoostModel", step_args=train_args,)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


### Step 3: Model Evaluation

In [11]:
# sklearn_processor_eval = SKLearnProcessor(
#     framework_version="1.2-1",
#     instance_type=processing_instance_type,
#     instance_count=1,
#     role=sagemaker_role,
#     sagemaker_session=pipeline_session,
#     base_job_name= "Xgb_Evaluation",
# )

# eval_args = sklearn_processor_eval.run(
#     inputs=[
#         ProcessingInput(source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model" ),
#         ProcessingInput(source=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri, destination="/opt/ml/processing/test"),
#     ],
#     outputs=[ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation")],
#     code="code/evaluate.py"  # your custom evaluation script
# )

# evaluation_report = PropertyFile(
#     name="EvaluationReport", output_name="evaluation", path="evaluation.json",
# )

# step_eval = ProcessingStep (name="EvaluateModel", step_args=eval_args, property_files=[evaluation_report])



In [12]:
script_processor_eval = ScriptProcessor(
    image_uri=image_uri,  # Correct URI for us-east-1
    command=["python3"],
    role=sagemaker_role,
    instance_type=processing_instance_type,  # Choose the instance type
    instance_count=1,
    sagemaker_session=pipeline_session,
    base_job_name= "Xgb_Evaluation",
)

In [13]:
eval_args = script_processor_eval.run(
    inputs=[
        ProcessingInput(source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model" ),
        ProcessingInput(source=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri, destination="/opt/ml/processing/test"),
    ],
    outputs=[ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation", destination=f"s3://{s3_bucket}/{s3_prefix}/evaluation")],
    code="code/evaluate.py"  # your custom evaluation script
)

evaluation_report = PropertyFile(
    name="EvaluationReport", output_name="evaluation", path="evaluation.json",
)

step_eval = ProcessingStep(name="EvaluateModel", step_args=eval_args, property_files=[evaluation_report])

### Model Registey Step

In [14]:
step_register = RegisterModel(
    name="MyModel",
    estimator=xgb_estimator,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.t2.medium"],    
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    #model_metrics=model_metrics,
)

### Condition Step

In [15]:
step_fail = FailStep(
    name="AccuracyFailed",
    error_message=Join(on=" ", values=["Execution failed due to model accuracy < ", accuracy_threshold ]),
)

In [16]:
cond_lte = ConditionGreaterThan(  # You can change the condition here
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="binary_classification_metrics.accuracy.value"  # This should follow the structure of your report_dict defined in the evaluate.py file.
    ),
    right=accuracy_threshold,  # You can change the threshold here
)

In [17]:
step_cond = ConditionStep(
    name="Check-Accuracy",
    conditions=[cond_lte],
    if_steps=[step_register],
    else_steps=[step_fail]
)

### Define Pipeline Parameters, Steps and Conditions

In [18]:
pipeline = Pipeline(
    name="xgb-pipeline",
    sagemaker_session=sagemaker_session,
    parameters=[
                processing_instance_type,
                instance_type, 
                model_approval_status,
                input_data_uri,
                output_path,
                accuracy_threshold,
               ],
    steps=[ step_process, step_train, step_eval, step_cond ]
)

In [19]:
pipeline.upsert(role_arn=sagemaker_role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-1:678467581510:pipeline/xgb-pipeline',
 'ResponseMetadata': {'RequestId': 'cc476ce3-baab-4946-be38-4a4142993245',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'cc476ce3-baab-4946-be38-4a4142993245',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '80',
   'date': 'Sun, 24 Nov 2024 01:55:23 GMT'},
  'RetryAttempts': 0}}

In [20]:
pipeline.start()

_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:678467581510:pipeline/xgb-pipeline/execution/rcx5c0lzjzii', sagemaker_session=<sagemaker.session.Session object at 0x7ff60895cdd0>)