# XGBoost SageMaker Pipeline with EMR

In [353]:
import sagemaker
from sagemaker import image_uris
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import CacheConfig, ProcessingStep, TrainingStep
from sagemaker.processing import ScriptProcessor
from sagemaker.workflow.pipeline_context import PipelineSession

In [354]:
sagemaker_role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
pipeline_session = PipelineSession()
aws_region = sagemaker_session.boto_session.region_name
emr_role = "arn:aws:iam::047922237497:role/EMRServerlessExecutionRole-01"

In [355]:
s3_bucket = "aamlops2024"
s3_prefix = "xgboost-pipeline-emr"
app_info_key = f"{s3_prefix}/emr-tracking/app_info.json"
script_uri = f"s3://{s3_bucket}/{s3_prefix}"


In [356]:
image_uri = image_uris.retrieve(
    framework="xgboost",          # Framework name
    region=aws_region,            # Region (defaults to the region of the session)
    version="1.5-1",              # Container version
    # instance_type="ml.m5.large",  # Instance type (optional, used for multi-model endpoints)
    py_version="py3"              # Python version (optional, default is "py3")
)
image_uri

INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1'

In [357]:
image_uri="763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-processing-container:latest"
image_uri

'763104351884.dkr.ecr.us-east-1.amazonaws.com/sagemaker-processing-container:latest'

In [358]:
image_uri=sagemaker.image_uris.retrieve(
        framework="sklearn",
        region="us-east-1",
        version="1.2-1",  # ✅ You can also try "1.2-1"
        instance_type="ml.m5.large"
    )
image_uri

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3'

In [359]:
# Define cache configuration
cache_config = CacheConfig(
    enable_caching=True,             # Enable caching
    expire_after="P30D"              # Cache expiry in ISO 8601 duration format (e.g., P30D = 30 days)
)

## 0. Create EMR Application step

In [360]:
script_processor = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type="ml.m5.large",
    instance_count=1,
    role=sagemaker_role,
    sagemaker_session=pipeline_session,
    base_job_name="xgb_pipeline_emr",
    env={
        "S3_BUCKET": s3_bucket,
        "S3_KEY": app_info_key
    }
)

In [375]:
create_emr_app_step = ProcessingStep(
    name="Create_EMR_Application",
    processor=script_processor,
    code="./code/create_emr_app.py",
    cache_config=cache_config
)

## 1. Preprocessing

### 1.1 Submit_Preprocessing_Job

In [362]:
local_path = "./code/spark_preprocessing.py"
destination_path = f"s3://{s3_bucket}/{s3_prefix}/code"
preprocessing_entry_point_s3uri = sagemaker.s3.S3Uploader.upload(
    local_path=local_path,
    desired_s3_uri=destination_path
)
preprocessing_entry_point_s3uri

's3://aamlops2024/xgboost-pipeline-emr/code/spark_preprocessing.py'

In [363]:
script_processor = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type="ml.m5.large",
    instance_count=1,
    role=sagemaker_role,
    sagemaker_session=pipeline_session,
    base_job_name="xgb_pipeline_emr_submit_job",
    env={
        # "APP_INFO_KEY": f"{s3_prefix}/emr-tracking/app_info.json",
        "APP_INFO_KEY": app_info_key,
        "S3_BUCKET": s3_bucket,
        "S3_PREFIX": s3_prefix,
        "ENTRY_POINT": preprocessing_entry_point_s3uri,
        "EMR_ROLE": emr_role,
        "JOB_TYPE": "preprocessing"
    }
)

In [364]:
submit_preprocess_step = ProcessingStep(
    name="xgb-emr-Submit_Preprocessing",
    processor=script_processor,
    code="code/submit_emr_job_preprocessing.py",
    job_arguments=[
        "--region", "us-east-1",
    ],
)
submit_preprocess_step.add_depends_on([create_emr_app_step])

## 1.2  Wait for Preprocessing Completion

In [365]:
script_processor = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type="ml.m5.large",
    instance_count=1,
    role=sagemaker_role,
    sagemaker_session=pipeline_session,
    base_job_name="xgb_pipeline_emr_submit_job",
    env={
        "APP_INFO_KEY": app_info_key,
        "S3_BUCKET": s3_bucket,
        "S3_PREFIX": s3_prefix,
        "JOB_TYPE": "preprocessing"
    }
)

In [366]:
wait_preprocess_step = ProcessingStep(
    name="Wait_Preprocessing_Job",
    processor=script_processor,
    code="code/wait_for_emr_job.py",
    job_arguments=[
        "--region", "us-east-1",
    ],
)
wait_preprocess_step.add_depends_on([submit_preprocess_step])

## 2 Model Training

### 2.1 Submit Training Job

In [367]:
local_path = "./code/spark_training.py"
destination_path = f"s3://{s3_bucket}/{s3_prefix}/code"
training_entry_point_s3uri = sagemaker.s3.S3Uploader.upload(
    local_path=local_path,
    desired_s3_uri=destination_path
)
training_entry_point_s3uri

's3://aamlops2024/xgboost-pipeline-emr/code/spark_training.py'

In [368]:
script_processor = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type="ml.m5.large",
    instance_count=1,
    role=sagemaker_role,
    sagemaker_session=pipeline_session,
    base_job_name="xgb_pipeline_emr_submit_job",
    env={
        "APP_INFO_KEY": app_info_key,
        "EMR_ROLE": emr_role,
        "ENTRY_POINT": training_entry_point_s3uri,
        "S3_BUCKET": s3_bucket,
        "S3_PREFIX": s3_prefix,
        "JOB_TYPE": "training"
    }
)


In [369]:
submit_train_step = ProcessingStep(
    name="Submit_Training_Job",
    processor=script_processor,
    code="code/submit_emr_job_training.py",
    job_arguments=[
        "--region", "us-east-1",
    ],
)
submit_train_step.add_depends_on([wait_preprocess_step])

### 2.1 Wait for Training Completion

In [370]:
script_processor = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type="ml.m5.large",
    instance_count=1,
    role=sagemaker_role,
    sagemaker_session=pipeline_session,
    base_job_name="xgb_pipeline_emr_submit_job",
    env={
        "APP_INFO_KEY": app_info_key,
        "S3_BUCKET": s3_bucket,
        "S3_PREFIX": s3_prefix,
        "JOB_TYPE": "training"
    }
)

In [371]:
wait_train_step = ProcessingStep(
    name="Wait_Training_Job",
    processor=script_processor,
    code="./code/wait_for_emr_job.py",
    job_arguments=[
        "--region", "us-east-1",
    ],
    cache_config=cache_config
)
wait_train_step.add_depends_on([submit_train_step])

## 6. Pipeline

In [372]:
pipeline = Pipeline(
    name="xgboost-pipeline-emr-v1-0",
    steps=[
        create_emr_app_step,
        submit_preprocess_step,
        wait_preprocess_step,
        submit_train_step,
        wait_train_step
    ]
)

In [373]:
pipeline.upsert(role_arn=sagemaker_role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-1:047922237497:pipeline/xgboost-pipeline-emr-v1-0',
 'ResponseMetadata': {'RequestId': 'fe13a818-1b00-4419-892a-5b0d1a2a09af',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'fe13a818-1b00-4419-892a-5b0d1a2a09af',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '93',
   'date': 'Fri, 04 Apr 2025 05:51:52 GMT'},
  'RetryAttempts': 0}}

In [374]:
pipeline.start()

_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:047922237497:pipeline/xgboost-pipeline-emr-v1-0/execution/7tgthtxh17em', sagemaker_session=<sagemaker.session.Session object at 0x7ff20730eb10>)