In [1]:
import sagemaker
# Initialize SageMaker session and role
sagemaker_session = sagemaker.session.Session()
role = "arn:aws:iam::282698011778:role/service-role/AmazonSageMaker-ExecutionRole-20250701T225193"

In [2]:
from sagemaker.workflow.parameters import ParameterString, ParameterInteger

# Pipeline parameters
input_data = ParameterString(
    name="InputDataUrl", 
    default_value="s3://mlops-testing-buck"
)
processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount", 
    default_value=1
)
training_instance_count = ParameterInteger(
    name="TrainingInstanceCount", 
    default_value=1
)

In [4]:
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor

# Define processing step

# Define sklearn processor
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type="ml.m5.large",
    instance_count=1,  # optional, defaults to 1
    base_job_name='sklearn-preprocessing'  # optional, helps identify the processing job
)

# Define processing step
step_process = ProcessingStep(
    name="SampleProcessing",
    processor=sklearn_processor,
    inputs=[
        sagemaker.processing.ProcessingInput(
            source=input_data,
            destination="/opt/ml/processing/input"
        )
    ],
    outputs=[
        sagemaker.processing.ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train"
        ),
        sagemaker.processing.ProcessingOutput(
            output_name="validation",
            source="/opt/ml/processing/output/validation"
        )
    ],
    code="processing_script.py"  # This script should exist in your directory
)


In [7]:
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.workflow.steps import TrainingStep
from sagemaker.inputs import TrainingInput
from sagemaker.session import Session

# Initialize the SageMaker session
sagemaker_session = Session()

# Get the execution role
role = "arn:aws:iam::282698011778:role/service-role/AmazonSageMaker-ExecutionRole-20250701T225193"

# Define the training instance count
training_instance_count = 1

# Define your S3 path for training data
train_data_path = "s3://mlops-testing-bucket-kulsin/sample-data.csv"

# Define XGBoost estimator with specified version
xgboost_estimator = Estimator(
    image_uri=sagemaker.image_uris.retrieve(
        "xgboost", 
        sagemaker_session.boto_region_name,
        version="1.7-1"
    ),
    role=role,
    instance_count=training_instance_count,
    instance_type='ml.m5.xlarge',
    volume_size=30,
    max_run=3600,
    input_mode='File',
    output_path=f"s3://{sagemaker_session.default_bucket()}/output"
)

# Define training step with only training data
step_train = TrainingStep(
    name="SampleTraining",
    estimator=xgboost_estimator,
    inputs={
        "train": TrainingInput(
            s3_data=train_data_path,
            content_type="csv"
        )
    }
)

In [None]:
import sagemaker
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.inputs import TrainingInput
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.sklearn import SKLearn
from sagemaker.workflow.parameters import ParameterString, ParameterInteger
from sklearn.model_selection import train_test_split
import os

# Get the SageMaker session and role
sagemaker_session = sagemaker.Session()
role = "arn:aws:iam::282698011778:role/service-role/AmazonSageMaker-ExecutionRole-20250701T225193"

# Define pipeline parameters
input_data = ParameterString(
    name="InputData",
    default_value="s3://mlops-testing-bucket-kulsin/sample-data.csv"
)

# Get the current working directory
base_dir = os.getcwd()
processing_script_path = os.path.join(base_dir, "processing_script.py")
training_script_path = os.path.join(base_dir, "train.py")

# Define your processing step
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type="ml.m5.large",
    instance_count=1,
    base_job_name='sklearn-processing'
)

# Create processing step
step_process = ProcessingStep(
    name="PreprocessData",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(
            source=input_data.to_string(),
            destination="/opt/ml/processing/input"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/train"
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/test"
        )
    ],
    code=processing_script_path
)

# Define the estimator for training
sklearn_estimator = SKLearn(
    entry_point=training_script_path,
    framework_version="0.23-1",
    instance_type="ml.m5.large",
    role=role,
    instance_count=1,
    base_job_name='sklearn-train'
)

# Create training step
step_train = TrainingStep(
    name="TrainModel",
    estimator=sklearn_estimator,
    inputs={
        "training": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri
        ),
        "testing": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri
        )
    }
)

# Create the pipeline
pipeline = Pipeline(
    name="SampleDataAnalysisPipeline",
    parameters=[input_data],
    steps=[step_process, step_train],
    sagemaker_session=sagemaker_session
)

# Submit the pipeline definition
pipeline.upsert(role_arn=role)

# Start the pipeline execution
execution = pipeline.start()

# Print the execution ARN
print(f"Pipeline execution started with ARN: {execution.arn}")
