In [None]:
# Initialize SageMaker session and role
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()

In [None]:
# Pipeline parameters
input_data = ParameterString(name="InputDataUrl", default_value="s3://mlops-testing-bucket-kulsin/sample-data.csv")
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
training_instance_count = ParameterInteger(name="TrainingInstanceCount", default_value=1)

In [None]:
# Define sklearn processor
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type="ml.m5.large",
    instance_count=processing_instance_count
)

# Define processing step
step_process = ProcessingStep(
    name="SampleProcessing",
    processor=sklearn_processor,
    inputs=[
        sagemaker.processing.ProcessingInput(
            source=input_data,
            destination="/opt/ml/processing/input"
        )
    ],
    outputs=[
        sagemaker.processing.ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train"
        ),
        sagemaker.processing.ProcessingOutput(
            output_name="validation",
            source="/opt/ml/processing/output/validation"
        )
    ],
    code="processing_script.py"  # This script should exist in your directory
)


In [None]:
# Define XGBoost estimator
xgboost_estimator = Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost", sagemaker_session.boto_region_name, version="1.2-1"),
    role=role,
    instance_count=training_instance_count,
    instance_type="ml.m5.large",
    output_path=f"s3://{sagemaker_session.default_bucket()}/output",
    hyperparameters={
        "objective": "binary:logistic",
        "num_round": 100,
        "max_depth": 5,
        "eta": 0.2,
        "eval_metric": "error"
    }
)

# Define training step
step_train = TrainingStep(
    name="SampleTraining",
    estimator=xgboost_estimator,
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="csv"
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["validation"].S3Output.S3Uri,
            content_type="csv"
        )
    }
)

In [None]:
# Create the pipeline
pipeline = Pipeline(
    name="SampleDataAnalysisPipeline",
    parameters=[input_data, processing_instance_count, training_instance_count],
    steps=[step_process, step_train],
    sagemaker_session=sagemaker_session
)

# Create or update pipeline
pipeline.upsert(role_arn=role)

# Execute pipeline (you can remove this from notebook if running only definition from CI/CD)
execution = pipeline.start(parameters={"InputDataUrl": "s3://mlops-testing-bucket-kulsin/sample-data.csv"})
print(f"Pipeline execution ARN: {execution.arn}")