In [7]:
from sagemaker import image_uris
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.steps import ProcessingStep
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession

image_uri = image_uris.retrieve(framework="sklearn", region=region, version="1.2-1")
sagemaker_session = sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()

In [6]:



# Set up the processing job
bucket = "sagemaker-ap-southeast-1-619071320705"
input_s3 = f"s3://{bucket}/input-data/raw.csv"
output_s3 = f"s3://{bucket}/output-data/cleaned.csv"
script_s3 = f"s3://{bucket}/pipeline-scripts/preprocess.py"

# Create an SKLearnProcessor
processor = SKLearnProcessor(
    framework_version="1.2-1",
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",  # Use a smaller instance if needed
    #image_uri=image_uri  # Use SageMaker's prebuilt image
)

# Create a ProcessingStep
processing_step = ProcessingStep(
    name="DataCleaningStep",
    processor=processor,
    inputs=[
        ProcessingInput(source=input_s3, destination="/opt/ml/processing/input"),
        ProcessingInput(source=script_s3, destination="/opt/ml/processing/code"),
    ],
    outputs=[
        ProcessingOutput(source="/opt/ml/processing/output", destination=output_s3)
    ],
    code="/opt/ml/processing/code/preprocess.py",
    arguments=[
        "--input", "/opt/ml/processing/input/raw.csv",
        "--output", "/opt/ml/processing/output/cleaned.csv"
    ]
)

TypeError: ProcessingStep.__init__() got an unexpected keyword argument 'arguments'

In [None]:
from sagemaker.workflow.pipeline import Pipeline

pipeline = Pipeline(
    name="SimpleDataCleaningPipeline",
    steps=[processing_step],
    sagemaker_session=sagemaker.Session()
)

# Execute the pipeline
pipeline.upsert(role_arn=role)
pipeline.start()