### Libraries

In [1]:
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


### Build a pipeline step with Processor, Input, Output

In [1]:
def create_pre_processing_step():

    # inputs
    s3_input_file = 's3://ktzouvan-trading-point-sagemaker-poc/datasets/marketing/bank-additional-full.csv'
    dataset_processing_input = ProcessingInput(source=s3_input_file, destination="/opt/ml/processing/input", s3_input_mode="File", s3_data_distribution_type="ShardedByS3Key")

    # outputs
    s3_output_path = 's3://ktzouvan-trading-point-sagemaker-poc/datasets/marketing/'
    splits_processing_output = ProcessingOutput(source="/opt/ml/processing/output", destination=s3_output_path)

    # processor
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        role=get_execution_role(),
        instance_type="ml.m5.large",
        instance_count=1, 
        base_job_name='sm-tp-poc-pre-process'
    )

    pre_processing_step = ProcessingStep(
        name="pre-processing",
        processor=sklearn_processor,
        inputs=[dataset_processing_input],
        outputs=[splits_processing_output],
        job_arguments =['--input_path', '/opt/ml/processing/input','--input_file', 'bank-additional-full.csv','--output_path', '/opt/ml/processing/output'],
        code="b1_pre_processing.py",
    )

    return pre_processing_step

### Step definition

In [3]:
#pre_processing_step = create_pre_processing_step()
#pre_processing_step