In [5]:
%%writefile preprocess.py
# import subprocess
# import sys

# def install(package):
#     subprocess.check_call([sys.executable, "-q", "-m", "pip", "install", package])
# def upgrade(package):
#     subprocess.check_call([sys.executable, "-q", "-m", "pip", "install", package, '--upgrade'])
    
# upgrade('pandas==1.3.5')
# upgrade('numpy')
# upgrade('pyarrow')
# install('category_encoders')

import numpy as np
import pandas as pd
import os
import warnings
import joblib
import argparse
from sklearn.preprocessing import OneHotEncoder

# import category_encoders as ce

cat_cols = ['zip_agg Customer Subtype', 'zip_agg Customer main type']

if __name__ == '__main__':
    input_path = '/opt/ml/processing/input'
    output_path = '/opt/ml/processing/output'
    
    try:
        os.makedirs(os.path.join(output_path, "train"))
        os.makedirs(os.path.join(output_path, "validate"))
        os.makedirs(os.path.join(output_path, "test"))
        os.makedirs(os.path.join(output_path, 'encoder'))
    except:
        pass
    
    print("Reading data")
    df = pd.read_csv(os.path.join(input_path, 'full_data.csv'))
    
    print('splitting data')
    train_data, validation_data, test_data = np.split(
        df.sample(frac=1, random_state=1420),
        [int(0.7 * len(df)), int(0.9 * len(df))],)
    
    print('Preprocessing data')
    sk_encoder = OneHotEncoder()
    sk_one_hot = sk_encoder.fit_transform(train_data[cat_cols]).toarray()
    sk_one_hot = pd.DataFrame(sk_one_hot, columns=sk_encoder.get_feature_names())
    sk_one_hot.columns = [x.replace('x0', cat_cols[0]).replace('x1', cat_cols[1]) for x in sk_one_hot.columns]
    train_data = sk_one_hot.merge(train_data.drop(columns=cat_cols), left_index=True, right_index=True)
    
    sk_test = sk_encoder.transform(test_data[cat_cols]).toarray()
    sk_test = pd.DataFrame(sk_test, columns=sk_encoder.get_feature_names())
    sk_test.columns = [x.replace('x0', cat_cols[0]).replace('x1', cat_cols[1]) for x in sk_test.columns]
    test_data = sk_test.merge(test_data.drop(columns=cat_cols), left_index=True, right_index=True)
    
    # ce_encoder = ce.OneHotEncoder(cols=cat_cols, use_cat_names=True, handle_missing='return_nan')
    # trnsfrmd_df = ce_encoder.fit_transform(df)
    
    print('Saving dataframe')
    train_data.to_parquet(os.path.join(output_path, 'train', 'train_feats.parquet'))
    test_data.to_parquet(os.path.join(output_path, 'test', 'test_feats.parquet'))
    validation_data.to_parquet(os.path.join(output_path, 'validate', 'validate_feats.parquet'))
                              
    print('Saving preprocessor joblib')
    encoder_name = 'preprocessor.joblib'
    joblib.dump(sk_encoder, os.path.join(output_path, 'encoder', encoder_name))    

Overwriting preprocess.py


In [6]:
import sagemaker
import sagemaker.session

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString)

from sagemaker.workflow.functions import Join
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.pipeline import Pipeline

session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()

bucket = session.default_bucket()
prefix = '1_ins_dataset'
pipeline_name = "InsExample"  # SageMaker Pipeline name
model_package_group_name = "Insurance Co Example"  # Model name in model registry
framework_version = "0.23-1"

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)

processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value="ml.t3.medium")

input_uri = Join(on="/", values=['s3://{}'.format(bucket),
                                 prefix,
                                 'clean',
                                 'full_data.csv'])

tags = [
    {"Key": "DATASET", "Value": "InsCOIL"},
    {"Key": "SOURCE", "Value": "UCI"}
   ]

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="ins-example-job"
)

preprocess_step = ProcessingStep(
    name="preprocess",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(source=input_uri, destination="/opt/ml/processing/input")
    ],
    outputs=[
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'final',
                    'train'
                ]
            )
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/output/test",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'final',
                    'test'
                ]
            )
        ),
        ProcessingOutput(
            output_name="validate",
            source="/opt/ml/processing/output/validate",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'final',
                    'validate'
                ]
            )
        ),
        ProcessingOutput(
            output_name="encoder",
            source="/opt/ml/processing/output/encoder",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    'final',
                    'encoder'
                ]
            )
        )
    ],
    code="preprocess.py"
)

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type,
        processing_instance_count
    ],
    steps=[preprocess_step])

pipeline.upsert(role_arn=role, tags=tags)

pipeline.start(
    execution_display_name="InsClean-preprocess2")

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it.


_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:707031497630:pipeline/insexample/execution/1donc66rpim8', sagemaker_session=<sagemaker.session.Session object at 0x7f051da70b10>)