# Code to Replicate Preprocessor as an Estimator Failure

In [2]:
import pandas as pd
import numpy as np
import datetime
import random

import sagemaker
import sagemaker.session

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString
)

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.functions import Join
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.sklearn.estimator import SKLearn

from sagemaker.workflow.pipeline import Pipeline

import os
from sklearn.model_selection import train_test_split
from time import gmtime, strftime, sleep
import boto3
import joblib

In [3]:
session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = 'custom_preprocessing'

timestamp_suffix = strftime("%Y-%m-%d-%H%M%S", gmtime())
folder_name = prefix + '-' + timestamp_suffix
prefix_path = f's3://{bucket}/{folder_name}'

## Create sample data

In [4]:
tf_vals = ['true', 'false', np.nan, '1', '0']
onehot_vals = [np.nan, 'purple', 'orange', 'purple', 'blue']

date_vals = []
for _ in range(4):
    date = datetime.date(2022, random.randint(1, 12), random.randint(1, 31))
    date_vals.append(date)
date_vals.append(np.nan)

float_vals = [3, 8.0, 2, np.nan, 4.0]
list_max_vals = ['3,0,9,4,2', np.nan, '0,2,3,9,8,4', '4', '5,4,3']
list_nunique_vals = ['apple,orange,grape', '0,9,8,3,4,3,3,4,9', np.nan, '4,4,4,4,4', 'pineapple']
descstat_vals = ['9,2,8,3,4', '1', '7,8,9,2,3,4', np.nan, '34']
multi_label_vals = ['apple,orange,grape', 'pineapple,grape,strawberry', np.nan, 'blueberry', 'grapefruit,apple']
drop_vals = [np.nan, 3, 6, 1, np.nan]
x_rand = list(range(5))

sample_df = pd.DataFrame({
    'true_false':tf_vals,
    'one_hot':onehot_vals,
    'dates':date_vals,
    'floats':float_vals,
    'max_of_list':list_max_vals,
    'nunique_of_list':list_nunique_vals,
    'desc_stats':descstat_vals,
    'multi_label':multi_label_vals,
    'random_col':drop_vals,
    'other':x_rand})
sample_df

Unnamed: 0,true_false,one_hot,dates,floats,max_of_list,nunique_of_list,desc_stats,multi_label,random_col,other
0,true,,2022-07-10,3.0,30942.0,"apple,orange,grape",92834.0,"apple,orange,grape",,0
1,false,purple,2022-05-07,8.0,,098343349,1.0,"pineapple,grape,strawberry",3.0,1
2,,orange,2022-10-01,2.0,23984.0,,789234.0,,6.0,2
3,1,purple,2022-10-12,,4.0,44444,,blueberry,1.0,3
4,0,blue,,4.0,543.0,pineapple,34.0,"grapefruit,apple",,4


In [7]:
sample_df.to_csv('data/sample.csv')
train_input = session.upload_data('data/sample.csv', bucket=bucket, key_prefix=folder_name)

## Write Preprocessing Script

In [8]:
%%writefile preprocess.py
import subprocess
import sys

def upgrade(package):
    subprocess.check_call([sys.executable, "-q", "-m", "pip", "install", package, '--upgrade'])
    
upgrade('pandas==1.3.5')
upgrade('numpy')
upgrade('pyarrow')

import numpy as np
import pandas as pd
import boto3
# import logging
import os
import warnings
import joblib
import argparse

env_parser = argparse.ArgumentParser()
env_parser.add_argument('--INPUT_FEATURES_SIZE', type=int, dest='INPUT_FEATURES_SIZE')
    
env_args = env_parser.parse_args()
INPUT_FEATURES_SIZE = env_args.INPUT_FEATURES_SIZE

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

warnings.simplefilter("once")

true_false = ['true_false']
one_hot = ['one_hot']
date_cols = ['dates']
float_cols = ['floats']
max_of_list = ['max_of_list']
count_unique = ['nunique_of_list']
desc_stat_cols = ['desc_stats']
list_to_labels = ['multi_label']
drop_cols = ['random_col']

class TrueFalseTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        self._col_names = list(X.columns)
        return self

    def transform(self, X, y=None):
        print('Running TrueFalseTransformer')
        X.fillna('-1', inplace=True)
        X = X.replace({'true':'1', 'false':'0'})
        X = X.apply(pd.to_numeric, args=('coerce',))
        return X

    def get_feature_names(self):
        return self._col_names

class OneHotTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._filler = 'ml_empty'
        self._col_names = None
        self._encoder = None
        self._transformer = None
        self._transformed_feats = []

    def fit(self, X, y=None):
        self._col_names = X.dropna(axis=1, how='all').columns
        X = X[self._col_names].fillna(self._filler)
        self._encoder = OneHotEncoder(handle_unknown='ignore')
        self._transformer = self._encoder.fit(X)
        self._transformed_feats = self._transformer.get_feature_names_out()
        return self

    def transform(self, X, y=None):
        print('Running OneHotTransformer')
        X = self._transformer.transform(X[self._col_names])
        return X

    def get_feature_names(self):
        return self._transformed_feats

class DateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print('Running DateTransformer')
        temp_df = pd.DataFrame(index=X.index.copy())

        for col in X.columns:
            X[col] = pd.to_datetime(X[col])
            temp_df[f'{col}-month'] = X[col].dt.month.astype(float)
            temp_df[f'{col}-day_of_week'] = X[col].dt.dayofweek.astype(float)
            temp_df[f'{col}-hour'] = X[col].dt.hour.astype(float)
            temp_df[f'{col}-day_of_month'] = X[col].dt.day.astype(float)
            temp_df[f'{col}-is_month_start'] = X[col].dt.is_month_start.astype(int)
            temp_df[f'{col}-is_month_end'] = X[col].dt.is_month_end.astype(int)
        self._col_names = list(temp_df.columns)
        return temp_df

    def get_feature_names(self):
        return self._col_names

class FloatTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        self._col_names = list(X.columns)
        return self

    def transform(self, X, y=None):
        print('Running FloatTransformer')
        for col in self._col_names:
            if X[col].dtype == 'string':
                X[col] = X[col].astype(float)
        return X

    def get_feature_names(self):
        return self._col_names

class ListMaxTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        self._col_names = list(X.columns)
        return self

    def transform(self, X, y=None):
        print('Running ListMaxTransformer')
        temp_df = pd.DataFrame(index=X.index)
        for col in self._col_names:
            if X[col].dtype == 'string':
                X[col].fillna('-1', inplace=True)
                X[col] = X[col].str.split(pat=',').apply(set).apply(list)
            temp_series = X[col].explode()
            temp_series = temp_series.replace({'true':'1', 'false':'0'}).fillna('-1').apply(pd.to_numeric, args=('coerce',))
            temp_series = temp_series.groupby(temp_series.index).max()
            temp_df = temp_df.merge(temp_series, left_index=True, right_index=True, how='outer')
        temp_df = temp_df.fillna(0)
        return temp_df

    def get_feature_names(self):
        return self._col_names

class ListNuniqueTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        self._col_names = list(X.columns)
        return self

    def transform(self, X, y=None):
        print('Running ListNuniqueTransformer')
        temp_df = pd.DataFrame(index=X.index)
        for col in self._col_names:
            if X[col].dtype == 'string':
                X[col] = X[col].dropna().str.split(pat=',').apply(set).apply(list)
            temp_series = X[col].explode()
            temp_series = temp_series.groupby(temp_series.index).nunique()
            temp_df = temp_df.merge(temp_series, left_index=True, right_index=True, how='outer')
        temp_df = temp_df.fillna(0)
        return temp_df

    def get_feature_names(self):
        return self._col_names

class DescStatTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print('Running DescStatTransformer')
        temp_df = pd.DataFrame(index=X.index)
        for col in X.columns:
            if X[col].dtype == 'string':
                X[col].fillna('-1', inplace=True)
                X[col] = X[col].str.split(pat=',').apply(set).apply(list)
            temp_series = X[col].explode()
            temp_series = temp_series.fillna('-1').apply(pd.to_numeric, args=('coerce',))
            temp_series = temp_series.groupby(temp_series.index).agg(['min', 'max', 'mean', 'std', 'nunique'])
            temp_series.columns = [f'{col}-{x}' for x in temp_series.columns]
            temp_df = temp_df.merge(temp_series, left_index=True, right_index=True, how='outer')
        temp_df = temp_df.fillna(0)
        self._col_names = list(temp_df.columns)
        return temp_df

    def get_feature_names(self):
        return self._col_names

class MultilabelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._filler = 'ml_empty'
        self._encoder = None
        self._transformer = None
        self._col_names = None

    def fit(self, X, y=None):
        X = X.fillna(self._filler).str.split(pat=',').apply(set).apply(list)
        self._encoder = MultiLabelBinarizer()
        self._encoder.fit(X)
        self._col_names = [X.name + '_' + x for x in self._encoder.classes_]
        return self

    def transform(self, X, y=None):
        print('Running MultilabelTransformer')
        X = X.fillna(self._filler).str.split(pat=',').apply(set).apply(list)
        trans_array = self._encoder.transform(X)
        df = pd.DataFrame(trans_array, columns=self._col_names, index=X.index)        
        return df

    def get_feature_names(self):
        return self._col_names
    
if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    args = parser.parse_args()
    
    input_files = [os.path.join(args.train, file) for file in os.listdir(args.train)]
    if len(input_files) == 0:
        raise ValueError(('There are no files in {}.\n' +
                          'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                          'the data specification in S3 was incorrectly specified or the role specified\n' +
                          'does not have permission to access the data.').format(args.train, "train"))
    
    file_types = [x.split('.')[-1] for x in os.listdir(args.train)]
    type_list = list(set(file_types))
    if len(type_list) != 1:
           raise ValueError(('There are multiple file types or no files in {}.\n' +
                             'Please submit a single file or single file type.\n' +
                             'Accepted file types are csv, parquet, and json.').format(args.train))
    else:
        file_type = ''.join(type_list)
        if file_type == 'csv':
            raw_data = [ pd.read_csv(file) for file in input_files ]
        elif file_type == 'parquet':
            raw_data = [ pd.read_parquet(file) for file in input_files ]
        elif file_type == 'json':
            raw_data = [ pd.read_json(file) for file in input_files ]
        else:
            print('File type {} not accepted. Please use csv, parquet, or json'.format(file_type))
    
    train_data = pd.concat(raw_data)
    
    # print(train_data.head())
    
    preprocessor = ColumnTransformer([
        ('drop_cols', 'drop', drop_cols),
        ('truefalse', TrueFalseTransformer(), true_false),
        ('onehot', OneHotTransformer(), one_hot),
        ('dates', DateTransformer(), date_cols),
        ('floats', FloatTransformer(), float_cols),
        ('listmax', ListMaxTransformer(), max_of_list),
        ('nunique', ListNuniqueTransformer(), count_unique),
        ('descstats', DescStatTransformer(), desc_stat_cols),
        ('multilabel', MultilabelTransformer(), 'multi_label')],
        remainder='passthrough')

    print('Preprocessing data')
    preprocessor.fit(train_data)

    print('Saving preprocessor joblib')
    encoder_name = 'preprocessor.joblib'
    joblib.dump(preprocessor, os.path.join(args.model_dir, encoder_name))
    
    print('Defining and saving selected feature names')
    transform_col_list = drop_cols + true_false + one_hot + date_cols + float_cols + max_of_list + count_unique + desc_stat_cols + list_to_labels

    step_list = ['truefalse',
                 'onehot',
                 'dates',
                 'floats',
                 'listmax',
                 'nunique',
                 'descstats',
                 'multilabel']
    
    feature_names = []
    
    for step in step_list:
        print(step)
        item = preprocessor.named_transformers_[step].get_feature_names()
        if type(item) == list:
            feature_names = feature_names + item
        elif type(item) == str:
            feature_names = feature_names + [item]
        else:
            print('get_feature_names produced something other than a list or string')
            print(type(item))
            
    remainder_cols = list(train_data.drop(transform_col_list, axis=1).columns)
    feature_names = feature_names + remainder_cols
    
    joblib.dump(feature_names, os.path.join(args.model_dir, "feature_names.joblib"))

    print("Selected features are: {}".format(feature_names))
    
    
def input_fn(input_data, content_type):
    '''Parse input data payload
    
    Accepts csv, parquet, or json file types'''
    
    print('Running input function')
    
    if content_type == 'text/csv':
        df = pd.read_csv(StringIO(input_data))
        return df
    elif content_type == 'application/x-parquet':
        df = pd.read_parquet(input_data)
    elif content_type == 'application/json':
        df = pd.read_json(input_data)
    else:
        raise ValueError("{} not supported by script".format(content_type))
        
def output_fn(prediction, accept):
    '''Format prediction output.
    
    The default accept/content-type between containers for serial inference is JSON.
    We also want to set the ContentType or mimetype as the same value as accept so the next
    container can read the response payload correctly.
    '''
    
    print('Running output function')
    
    if accept == 'application/json':
        instances = []
        for row in prediction.tolist():
            instances.append({'features': row})
            
        json_output = {'instances': instances}
        
        return worker.Response(json.dumps(json_output), mimetype=accept)
    elif accept == 'text/csv':
        return worker.Response(encoders.encode(prediction, accept), mimetype=accept)
    else:
        raise RuntimeException('{} accept type is not supported by this script')
        
def predict_fn(input_data, model):
    '''Preprocess input data
    
    The default predict_fn uses .predict(), but our model is a preprocessor
    so we want to use .transform().
    '''
    
    feat_names = joblib.load(os.path.join(model_dir, 'selected_feature_names.joblib'))
    INPUT_FEATURES_SIZE = len(feat_names)
    
    print('Running predict_function')
    
    print('Input data shape at predict_fn: {}'.format(input_data.shape))
    if input_data.shape[1] == INPUT_FEATURES_SIZE:
        features = model.transform(input_data)
        return features
    elif input_data.shape[1] == INPUT_FEATURES_SIZE + 1:
        # this assumes the target is the last column
        features = model.transform(input_data.iloc[:, :-1])
        # # This assumes the target is the first column
        # features = model.transform(input_data.iloc[:, 1:])
        return np.insert(features, 0, input_data[label_column], axis=1)
        # What format should this be in? csv, json?
        # Should I add the column names here?
    
def model_fn(model_dir):
    '''Deserialize fitted model'''
    
    print('Running model function')
    
    preprocessor = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return preprocessor

Writing preprocess.py


## Train Preprocessor

In [11]:
script_path = "preprocess.py"
model_output_path = os.path.join('s3://', bucket, folder_name, "components")

sklearn_transformer = SKLearn(
    entry_point=script_path,
    role=role,
    output_path=model_output_path,
    instance_type="ml.m5.large",
    sagemaker_session=None,
    framework_version="1.0-1",
    py_version="py3",
    tags=tags
)

In [12]:
sklearn_transformer.fit({"train": train_input})

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2023-02-22-17-27-36-005


2023-02-22 17:27:36 Starting - Starting the training job...
2023-02-22 17:27:52 Starting - Preparing the instances for training......
2023-02-22 17:29:04 Downloading - Downloading input data
2023-02-22 17:29:04 Training - Downloading the training image.....[34m2023-02-22 17:29:49,303 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-02-22 17:29:49,307 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-22 17:29:49,315 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-02-22 17:29:49,533 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-22 17:29:49,545 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-22 17:29:49,556 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-22 17:29:49,565 sagemaker-training-to

In [14]:
transformer_prefix = os.path.join(folder_name,
                                  "components",
                                  sklearn_transformer.latest_training_job.job_name,
                                  "output",
                                  "model.tar.gz")

session.download_data(path='./', bucket=bucket, key_prefix=transformer_prefix)

In [15]:
!tar xvzf model.tar.gz

preprocessor.joblib
feature_names.joblib


In [16]:
feature_list = list(joblib.load("feature_names.joblib"))
print(feature_list)

['true_false', 'dates-month', 'dates-day_of_week', 'dates-hour', 'dates-day_of_month', 'dates-is_month_start', 'dates-is_month_end', 'floats', 'max_of_list', 'nunique_of_list', 'desc_stats-min', 'desc_stats-max', 'desc_stats-mean', 'desc_stats-std', 'desc_stats-nunique', 'multi_label_apple', 'multi_label_blueberry', 'multi_label_grape', 'multi_label_grapefruit', 'multi_label_ml_empty', 'multi_label_orange', 'multi_label_pineapple', 'multi_label_strawberry', 'Unnamed: 0', 'other']


In [17]:
joblib.load("preprocessor.joblib")

AttributeError: Can't get attribute 'TrueFalseTransformer' on <module '__main__'>