In [3]:
import pandas as pd
import numpy as np
import datetime
import random

import sagemaker
import sagemaker.session

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString
)

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.functions import Join
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.sklearn.estimator import SKLearn
# import sagemaker_containers

from sagemaker.workflow.pipeline import Pipeline

import os
from sklearn.model_selection import train_test_split
from time import gmtime, strftime, sleep
import boto3
import joblib

In [4]:
session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = 'custom_preprocessing'

timestamp_suffix = strftime("%Y-%m-%d-%H%M%S", gmtime())
folder_name = prefix + '-' + timestamp_suffix
prefix_path = f's3://{bucket}/{folder_name}'

In [5]:
tags = [
    {"Key": "PLATFORM", "Value": "FO-ML"},
    {"Key": "BUSINESS_REGION", "Value": "GLOBAL"},
    {"Key": "BUSINESS_UNIT", "Value": "MOBILITY"},
    {"Key": "CLIENT", "Value": "MULTI_TENANT"}
   ]

## Create sample data

In [6]:
tf_vals = ['true', 'false', np.nan, '1', '0']
onehot_vals = [np.nan, 'purple', 'orange', 'purple', 'blue']

date_vals = []
for _ in range(4):
    date = datetime.date(2022, random.randint(1, 12), random.randint(1, 31))
    date_vals.append(date)
date_vals.append(np.nan)

float_vals = [3, 8.0, 2, np.nan, 4.0]
list_max_vals = ['3,0,9,4,2', np.nan, '0,2,3,9,8,4', '4', '5,4,3']
list_nunique_vals = ['apple,orange,grape', '0,9,8,3,4,3,3,4,9', np.nan, '4,4,4,4,4', 'pineapple']
descstat_vals = ['9,2,8,3,4', '1', '7,8,9,2,3,4', np.nan, '34']
multi_label_vals = ['apple,orange,grape', 'pineapple,grape,strawberry', np.nan, 'blueberry', 'grapefruit,apple']
drop_vals = [np.nan, 3, 6, 1, np.nan]
x_rand = list(range(5))

sample_df = pd.DataFrame({
    'true_false':tf_vals,
    'one_hot':onehot_vals,
    'dates':date_vals,
    'floats':float_vals,
    'max_of_list':list_max_vals,
    'nunique_of_list':list_nunique_vals,
    'desc_stats':descstat_vals,
    'multi_label':multi_label_vals,
    'random_col':drop_vals,
    'other':x_rand})
sample_df

Unnamed: 0,true_false,one_hot,dates,floats,max_of_list,nunique_of_list,desc_stats,multi_label,random_col,other
0,true,,2022-11-20,3.0,30942.0,"apple,orange,grape",92834.0,"apple,orange,grape",,0
1,false,purple,2022-02-03,8.0,,098343349,1.0,"pineapple,grape,strawberry",3.0,1
2,,orange,2022-01-12,2.0,23984.0,,789234.0,,6.0,2
3,1,purple,2022-03-02,,4.0,44444,,blueberry,1.0,3
4,0,blue,,4.0,543.0,pineapple,34.0,"grapefruit,apple",,4


In [7]:
sample_df.to_csv('sample.csv')
train_input = session.upload_data('sample.csv', bucket=bucket, key_prefix=folder_name)

## Train Preprocessor

In [8]:
script_path = "processor_script.py"
model_output_path = os.path.join('s3://', bucket, folder_name, "components")

sklearn_transformer = SKLearn(
    entry_point=script_path,
    role=role,
    output_path=model_output_path,
    instance_type="ml.m5.large",
    sagemaker_session=None,
    framework_version="1.0-1",
    py_version="py3",
    tags=tags,
    dependencies=['transformers.py']
)

In [9]:
sklearn_transformer.fit({"train": train_input})

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2023-02-27-15-35-25-466


2023-02-27 15:35:25 Starting - Starting the training job...
2023-02-27 15:35:43 Starting - Preparing the instances for training......
2023-02-27 15:36:32 Downloading - Downloading input data...
2023-02-27 15:37:03 Training - Downloading the training image...
2023-02-27 15:37:38 Training - Training image download completed. Training in progress.[34m2023-02-27 15:37:42,430 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-02-27 15:37:42,432 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-27 15:37:42,440 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-02-27 15:37:42,651 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-27 15:37:42,662 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-27 15:37:42,672 sagemaker-training-toolkit INFO     No GPUs dete

In [10]:
transformer_prefix = os.path.join(folder_name,
                                  "components",
                                  sklearn_transformer.latest_training_job.job_name,
                                  "output",
                                  "model.tar.gz")

session.download_data(path='./', bucket=bucket, key_prefix=transformer_prefix)

In [11]:
!tar xvzf model.tar.gz

preprocessor.joblib
feature_names.joblib


In [12]:
feature_list = list(joblib.load("feature_names.joblib"))
print(feature_list)

['true_false', 'one_hot_blue', 'one_hot_ml_empty', 'one_hot_orange', 'one_hot_purple', 'dates-month', 'dates-day_of_week', 'dates-hour', 'dates-day_of_month', 'dates-is_month_start', 'dates-is_month_end', 'floats', 'max_of_list', 'nunique_of_list', 'desc_stats-min', 'desc_stats-max', 'desc_stats-mean', 'desc_stats-std', 'desc_stats-nunique', 'multi_label_apple', 'multi_label_blueberry', 'multi_label_grape', 'multi_label_grapefruit', 'multi_label_ml_empty', 'multi_label_orange', 'multi_label_pineapple', 'multi_label_strawberry', 'Unnamed: 0', 'other']


In [13]:
joblib.load("preprocessor.joblib")



ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('drop_cols', 'drop', ['random_col']),
                                ('truefalse', TrueFalseTransformer(),
                                 ['true_false']),
                                ('onehot', OneHotTransformer(), ['one_hot']),
                                ('dates', DateTransformer(), ['dates']),
                                ('floats', FloatTransformer(), ['floats']),
                                ('listmax', ListMaxTransformer(),
                                 ['max_of_list']),
                                ('nunique', ListNuniqueTransformer(),
                                 ['nunique_of_list']),
                                ('descstats', DescStatTransformer(),
                                 ['desc_stats']),
                                ('multilabel', MultilabelTransformer(),
                           

# Not tested yet

In [14]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer_output = os.path.join("s3://", bucket, folder_name, "Feature_selection_output/")
transformer = sklearn_transformer.transformer(
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=transformer_output,
    assemble_with="Line",
    accept="text/csv",
    tags=tags
)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2023-02-27-15-45-43-093


In [None]:
# Preprocess training input
transformer.transform(train_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path

INFO:sagemaker:Creating transform job with name: sagemaker-scikit-learn-2023-02-27-15-45-44-604


............................[34m2023-02-27 15:50:18,135 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-27 15:50:18,139 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-27 15:50:18,140 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
  