In [2]:
import sagemaker
import os
import pandas as pd

from sklearn.model_selection import train_test_split

In [3]:
session = sagemaker.session.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()
# bucket = session.default_bucket()
bucket = 'asurion-ml-experimentation'
prefix = 'custom_preprocessing'

In [4]:
tags = [
    {"Key": "PLATFORM", "Value": "FO-ML"},
    {"Key": "BUSINESS_REGION", "Value": "GLOBAL"},
    {"Key": "BUSINESS_UNIT", "Value": "MOBILITY"},
    {"Key": "CLIENT", "Value": "MULTI_TENANT"}
   ]

In [5]:
df = pd.read_csv('s3://asurion-ml-experimentation/custom_preprocessing/data/1_full_data.csv')
df.head()

Unnamed: 0,zip_agg_customer_subtype,zip_agg_number_of_houses,zip_agg_avg_size_household,zip_agg_avg_age,zip_agg_customer_main_type,zip_agg_roman_catholic,zip_agg_protestant,zip_agg_other_religion,zip_agg_no_religion,zip_agg_married,...,nbr_private_accident_ins_policies,nbr_family_accidents_ins_policies,nbr_disability_ins_policies,nbr_fire_policies,nbr_surfboard_policies,nbr_boat_policies,nbr_bicycle_policies,nbr_property_ins_policies,nbr_ss_ins_policies,nbr_mobile_home_policies
0,Lower class large families,1,3,2,Family with grown ups,0,5,1,3,7,...,0,0,0,1,0,0,0,0,0,0
1,Mixed small town dwellers,1,2,2,Family with grown ups,1,4,1,4,6,...,0,0,0,1,0,0,0,0,0,0
2,Mixed small town dwellers,1,2,2,Family with grown ups,0,4,2,4,3,...,0,0,0,1,0,0,0,0,0,0
3,"Modern, complete families",1,3,3,Average Family,2,3,2,4,5,...,0,0,0,1,0,0,0,0,0,0
4,Large family farms,1,4,2,Farmers,1,4,1,4,7,...,0,0,0,1,0,0,0,0,0,0


In [12]:
label_col = 'nbr_mobile_home_policies'
cat_feats = ['zip_agg_customer_subtype', 'zip_agg_customer_main_type']
# INPUT_FEATURES_SIZE = 85
nbr_cols = df.shape[1]

NameError: name 'df' is not defined

In [7]:
train, test = train_test_split(df, test_size = .1, random_state=12, stratify=df[label_col])

In [43]:
%%writefile custom_preprocess.py
import pandas as pd
import numpy as np

import time
import sys
from io import StringIO
import os
import shutil

import argparse
import csv
import joblib
import json

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

from sagemaker_containers.beta.framework import (
    content_types, encoders, env, modules, transformer, worker)

INPUT_FEATURES_SIZE = 85
label_column = 'nbr_mobile_home_policies'
cat_feats = ['zip_agg_customer_subtype', 'zip_agg_customer_main_type']

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    parser.add_argument('--label_col', type=str, default='nbr_mobile_home_policies')
    args = parser.parse_args()
    
    cat_feats = ['zip_agg_customer_subtype', 'zip_agg_customer_main_type']
    
    input_files = [ os.path.join(args.train, file) for file in os.listdir(args.train) ]
    if len(input_files) == 0:
        raise ValueError(('There are no files in {}.\n' +
                          'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                          'the data specification in S3 was incorrectly specified or the role specified\n' +
                          'does not have permission to access the data.').format(args.train, "train"))

    raw_data = [ pd.read_csv(file) for file in input_files ]
    concat_data = pd.concat(raw_data)
    
    number_of_columns_x = concat_data.shape[1]
    train_y = concat_data.iloc[:,number_of_columns_x-1]
    train_X = concat_data.iloc[:,:number_of_columns_x-1]
    
    col_transformer = ColumnTransformer([
            ('encoder', OneHotEncoder(), cat_feats)],
        remainder='passthrough')
        
    col_transformer.fit(train_X, train_y)

    joblib.dump(col_transformer, os.path.join(args.model_dir, "model.joblib"))

    print("saved model!")
    
    one_hot_cols = col_transformer.named_transformers_['encoder'].get_feature_names()
    feature_names = []

    for i, col in enumerate(cat_feats):
        del_str = f'x{i}'
        col_list = [itm for itm in one_hot_cols if itm.startswith(del_str)]
        feature_names = feature_names + [x.replace(del_str, col) for x in col_list]
        
    feature_names = feature_names + list(train_X.drop(cat_feats, axis=1).columns)
    
    joblib.dump(feature_names, os.path.join(args.model_dir, "selected_feature_names.joblib"))
    
    print("Selected features are: {}".format(feature_names))
    
def input_fn(input_data, content_type):
    '''Parse input data payload
    
    Accepts csv, parquet, or json file types'''
    
    print('Running input function')
    
    if content_type == 'text/csv':
        df = pd.read_csv(StringIO(input_data))
        return df
    elif content_type == 'application/x-parquet':
        df = pd.read_parquet(input_data)
    elif content_type == 'application/json':
        df = pd.read_json(input_data)
    else:
        raise ValueError("{} not supported by script".format(content_type))
        
def output_fn(prediction, accept):
    '''Format prediction output.
    
    The default accept/content-type between containers for serial inference is JSON.
    We also want to set the ContentType or mimetype as the same value as accept so the next
    container can read the response payload correctly.
    '''
    
    print('Running output function')
    
    if accept == 'application/json':
        instances = []
        for row in prediction.tolist():
            instances.append({'features': row})
            
        json_output = {'instances': instances}
        
        return worker.Response(json.dumps(json_output), mimetype=accept)
    elif accept == 'text/csv':
        return worker.Response(encoders.encode(prediction, accept), mimetype=accept)
    else:
        raise RuntimeException('{} accept type is not supported by this script')
        
def predict_fn(input_data, model):
    '''Preprocess input data
    
    The default predict_fn uses .predict(), but our model is a preprocessor
    so we want to use .transform().
    '''
    
    print('Running predict_function')
    
    print('Input data shape at predict_fn: {}'.format(input_data.shape))
    if input_data.shape[1] == INPUT_FEATURES_SIZE:
        features = model.transform(input_data)
        return features
    elif input_data.shape[1] == INPUT_FEATURES_SIZE + 1:
        features = model.transform(input_data.iloc[:, :INPUT_FEATURES_SIZE])
        return np.insert(features, 0, input_data[label_column], axis=1)
    
def model_fn(model_dir):
    '''Deserialize fitted model'''
    
    print('Running model function')
    
    preprocessor = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return preprocessor

Overwriting custom_preprocess.py


In [9]:
train.to_csv("data/train.csv", index=False)

WORK_DIRECTORY = "data"

train_input = session.upload_data(
    path="{}/{}".format(WORK_DIRECTORY, "train.csv"),
    bucket=bucket,
    key_prefix="{}/{}".format(prefix, "training_data"),
)

In [44]:
from sagemaker.sklearn.estimator import SKLearn

script_path = "custom_preprocess.py"
model_output_path = os.path.join("s3://", bucket, prefix, "preprocessing_model/")

sklearn_preprocessor = SKLearn(
    entry_point=script_path,
    role=role,
    output_path=model_output_path,
    instance_type="ml.m5.large",
    sagemaker_session=None,
    framework_version="1.0-1",
    py_version="py3",
    tags = tags
)

sklearn_preprocessor.fit({"train": train_input})

2023-02-03 18:07:07 Starting - Starting the training job...
2023-02-03 18:07:34 Starting - Preparing the instances for trainingProfilerReport-1675447626: InProgress
......
2023-02-03 18:08:34 Downloading - Downloading input data...
2023-02-03 18:08:54 Training - Downloading the training image...
2023-02-03 18:09:34 Uploading - Uploading generated training model[34m2023-02-03 18:09:28,343 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-02-03 18:09:28,347 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-03 18:09:28,355 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-02-03 18:09:28,547 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-03 18:09:28,558 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-03 18:09:28,569 sagemaker-training-toolkit INFO

In [45]:
key_prefix = os.path.join(
    prefix,
    "preprocessing_model/",
    sklearn_preprocessor.latest_training_job.job_name,
    "output",
    "model.tar.gz",
)
session.download_data(path="./", bucket=bucket, key_prefix=key_prefix)

In [46]:
!tar xvzf model.tar.gz

selected_feature_names.joblib
model.joblib


In [47]:
import joblib

feature_list = list(joblib.load("selected_feature_names.joblib"))
print(feature_list)

['zip_agg_customer_subtype_Affluent senior apartments', 'zip_agg_customer_subtype_Affluent young families', 'zip_agg_customer_subtype_Career and childcare', "zip_agg_customer_subtype_Couples with teens 'Married with children'", "zip_agg_customer_subtype_Dinki's (double income no kids)", 'zip_agg_customer_subtype_Etnically diverse', 'zip_agg_customer_subtype_Family starters', 'zip_agg_customer_subtype_Fresh masters in the city', 'zip_agg_customer_subtype_High Income, expensive child', 'zip_agg_customer_subtype_High status seniors', 'zip_agg_customer_subtype_Large family farms', 'zip_agg_customer_subtype_Large family, employed child', 'zip_agg_customer_subtype_Large religous families', 'zip_agg_customer_subtype_Low income catholics', 'zip_agg_customer_subtype_Lower class large families', 'zip_agg_customer_subtype_Middle class families', 'zip_agg_customer_subtype_Mixed apartment dwellers', 'zip_agg_customer_subtype_Mixed rurals', 'zip_agg_customer_subtype_Mixed seniors', 'zip_agg_customer

In [17]:
trial = train.iloc[:5,:]

Unnamed: 0,zip_agg_customer_subtype,zip_agg_number_of_houses,zip_agg_avg_size_household,zip_agg_avg_age,zip_agg_customer_main_type,zip_agg_roman_catholic,zip_agg_protestant,zip_agg_other_religion,zip_agg_no_religion,zip_agg_married,...,nbr_private_accident_ins_policies,nbr_family_accidents_ins_policies,nbr_disability_ins_policies,nbr_fire_policies,nbr_surfboard_policies,nbr_boat_policies,nbr_bicycle_policies,nbr_property_ins_policies,nbr_ss_ins_policies,nbr_mobile_home_policies
7038,"Young, low educated",1,2,2,Living well,1,0,2,6,5,...,0,0,0,0,0,0,0,0,0,0
1530,Religious elderly singles,1,2,5,Retired and Religeous,2,5,2,1,3,...,0,0,0,0,0,0,0,0,0,0
9427,Family starters,1,3,3,Average Family,1,4,1,4,6,...,0,0,0,0,0,0,0,0,0,0
7843,Affluent young families,1,3,2,Average Family,0,4,0,5,9,...,0,0,0,1,0,0,0,0,0,0
2156,Mixed small town dwellers,1,3,3,Family with grown ups,0,7,1,1,7,...,0,0,0,1,0,0,0,0,0,0


In [18]:
# trial = train.iloc[:5,:]

# trial.to_csv("data/trial.csv", index=False)

# WORK_DIRECTORY = "data"

# trial_input = session.upload_data(
#     path="{}/{}".format(WORK_DIRECTORY, "trial.csv"),
#     bucket=bucket,
#     key_prefix="{}/{}".format(prefix, "trial_data"),
# )

In [48]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer_output = os.path.join("s3://", bucket, prefix, "Feature_selection_output/")
transformer = sklearn_preprocessor.transformer(
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=transformer_output,
    assemble_with="Line",
    accept="text/csv",
)

In [49]:
# # Preprocess trial input
# transformer.transform(trial_input, content_type="text/csv")
# print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
# transformer.wait()
# preprocessed_train = transformer.output_path

...........................
[34m2023-02-03 18:14:41,061 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-03 18:14:41,064 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-03 18:14:41,065 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
  

In [50]:
# Preprocess training input
transformer.transform(train_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path

............................[34m2023-02-03 18:47:12,375 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-03 18:47:12,378 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-02-03 18:47:12,379 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
  

## Autopilot portion

In [9]:
import joblib

feature_list = list(joblib.load("selected_feature_names.joblib"))
print(feature_list)

['zip_agg_customer_subtype_Affluent senior apartments', 'zip_agg_customer_subtype_Affluent young families', 'zip_agg_customer_subtype_Career and childcare', "zip_agg_customer_subtype_Couples with teens 'Married with children'", "zip_agg_customer_subtype_Dinki's (double income no kids)", 'zip_agg_customer_subtype_Etnically diverse', 'zip_agg_customer_subtype_Family starters', 'zip_agg_customer_subtype_Fresh masters in the city', 'zip_agg_customer_subtype_High Income, expensive child', 'zip_agg_customer_subtype_High status seniors', 'zip_agg_customer_subtype_Large family farms', 'zip_agg_customer_subtype_Large family, employed child', 'zip_agg_customer_subtype_Large religous families', 'zip_agg_customer_subtype_Low income catholics', 'zip_agg_customer_subtype_Lower class large families', 'zip_agg_customer_subtype_Middle class families', 'zip_agg_customer_subtype_Mixed apartment dwellers', 'zip_agg_customer_subtype_Mixed rurals', 'zip_agg_customer_subtype_Mixed seniors', 'zip_agg_customer

In [10]:
len(feature_list)

132

In [7]:
output_prefix = os.path.join(prefix, "Feature_selection_output/")

session.download_data(path="./", bucket=bucket, key_prefix=output_prefix)
df_new = pd.read_csv("train.csv.out", header=None)

In [13]:
label_col = 'nbr_mobile_home_policies'

df_new.columns = [label_col] + feature_list
df_new.columns

Index(['nbr_mobile_home_policies',
       'zip_agg_customer_subtype_Affluent senior apartments',
       'zip_agg_customer_subtype_Affluent young families',
       'zip_agg_customer_subtype_Career and childcare',
       'zip_agg_customer_subtype_Couples with teens 'Married with children'',
       'zip_agg_customer_subtype_Dinki's (double income no kids)',
       'zip_agg_customer_subtype_Etnically diverse',
       'zip_agg_customer_subtype_Family starters',
       'zip_agg_customer_subtype_Fresh masters in the city',
       'zip_agg_customer_subtype_High Income, expensive child',
       ...
       'nbr_life_ins', 'nbr_private_accident_ins_policies',
       'nbr_family_accidents_ins_policies', 'nbr_disability_ins_policies',
       'nbr_fire_policies', 'nbr_surfboard_policies', 'nbr_boat_policies',
       'nbr_bicycle_policies', 'nbr_property_ins_policies',
       'nbr_ss_ins_policies'],
      dtype='object', length=133)

In [14]:
df_new.head()

Unnamed: 0,nbr_mobile_home_policies,zip_agg_customer_subtype_Affluent senior apartments,zip_agg_customer_subtype_Affluent young families,zip_agg_customer_subtype_Career and childcare,zip_agg_customer_subtype_Couples with teens 'Married with children',zip_agg_customer_subtype_Dinki's (double income no kids),zip_agg_customer_subtype_Etnically diverse,zip_agg_customer_subtype_Family starters,zip_agg_customer_subtype_Fresh masters in the city,"zip_agg_customer_subtype_High Income, expensive child",...,nbr_life_ins,nbr_private_accident_ins_policies,nbr_family_accidents_ins_policies,nbr_disability_ins_policies,nbr_fire_policies,nbr_surfboard_policies,nbr_boat_policies,nbr_bicycle_policies,nbr_property_ins_policies,nbr_ss_ins_policies
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df_new.to_csv("./data/train_new.csv", index=False)

WORK_DIRECTORY = "data"

train_new_input = session.upload_data(
    path="{}/{}".format(WORK_DIRECTORY, "train_new.csv"),
    bucket=bucket,
    key_prefix="{}/{}".format(prefix, "training_data_new"),
)

In [19]:
input_data_config = [
    {
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": "s3://{}/{}/training_data_new".format(bucket, prefix),
            }
        },
        "TargetAttributeName": label_col,
    }
]

output_data_config = {"S3OutputPath": "s3://{}/{}/autopilot_job_output".format(bucket, prefix)}

AutoML_Job_Config = {
    "CompletionCriteria": {
        # we set MaxCandidate to 50 to have shorter run time. Please adjust this for your use case.
        "MaxCandidates": 50,
        "MaxAutoMLJobRuntimeInSeconds": 1800,
    }
}

In [None]:
from time import gmtime, strftime, sleep
import boto3

region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
timestamp_suffix = strftime("%d-%H-%M-%S", gmtime())

auto_ml_job_name = "automl-blog" + timestamp_suffix
print("AutoMLJobName: " + auto_ml_job_name)

sm.create_auto_ml_job(
    AutoMLJobName=auto_ml_job_name,
    InputDataConfig=input_data_config,
    OutputDataConfig=output_data_config,
    AutoMLJobConfig=AutoML_Job_Config,
    RoleArn=role,
)

AutoMLJobName: automl-blog06-17-45-52
