In [1]:
# Add logic to save transformed data to repo
# Clean up

from time import gmtime, strftime

import boto3
import sagemaker
import joblib
import pandas as pd
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
from sagemaker.image_uris import retrieve

X_train, X_val, ..., y_train, y_val, ... need to have no headers

# Train featurizer on local

In [None]:
! python3 featurizer.py --raw-schema-path "schema.yml" --target-col "rings" --X-train-path "data/split/" --X-train-file "X_train.csv" --model-dir ./

# Train featurizer on separate instance

In [2]:
sm_boto3 = boto3.client('sagemaker')
sess = sagemaker.Session()
region = sess.boto_session.region_name
role = sagemaker.get_execution_role()
bucket = "kyle-abalone"

In [None]:
bucket

In [3]:
raw_schema_path = sess.upload_data(
    path='schema.yml', 
    bucket=bucket,
    key_prefix='abalone'
)
X_train_path = sess.upload_data(
    path='data/split/X_train.csv', 
    bucket=bucket,
    key_prefix='abalone/data/split'
)
y_train_path = sess.upload_data(
    path='data/split/y_train.csv', 
    bucket=bucket,
    key_prefix='abalone/data/split'
)
X_test_path = sess.upload_data(
    path='data/split/X_test.csv', 
    bucket=bucket,
    key_prefix='abalone/data/split'
)
y_test_path = sess.upload_data(
    path='data/split/y_test.csv', 
    bucket=bucket,
    key_prefix='abalone/data/split'
)

In [9]:
FRAMEWORK_VERSION = "0.23-1"
script_path = "featurizer.py"

featurizer = SKLearn(
    entry_point=script_path,
    role=role,
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.c4.xlarge",
    sagemaker_session=sess,
    source_dir='./'
)

In [10]:
featurizer.fit({'train': X_train_path, 'raw_schema_path': raw_schema_path}, wait=True, logs=True)

2021-06-24 22:03:48 Starting - Starting the training job...
2021-06-24 22:04:12 Starting - Launching requested ML instancesProfilerReport-1624572227: InProgress
.........
2021-06-24 22:05:33 Starting - Preparing the instances for training.........
2021-06-24 22:07:13 Downloading - Downloading input data
2021-06-24 22:07:13 Training - Downloading the training image...
2021-06-24 22:07:43 Uploading - Uploading generated training model[34m2021-06-24 22:07:38,819 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-06-24 22:07:38,822 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-06-24 22:07:38,832 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-06-24 22:07:39,301 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34mCollecting pyyaml==5.4.1
  Down

# Build batch transformer to transform X_train and X_test using featurizer

In [None]:
featurizer = joblib.load('model.joblib')

In [None]:
X_train = pd.read_csv('data/split/X_train.csv')
#loaded_model.transform()

In [None]:
X_train.head()

In [6]:
artifact = sm_boto3.describe_training_job(
    TrainingJobName=featurizer.latest_training_job.name)

In [None]:
artifact

In [11]:
transformer = featurizer.transformer(
    instance_count=1, instance_type="ml.m5.xlarge", assemble_with="Line", accept="text/csv"
)

In [12]:
transformer.transform(X_train_path, content_type="text/csv")
transformer.wait()
transformed_X_train = transformer.output_path

transformer.transform(X_test_path, content_type="text/csv")
transformer.wait()
transformed_X_test = transformer.output_path

................................[34m2021-06-24 22:13:44,761 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-06-24 22:13:44,763 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2021-06-24 22:13:44,761 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2021-06-24 22:13:44,763 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-06-24 22:13:44,764 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_bod

In [21]:
# sess.upload_data(
#     path='schema.yml', 
#     bucket=bucket,
#     key_prefix='abalone'
# )
transformed_X_train

's3://sagemaker-us-east-1-926521026587/sagemaker-scikit-learn-2021-06-24-22-08-40-316'

# Train linear learner on separate instance

In [14]:
ll_image = retrieve("linear-learner", region)

In [16]:
s3_ll_output_key_prefix = "ll_training_output"
s3_ll_output_location = "s3://{}/{}/{}".format(
    bucket, s3_ll_output_key_prefix, "ll_model"
)

In [22]:
ll_estimator = sagemaker.estimator.Estimator(
    ll_image,
    role,
    instance_count=1,
    instance_type="ml.m4.2xlarge",
    volume_size=20,
    max_run=3600,
    input_mode="File",
    output_path=s3_ll_output_location,
    sagemaker_session=sess,
)

ll_estimator.set_hyperparameters(feature_dim=9, predictor_type="regressor", mini_batch_size=32)

ll_train_data = sagemaker.inputs.TrainingInput(
    transformed_X_train,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

data_channels = {"train": ll_train_data}

In [None]:
ll_estimator.fit(inputs=data_channels, wait=True, logs=True)

2021-06-24 22:39:50 Starting - Starting the training job...
2021-06-24 22:40:00 Starting - Launching requested ML instancesProfilerReport-1624574390: InProgress
........

In [None]:
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

featurizer_model = featurizer.create_model()
linear_learner_model = ll_estimator.create_model()

model_name = "inference-pipeline-" + timestamp_prefix
sm_model = PipelineModel(
    name=model_name, role=role, models=[scikit_learn_inferencee_model, linear_learner_model]
)