# SKLearn Custom Model

In this example we use a different kind of container. This is not a pre-defined model, instead it is a container with a specific ML Library installed that will allow you to execute a custom script.

## Step One - Create a Training Script

We need to define a model that can run on Sagemaker training hardware.

Detailed guidance here https://sagemaker.readthedocs.io/en/stable/using_sklearn.html#preparing-the-scikit-learn-training-script

In this example we are use SKLearn pipelines to include some data processing before training the model.
Key Points

* We write this script into the src directory
* This directory already contains our Custom Classes
* We use the standard import statement to include them:

    import UnknownCategoryFlagger as ucf


In [8]:
%%writefile src/sklearn_pipeline_training_script.py

import argparse
from sklearn.externals import joblib
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from sklearn.compose import ColumnTransformer
#, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer, StandardScaler, OneHotEncoder

import UnknownCategoryFlagger as ucf
import DataFrameCoercer as dfc

# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ =='__main__':
    
    #------------------------------- parsing input parameters (from command line)
    print('extracting arguments')
    parser = argparse.ArgumentParser()

    # RandomForest hyperparameters
    parser.add_argument('--n_estimators', type=int, default=150)
    parser.add_argument('--min_samples_leaf', type=int, default=20)
    parser.add_argument('--max_depth', type=int, default=9)
    
    # Data, model, and output directories
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train_dir', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test_dir', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train_file', type=str, default='train.csv')
    parser.add_argument('--test_file', type=str, default='validation.csv')
    parser.add_argument('--features', type=str, default='')  # explicitly name which features to use
    parser.add_argument('--target_variable', type=str)  # explicitly name the column to be used as target

    args, _ = parser.parse_known_args()
    
    #------------------------------- data preparation
    print('reading data')
    train_df = pd.read_csv(os.path.join(args.train_dir, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test_dir, args.test_file))

    features = args.features.split()
    if features == []:
        features = list(train_df.columns)
        features.remove(args.target_variable)
    
    print('building training and testing datasets')
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[args.target_variable]
    y_test = test_df[args.target_variable]
    
    numeric_cols = list( X_train.select_dtypes(include="number").columns)
    categorical_cols = list( X_train.select_dtypes(exclude="number").columns)
    
    #------------------------------- setup the preprocessing
    print('preprocesser setup')

    coercer = Pipeline([
        ("coerce", dfc.DataFrameCoercer() )
    ])
        
    unknown_gen = Pipeline([
        ("unknown", ucf.UnknownCategoryFlagger() )
    ])
    
    numeric_transformer = make_pipeline(
        SimpleImputer(strategy='median'),
        StandardScaler()
    )

    categorical_transformer = make_pipeline(
        SimpleImputer(strategy='constant', fill_value='missing'),
        OneHotEncoder(handle_unknown='ignore')
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_cols),
            ("cat", categorical_transformer, categorical_cols)
        ]
    )
    
    #------------------------------- model training
    print('training model')
    rfcl = RandomForestClassifier(
        n_estimators=args.n_estimators,
        min_samples_leaf=args.min_samples_leaf,
        max_depth=args.max_depth,
        n_jobs=-1)
    
    model = Pipeline(steps=[
        ('coerce', coercer),
        ('preprocessor', preprocessor),
        ('rf', rfcl )
    ])
    
    model.fit(X_train, y_train)
    
    #-------------------------------  model testing
    print('testing model')

    test_preds = model.predict_proba(X_test)
    roc_auc = roc_auc_score(y_test, test_preds[:,1])
    print("Validation AUC: ", roc_auc)
        
    #------------------------------- save model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print('model saved at ' + path)

Overwriting src/sklearn_pipeline_training_script.py



Local training

Script arguments allows us to remove from the script any SageMaker-specific configuration, and run locally

Note This script relies on scikit-learn version 0.22 (Certain functions have been deprecated in 0.23)


In [9]:
! python src/sklearn_pipeline_training_script.py \
    --n_estimators 100 \
    --min_samples_leaf 5 \
    --model_dir 'model/' \
    --train_dir 'data/' \
    --test_dir 'data/' \
    --train_file 'train.csv' \
    --test_file 'validation.csv' \
    --target_variable 'Churn'

extracting arguments
reading data
building training and testing datasets
preprocesser setup
training model
testing model
Validation AUC:  0.8392602002735712
model saved at model/model.joblib


## Train and Deploy using Sagemaker

To train this model on Sagemaker we need to instantiate a Sagemaker training job and pass it the script defined above.

Key Training Points

* We use the sagemaker.sklearn.estimator class to create the training job.
* We are retrieving the S3 path to the data from our previously defined config
* We provide a new param to the training job:source_dir='src' This tells the job where to find all source code (including the entry point script).

Key Deployment Points

* We use the sagemaker.sklearn.model class to instantiate the model using the artifact created by the training job.
* We provide a new param to the model: source_dir='src' so that it can find the corresponding source code.

In this example we write out the command into a new script (so that it can be run locally or via a master RUN script).


In [14]:
%%writefile RUN_Sagemaker_02b_Build.py

# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
import sagemaker
import boto3
import sys

target = "Churn"

boto_session = boto3.Session()
region = boto_session.region_name
sgmk_session = sagemaker.Session()
sm_boto3 = boto_session.client("sagemaker")
sgmk_role = sagemaker.get_execution_role()

train_path_s3 = 's3://telco-churn-seoul/xgboost-example/train.csv'
test_path_s3 = 's3://telco-churn-seoul/xgboost-example/validation.csv'

sklearn_estimator = SKLearn(
    entry_point='sklearn_pipeline_training_script.py',
    role=sgmk_role,
    source_dir='src',
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version='0.20.0',
    base_job_name='rf-scikit',
    metric_definitions=[
        { 'Name': 'AUC', 'Regex': 'Validation AUC: ([0-9.]+).*$' },
    ],
    hyperparameters={
        'n_estimators': 100,
        'min_samples_leaf': 8,
        'target_variable': target,
    },
    max_run=20*60,  # Maximum allowed active runtime (in seconds)
    use_spot_instances=True,  # Use spot instances to reduce cost
    max_wait=30*60,  # Maximum clock time (including spot delays)
)

data_dict = {'train':train_path_s3, 'test': test_path_s3}

sklearn_estimator.fit({'train':train_path_s3, 'test': test_path_s3}, wait=True)

sklearn_estimator.latest_training_job.wait(logs='None')

model_artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']

print('Model artifact saved at:', model_artifact)

model = SKLearnModel(
    model_data=model_artifact,
    framework_version='0.20.0',
    py_version='py3',
    role=sgmk_role,
    source_dir='src',
    entry_point='sklearn_pipeline_training_script.py',
)


Overwriting RUN_Sagemaker_02b_Build.py


In [None]:
!python RUN_Sagemaker_02b_Build.py
