# Build a Custom SciKit Learn Model

In this notebook we will demonstrate what is required to train and deploy a custom model on Sagemaker infrastructure. 


## Create a training script

We need to define a model that can run on Sagemaker training hardware.

Detailed guidance here https://sagemaker.readthedocs.io/en/stable/using_sklearn.html#preparing-the-scikit-learn-training-script


In [4]:
%%writefile sklearn_training_script.py

import argparse
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib


# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ =='__main__':
    
    #------------------------------- parsing input parameters (from command line)
    print('extracting arguments')
    parser = argparse.ArgumentParser()

    # RandomForest hyperparameters
    parser.add_argument('--n_estimators', type=int, default=150)
    parser.add_argument('--min_samples_leaf', type=int, default=20)
    parser.add_argument('--max_depth', type=int, default=9)
    
    # Data, model, and output directories
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train_dir', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test_dir', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train_file', type=str, default='train.csv')
    parser.add_argument('--test_file', type=str, default='validation.csv')
    parser.add_argument('--features', type=str, default='')  # explicitly name which features to use
    parser.add_argument('--target_variable', type=str)  # explicitly name the column to be used as target

    args, _ = parser.parse_known_args()
    
    #------------------------------- data preparation
    print('reading data')
    train_df = pd.read_csv(os.path.join(args.train_dir, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test_dir, args.test_file))

    features = args.features.split()
    if features == []:
        features = list(train_df.columns)
        features.remove(args.target_variable)
    
    print('building training and testing datasets')
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[args.target_variable]
    y_test = test_df[args.target_variable]
    
    #------------------------------- model training
    print('training model')
    model = RandomForestClassifier(
        n_estimators=args.n_estimators,
        min_samples_leaf=args.min_samples_leaf,
        max_depth=args.max_depth,
        n_jobs=-1)
    
    model.fit(X_train, y_train)
    
    #-------------------------------  model testing
    print('testing model')

    test_preds = model.predict_proba(X_test)
    roc_auc = roc_auc_score(y_test, test_preds[:,1])
    print("Validation AUC: ", roc_auc)
        
    #------------------------------- save model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print('model saved at ' + path)

Writing sklearn_training_script.py



#### Local training

Script arguments allows us to remove from the script any SageMaker-specific configuration, and run locally

**Note** This script relies on scikit-learn version 0.22 (Certain functions have been deprecated in 0.23)


In [81]:
! python sklearn_training_script.py \
    --n_estimators 100 \
    --min_samples_leaf 3 \
    --model_dir 'model/' \
    --train_dir '../../data/partitioned/' \
    --test_dir '../../data/partitioned/' \
    --train_file 'train.csv' \
    --test_file 'validation.csv' \
    --features "discharge_disposition_id admission_source_id time_in_hospital num_lab_procedures num_medications number_outpatient number_emergency number_inpatient number_diagnoses"\
    --target_variable 'readmitted'


extracting arguments
reading data
building training and testing datasets
training model
testing model
Validation AUC:  0.6760684929883138
model saved at model/model.joblib


## Create a script to execute training

To build the model we execute on Sagemake infrastructure using the SKLearn Estimator.
https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/sagemaker.sklearn.html

Write out a script for this (that can be run locally or via the master RUN script)

In [5]:
%%writefile RUN_Sagemaker_02_Build.py

# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
import sagemaker
import sys

sys.path.append("../../")
import utils.config as cfg
import utils.models as mods

config = cfg.get_config()
region = config['region']
target = config['target']
bucket_name = config['bucket_name']
bucket_prefix = config['bucket_prefix']
sgmk_session = config['sgmk_session']
sgmk_role = config['sgmk_role']
sm_boto3 = config['sm_boto3']

train_path_s3 = cfg.get_s3_path('train')
test_path_s3 = cfg.get_s3_path('validation')

sklearn_estimator = SKLearn(
    entry_point='sklearn_training_script.py',
    role=sgmk_role,
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version='0.20.0',
    base_job_name='rf-scikit',
    metric_definitions=[
        { 'Name': 'AUC', 'Regex': 'Validation AUC: ([0-9.]+).*$' },
    ],
    hyperparameters={
        'n_estimators': 100,
        'min_samples_leaf': 3,
        'target_variable': target,
        'features': "discharge_disposition_id admission_source_id time_in_hospital num_lab_procedures num_medications number_outpatient number_emergency number_inpatient number_diagnoses",
    },
    max_run=20*60,  # Maximum allowed active runtime (in seconds)
    use_spot_instances=True,  # Use spot instances to reduce cost
    max_wait=30*60,  # Maximum clock time (including spot delays)
)

data_dict = {'train':train_path_s3, 'test': test_path_s3}

sklearn_estimator.fit({'train':train_path_s3, 'test': test_path_s3}, wait=True)

sklearn_estimator.latest_training_job.wait(logs='None')

model_artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']

print('Model artifact saved at:', model_artifact)


Writing RUN_Sagemaker_02_Build.py


### Execute it

In [7]:
!cd amazon-sagemaker-workbench-demo/experiments/02_SKLearn/;python RUN_Sagemaker_02_Build.py

2021-03-01 22:55:39 Starting - Starting the training job...
2021-03-01 22:55:42 Starting - Launching requested ML instancesProfilerReport-1614639339: InProgress
......
2021-03-01 22:56:54 Starting - Preparing the instances for training.........
2021-03-01 22:58:38 Downloading - Downloading input data
2021-03-01 22:58:38 Training - Training image download completed. Training in progress..[34m2021-03-01 22:58:38,518 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-03-01 22:58:38,520 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-03-01 22:58:38,530 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-03-01 22:58:38,690 botocore.utils INFO     IMDS ENDPOINT: http://169.254.169.254/[0m
[34m2021-03-01 22:58:38,819 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-03-01 22:58:41,845 sagemaker-training

In [5]:
# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
import sagemaker
import sys

sys.path.append("../../")
import utils.config as cfg
import utils.models as mods

config = cfg.get_config()
sgmk_role = config['sgmk_role']
model_artifact="s3://sagemaker-us-east-2-320389841409/rf-scikit-2020-12-29-05-12-04-781/output/model.tar.gz"

model = SKLearnModel(
    model_data=model_artifact,
    framework_version='0.20.0',
    py_version='py3',
    role=sgmk_role,
    entry_point='sklearn_training_script.py',
)

predictor = model.deploy(
    instance_type='ml.c5.large',
    initial_instance_count=1,
)


-------------!

In [6]:
print(predictor)

<sagemaker.sklearn.model.SKLearnPredictor object at 0x7f4b73b5df50>


In [7]:
mods.register("SKLearn_RF", "Random Forest Classifier", model_artifact, predictor)

'Done'

In [9]:
import pandas as pd
test_df = pd.read_csv('../../data/partitioned/test.csv')
test_df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,319655078,87697485,Caucasian,Male,[50-60),?,Clinic Referral,1,1,1,...,No,No,No,No,No,No,No,No,No,0
1,46247880,9346356,Caucasian,Female,[60-70),?,Physician Referral,1,7,8,...,No,No,No,No,No,No,No,No,No,0
2,85492566,24242400,Caucasian,Female,[60-70),?,Transfer from a Skilled Nursing Facility (SNF),1,17,3,...,No,Down,No,No,No,No,No,Ch,Yes,0
3,238261572,90486225,Caucasian,Male,[80-90),?,Clinic Referral,1,1,5,...,No,Steady,No,No,No,No,No,Ch,Yes,0
4,138396858,47461050,Caucasian,Male,[60-70),?,Physician Referral,1,1,1,...,No,No,No,No,No,No,No,No,No,1


In [10]:
# the SKLearnPredictor does the serialization from pandas for us
preds = predictor.predict(test_df)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from model with message "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>
". See https://us-east-2.console.aws.amazon.com/cloudwatch/home?region=us-east-2#logEventViewer:group=/aws/sagemaker/Endpoints/sagemaker-scikit-learn-2020-12-29-05-23-01-157 in account 320389841409 for more information.

In [11]:

features = ["discharge_disposition_id", "admission_source_id", "time_in_hospital", 
            "num_lab_procedures", "num_medications", "number_outpatient", 
            "number_emergency", "number_inpatient", "number_diagnoses"]

X_test = test_df[features]
X_test.head()


Unnamed: 0,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
0,1,1,1,23,17,2,0,0,6
1,1,7,8,71,7,0,0,0,3
2,1,17,3,26,12,0,2,0,9
3,1,1,5,50,27,0,0,0,9
4,1,1,1,6,9,1,0,0,9


In [18]:
# the SKLearnPredictor does the serialization from pandas for us
preds2 = predictor.predict(X_test)


In [14]:
preds2

array([0, 0, 1, ..., 0, 1, 0])

In [15]:
test_df["preds"]= preds2

In [19]:
test_df.loc[:,["readmitted","preds"]].head(50)

Unnamed: 0,readmitted,preds
0,0,0
1,0,0
2,0,1
3,0,0
4,1,0
5,0,0
6,0,0
7,1,0
8,0,0
9,1,0
