## Resources for Reference
- https://github.com/SpencerStaub/CloudComputingG10/blob/master/JG_SageMaker_Demo/jgsagedemo.ipynb

## Preparation

In [1]:
# Specify bucket
bucket = 'cloudcomputinggroup10data'
prefix = 'sagemaker/xgboost'
 
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

# Gets needed IAM role
role = get_execution_role()

# Get this for hyperparameter tuning job 
smclient = boto3.Session().client('sagemaker')

# Get region
region = boto3.Session().region_name  

In [2]:
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker                                  # Amazon SageMaker's Python SDK provides many helper functions
from sagemaker.predictor import csv_serializer    # Converts strings for HTTP POST requests on inference
import plotly.express as px
import csv
import io
from io import StringIO

## Data (Refer to XGBoost_EDA.ipynb for more)

"The dataset contains transactions made by credit cards in September 2013 by european cardholders. 
This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise."

In [3]:
data_key = 'creditcardraw.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

# Read csv with Pandas
df = pd.read_csv(data_location)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Training, Validation, and Test Split

In [4]:
train_data, validation_data, test_data = np.split(df.sample(frac=1, random_state=1729), 
                                                  [int(0.7 * len(df)), int(0.9 * len(df))])   # Randomly sort the data then split out first 70%, second 20%, and last 10%

In [5]:
train_data.shape

(199364, 31)

In [6]:
validation_data.shape

(56962, 31)

In [7]:
test_data.shape

(28481, 31)

In [8]:
df.shape

(284807, 31)

"Amazon SageMaker's XGBoost container expects data in the libSVM or CSV data format. For this example, we'll stick to CSV. Note that the first column must be the target variable and the CSV should not include headers. Also, notice that although repetitive it's easiest to do this after the train|validation|test split rather than before. This avoids any misalignment issues due to random reordering."

In [9]:
# Training data to csv
pd.concat([train_data['Class'], train_data.drop(['Class'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)

# Validation data to csv
pd.concat([validation_data['Class'], validation_data.drop(['Class'], axis=1)], axis=1).to_csv('validation.csv', index=False, header=False)

# Test data with label to csv...will be used for model evaluation later
test_data.to_csv('test_with_label.csv', index=False, header=False)

Now we'll copy the file to S3 for Amazon SageMaker's managed training to pickup.

In [10]:
# Send train csv to S3
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

# Sent validation csv to S3
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

# Send test csv to S3
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test/test_with_label.csv')).upload_file('test_with_label.csv')

## Hyperparameter Tuning Setup
We will tune four hyperparameters:

*eta*: Step size shrinkage used in updates to prevent overfitting. After each boosting step, you can directly get the weights of new features. The eta parameter actually shrinks the feature weights to make the boosting process more conservative.

*alpha*: L1 regularization term on weights. Increasing this value makes models more conservative.

*min_child_weight*: Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, the building process gives up further partitioning. In linear regression models, this simply corresponds to a minimum number of instances needed in each node. The larger the algorithm, the more conservative it is.

*max_depth*: Maximum depth of a tree. Increasing this value makes the model more complex and likely to be overfitted.

In [11]:
from time import gmtime, strftime, sleep
tuning_job_name = 'xgboost-tuningjob-' + strftime("%d-%H-%M-%S", gmtime())

print (tuning_job_name)

# Specify JSON object with hyperparameter tuning ranges
tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "eta",
        },
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "min_child_weight",
        },
        {
          "MaxValue": "2",
          "MinValue": "0",
          "Name": "alpha",            
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "max_depth",
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 20,
      "MaxParallelTrainingJobs": 3
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:auc",
      "Type": "Maximize"
    }
  }

xgboost-tuningjob-09-19-08-17


Then we configure the training jobs the hyperparameter tuning job will launch by defining a JSON object that specifies following information:
- The container image for the algorithm (XGBoost)
- The input configuration for the training and validation data
- Configuration for the output of the algorithm
- The values of any algorithm hyperparameters that are not tuned in the tuning job (StaticHyperparameters)
- The type and number of instances to use for the training jobs
- The stopping condition for the training jobs

Again, since we are using built-in XGBoost algorithm here, it emits two predefined metrics: validation:auc and train:auc, and we elected to monitor validation_auc as you can see above. 

In [12]:
from sagemaker.amazon.amazon_estimator import get_image_uri

# Specify version of xgboost algorithm
training_image = get_image_uri(region, 'xgboost', '0.90-1')

# Assign training and validation csv's to these variables from S3 bucket
s3_input_train = 's3://{}/{}/train'.format(bucket, prefix)
s3_input_validation ='s3://{}/{}/validation/'.format(bucket, prefix)
    
training_job_definition = {
    "AlgorithmSpecification": {
      "TrainingImage": training_image,
      "TrainingInputMode": "File"
    },
    "InputDataConfig": [
      {
        "ChannelName": "train",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_input_train
          }
        }
      },
      {
        "ChannelName": "validation",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_input_validation
          }
        }
      }
    ],
    "OutputDataConfig": {
      "S3OutputPath": "s3://{}/{}/output".format(bucket,prefix)
    },
    "ResourceConfig": {
      "InstanceCount": 1,
      "InstanceType": "ml.m4.xlarge",
      "VolumeSizeInGB": 10
    },
    "RoleArn": role,
    "StaticHyperParameters": {
      "eval_metric": "auc",
      "num_round": "100",
      "objective": "binary:logistic",
      "rate_drop": "0.3",
      "tweedie_variance_power": "1.4"
    },
    "StoppingCondition": {
      "MaxRuntimeInSeconds": 43200
    }
}

## Hyperparameter Tuning Execution
Now we can launch a hyperparameter tuning job by calling create_hyper_parameter_tuning_job API. After the hyperparameter tuning job is created, we can go to SageMaker console to track the progress of the hyperparameter tuning job until it is completed.

In [13]:
smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName = tuning_job_name,
                                            HyperParameterTuningJobConfig = tuning_job_config,
                                            TrainingJobDefinition = training_job_definition)

{'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-east-2:796592541871:hyper-parameter-tuning-job/xgboost-tuningjob-09-19-08-17',
 'ResponseMetadata': {'RequestId': 'f194556f-6dd4-487f-b363-04e011f075cb',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f194556f-6dd4-487f-b363-04e011f075cb',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '130',
   'date': 'Thu, 09 Apr 2020 19:16:55 GMT'},
  'RetryAttempts': 0}}


Let's just run a quick check of the hyperparameter tuning jobs status to make sure it started successfully.

In [21]:
smclient.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name)['HyperParameterTuningJobStatus']

'Completed'

## Track hyperparameter tuning job progress
After you launch a tuning job, you can see its progress by calling describe_tuning_job API. The output from describe-tuning-job is a JSON object that contains information about the current state of the tuning job. You can call list_training_jobs_for_tuning_job to see a detailed list of the training jobs that the tuning job launched.

In [22]:
# run this cell to check current status of hyperparameter tuning job
tuning_job_result = smclient.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)

status = tuning_job_result['HyperParameterTuningJobStatus']
if status != 'Completed':
    print('Reminder: the tuning job has not been completed.')
    
job_count = tuning_job_result['TrainingJobStatusCounters']['Completed']
print("%d training jobs have completed" % job_count)
    
is_minimize = (tuning_job_result['HyperParameterTuningJobConfig']['HyperParameterTuningJobObjective']['Type'] != 'Maximize')
objective_name = tuning_job_result['HyperParameterTuningJobConfig']['HyperParameterTuningJobObjective']['MetricName']

20 training jobs have completed


## Fetch all results from Tuning as DataFrame
This code below helps us identify the hyperparameters that lead to the model with the highest AUC on the validation set.

In [28]:
tuner = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)

# Get df of models organized by highest AUC
full_df = tuner.dataframe()

if len(full_df) > 0:
    hyper_df = full_df[full_df['FinalObjectiveValue'] > -float('inf')]
    if len(hyper_df) > 0:
        hyper_df = hyper_df.sort_values('FinalObjectiveValue', ascending=is_minimize)
        print("Number of training jobs with valid objective: %d" % len(hyper_df))
        print({"lowest":min(hyper_df['FinalObjectiveValue']),"highest": max(hyper_df['FinalObjectiveValue'])})
        pd.set_option('display.max_colwidth', -1)  # Don't truncate TrainingJobName        
    else:
        print("No training jobs have reported valid results yet.")
        
hyper_df[['FinalObjectiveValue', 'TrainingJobName', 'alpha', 'eta', 'max_depth', 'min_child_weight']].head(5)

Number of training jobs with valid objective: 20
{'lowest': 0.5, 'highest': 0.9939529895782471}


Unnamed: 0,FinalObjectiveValue,TrainingJobName,alpha,eta,max_depth,min_child_weight
10,0.993953,xgboost-tuningjob-09-19-08-17-010-40215031,1.14071,0.273211,10.0,10.0
5,0.992975,xgboost-tuningjob-09-19-08-17-015-c4d2b6be,1.14071,0.253211,10.0,10.0
11,0.99271,xgboost-tuningjob-09-19-08-17-009-b323b732,0.930444,0.271125,10.0,9.387145
9,0.992651,xgboost-tuningjob-09-19-08-17-011-afe2b3fa,0.553879,0.300025,10.0,10.0
17,0.992531,xgboost-tuningjob-09-19-08-17-003-b4b58b00,0.460258,0.330548,9.0,9.795314
