In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
import boto3
import re
from datetime import datetime

import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig
from sagemaker.model_monitor import DataCaptureConfig, DatasetFormat, DefaultModelMonitor
from sagemaker.s3 import S3Uploader, S3Downloader

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

%cd /root/predicting-coronavirus
#%pip install mpu
import src.data_import as di
import src.data_tools as dt

%matplotlib inline
pd.set_option('display.max_columns', 500)  
pd.set_option('display.max_rows', 500)   

/root/predicting-coronavirus
Note: you may need to restart the kernel to use updated packages.


In [23]:
sess = boto3.Session()
sm = sess.client('sagemaker')
role = sagemaker.get_execution_role()

In [24]:
account_id = sess.client('sts', region_name=sess.region_name).get_caller_identity()["Account"]
bucket = 'sagemaker-studio-{}-{}'.format(sess.region_name, account_id)
prefix = 'capstone2'

try:
    if sess.region_name == "us-east-1":
        sess.client('s3').create_bucket(Bucket=bucket)
    else:
        sess.client('s3').create_bucket(Bucket=bucket, 
                                        CreateBucketConfiguration={'LocationConstraint': sess.region_name})
except Exception as e:
    print("Looks like you already have a bucket of this name. That's good. Uploading the data files...")

# Return the URLs of the uploaded file, so they can be reviewed or used elsewhere
s3url = S3Uploader.upload('data/train.csv', 's3://{}/{}/{}'.format(bucket, prefix,'merged'))
s3url = S3Uploader.upload('data/test.csv', 's3://{}/{}/{}'.format(bucket, prefix,'merged'))
print(s3url)


s3://sagemaker-studio-us-east-1-752222400982/capstone2/merged/test.csv


In [25]:
from sagemaker.amazon.amazon_estimator import get_image_uri
docker_image_name = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version='1.0-1')

In [26]:
s3_input_train = sagemaker.s3_input(s3_data='s3://sagemaker-studio-us-east-1-752222400982/capstone2/merged/train.csv', content_type='csv')
s3_input_test = sagemaker.s3_input(s3_data='s3://sagemaker-studio-us-east-1-752222400982/capstone2/merged/test.csv', content_type='csv')


In [27]:
sess = sagemaker.session.Session()

create_date = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
coronavirus_xgb_experiment = Experiment.create(experiment_name="predicting-coronavirus-cases-xgboost-{}".format(create_date), 
                                              description="Using xgboost to predict coronavirus cases", 
                                              sagemaker_boto_client=boto3.client('sagemaker'))

In [17]:
hyperparams = {"max_depth":5,
               "subsample":0.5,
               "num_round":30,
               "eta":0.1,
               "gamma":4,
               "min_child_weight":6,
               "verbosity":0,
               "alpha":1
              } #,
               #"objective":'binary:logistic'}

In [30]:
#min_child_weights = [1, 2, 4, 8, 10]
#max_depths = [1, 2, 4, 8, 10]
min_child_weights = [4, 8]
max_depths = [4, 8]


for weight in min_child_weights:
    for depth in max_depths:
        print('LAUNCHING FOR CHILD: {} AND DEPTH: {}'.format(weight, depth))
        hyperparams = {"max_depth":depth,
                   "subsample":0.5,
                   "num_round":30,
                   "eta":0.1,
                   "gamma":4,
                   "min_child_weight":weight,
                   "verbosity":1,
                   "alpha":1
                  } 
        trial = Trial.create(trial_name="algorithm-mode-trial-{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime())), 
                             experiment_name=coronavirus_xgb_experiment.experiment_name,
                             sagemaker_boto_client=boto3.client('sagemaker'))

        xgb = sagemaker.estimator.Estimator(image_name=docker_image_name,
                                            role=role,
                                            hyperparameters=hyperparams,
                                            train_instance_count=1, 
                                            train_use_spot_instances=True,
                                            train_max_run=300,
                                            train_max_wait=600,
                                            train_instance_type='ml.m4.xlarge',
                                            output_path='s3://{}/{}/output'.format(bucket, prefix),
                                            base_job_name="xgboost-coronavirus",
                                            sagemaker_session=sess)

        xgb.fit({'train': s3_input_train,
                 'validation': s3_input_test}, 
                experiment_config={
                    "ExperimentName": coronavirus_xgb_experiment.experiment_name, 
                    "TrialName": trial.trial_name,
                    "TrialComponentDisplayName": "min-child-{}-max-depth-{}".format(weight, depth),
                }
               )    

INFO:sagemaker:Creating training-job with name: xgboost-coronavirus-2020-05-13-18-18-35-275


LAUNCHING FOR CHILD: 4 AND DEPTH: 4
2020-05-13 18:18:35 Starting - Starting the training job...
2020-05-13 18:18:38 Starting - Launching requested ML instances......
2020-05-13 18:19:50 Starting - Preparing the instances for training......
2020-05-13 18:20:39 Downloading - Downloading input data...
2020-05-13 18:21:18 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[18:21:45] 620x19 matrix with 11780 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m


INFO:sagemaker:Creating training-job with name: xgboost-coronavirus-2020-05-13-18-22-18-313


Training seconds: 76
Billable seconds: 27
Managed Spot Training savings: 64.5%
LAUNCHING FOR CHILD: 4 AND DEPTH: 8
2020-05-13 18:22:18 Starting - Starting the training job...
2020-05-13 18:22:22 Starting - Launching requested ML instances......
2020-05-13 18:23:29 Starting - Preparing the instances for training......
2020-05-13 18:24:28 Downloading - Downloading input data...
2020-05-13 18:25:15 Training - Training image download completed. Training in progress..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[18:25:17] 620x19 matrix with 11780 entries loaded from /opt/ml/input/data/train

INFO:sagemaker:Creating training-job with name: xgboost-coronavirus-2020-05-13-18-26-01-225


Training seconds: 60
Billable seconds: 32
Managed Spot Training savings: 46.7%
LAUNCHING FOR CHILD: 8 AND DEPTH: 4
2020-05-13 18:26:01 Starting - Starting the training job...
2020-05-13 18:26:03 Starting - Launching requested ML instances......
2020-05-13 18:27:11 Starting - Preparing the instances for training...
2020-05-13 18:27:58 Downloading - Downloading input data...
2020-05-13 18:28:22 Training - Downloading the training image...
2020-05-13 18:29:02 Uploading - Uploading generated training model
2020-05-13 18:29:02 Completed - Training job completed
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input 

INFO:sagemaker:Creating training-job with name: xgboost-coronavirus-2020-05-13-18-29-13-346


Training seconds: 64
Billable seconds: 34
Managed Spot Training savings: 46.9%
LAUNCHING FOR CHILD: 8 AND DEPTH: 8
2020-05-13 18:29:13 Starting - Starting the training job...
2020-05-13 18:29:18 Starting - Launching requested ML instances......
2020-05-13 18:30:23 Starting - Preparing the instances for training......
2020-05-13 18:31:24 Downloading - Downloading input data...
2020-05-13 18:32:07 Training - Downloading the training image...
2020-05-13 18:32:40 Uploading - Uploading generated training model
2020-05-13 18:32:40 Completed - Training job completed
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV inp

In [39]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

trial = Trial.create(trial_name="algorithm-mode-trial-{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime())), 
                     experiment_name=coronavirus_xgb_experiment.experiment_name,
                     sagemaker_boto_client=boto3.client('sagemaker'))

xgb = sagemaker.estimator.Estimator(image_name=docker_image_name,
                                    role=role,
                                    train_instance_count=1, 
                                    train_use_spot_instances=True,
                                    train_max_run=300,
                                    train_max_wait=600,
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    base_job_name="xgboost-coronavirus",
                                    sagemaker_session=sess)

xgb.set_hyperparameters(num_rounds=30, 
                        objective='reg:squarederror',
                        verbosity=1,
                        eta=0.1,
                        gamma=4)

hp_ranges = {
    "max_depth":IntegerParameter(1,10),
    "subsample":ContinuousParameter(0.3,0.8),
#    "eta":ContinuousParameter(0.03,0.2),
    "min_child_weight":IntegerParameter(2,10),
    "alpha":ContinuousParameter(0, 2)
}

tuner = HyperparameterTuner(
    xgb,
    'validation:rmse',
    hp_ranges,
    objective_type='Minimize',
    max_jobs=40,
    max_parallel_jobs=8
)

tuner.fit({'train': s3_input_train,
         'validation': s3_input_test}, 
                experiment_config={
                    "ExperimentName": coronavirus_xgb_experiment.experiment_name, 
                    "TrialName": trial.trial_name,
                    "TrialComponentDisplayName": "HPO Job"
                })



INFO:root:_TuningJob.start_new!!!
INFO:sagemaker:Creating hyperparameter tuning job with name: sagemaker-xgboost-200513-2009


In [49]:
from sagemaker.analytics import HyperparameterTuningJobAnalytics

exp = HyperparameterTuningJobAnalytics(
    sagemaker_session=sess,
    hyperparameter_tuning_job_name=tuner.latest_tuning_job.name)


In [50]:
df = exp.dataframe()
df

Unnamed: 0,alpha,max_depth,min_child_weight,subsample,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,1.756401,3.0,4.0,0.611268,sagemaker-xgboost-200513-2009-008-64b62890,Failed,,2020-05-13 20:11:59+00:00,2020-05-13 20:13:03+00:00,64.0
1,0.18891,1.0,4.0,0.438257,sagemaker-xgboost-200513-2009-007-5752f380,Failed,,2020-05-13 20:12:13+00:00,2020-05-13 20:13:15+00:00,62.0
2,1.788627,6.0,7.0,0.794133,sagemaker-xgboost-200513-2009-006-c4dd4385,Stopped,,2020-05-13 20:12:23+00:00,2020-05-13 20:13:19+00:00,56.0
3,1.684216,3.0,4.0,0.39989,sagemaker-xgboost-200513-2009-005-66a7c3f3,Failed,,2020-05-13 20:12:22+00:00,2020-05-13 20:13:16+00:00,54.0
4,1.565004,10.0,2.0,0.77801,sagemaker-xgboost-200513-2009-004-1d3f3c8c,Failed,,2020-05-13 20:12:01+00:00,2020-05-13 20:12:58+00:00,57.0
5,1.870262,8.0,3.0,0.363637,sagemaker-xgboost-200513-2009-003-10f1c8a9,Failed,,2020-05-13 20:11:57+00:00,2020-05-13 20:13:03+00:00,66.0
6,0.409231,2.0,5.0,0.749939,sagemaker-xgboost-200513-2009-002-4c2d926b,Stopped,,2020-05-13 20:12:22+00:00,2020-05-13 20:13:20+00:00,58.0
7,0.953319,7.0,6.0,0.554174,sagemaker-xgboost-200513-2009-001-502bf6dc,Failed,,2020-05-13 20:12:08+00:00,2020-05-13 20:13:17+00:00,69.0


In [None]:
df.sort_values('FinalObjectiveValue', ascending=False)