In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
import boto3
import re
from datetime import datetime

import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig
from sagemaker.model_monitor import DataCaptureConfig, DatasetFormat, DefaultModelMonitor
from sagemaker.s3 import S3Uploader, S3Downloader

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

%cd /root/predicting-coronavirus
#%pip install mpu
import src.data_import as di
import src.data_tools as dt

%matplotlib inline
pd.set_option('display.max_columns', 500)  
pd.set_option('display.max_rows', 500)   

/root/predicting-coronavirus


In [3]:
sess = boto3.Session()
sm = sess.client('sagemaker')
role = sagemaker.get_execution_role()

In [13]:
account_id = sess.client('sts', region_name=sess.region_name).get_caller_identity()["Account"]
bucket = 'sagemaker-studio-{}-{}'.format(sess.region_name, account_id)
prefix = 'capstone2'

try:
    if sess.region_name == "us-east-1":
        sess.client('s3').create_bucket(Bucket=bucket)
    else:
        sess.client('s3').create_bucket(Bucket=bucket, 
                                        CreateBucketConfiguration={'LocationConstraint': sess.region_name})
except Exception as e:
    print("Looks like you already have a bucket of this name. That's good. Uploading the data files...")

# Return the URLs of the uploaded file, so they can be reviewed or used elsewhere
s3url = S3Uploader.upload('data/train3k.csv', 's3://{}/{}/{}'.format(bucket, prefix,'merged'))
s3url = S3Uploader.upload('data/val3k.csv', 's3://{}/{}/{}'.format(bucket, prefix,'merged'))
print(s3url)


s3://sagemaker-studio-us-east-1-752222400982/capstone2/merged/val3k.csv


In [9]:
from sagemaker.amazon.amazon_estimator import get_image_uri
docker_image_name = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version='1.0-1')

In [14]:
s3_input_train = sagemaker.s3_input(s3_data='s3://sagemaker-studio-us-east-1-752222400982/capstone2/merged/train3k.csv', content_type='csv')
s3_input_test = sagemaker.s3_input(s3_data='s3://sagemaker-studio-us-east-1-752222400982/capstone2/merged/val3k.csv', content_type='csv')


In [15]:
smsess = sagemaker.session.Session()

create_date = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
coronavirus_xgb_experiment = Experiment.create(experiment_name="predicting-coronavirus-cases-xgboost-{}".format(create_date), 
                                              description="Using xgboost to predict coronavirus cases", 
                                              sagemaker_boto_client=boto3.client('sagemaker'))

In [16]:
hyperparams = {"max_depth":12,
               "subsample":0.7,
               "num_round":50,
               "eta":0.1,
               "gamma":4,
               "min_child_weight":8,
               "verbosity":0,
               "alpha":1
              } #,
               #"objective":'binary:logistic'}

In [17]:
trial = Trial.create(trial_name="algorithm-mode-trial-{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime())), 
                     experiment_name=coronavirus_xgb_experiment.experiment_name,
                     sagemaker_boto_client=boto3.client('sagemaker'))

xgb = sagemaker.estimator.Estimator(image_name=docker_image_name,
                                    role=role,
                                    hyperparameters=hyperparams,
                                    train_instance_count=1, 
                                    train_use_spot_instances=True,
                                    train_max_run=300,
                                    train_max_wait=600,
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    base_job_name="xgboost-coronavirus",
                                    sagemaker_session=smsess)

xgb.fit({'train': s3_input_train,
         'validation': s3_input_test}, 
        experiment_config={
            "ExperimentName": coronavirus_xgb_experiment.experiment_name, 
            "TrialName": trial.trial_name,
            "TrialComponentDisplayName": "Training-new-data",
        }
       )    

INFO:sagemaker:Creating training-job with name: xgboost-coronavirus-2020-05-14-22-10-39-400


ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The account-level service limit 'ml.m4.xlarge for spot training job usage' is 20 Instances, with current utilization of 20 Instances and a request delta of 1 Instances. Please contact AWS support to request an increase for this limit.

In [18]:
hyperparams = {"max_depth":6,
               "subsample":0.41,
               "num_round":50,
               "eta":0.1,
               "gamma":4,
               "min_child_weight":6,
               "verbosity":0,
               "alpha":1
              } #,
               #"objective":'binary:logistic'}

In [19]:
trial = Trial.create(trial_name="algorithm-mode-trial-{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime())), 
                     experiment_name=coronavirus_xgb_experiment.experiment_name,
                     sagemaker_boto_client=boto3.client('sagemaker'))

xgb = sagemaker.estimator.Estimator(image_name=docker_image_name,
                                    role=role,
                                    hyperparameters=hyperparams,
                                    train_instance_count=1, 
                                    train_use_spot_instances=True,
                                    train_max_run=300,
                                    train_max_wait=600,
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    base_job_name="xgboost-coronavirus",
                                    sagemaker_session=smsess)

xgb.fit({'train': s3_input_train,
         'validation': s3_input_test}, 
        experiment_config={
            "ExperimentName": coronavirus_xgb_experiment.experiment_name, 
            "TrialName": trial.trial_name,
            "TrialComponentDisplayName": "Training-6-6-o-3k",
        }
       )    

INFO:sagemaker:Creating training-job with name: xgboost-coronavirus-2020-05-14-16-32-26-234


2020-05-14 16:32:27 Starting - Starting the training job...
2020-05-14 16:32:29 Starting - Launching requested ML instances......
2020-05-14 16:33:37 Starting - Preparing the instances for training......
2020-05-14 16:34:56 Downloading - Downloading input data
2020-05-14 16:34:56 Training - Downloading the training image...
2020-05-14 16:35:28 Training - Training image download completed. Training in progress..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[16:35:30] 2284x7 matrix with 15988 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34mINFO:

In [13]:
trial = Trial.create(trial_name="algorithm-mode-trial-{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime())), 
                     experiment_name=coronavirus_xgb_experiment.experiment_name,
                     sagemaker_boto_client=boto3.client('sagemaker'))

xgb = sagemaker.estimator.Estimator(image_name=docker_image_name,
                                    role=role,
                                    hyperparameters=hyperparams,
                                    train_instance_count=1, 
                                    train_use_spot_instances=True,
                                    train_max_run=300,
                                    train_max_wait=600,
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    base_job_name="xgboost-coronavirus",
                                    sagemaker_session=smsess)

xgb.fit({'train': s3_input_train,
         'validation': s3_input_test}, 
        experiment_config={
            "ExperimentName": coronavirus_xgb_experiment.experiment_name, 
            "TrialName": trial.trial_name,
            "TrialComponentDisplayName": "Training-6-6-o-3k",
        }
       )    

INFO:sagemaker:Creating training-job with name: xgboost-coronavirus-2020-05-14-16-11-17-276


2020-05-14 16:11:17 Starting - Starting the training job...
2020-05-14 16:11:19 Starting - Launching requested ML instances......
2020-05-14 16:12:28 Starting - Preparing the instances for training......
2020-05-14 16:13:23 Downloading - Downloading input data...
2020-05-14 16:14:01 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[16:14:23] 1586x7 matrix with 11102 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[16:14:23] 529x7 matrix with 370

In [73]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

trial = Trial.create(trial_name="algorithm-mode-trial-{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime())), 
                     experiment_name=coronavirus_xgb_experiment.experiment_name,
                     sagemaker_boto_client=boto3.client('sagemaker'))

xgb = sagemaker.estimator.Estimator(image_name=docker_image_name,
                                    role=role,
                                    train_instance_count=1, 
                                    train_use_spot_instances=True,
                                    train_max_run=300,
                                    train_max_wait=600,
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    base_job_name="xgboost-coronavirus",
                                    sagemaker_session=smsess)

xgb.set_hyperparameters(num_round=30, 
                        objective='reg:squarederror',
                        verbosity=3,
                        eta=0.1,
                        gamma=4)

hp_ranges = {
    "max_depth":IntegerParameter(1,10),
    "subsample":ContinuousParameter(0.3,0.8),
#    "eta":ContinuousParameter(0.03,0.2),
    "min_child_weight":IntegerParameter(2,10),
    "alpha":ContinuousParameter(0, 2)
}

tuner = HyperparameterTuner(
    xgb,
    'validation:rmse',
    hp_ranges,
    objective_type='Minimize',
    max_jobs=40,
    max_parallel_jobs=8
)

tuner.fit({'train': s3_input_train,
         'validation': s3_input_test}, 
                experiment_config={
                    "ExperimentName": coronavirus_xgb_experiment.experiment_name, 
                    "TrialName": trial.trial_name,
                    "TrialComponentDisplayName": "HPO Job"
                })



INFO:root:_TuningJob.start_new!!!
INFO:sagemaker:Creating hyperparameter tuning job with name: sagemaker-xgboost-200513-2044


In [91]:
from sagemaker.analytics import HyperparameterTuningJobAnalytics

exp = HyperparameterTuningJobAnalytics(
    sagemaker_session=smsess,
    hyperparameter_tuning_job_name=tuner.latest_tuning_job.name)


In [92]:
df = exp.dataframe()
df

Unnamed: 0,alpha,max_depth,min_child_weight,subsample,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,1.545182,6.0,10.0,0.798604,sagemaker-xgboost-200513-2044-040-10d5151c,Completed,247.352875,2020-05-13 21:02:32+00:00,2020-05-13 21:03:41+00:00,69.0
1,1.890641,5.0,8.0,0.74239,sagemaker-xgboost-200513-2044-039-b3e60a03,Completed,250.867279,2020-05-13 21:02:35+00:00,2020-05-13 21:03:45+00:00,70.0
2,1.998815,10.0,10.0,0.63711,sagemaker-xgboost-200513-2044-038-c43b83a6,Completed,259.645355,2020-05-13 21:02:07+00:00,2020-05-13 21:03:18+00:00,71.0
3,2.0,10.0,10.0,0.63211,sagemaker-xgboost-200513-2044-037-71d635c8,Completed,258.642578,2020-05-13 21:01:48+00:00,2020-05-13 21:02:48+00:00,60.0
4,1.78151,6.0,7.0,0.668233,sagemaker-xgboost-200513-2044-036-ed77d57d,Completed,253.777039,2020-05-13 21:01:37+00:00,2020-05-13 21:02:40+00:00,63.0
5,1.504654,6.0,7.0,0.743062,sagemaker-xgboost-200513-2044-035-84a7745e,Completed,254.547256,2020-05-13 21:02:04+00:00,2020-05-13 21:03:16+00:00,72.0
6,1.983658,6.0,7.0,0.768086,sagemaker-xgboost-200513-2044-034-7f330160,Completed,250.140045,2020-05-13 21:02:12+00:00,2020-05-13 21:03:19+00:00,67.0
7,0.314142,4.0,6.0,0.789845,sagemaker-xgboost-200513-2044-033-8ae0bebd,Completed,266.831818,2020-05-13 21:01:12+00:00,2020-05-13 21:02:24+00:00,72.0
8,0.109796,7.0,6.0,0.8,sagemaker-xgboost-200513-2044-032-76c2f047,Completed,260.272217,2020-05-13 20:59:08+00:00,2020-05-13 21:00:08+00:00,60.0
9,0.195245,1.0,2.0,0.787099,sagemaker-xgboost-200513-2044-031-1f169c80,Completed,312.82373,2020-05-13 20:58:32+00:00,2020-05-13 20:59:39+00:00,67.0


In [None]:
df.sort_values('FinalObjectiveValue', ascending=False)