## Creating S3 Buckets

In [1]:
import sagemaker
import boto3
#from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.amazon.amazon_estimator import image_uris
from sagemaker.session import s3_input, Session
import numpy as np                                
import pandas as pd
import urllib
import os

In [2]:
my_region = boto3.session.Session().region_name # This gives you your region

bucket_name = 'tutorialbucke-demo'
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    else: 
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully') # this is how you explicitly add the location constraint
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [3]:
# stores the model in this defined path
prefix = 'xgboost-sagemaker'
output_path = 's3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://tutorialbucke-demo/xgboost-sagemaker/output


## Loading Data into S3

We will first divide our data into train and test. Then we will load it into S3.
An important step to keep in mind while using SageMaker is that, the in-built algorithms in the SageMaker expects the dependent feature to be the first column of our dataset. So if your dataset’s first column is not that of the dependent feature, make sure that you change it.

In [4]:
try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ',e)

try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


In [5]:
model_data.head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


In [6]:
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


In [7]:
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [8]:
pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('test.csv', index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

## Retrieving the XGBoost image container

In [9]:
#container = get_image_uri(boto3.Session().region_name,'xgboost', repo_version='latest')
container = image_uris.retrieve('xgboost', boto3.Session().region_name,'latest')

In [10]:
hyperparameters = {'max_depth':'5',
                   'eta':'0.2',
                   'gamma':'4',
                   'min_child_weight':'6',
                   'subsample':'0.7',
                   'objective':'binary:logistic',
                   'num_round':50
                  }

In [11]:
estimator = sagemaker.estimator.Estimator(image_uri = container,
                                         hyperparameters = hyperparameters,
                                         role = sagemaker.get_execution_role(),
                                         train_instance_count = 1,
                                         train_instance_type = 'ml.m4.xlarge',
                                         train_volume_size = 5,
                                         output_path = output_path,
                                         train_use_spot_instance = True,
                                            )

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [None]:
estimator.fit({'train':s3_input_train, 'validation':s3_input_test})

2021-07-16 14:18:36 Starting - Starting the training job...
2021-07-16 14:18:59 Starting - Launching requested ML instancesProfilerReport-1626445116: InProgress
......
2021-07-16 14:19:59 Starting - Preparing the instances for training......
2021-07-16 14:21:07 Downloading - Downloading input data...
2021-07-16 14:21:19 Training - Downloading the training image..

In [None]:
# import boto3
# import sagemaker
# from sagemaker.xgboost.estimator import XGBoost
# from sagemaker.session import Session
# from sagemaker.inputs import TrainingInput
# # initialize hyperparameters
# hyperparameters = {
#         "max_depth":"5",
#         "eta":"0.2",
#         "gamma":"4",
#         "min_child_weight":"6",
#         "subsample":"0.7",
#         "verbosity":"1",
#         "objective":"reg:linear",
#         "num_round":"50"}

# # set an output path where the trained model will be saved
# bucket = sagemaker.Session().default_bucket()
# prefix = 'DEMO-xgboost-as-a-framework'
# output_path = 's3://{}/{}/{}/output'.format(bucket, prefix, 'abalone-xgb-framework')

# # construct a SageMaker XGBoost estimator
# # specify the entry_point to your xgboost training script
# estimator = XGBoost(entry_point = "your_xgboost_abalone_script.py", 
#                     framework_version='1.2-2',
#                     hyperparameters=hyperparameters,
#                     role=sagemaker.get_execution_role(),
#                     instance_count=1,
#                     instance_type='ml.m5.2xlarge',
#                     output_path=output_path)

# # define the data type and paths to the training and validation datasets
# content_type = "libsvm"
# train_input = TrainingInput("s3://{}/{}/{}/".format(bucket, prefix, 'train'), content_type=content_type)
# validation_input = TrainingInput("s3://{}/{}/{}/".format(bucket, prefix, 'validation'), content_type=content_type)

# # execute the XGBoost training job
# estimator.fit({'train': train_input, 'validation': validation_input})

## Deployment

In [None]:
xgb_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

## Prediction

In [None]:
from sagemaker.serializers import CSVSerializer

test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array
xgb_predictor.serializer = CSVSerializer() # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

## Evaluation

In [None]:
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))

## Clean Up

In [None]:
xgb_predictor.delete_endpoint()
#xgb_predictor.delete_model()

In [None]:
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()