In [2]:
'''Importing Important Libraries
Steps To Be Followed
Importing necessary Libraries
Creating S3 bucket
Mapping train And Test Data in S3
Mapping The path of the models in S3'''

import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker.session import s3_input, Session

In [19]:
bucket_name ='jibinbanks3'
#set the region for the instance
region=boto3.session.Session().region_name
print('My region : ',region)

My region :  us-east-2


In [20]:
#create s3 bucket

s3=boto3.resource('s3')
try:
    if region=='us-east-2':
        s3.create_bucket(Bucket=bucket_name,CreateBucketConfiguration={'LocationConstraint': 'us-east-2'})
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [21]:
# set an output path where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)

In [23]:
#Mapping Train and Test Data in S3
import pandas as pd
import urllib
import numpy as np
try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ',e)

try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


In [25]:
'''In SageMaker, we don’t need to split data into X_train, X_test, y_train, y_test as we usually did in 
Machine Learning steps but only split into train and test data.
The dataset was converted to one-hot encoding. As the target feature is y_yes, we have to drop the y_no column. 
According to the AWS documents, the independent variable has to be moved to the first column. 
After cleaning the data, we save it to our S3 bucket.'''

# Train Test split
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)


### Saving Train And Test Into Buckets
# We start with Train Data
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], 
                                                axis=1)], 
                                                axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')


# Test Data Into Buckets
pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('test.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

(28831, 61) (12357, 61)


In [26]:
#Mapping the path of the Models in S3
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
container = sagemaker.image_uris.retrieve('xgboost', boto3.Session().region_name,"1.2-1")

In [27]:
# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic", # since it's a bianry class
        "num_round":50
        }

In [30]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container,
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_type = 'ml.m5.2xlarge',
                                          instance_count= 1,
                                          volume_size = 5,
                                          output_path=output_path,
                                           use_spot_instances=True,
                                           max_run=300,
                                           max_wait= 600)

In [31]:
#Now we can train our model with fit function feeding the train and test data.
estimator.fit({'train': s3_input_train,'validation': s3_input_test})

2022-06-18 11:59:55 Starting - Starting the training job...
2022-06-18 11:59:57 Starting - Launching requested ML instancesProfilerReport-1655553595: InProgress
......
2022-06-18 12:01:25 Starting - Preparing the instances for training......
2022-06-18 12:02:24 Downloading - Downloading input data
2022-06-18 12:02:24 Training - Downloading the training image.....[34m[2022-06-18 12:03:02.882 ip-10-0-101-234.us-east-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delim

In [32]:
#deploy our model and do the prediction to our testing data using endpoint, which is the estimator that we defined here
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')


-------!

In [33]:
#use our model to predict the test data
from sagemaker.serializers import CSVSerializer

test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array

#xgb_predictor.content_type = 'csv' # set the data type for an inference
xgb_predictor.serializer = CSVSerializer() # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(12357,)


In [34]:
predictions_array

array([0.05214286, 0.05660191, 0.05096195, ..., 0.03436061, 0.02942475,
       0.03715819])

In [36]:
'''It returns values between 0–1, thus we have to round it to either 0 or 1 
(with default is 0.5 as threshold) and then compare it to target column test_data[‘y_yes’]'''

cm = pd.crosstab(index=test_data['y_yes'],columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])

In [40]:
tn = cm.iloc[0,0]
fn = cm.iloc[1,0]
tp = cm.iloc[1,1]
fp = cm.iloc[0,1]
p = (tp+tn)/(tp+tn+fp+fn)*100

In [41]:
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
# print the confusion matrix
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 89.7%

Predicted      No Purchase    Purchase
Observed
No Purchase    91% (10785)    34% (151)
Purchase        9% (1124)     66% (297) 



In [None]:
#The confusion matrix shows the accuracy result is 89.7%

In [42]:
#delete notebook instance to avoid the potential billing cost
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


[{'ResponseMetadata': {'RequestId': 'TH871S7Q1FBXBTWA',
   'HostId': '+XoOTOjAtcKIxV6upJCtYebJuQl+UtcbmRB2N/OT6XQeDGvoB7VZh26UhCVeFpLfaKqnyiBBwhc=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': '+XoOTOjAtcKIxV6upJCtYebJuQl+UtcbmRB2N/OT6XQeDGvoB7VZh26UhCVeFpLfaKqnyiBBwhc=',
    'x-amz-request-id': 'TH871S7Q1FBXBTWA',
    'date': 'Sat, 18 Jun 2022 12:25:45 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-as-a-built-in-algo/train/train.csv'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-06-18-11-59-55-395/rule-output/ProfilerReport-1655553595/profiler-output/profiler-reports/LoadBalancing.json'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-06-18-11-59-55-395/rule-output/ProfilerReport-1655553595/profiler-output/profiler-reports/StepOutlier.json'},
   {'Key': 'xgboost-as-a-built-in-algo/ou