1. Import libraries
2. Creaate S3 Bucket
3. Map training and test data in S3
4. Map the path of the models in S3

In [1]:
import sagemaker
import boto3
from sagemaker.session import Session
import os

In [2]:
bucket_name = "bankapp-1"

my_region = boto3.session.Session().region_name
print(my_region)

us-east-2


In [3]:
s3 = boto3.resource('s3')
try:
    if my_region == 'us-east-2':
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': 'us-east-2'})
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ', e)

S3 error:  An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


In [4]:
# set an output path where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo'
output_path = os.path.join("s3://", bucket_name, prefix, "output")
print(output_path)

s3://bankapp-1/xgboost-as-a-built-in-algo/output


#### Downloading sample data and Storing in S3

In [5]:
import pandas as pd
import urllib

In [6]:
# download from data online 
raw_url = "https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv"
try:
    urllib.request.urlretrieve(raw_url, "bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ', e)
    
# load the data
try:
    df = pd.read_csv("./bank_clean.csv", index_col=0)
    print('Success: Data loaded into dataframe')
except Exception as e:
    print('Dataframe load error: ', e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe


In [7]:
df.head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


#### Train Test Split

In [8]:
# Train Test Split 
# - Separate train and test data and store in S3 for future use
import numpy as np
train_data, test_data = np.split(df.sample(frac=1, random_state=100), \
                                 [int(0.7 * len(df))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


Note: To use Sagemaker, dependent variable has to be the first feature.

In [9]:
# Save Train and Test data into S3 buckets
# Train data

# move dependent variable as the first column
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)],
              axis=1)\
    .to_csv('train.csv', index=False, header=False)

In [10]:
# upload train data to S3
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train', 'train.csv'))\
               .upload_file('train.csv')
# create a path to get the data from the S3 bucket
s3_train_data_path = os.path.join('s3://', bucket_name, prefix, 'train')
s3_input_train = sagemaker.TrainingInput(s3_data=s3_train_data_path, content_type='csv')

In [11]:
# Do the same for test data

pd.concat([test_data['y_yes'], test_data.drop(['y_yes', 'y_no'], axis=1)], axis=1)\
            .to_csv('test.csv', index=False, header=False)

In [12]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test', 'test.csv'))\
     .upload_file('train.csv')

s3_test_data_path = os.path.join('s3://', bucket_name, prefix, 'test')
s3_input_test = sagemaker.TrainingInput(s3_data=s3_test_data_path, content_type='csv')

#### Building and Training Xgbost model - Inbuilt Algorithm

In [13]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container
# specify the repo_version depnding on your preference.
container = sagemaker.image_uris.retrieve('xgboost', 
                                          boto3.Session().region_name, 
                                          version='1.3-1')

In [14]:
# initialize hyperparameters


hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":"50"
        }


In [15]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path,
                                         use_spot_instances=True,
                                         max_run=300,
                                         max_wait=600
                                         )

In [16]:
estimator.fit({'train': s3_input_train, 'validation': s3_input_test})

2022-01-06 19:59:02 Starting - Starting the training job...
2022-01-06 19:59:25 Starting - Launching requested ML instancesProfilerReport-1641499142: InProgress
...
2022-01-06 19:59:52 Starting - Preparing the instances for training.........
2022-01-06 20:01:26 Downloading - Downloading input data...
2022-01-06 20:02:02 Training - Training image download completed. Training in progress.
2022-01-06 20:02:02 Uploading - Uploading generated training model.[34m[2022-01-06 20:01:58.814 ip-10-0-178-238.us-east-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2022-01-06:20:01:58:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2022-01-06:20:01:58:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2022-01-06:20:01:58:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-01-06:20:01:58:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2022-

#### Deploy Machine Learning Model

In [17]:
xgb_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

------!

#### Prediction of Test Data

In [24]:
from sagemaker.serializers import CSVSerializer

test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values
xgb_predictor.serializer = CSVSerializer()
predictions = xgb_predictor.predict(test_data_array, initial_args={'ContentType': 'text/csv'}).decode('utf-8')
predictions_array = np.fromstring(predictions[1:], sep=',')
print(predictions_array.shape)

(12357,)


In [25]:
predictions_array

array([0.33076534, 0.32295951, 0.09651386, ..., 0.04343579, 0.06987029,
       0.42867687])

In [26]:
confusion_matrix = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), 
                              rownames=['Observed'], colnames=['Predicted'])

true_n = confusion_matrix.iloc[0, 0]
false_n = confusion_matrix.iloc[1, 0]
true_p = confusion_matrix.iloc[1, 1]
false_p = confusion_matrix.iloc[0, 1]
accuracy = (true_p + true_n) / (true_p + true_n + false_p + false_n) *100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", accuracy))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", true_n/(true_n+false_n)*100, true_n, false_p/(true_p+false_p)*100, false_p))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", false_n/(true_n+false_n)*100, false_n, true_p/(true_p+false_p)*100, true_p))



Overall Classification Rate: 89.9%

Predicted      No Purchase    Purchase
Observed
No Purchase    91% (10821)    40% (193)
Purchase        9% (1056)     60% (287) 



#### Deleting The Endpoints

In [32]:
estimator.delete_endpoint()
# sagemaker.predictor.Predictor.delete_endpoint(xgb_predictor.endpoint)
boto3.resource('s3').Bucket(bucket_name).objects.all().delete()

The function delete_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


[{'ResponseMetadata': {'RequestId': '31EF48KRQGY67ZAY',
   'HostId': 'v5VvjxPPugYjpFVVN7M7FDiWXGZrZj0RBlmTpTLANM3JxgJkYwb5XZGMNSf2/m25YHhywcEmMh8=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'v5VvjxPPugYjpFVVN7M7FDiWXGZrZj0RBlmTpTLANM3JxgJkYwb5XZGMNSf2/m25YHhywcEmMh8=',
    'x-amz-request-id': '31EF48KRQGY67ZAY',
    'date': 'Thu, 06 Jan 2022 20:37:41 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-01-06-19-59-02-435/rule-output/ProfilerReport-1641499142/profiler-output/profiler-reports/GPUMemoryIncrease.json'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-01-06-03-09-15-338/profiler-output/system/training_job_end.ts'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2022-01-06-19-57-35-877/rule-output/ProfilerReport-1641499055/profiler-outpu