In [1]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/demo-xgboost'
# each region has its XGBoost container
containers = {'eu-central-1': '813361260812.dkr.ecr.eu-central-1.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} 
my_region = boto3.session.Session().region_name # set the region of the instance
print("Great! - your SageMaker Instance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint to make inference requests.")

Great! - your SageMaker Instance is in the eu-central-1 region. You will use the 813361260812.dkr.ecr.eu-central-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint to make inference requests.


In [2]:
# Download from your S3 bucket the census data CSV file based on the publically available census data from the ML repository curated by the University of California, Irvine
from io import StringIO
s3 = boto3.resource('s3')
bucket_name = 'raz-eu-central-1-tutorial' # place the adult_census.csv file in a bucket in your account
object_key = 'adult_census.csv'

# Load the data into a pandas dataframe 

csv_obj = s3.Object(bucket_name, object_key)
csv_string = csv_obj.get()['Body'].read().decode('utf-8')

raw_data = pd.read_csv(StringIO(csv_string))
raw_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,y_no,y_yes
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0,1


In [3]:
model_data = pd.get_dummies(raw_data)
model_data.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,y_no,y_yes,workclass_ 0,workclass_ Federal-gov,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,77516,13,2174,0,40,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Randomize the data and split it between train and test datasets on a 70% 30% split respectively
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(22792, 110) (9768, 110)


In [7]:
# Reformat the header and first column of the training data, 
# save the new train dataset to your S3 bucket as train.csv and load the data from the S3 bucket
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [8]:
# Set up the SageMaker session, create an instance of the XGBoost model (an estimator), 
# and define the model’s hyperparameters
session_sm = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(containers[my_region],role, train_instance_count=1, train_instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name, prefix),sagemaker_session=session_sm)
xgb.set_hyperparameters(alpha=1.436043930551528,rate_drop=0.3,tweedie_variance_power=1.4,max_depth=3,eta=0.7750488867396172,min_child_weight=1.4300264673838223,objective='binary:logistic',num_round=100)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [9]:
# After the data is loaded and the XGBoost estimator is set up, 
# train the model using gradient optimization on a ml.m4.xlarge instance
xgb.fit({'train': s3_input_train})

2020-11-13 14:45:47 Starting - Starting the training job...
2020-11-13 14:45:50 Starting - Launching requested ML instances......
2020-11-13 14:46:53 Starting - Preparing the instances for training......
2020-11-13 14:48:13 Downloading - Downloading input data
2020-11-13 14:48:13 Training - Downloading the training image...
2020-11-13 14:48:32 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[2020-11-13:14:48:33:INFO] Running standalone xgboost training.[0m
[34m[2020-11-13:14:48:33:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2020-11-13:14:48:33:INFO] File size need to be processed in the node: 4.91mb. Available memory size in the node: 8477.6mb[0m
[34m[2020-11-13:14:48:33:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:48:33] S3DistributionType set as FullyReplicated[0m
[34m[14:48:33] 22792x108 matrix with 2461536 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[

In [10]:
# Deploy your model and create an endpoint that you can access
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-----------------!

In [11]:
# Predict whether census participants in the test dataset earned more than 50K
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(9768,)


In [12]:
# Evaluate the performance and accuracy of the model
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "Under 50K", "Over 50K"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("Under 50K", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Over 50K", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 87.1%

Predicted      Under 50K      Over 50K
Observed
Under 50K      78% (1575)    11% (825)
Over 50K        22% (439)     89% (6929) 



In [None]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)