In [1]:
import boto3
import sagemaker

In [2]:
sess = sagemaker.Session()

role = sagemaker.get_execution_role()
print(
    role
)  # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = sess.default_bucket()  # Replace with your own bucket name if needed
print(bucket)
prefix = "xgboost/abalone"  # Replace with the prefix under which you want to store the data if needed

arn:aws:iam::962148432162:role/service-role/AmazonSageMaker-ExecutionRole-20201126T084214
sagemaker-us-east-1-962148432162


In [3]:
! aws s3 cp s3://aws-mls-c01/sagemaker/xgboost/abalone.test ./data/
! aws s3 cp s3://aws-mls-c01/sagemaker/xgboost/abalone.train ./data/
! aws s3 cp s3://aws-mls-c01/sagemaker/xgboost/abalone.validation ./data/

download: s3://aws-mls-c01/sagemaker/xgboost/abalone.test to data/abalone.test
download: s3://aws-mls-c01/sagemaker/xgboost/abalone.train to data/abalone.train
download: s3://aws-mls-c01/sagemaker/xgboost/abalone.validation to data/abalone.validation


In [4]:
region_name = boto3.Session().region_name

In [5]:
container = sagemaker.image_uris.retrieve("xgboost", region_name, version="latest")

In [6]:
train_channel = prefix + "/train"
validation_channel = prefix + "/validation"

sess.upload_data(path="./data/abalone.train", bucket=bucket, key_prefix=train_channel)
sess.upload_data(path="./data/abalone.validation", bucket=bucket, key_prefix=validation_channel)

s3_train_data = "s3://{}/{}".format(bucket, train_channel)
s3_validation_data = "s3://{}/{}".format(bucket, validation_channel)

In [7]:
s3_output_location = "s3://{}/{}/output".format(bucket, prefix)

In [8]:
xgb_model = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    volume_size=5,
    max_run=3600,
    input_mode="File",
    output_path=s3_output_location,
    hyperparameters={
        "max_depth": "5",
        "eta": "0.2",
        "gamma": "4",
        "min_child_weight": "6",
        "subsample": "0.7",
        "objective": "reg:linear",
        "num_round": "50",
        "verbosity": "2",
    },
)

In [9]:
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="libsvm",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="libsvm",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data, "validation": validation_data}

In [10]:
%%time
xgb_model.fit(inputs=data_channels, logs=True)

2021-10-16 04:59:17 Starting - Starting the training job...
2021-10-16 04:59:19 Starting - Launching requested ML instancesProfilerReport-1634360357: InProgress
...
2021-10-16 05:00:16 Starting - Preparing the instances for training............
2021-10-16 05:02:08 Downloading - Downloading input data
2021-10-16 05:02:08 Training - Downloading the training image.[34mArguments: train[0m
[34m[2021-10-16:05:02:20:INFO] Running standalone xgboost training.[0m
[34m[2021-10-16:05:02:20:INFO] File size need to be processed in the node: 0.21mb. Available memory size in the node: 23781.52mb[0m
[34m[05:02:20] S3DistributionType set as FullyReplicated[0m
[34m[05:02:20] 2923x9 matrix with 23384 entries loaded from /opt/ml/input/data/train[0m
[34m[05:02:20] S3DistributionType set as FullyReplicated[0m
[34m[05:02:20] 626x9 matrix with 5008 entries loaded from /opt/ml/input/data/validation[0m
[34m[05:02:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 2 prune


2021-10-16 05:02:49 Uploading - Uploading generated training model
2021-10-16 05:02:49 Completed - Training job completed
Training seconds: 41
Billable seconds: 41
CPU times: user 385 ms, sys: 34.9 ms, total: 420 ms
Wall time: 3min 46s


In [11]:
%%time

from sagemaker.serializers import LibSVMSerializer

xgboost_classifier = xgb_model.deploy(
    initial_instance_count=1, instance_type="ml.m5.xlarge", serializer=LibSVMSerializer()
)

-----------!CPU times: user 174 ms, sys: 10.4 ms, total: 184 ms
Wall time: 5min 32s


In [12]:
import math

test_file = open("./data/abalone.test", "r")
test_list = test_file.readlines()

In [13]:
def print_pred(payload):

    response = xgboost_classifier.predict(payload)
    
    raw_result = response.decode("utf-8")
    result = math.ceil(float(raw_result))
    label = payload.strip(" ").split()[0]
    
    print(f"Label: {label}\tPrediction: {result}\t({raw_result})")
    

for line in test_list:
    print_pred(line)

Label: 12	Prediction: 13	(12.576863288879395)
Label: 9	Prediction: 10	(9.881879806518555)
Label: 7	Prediction: 7	(6.384933948516846)
Label: 9	Prediction: 10	(9.381988525390625)
Label: 9	Prediction: 9	(8.309389114379883)
Label: 8	Prediction: 7	(6.6915411949157715)
Label: 11	Prediction: 10	(9.264090538024902)
Label: 10	Prediction: 10	(9.276705741882324)
Label: 8	Prediction: 9	(8.248900413513184)
Label: 9	Prediction: 11	(10.2130765914917)
Label: 11	Prediction: 11	(10.908291816711426)
Label: 10	Prediction: 12	(11.518393516540527)
Label: 13	Prediction: 12	(11.912405967712402)
Label: 8	Prediction: 9	(8.330268859863281)
Label: 6	Prediction: 9	(8.289459228515625)
Label: 5	Prediction: 7	(6.861783027648926)
Label: 10	Prediction: 13	(12.219161033630371)
Label: 11	Prediction: 12	(11.193463325500488)
Label: 8	Prediction: 8	(7.889063358306885)
Label: 23	Prediction: 16	(15.704513549804688)
Label: 9	Prediction: 9	(8.505152702331543)
Label: 11	Prediction: 11	(10.511245727539062)
Label: 13	Prediction: 1

Label: 10	Prediction: 11	(10.077371597290039)
Label: 8	Prediction: 9	(8.910690307617188)
Label: 8	Prediction: 9	(8.405914306640625)
Label: 6	Prediction: 7	(6.456258296966553)
Label: 8	Prediction: 10	(9.785928726196289)
Label: 18	Prediction: 12	(11.146017074584961)
Label: 8	Prediction: 8	(7.7582316398620605)
Label: 10	Prediction: 14	(13.092689514160156)
Label: 10	Prediction: 10	(9.030548095703125)
Label: 11	Prediction: 10	(9.654655456542969)
Label: 9	Prediction: 10	(9.141240119934082)
Label: 9	Prediction: 11	(10.289562225341797)
Label: 12	Prediction: 11	(10.670577049255371)
Label: 8	Prediction: 9	(8.46324348449707)
Label: 12	Prediction: 13	(12.898691177368164)
Label: 11	Prediction: 15	(14.20477294921875)
Label: 9	Prediction: 11	(10.965547561645508)
Label: 16	Prediction: 16	(15.588629722595215)
Label: 9	Prediction: 9	(8.897963523864746)
Label: 5	Prediction: 4	(3.744978904724121)
Label: 8	Prediction: 9	(8.151074409484863)
Label: 8	Prediction: 7	(6.7047953605651855)
Label: 8	Prediction: 11

Label: 9	Prediction: 11	(10.259461402893066)
Label: 13	Prediction: 10	(9.775040626525879)
Label: 7	Prediction: 8	(7.5151519775390625)
Label: 11	Prediction: 10	(9.190499305725098)
Label: 10	Prediction: 11	(10.911022186279297)
Label: 9	Prediction: 13	(12.119057655334473)
Label: 15	Prediction: 13	(12.162152290344238)
Label: 10	Prediction: 12	(11.51169204711914)
Label: 8	Prediction: 9	(8.71969223022461)
Label: 9	Prediction: 9	(8.731522560119629)
Label: 3	Prediction: 5	(4.964925765991211)
Label: 10	Prediction: 10	(9.733503341674805)
Label: 11	Prediction: 11	(10.41551685333252)
Label: 11	Prediction: 12	(11.248092651367188)
Label: 10	Prediction: 10	(9.847087860107422)
Label: 10	Prediction: 12	(11.017168998718262)
Label: 10	Prediction: 10	(9.672565460205078)
Label: 11	Prediction: 13	(12.146878242492676)
Label: 9	Prediction: 7	(6.84602689743042)
Label: 9	Prediction: 11	(10.252008438110352)
Label: 17	Prediction: 15	(14.268035888671875)
Label: 4	Prediction: 5	(4.708862781524658)
Label: 15	Predict

Label: 9	Prediction: 9	(8.765152931213379)
Label: 8	Prediction: 9	(8.01453685760498)
Label: 13	Prediction: 9	(8.99057388305664)
Label: 8	Prediction: 9	(8.36758041381836)
Label: 9	Prediction: 11	(10.254919052124023)
Label: 16	Prediction: 16	(15.729795455932617)
Label: 11	Prediction: 12	(11.293742179870605)
Label: 19	Prediction: 13	(12.743020057678223)
Label: 8	Prediction: 11	(10.751090049743652)
Label: 9	Prediction: 13	(12.531196594238281)
Label: 8	Prediction: 9	(8.02219009399414)
Label: 13	Prediction: 14	(13.161118507385254)
Label: 13	Prediction: 15	(14.617571830749512)
Label: 11	Prediction: 11	(10.83185863494873)
Label: 8	Prediction: 7	(6.834247589111328)
Label: 14	Prediction: 15	(14.962894439697266)
Label: 8	Prediction: 12	(11.171463966369629)
Label: 11	Prediction: 11	(10.009407043457031)
Label: 8	Prediction: 10	(9.545341491699219)
Label: 12	Prediction: 13	(12.956452369689941)
Label: 13	Prediction: 9	(8.398655891418457)
Label: 11	Prediction: 11	(10.833467483520508)
Label: 20	Predicti

In [14]:
xgboost_classifier.delete_endpoint()