In [7]:
# Define IAM role
import boto3
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer

In [8]:
# sagemaker session, role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# S3 bucket name
bucket = sagemaker_session.default_bucket()

In [10]:
dataset = pd.read_csv("data/heart.csv")
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [11]:
dataset = pd.concat([dataset['target'], dataset.drop(['target'], axis=1)], axis=1) 
dataset.head()

Unnamed: 0,target,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,1,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,1,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,1,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,1,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [20]:
train_data, validation_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset)), int(0.9 * len(dataset))])
train_data.to_csv('xgboostdata/train.csv', header=False, index=False)
validation_data.to_csv('xgboostdata/validation.csv', header=False, index=False)

In [21]:
# should be the name of directory you created to save your features data
data_dir = 'xgboostdata'

# set prefix, a descriptive name for a directory  
prefix = 'xgb-heart-disease'

# upload all data to S3
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

In [22]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation'.format(bucket, prefix), content_type='csv')
print(s3_input_validation)

<sagemaker.inputs.s3_input object at 0x7f9ad5bfe4a8>


In [23]:
from sagemaker.amazon.amazon_estimator import get_image_uri
sess = sagemaker.Session()
region = boto3.Session().region_name    
smclient = boto3.Session().client('sagemaker')

container = get_image_uri(region, 'xgboost', repo_version='latest')
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(eta=0.1,
                        objective='binary:logistic',
                        num_round=25)

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

	get_image_uri(region, 'xgboost', '0.90-1').


2020-04-30 16:07:31 Starting - Starting the training job...
2020-04-30 16:07:32 Starting - Launching requested ML instances...
2020-04-30 16:08:31 Starting - Preparing the instances for training......
2020-04-30 16:09:18 Downloading - Downloading input data...
2020-04-30 16:09:50 Training - Downloading the training image..[34mArguments: train[0m
[34m[2020-04-30:16:10:09:INFO] Running standalone xgboost training.[0m
[34m[2020-04-30:16:10:09:INFO] File size need to be processed in the node: 0.01mb. Available memory size in the node: 8510.98mb[0m
[34m[2020-04-30:16:10:09:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:10:09] S3DistributionType set as FullyReplicated[0m
[34m[16:10:09] 212x13 matrix with 2756 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-04-30:16:10:10:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:10:09] S3DistributionType set as FullyReplicated[0m
[34m[16:10:10] 60x13 matrix with 780 e

In [25]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')



-------------!

In [26]:
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = None

def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(test_data.as_matrix()[:, 1:])
predictions



array([0.59925812, 0.76485759, 0.07130074, 0.58078784, 0.93727863,
       0.84140337, 0.59413058, 0.91342354, 0.12541868, 0.08993977,
       0.90113848, 0.05482947, 0.5820263 , 0.58455676, 0.8286227 ,
       0.05482947, 0.90104854, 0.91773498, 0.31685248, 0.71138388,
       0.05482947, 0.10896374, 0.28237435, 0.05482947, 0.90231144,
       0.94149286, 0.92065221, 0.60266513, 0.08270461, 0.80308229,
       0.87824637])

In [29]:
predictions = [1 if n >= 0.5 else 0 for n in predictions]

In [32]:
from sklearn.metrics import confusion_matrix, classification_report
# Show classification report
print(classification_report(test_data.to_numpy()[:,0], predictions))

              precision    recall  f1-score   support

         0.0       0.91      0.71      0.80        14
         1.0       0.80      0.94      0.86        17

   micro avg       0.84      0.84      0.84        31
   macro avg       0.85      0.83      0.83        31
weighted avg       0.85      0.84      0.84        31

