In [22]:
# Define IAM role
import boto3
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer

In [23]:
# sagemaker session, role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# S3 bucket name
bucket = sagemaker_session.default_bucket()

In [24]:
dataset = pd.read_csv("data/heart.csv")
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [25]:
dataset = pd.concat([dataset['target'], dataset.drop(['target'], axis=1)], axis=1) 
dataset.head()

Unnamed: 0,target,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,1,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,1,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,1,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,1,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [26]:
dataset=np.random.shuffle(dataset)
train_data = dataset.iloc[:, :243]
test_data = dataset.iloc[:, :60]

KeyError: 185

In [27]:
# train_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset)), int(0.9 * len(dataset))])
train_data.to_csv('xgboostdata/train.csv', header=False, index=False)
# validation_data.to_csv('xgboostdata/validation.csv', header=False, index=False)

In [28]:
# should be the name of directory you created to save your features data
data_dir = 'xgboostdata'

# set prefix, a descriptive name for a directory  
prefix = 'xgb-heart-disease'

# upload all data to S3
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

In [29]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation'.format(bucket, prefix), content_type='csv')

In [30]:
from sagemaker.amazon.amazon_estimator import get_image_uri
sess = sagemaker.Session()
region = boto3.Session().region_name    
smclient = boto3.Session().client('sagemaker')

container = get_image_uri(region, 'xgboost', repo_version='latest')
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(eta=0.1,
                        objective='binary:logistic',
                        num_round=25)

xgb.fit({'train': s3_input_train})

	get_image_uri(region, 'xgboost', '0.90-1').


2020-05-03 22:40:27 Starting - Starting the training job...
2020-05-03 22:40:29 Starting - Launching requested ML instances...
2020-05-03 22:41:26 Starting - Preparing the instances for training......
2020-05-03 22:42:18 Downloading - Downloading input data...
2020-05-03 22:42:54 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2020-05-03:22:42:55:INFO] Running standalone xgboost training.[0m
[34m[2020-05-03:22:42:55:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2020-05-03:22:42:55:INFO] File size need to be processed in the node: 0.01mb. Available memory size in the node: 8502.84mb[0m
[34m[2020-05-03:22:42:55:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:42:55] S3DistributionType set as FullyReplicated[0m
[34m[22:42:55] 303x13 matrix with 3939 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[22:42:55] src/tree/updater_prune.cc:74: tree pruning end, 1 

In [31]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

-------------!

In [32]:
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = None

def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(test_data.values[:, 1:])
predictions

array([0.79937273, 0.69498521, 0.9521271 , 0.93001759, 0.78808826,
       0.88171422, 0.91167301, 0.72530186, 0.86979073, 0.86611092,
       0.87413114, 0.90548766, 0.94162464, 0.86976618, 0.86020094,
       0.93265831, 0.78930396, 0.86021256, 0.91693586, 0.90865093,
       0.70463598, 0.9445619 , 0.88568211, 0.71594715, 0.77080637,
       0.91416365, 0.85236323, 0.8793174 , 0.87771994, 0.88516223,
       0.90627211, 0.76254314, 0.94756854, 0.80676991, 0.74551898,
       0.88661402, 0.94166851, 0.8599785 , 0.88285774, 0.86024761,
       0.87522215, 0.94339192, 0.61406761, 0.88348287, 0.91607571,
       0.91607571, 0.94756854, 0.90175515, 0.88888711, 0.93141061,
       0.94005603, 0.72454208, 0.56433654, 0.91086644, 0.90935522,
       0.83463389, 0.92428154, 0.90828204, 0.94144791, 0.50621516,
       0.90669817, 0.79361242, 0.92512584, 0.8410852 , 0.86256105,
       0.92381734, 0.82321912, 0.95043397, 0.94756854, 0.84082013,
       0.80968142, 0.86908805, 0.94756854, 0.8692221 , 0.95043

In [33]:
predictions = [1 if n >= 0.5 else 0 for n in predictions]

In [34]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
# Show classification report
print(classification_report(test_data.to_numpy()[:,0], predictions))

              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       138
         1.0       0.98      0.98      0.98       165

   micro avg       0.98      0.98      0.98       303
   macro avg       0.98      0.98      0.98       303
weighted avg       0.98      0.98      0.98       303

