In [62]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

#Sagemaker specific libraries
import sagemaker 
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer
from sagemaker.session import s3_input, Session


In [39]:
bucket_name = 'diabetesbucket10' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
my_region = boto3.session.Session().region_name # set the region of the instance
print(my_region)

us-east-1


In [48]:
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


<h3>Downloading The Dataset And Storing in S3<h3>

In [63]:
import pandas as pd
import urllib
try:
    urllib.request.urlretrieve ("https://www.kaggle.com/datasets/mathchi/diabetes-data-set/download", "diabetes.csv")
    print('Success: downloaded diabetes.csv.')
except Exception as e:
    print('Data load error: ',e)

try:
    df = pd.read_csv('./diabetes.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
    print(df.head(5))
except Exception as e:
    print('Data load error: ',e)

Success: Data loaded into dataframe.
             Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
Pregnancies                                                         
6                148             72             35        0  33.6   
1                 85             66             29        0  26.6   
8                183             64              0        0  23.3   
1                 89             66             23       94  28.1   
0                137             40             35      168  43.1   

             DiabetesPedigreeFunction  Age  Outcome  
Pregnancies                                          
6                               0.627   50        1  
1                               0.351   31        0  
8                               0.672   32        1  
1                               0.167   21        0  
0                               2.288   33        1  


In [64]:
df.columns

Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

<h3> Remove outliers <h3>

In [65]:
def remove_outliers(df):
    return df[
              (df['Glucose'] >= 37.125) & 
              (df['Glucose'] <= 202.125) & 
              (df['BloodPressure'] >= 35.000) & 
              (df['BloodPressure'] <= 107.000) & 
              (df['BMI'] >= 13.350) & 
              (df['BMI'] <= 50.550) 
              ]

In [66]:
df = remove_outliers(df)

In [67]:
train_data = df[:500]
test_data = df[501:]

<h3> Storing Train and test data in S3 bucket <h3>

In [68]:
pd.concat([train_data['Outcome'], train_data.drop(['Outcome'], axis=1)],axis=1).to_csv('train.csv', index=False, header=False)

In [69]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

In [70]:
s3_input_train = sagemaker.TrainingInput(s3_data=f's3://{bucket_name}/{prefix}/train', content_type='csv')

In [71]:
pd.concat([test_data['Outcome'], test_data.drop(['Outcome'], axis=1)],axis=1).to_csv('test.csv', index=False, header=False)

In [72]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')

In [73]:
s3_input_test = sagemaker.TrainingInput(s3_data=f's3://{bucket_name}/{prefix}/test', content_type='csv')

<h3> loading inbuilt Xgboost algo using Image_URI <h3>

In [74]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
container = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version='1.0-1')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [75]:
# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
        }

In [77]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.2xlarge', 
                                          train_volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          train_use_spot_instances=True,
                                          train_max_run=300,
                                          train_max_wait=600)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


<h3> fitting Xgboost model <h3>

In [78]:
estimator.fit({'train': s3_input_train,'validation': s3_input_test})

2022-04-11 04:36:30 Starting - Starting the training job...
2022-04-11 04:36:54 Starting - Launching requested ML instancesProfilerReport-1649651790: InProgress
.........
2022-04-11 04:38:15 Starting - Preparing the instances for training......
2022-04-11 04:39:17 Downloading - Downloading input data...
2022-04-11 04:39:59 Training - Training image download completed. Training in progress.
2022-04-11 04:39:59 Uploading - Uploading generated training model[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:

In [79]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

---------!

<h3>Prediction of the Test Data<h3>

In [81]:
from sagemaker.predictor import csv_serializer
test_data_array = test_data.drop(['Outcome'], axis=1).values #load the data into an array
# xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(208,)


In [82]:
predictions_array

array([0.25837237, 0.88907993, 0.07490061, 0.05733732, 0.31068006,
       0.04407503, 0.05078761, 0.08358879, 0.10220534, 0.24982712,
       0.29974994, 0.10301007, 0.32218549, 0.85109955, 0.03148065,
       0.09582039, 0.1280203 , 0.02273622, 0.0803147 , 0.31999657,
       0.7500093 , 0.32090965, 0.14054459, 0.08024   , 0.13389057,
       0.04272237, 0.3726334 , 0.17845099, 0.16516468, 0.19404469,
       0.23200519, 0.85694027, 0.47961697, 0.03445728, 0.23990381,
       0.2252503 , 0.47498739, 0.03313931, 0.32064769, 0.12032562,
       0.93575692, 0.76879472, 0.1044215 , 0.45365161, 0.07218668,
       0.75411201, 0.71582121, 0.11474105, 0.72983932, 0.07110569,
       0.10026456, 0.20030557, 0.64835614, 0.35435724, 0.72213125,
       0.04264032, 0.44472352, 0.02594095, 0.12379211, 0.89755249,
       0.86328948, 0.4563975 , 0.79393119, 0.03570756, 0.1482055 ,
       0.03096173, 0.46561995, 0.28894666, 0.05981681, 0.8612572 ,
       0.05825692, 0.11510886, 0.08971408, 0.03275129, 0.29855

In [85]:
cm = pd.crosstab(index=test_data['Outcome'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No diabetes", "diabetes"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No diabetes", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("diabetes", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 78.4%

Predicted      No diabetes    diabetes
Observed
No diabetes    81% (123)    29% (16)
diabetes        19% (29)     71% (40) 



<h3>Deleting The Endpoints <h3>

In [86]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


[{'ResponseMetadata': {'RequestId': 'ABQCCEQR8B85E8N7',
   'HostId': '5/1Qm3+rBxXgBJf5xsziW3dNI5Uyeqq9P7jpgXojm/gdH80LAqbEoMvu3ZpZZmaTDx//ks1dzxA=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': '5/1Qm3+rBxXgBJf5xsziW3dNI5Uyeqq9P7jpgXojm/gdH80LAqbEoMvu3ZpZZmaTDx//ks1dzxA=',
    'x-amz-request-id': 'ABQCCEQR8B85E8N7',
    'date': 'Mon, 11 Apr 2022 04:50:59 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'diabetes-problem/test/test.csv'},
   {'Key': 'diabetes-problem/output/sagemaker-xgboost-2022-04-11-04-36-30-107/profiler-output/system/training_job_end.ts'},
   {'Key': 'diabetes-problem/output/sagemaker-xgboost-2022-04-11-04-36-30-107/profiler-output/framework/training_job_end.ts'},
   {'Key': 'diabetes-problem/train/train.csv'},
   {'Key': 'diabetes-problem/output/sagemaker-xgboost-2022-04-11-04-36-30-107/output/model.tar.gz'},
   {'Key': 'd