In [1]:
import sagemaker
from sagemaker import get_execution_role

import numpy as np
import pandas as pd

import boto3
import re

print('Libraries_loaded')

Libraries_loaded


In [2]:
bucket_name = 'awspracsagemaker'#  bucket name 

train_data = r'diabetespred/training/'
val_data = r'diabetespred/validation/' 

s3_model_output_location = r's3://{0}/diabetespred/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,train_data)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,val_data)

In [3]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)

s3://awspracsagemaker/diabetespred/model
s3://awspracsagemaker/diabetespred/training/
s3://awspracsagemaker/diabetespred/validation/


In [10]:
data = 's3://awspracsagemaker/diabetespred/diabetes.csv'

In [11]:
df = pd.read_csv(data)
df.shape

(768, 9)

In [12]:
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
cols

['Outcome',
 'Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [13]:
df = df[cols]
df.head()

Unnamed: 0,Outcome,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1,6,148,72,35,0,33.6,0.627,50
1,0,1,85,66,29,0,26.6,0.351,31
2,1,8,183,64,0,0,23.3,0.672,32
3,0,1,89,66,23,94,28.1,0.167,21
4,1,0,137,40,35,168,43.1,2.288,33


In [14]:
train = df[:629]
val = df[629:]

In [15]:
train.shape

(629, 9)

In [16]:
val.shape

(139, 9)

In [17]:
train.head()

Unnamed: 0,Outcome,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1,6,148,72,35,0,33.6,0.627,50
1,0,1,85,66,29,0,26.6,0.351,31
2,1,8,183,64,0,0,23.3,0.672,32
3,0,1,89,66,23,94,28.1,0.167,21
4,1,0,137,40,35,168,43.1,2.288,33


In [18]:
train.isnull().sum()

Outcome                     0
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64

In [19]:
np_train = train.values
np_val = val.values

In [20]:
type(np_train)

numpy.ndarray

In [21]:
np.savetxt("train.csv", np_train, delimiter=",")

In [22]:
np.savetxt("val.csv", np_val, delimiter=",")

In [23]:
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: 
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [24]:
write_to_s3('train.csv', 
            bucket_name,
            train_data + 'train.csv')

write_to_s3('val.csv',
            bucket_name,
            val_data + 'val.csv')

In [25]:
sess = sagemaker.Session()

In [26]:
role = get_execution_role()
role

'arn:aws:iam::753154596552:role/service-role/AmazonSageMaker-ExecutionRole-20200904T164660'

In [27]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(
    sess.boto_region_name,
    "xgboost", 
    "latest")

print(' SageMaker XGBoost Info :\n{} ({})'.format(container, sess.boto_region_name))

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').


 SageMaker XGBoost Info :
811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest (us-east-1)


In [34]:
estimator = sagemaker.estimator.Estimator(
    container,
    role, 
    train_instance_count=1, 
    train_instance_type='ml.m4.xlarge',
    output_path=s3_model_output_location,
    sagemaker_session=sess,
    base_job_name ='v1-xgboost-diabetes')

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [35]:
estimator.set_hyperparameters(max_depth=7,
                              objective="binary:logistic",num_round=1000)

In [36]:
estimator.hyperparameters()

{'max_depth': 7, 'objective': 'binary:logistic', 'num_round': 100}

In [37]:
training_input_config = sagemaker.session.s3_input(
    s3_data=s3_training_file_location,
    content_type='csv',
    s3_data_type='S3Prefix')

validation_input_config = sagemaker.session.s3_input(
    s3_data=s3_validation_file_location,
    content_type='csv',
    s3_data_type='S3Prefix'
)

data_channels = {'train': training_input_config, 'validation': validation_input_config}

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [38]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://awspracsagemaker/diabetespred/training/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://awspracsagemaker/diabetespred/validation/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


In [39]:
estimator.fit(data_channels)

2020-09-04 18:59:53 Starting - Starting the training job...
2020-09-04 18:59:55 Starting - Launching requested ML instances......
2020-09-04 19:01:13 Starting - Preparing the instances for training......
2020-09-04 19:02:11 Downloading - Downloading input data...
2020-09-04 19:02:44 Training - Downloading the training image..[34mArguments: train[0m
[34m[2020-09-04:19:03:05:INFO] Running standalone xgboost training.[0m
[34m[2020-09-04:19:03:05:INFO] File size need to be processed in the node: 0.16mb. Available memory size in the node: 8484.73mb[0m
[34m[2020-09-04:19:03:05:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:03:05] S3DistributionType set as FullyReplicated[0m
[34m[19:03:05] 629x8 matrix with 5032 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-09-04:19:03:05:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:03:05] S3DistributionType set as FullyReplicated[0m
[34m[19:03:05] 139x8 matrix with 111

In [40]:
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name = 'v1-xgboost-diabetes')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-----------------!

In [41]:
data = [4,9.4,0.5,2,0,2,1,2]

In [43]:
from sagemaker.predictor import csv_serializer, json_deserializer
predictor.serializer = csv_serializer
predictor.deserializer = None

In [44]:
result = predictor.predict(data)

In [45]:
np.round(float(result))

0.0