In [2]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer  
import io

# Define IAM role
role = get_execution_role()


prefix = 'sagemaker/DEMO-xgboost-dm'
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container
my_region = boto3.session.Session().region_name # set the region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-west-2 region. You will use the 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [3]:
bucket_name = ' ------ ' # <--- CHANGE - to be the bucket name
s3 = boto3.resource('s3')
my_region = boto3.session.Session().region_name # set the region of the instance

try:
    if  my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    else: 
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 error:  An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


In [4]:
try:
    s3.Bucket(bucket_name).download_file('', 'covid.csv')
except Exception as e:
    print('Data load error: ',e)

try:
    covid_df = pd.read_csv('./covid.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Data load error:  Parameter validation failed:
Invalid length for parameter Key, value: 0, valid range: 1-inf
Success: Data loaded into dataframe.


In [5]:
covid_df.columns


Index(['sex', 'patient_type', 'entry_date', 'date_symptoms', 'date_died',
       'intubed', 'pneumonia', 'age', 'pregnancy', 'diabetes', 'copd',
       'asthma', 'inmsupr', 'hypertension', 'other_disease', 'cardiovascular',
       'obesity', 'renal_chronic', 'tobacco', 'contact_other_covid',
       'covid_res', 'icu'],
      dtype='object')

In [6]:
columns_to_drop_above_97=[
       'intubed', 'pneumonia', 'pregnancy', 'diabetes', 'copd',
       'asthma', 'inmsupr', 'hypertension', 'other_disease', 'cardiovascular',
       'obesity', 'renal_chronic', 'tobacco', 'contact_other_covid',
       'covid_res', 'icu']

for col in columns_to_drop_above_97:
    covid_df = covid_df[covid_df[col]<97]
    

In [7]:
# reset index after tweaking dataframe
covid_df = covid_df.reset_index(drop=True)

length_of_lesser_data = covid_df.loc[covid_df.intubed == 1, 'intubed'].count()


In [8]:
# drop data to make the amounts equal

length_of_same_data = covid_df.loc[covid_df.intubed == 2, 'intubed'].count()
length = length_of_same_data - length_of_lesser_data


covid_df = covid_df.drop(covid_df.intubed[covid_df.intubed.eq(2)].sample(length).index )


In [9]:
# check they are equal 
length_of_lesser_data = covid_df.loc[covid_df.intubed == 1, 'intubed'].count()
print(length_of_lesser_data)

length_of_more_data = covid_df.loc[covid_df.intubed == 2, 'intubed'].count()
print(length_of_more_data)

2561
2561


In [10]:
# did they die and add this as a column 
covid_df['Died?'] = covid_df['date_died'].apply(lambda x: 1 if x == '9999-99-99' else 2)


In [11]:
# Select feature columns and column of target prediction
columns_to_train=['sex', 'age','pneumonia', 'diabetes', 'inmsupr', 'hypertension', 'other_disease', 'cardiovascular',
       'obesity']
column_class=["intubed"]


In [12]:
import sagemaker.amazon.common as smac

split_train = int(len(covid_df) * 0.8)
split_test = int(len(covid_df) * 0.2)
print(split_train)
print(split_test)

# shuffle data
covid_df = covid_df.sample(frac=1, random_state=1729).reset_index(drop=True)

x_train = covid_df[columns_to_train][split_test:split_train].to_numpy().astype('float32')
y_train = np.where(covid_df["intubed"][split_test:split_train].to_numpy() == 1, 0, 1).astype('float32')
x_test = covid_df[columns_to_train][:split_test].to_numpy().astype('float32')
y_test = np.where(covid_df["intubed"][:split_test] == 1, 0, 1).astype('float32')

print(len(y_train)) 
print(len(y_test)) 

4097
1024
3073
1024


In [14]:
train_buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(train_buf, x_train, y_train)
train_buf.seek(0)

val_buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(val_buf, x_test, y_test)
val_buf.seek(0)

bucket = 'wwcode-data-hack-bridgeclub' # <--- CHANGE - to be the bucket name
prefix = 'wwcode-covid'                # <--- CHANGE - to be folder name
train_key = 'training-data'         
val_key = 'val-data'      

boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', train_key)).upload_fileobj(train_buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, train_key)
print('uploaded training data location: {}'.format(s3_train_data))

boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'val', val_key)).upload_fileobj(val_buf)
s3_val_data = 's3://{}/{}/val/{}'.format(bucket, prefix, val_key)
print('uploaded training data location: {}'.format(s3_val_data))

In [16]:
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))


In [105]:
import boto3
import sagemaker

from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

sess = sagemaker.Session()

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sess)
linear.set_hyperparameters(feature_dim=len(columns_to_train),
                           predictor_type='binary_classifier',
                           mini_batch_size=200)

linear.fit({'train': s3_train_data, 'validation': s3_val_data})

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-08-12 18:59:50 Starting - Starting the training job...
2020-08-12 18:59:54 Starting - Launching requested ML instances.........
2020-08-12 19:01:28 Starting - Preparing the instances for training......
2020-08-12 19:02:39 Downloading - Downloading input data......
2020-08-12 19:03:42 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[08/12/2020 19:03:46 INFO 139816794826560] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_sched

### Turn it into an endpoint 

In [106]:
predictor = linear.deploy(initial_instance_count=1,
                             instance_type='ml.t2.medium',
                             endpoint_name = ' ---------- ') # <--- INSERT - add endpoint name

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-----------------!