In [None]:
import sagemaker
import boto3
import os 
 
bucket = sagemaker.Session().default_bucket()                     
prefix = 'sagemaker/xgboost-whitewinequality'

# Role when working on a notebook instance
role = sagemaker.get_execution_role()

In [None]:
#this will get the database from UCI directly into your notebook. I picked a dataset of white wine characteristics
!wget -N https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv

In [None]:
!head winequality-white.csv

In [None]:
import numpy as np  # For matrix operations and numerical processing
import pandas as pd # For munging tabular data

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
data = pd.read_csv('winequality-white.csv', sep=';')
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 50)         # Keep the output on one page
data[:5]

In [None]:
data.shape # (number of lines, number of columns)

In [None]:
label = 'quality'
#setting up the main characteristic that my model will work to predict via linear regression. I want to know an estimation of white wine quality

In [None]:
# Change the order of the columns and write the file without headers
cols = data.columns.tolist()
colIdx = data.columns.get_loc(label)
cols = cols[colIdx:colIdx+1] + cols[0:colIdx] + cols[colIdx+1:]
modified_data = data[cols]
modified_data[:5]

In [None]:
# Set the seed to 123 for reproductibility
# https://pandas.pydata.org/pandas-docs/version/0.22/generated/pandas.DataFrame.sample.html
# https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.split.html
train_data, validation_data, test_data = np.split(modified_data.sample(frac=1, random_state=123), 
                                                  [int(0.7 * len(modified_data)), int(0.9*len(modified_data))])
# Save the train file, please change precison in fmt as needed
np.savetxt("train.csv", train_data, delimiter=",", fmt='%1.4f')
np.savetxt("validation.csv", validation_data, delimiter=",", fmt='%1.4f')
np.savetxt("test.csv", test_data, delimiter=",", fmt='%1.4f')

In [None]:
 !ls -l *.csv

In [None]:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')
s3_data = {'train': s3_input_train, 'validation': s3_input_validation}

In [None]:
# from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.amazon.amazon_estimator import image_uris
    
sess = sagemaker.Session()

region = boto3.Session().region_name    
# container = get_image_uri(region, 'xgboost', '0.90-2')
container = sagemaker.image_uris.retrieve('xgboost', region, 'latest')

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.2xlarge',
                                    input_mode="File",
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)

In [None]:
xgb.set_hyperparameters(objective='reg:linear', 
                        num_round=100,
                        booster='gbtree',
                        eta=0.2,
                        max_depth=5,
                        min_child_weight=6,
                        verbose=1,
                        gamma=4,
                        subsample=0.7,
                        early_stopping_rounds=10,
                        scale_pos_weight=8,)

In [None]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter

hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                        'min_child_weight': ContinuousParameter(1, 10),
                        'alpha': ContinuousParameter(0, 2),
                        'max_depth': IntegerParameter(2, 8)
                        }

In [None]:
objective_metric_name = 'validation:rmse'
objective_type = 'Minimize'

In [None]:
from sagemaker.tuner import HyperparameterTuner

tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            objective_type=objective_type,
                            max_jobs=10,
                            max_parallel_jobs=1)

In [None]:
tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [None]:
sagemaker = boto3.Session().client(service_name='sagemaker') 

# Get tuning job name
job_name = tuner.latest_tuning_job.job_name
print(job_name)

sagemaker.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=job_name)['HyperParameterTuningJobStatus']

In [None]:
# Deploying the best model
tuning_job_result = sagemaker.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=job_name)
best_model_name = tuning_job_result['BestTrainingJob']['TrainingJobName']
print(best_model_name)

import time
timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.gmtime())
endpoint_name = best_model_name + '-ep-' + timestamp
print(endpoint_name)

In [None]:
tuner.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', endpoint_name=endpoint_name)

In [None]:
#Predicting the first 10 samples with the best model
smrt = boto3.client('sagemaker-runtime')

# Predict samples from the validation set
payload = validation_data[:10].drop(['quality'], axis=1) 
payload = payload.to_csv(header=False, index=False).rstrip()

print(payload)

In [None]:
response = smrt.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=payload.encode('utf8'),
    ContentType='text/csv')

print(response['Body'].read())

In [None]:
#And last but not least, let's not run unecessary costs in the Cloud; so here's a handy script to delete the endpoint:
sagemaker.delete_endpoint(EndpointName=endpoint_name)