## Setup

In [None]:
sc.install_pypi_package("boto3")
sc.install_pypi_package('sagemaker')

In [None]:
#define user specific parameters
region = 'us-west-2'
source_bucket = 's3a://emr-lab-income-dataset/'
sagemaker_execution_role = 'arn:aws:iam::883624334343:role/service-role/AmazonSageMaker-ExecutionRole-20190906T093404'

In [None]:
import boto3
import sagemaker

region = 'us-west-2'

boto_sess = boto3.Session(region_name=region)
sage_sdk_session = sagemaker.Session(boto_session=boto_sess)
bucket = sage_sdk_session.default_bucket()

print('A SageMaker session was initiated! You are using {} as your S3 bucket for intermediate files.'.format(bucket))

## Loading the Data

We will use the abalone data set from the [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Abalone).

   Given is the attribute name, attribute type, the measurement unit and a
   brief description.  The number of rings is the value to predict: either
   as a continuous value or as a classification problem.

	Name			Data Type		Meas.	Description
	----			---------		-----	-----------
	Rings			integer					+1.5 gives the age in years
	Length			continuous		mm		Longest shell measurement
	Diameter		continuous		mm		perpendicular to length
	Height			continuous		mm		with meat in shell
	Whole weight	continuous		grams	whole abalone
	Shucked weight	continuous		grams	weight of meat
	Viscera weight	continuous		grams	gut weight (after bleeding)
	Shell weight	continuous		grams	after being dried
	Male			integer			1/0 	1 encodes true, 0 false
	Female			integer			1/0 	1 encodes true, 0 false
	Infant			integer			1/0 	1 encodes true, 0 false

In [None]:
#Pull down dataset from the S3
abaloneData = spark.read.load(source_bucket + 'clean/', format='csv', inferSchema=True, header=True)
abaloneData.printSchema()
abaloneData.show(n=5)

In [None]:
#Split the dataframe in to training and validation data
trainData, testData = abaloneData.randomSplit([.8,.2])

s3_train_emr = source_bucket + 'train/'
s3_test_emr = source_bucket + 'test/'
data_format = 'csv'

#Save the data in to S3 for later training by SageMaker
trainData.write.save(s3_train_emr, format=data_format, mode='overwrite')
testData.write.save(s3_test_emr, format=data_format, mode='overwrite')

print('Train dataset saved in {} format to {}!'.format(data_format, s3_train_emr))
print('Test dataset saved in {} format to {}!'.format(data_format, s3_test_emr))

## Training and Hosting a Machine Learning Model in SageMaker

In [None]:
l2 = 1

training_images = {'LinearLearner': '174872318107.dkr.ecr.{}.amazonaws.com/linear-learner:1'.format(region),
                  'XGBoost': '433757028032.dkr.ecr.{}.amazonaws.com/xgboost:latest'.format(region)}

linear_hyperparams = {'feature_dim':len(abaloneData.columns)-1,
                      'predictor_type': 'regressor',
                      'loss': 'squared_loss',
                      'wd': l2}

xg_boost_hyperparams = {'num_round':100,
                        'lambda': l2,
                        'objective': 'reg:linear'}

hyperparams = {'LinearLearner': linear_hyperparams,
                  'XGBoost': xg_boost_hyperparams}

estimator = sagemaker.estimator.Estimator(
            image_name=training_images['XGBoost]',
            role=sagemaker_execution_role, 
            train_instance_count=1, 
            train_instance_type='ml.m5.large',
            output_path=None, 
            output_kms_key=None, 
            base_job_name=None, 
            sagemaker_session=sage_sdk_session, 
            hyperparameters=hyperparams['XGBoost'], 
            train_use_spot_instances=False, 
            train_max_wait=None)

In [None]:
s3_train = s3_train_emr.replace('s3a://', 's3://')
train_channel = sagemaker.session.s3_input(s3_train, content_type='text/csv')
estimator.fit({'train': train_channel})

In [None]:
from sagemaker.predictor import json_serializer, csv_serializer, json_deserializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON

predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge', 
                             serializer=csv_serializer, content_type=CONTENT_TYPE_CSV,
                            deserializer=json_deserializer)

## Inference


How well did our algorithm perform?

In [None]:
payload = '0.44, 0.365, 0.125, 0.516, 0.2155, 0.114, 0.155, 1, 0, 0'
response = predictor.predict(payload)
print(response)

## Clean-up

Once you complete the lab, take down the SageMaker resources that were created.

In [None]:
predictor.delete_endpoint()