## Setup

In [None]:
sc.install_pypi_package("boto3");
sc.install_pypi_package('sagemaker');

In [None]:
#define user specific parameters
region = 'us-west-2'
source_bucket = 's3a://emr-lab-income-dataset/'
sagemaker_execution_role = 'arn:aws:iam::883624334343:role/service-role/AmazonSageMaker-ExecutionRole-20190906T093404'

In [None]:
import boto3
import sagemaker

boto_sess = boto3.Session(region_name=region)
sage_sdk_session = sagemaker.Session(boto_session=boto_sess)
bucket = sage_sdk_session.default_bucket()

print('A SageMaker session was initiated! You are using {} as your S3 bucket for intermediate files.'.format(bucket))

## Loading the Data

We will use the abalone data set from the [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Abalone).

   Given is the attribute name, attribute type, the measurement unit and a
   brief description.  The number of rings is the value to predict: either
   as a continuous value or as a classification problem.

	Name			Data Type		Meas.	Description
	----			---------		-----	-----------
	Rings			integer					+1.5 gives the age in years
	Length			continuous		mm		Longest shell measurement
	Diameter		continuous		mm		perpendicular to length
	Height			continuous		mm		with meat in shell
	Whole weight	continuous		grams	whole abalone
	Shucked weight	continuous		grams	weight of meat
	Viscera weight	continuous		grams	gut weight (after bleeding)
	Shell weight	continuous		grams	after being dried
	Male			integer			1/0 	1 encodes true, 0 false
	Female			integer			1/0 	1 encodes true, 0 false
	Infant			integer			1/0 	1 encodes true, 0 false

In [None]:
#Pull down dataset from the S3
abaloneData = spark.read.load(source_bucket + 'clean/', format='csv', inferSchema=True, header=True)
abaloneData.show(n=5)

In [None]:
#Split the dataframe in to training and validation data
trainData, testData = abaloneData.randomSplit([.75,.25])

s3_train_emr = 's3a://'+ bucket + '/train/'
s3_test_emr = 's3a://'+ bucket + '/test/'
data_format = 'csv'

#Save the data in to S3 for later training by SageMaker
trainData.write.save(s3_train_emr, format=data_format, mode='overwrite')
testData.write.save(s3_test_emr, format=data_format, mode='overwrite')

print('Training dataset saved in {} format to {}!'.format(data_format, s3_train_emr))
print('Testing dataset saved in {} format to {}!'.format(data_format, s3_test_emr))

## Training a Machine Learning Model in SageMaker

In [None]:
model = 'XGBoost'
#model = 'LinearLeaner'

l2 = 1
l1 = 1

training_images = {
    'LinearLearner': '174872318107.dkr.ecr.{}.amazonaws.com/linear-learner:latest'.format(region),
    'XGBoost': '433757028032.dkr.ecr.{}.amazonaws.com/xgboost:latest'.format(region)
}

linear_hyperparams = {
    'feature_dim':len(abaloneData.columns)-1,
    'predictor_type': 'regressor',
    'loss': 'squared_loss',
    'wd': l2,
    'l1': l1
}

xgboost_hyperparams = {
    'num_round':100,
    'lambda': l2,
    'objective': 'reg:linear',
    'alpha': l1
}

hyperparams = {
    'LinearLearner': linear_hyperparams,
    'XGBoost': xgboost_hyperparams
}

In [None]:
estimator = sagemaker.estimator.Estimator(
    image_name=training_images[model],
    role=sagemaker_execution_role, 
    train_instance_count=1, 
    train_instance_type='ml.m5.large',
    sagemaker_session=sage_sdk_session, 
    hyperparameters=hyperparams[model]
)

In [None]:
#TODO The first time we write to S3 it creates a _SUCCESS file. This throws an error when calling SageMaker
#If you specify the file directly will it work? YES
s3_train = s3_train_emr.replace('s3a://', 's3://')
train_channel = sagemaker.session.s3_input(s3_train, content_type='text/csv')
estimator.fit({'train': train_channel})

## Inference Results From SageMaker Batch Transform

How well did our algorithm perform?

In [None]:
transformer = estimator.transformer(
    instance_count = 1,
    instance_type = 'ml.m5.large',
    strategy = 'MultiRecord',
    output_path = s3_inference,
    assemble_with= 'Line',
    accept='text/csv')

In [None]:
s3_inference = s3_train.replace('train', 'inference')

transformer.transform(
    data=s3_test,
    content_type='text/csv',
    split_type='Line',
    input_filter='$[1:]',
    join_source='Input',
    wait=True
)

In [None]:
#Pull down the inference data from S3
inference_data = spark.read.load(s3_inference, format='csv', inferSchema=True, header=False)
inference_data.show(n=5)

In [None]:
rings = inference_data.schema.names[0]
predicted_rings = inference_data.schema.names[-1]

SQL_RMSE = 'SELECT SQRT(AVG(POWER({}-{}, 2))) AS RMSE FROM inference'.format(rings, predicted_rings)

inference_data.registerTempTable("inference")
test = spark.sql(SQL_RMSE)
test.show()

## Wrap-Up
Congratulations! You processed data in Apache Spark on EMR and trained and deployed a machine learning model in Amazon SageMaker! Feel free to try different combinations of models and hyperparameters to see if you can reduce your model's RMSE.