## Setup

In [None]:
sc.version

In [2]:
sc.install_pypi_package("boto3")
sc.install_pypi_package('sagemaker')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting boto3
  Using cached https://files.pythonhosted.org/packages/9f/59/ba5611e8d79b066fc010726ba7342707d2b5a351dde7b0a0b2b42a5bdd4b/boto3-1.10.12-py2.py3-none-any.whl
Collecting botocore<1.14.0,>=1.13.12
  Using cached https://files.pythonhosted.org/packages/23/d3/4ab10a8a6cbd8a9544cb59c16c3e52288aa6bfcf26fdd0124edb2733ba2b/botocore-1.13.12-py2.py3-none-any.whl
Collecting s3transfer<0.3.0,>=0.2.0
  Using cached https://files.pythonhosted.org/packages/16/8a/1fc3dba0c4923c2a76e1ff0d52b305c44606da63f718d14d3231e21c51b0/s3transfer-0.2.1-py2.py3-none-any.whl
Collecting docutils<0.16,>=0.10
  Using cached https://files.pythonhosted.org/packages/22/cd/a6aa959dca619918ccb55023b4cb151949c64d4d5d55b3f4ffd7eee0c6e8/docutils-0.15.2-py3-none-any.whl
Collecting python-dateutil<2.8.1,>=2.1; python_version >= "2.7"
  Using cached https://files.pythonhosted.org/packages/41/17/c62faccbfbd163c7f57f3844689e3a78bae1f403648a6afb1d0866d87fbb/python_dateutil-2.8.0-py2.py3-none-any.whl
Collecting urllib

In [3]:
import boto3
import sagemaker

region = 'us-west-2'

boto_sess = boto3.Session(region_name=region)
sage_sdk_session = sagemaker.Session(boto_session=boto_sess)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Loading the Data

We will use the abalone data set from the [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Abalone).

   Given is the attribute name, attribute type, the measurement unit and a
   brief description.  The number of rings is the value to predict: either
   as a continuous value or as a classification problem.

	Name		Data Type	Meas.	Description
	----		---------	-----	-----------
	Sex		nominal			M, F, and I (infant)
	Length		continuous	mm	Longest shell measurement
	Diameter	continuous	mm	perpendicular to length
	Height		continuous	mm	with meat in shell
	Whole weight	continuous	grams	whole abalone
	Shucked weight	continuous	grams	weight of meat
	Viscera weight	continuous	grams	gut weight (after bleeding)
	Shell weight	continuous	grams	after being dried
	Rings		integer			+1.5 gives the age in years

In [4]:
#Pull down dataset from the S3
abaloneData = spark.read.load('s3a://emr-lab-income-dataset/Clean/', format='csv', inferSchema=True, header=True)

for dimension in ['Length', 'Diameter', 'Height']:
    abaloneData = abaloneData.withColumn(dimension,abaloneData[dimension].cast('double'))

abaloneData.printSchema()
abaloneData = abaloneData.select(['Rings', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight',
                                 'Viscera_weight', 'Shell_weight', 'Male', 'Female', 'Infant'])

abaloneData = abaloneData.dropna()
abaloneData.show(n=5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- Length: double (nullable = true)
 |-- Diameter: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Whole_weight: double (nullable = true)
 |-- Shucked_weight: double (nullable = true)
 |-- Viscera_weight: double (nullable = true)
 |-- Shell_weight: double (nullable = true)
 |-- Male: integer (nullable = true)
 |-- Female: integer (nullable = true)
 |-- Infant: integer (nullable = true)
 |-- Rings: integer (nullable = true)

+-----+------+--------+------+------------+--------------+--------------+------------+----+------+------+
|Rings|Length|Diameter|Height|Whole_weight|Shucked_weight|Viscera_weight|Shell_weight|Male|Female|Infant|
+-----+------+--------+------+------------+--------------+--------------+------------+----+------+------+
|   15| 0.455|   0.365| 0.095|       0.514|        0.2245|         0.101|        0.15|   1|     0|     0|
|    7|  0.35|   0.265|  0.09|      0.2255|        0.0995|        0.0485|        0.07|   1|     0|     0|
|    9|  0.53| 

In [5]:
#Split the dataframe in to training and validation data
trainData, testData = abaloneData.randomSplit([.8,.2])

#Save the data in to S3 for later training by SageMaker
#trainData.write.save('s3a://emr-lab-income-dataset/train/', format='csv', mode='overwrite')
#testData.write.save('s3a://emr-lab-income-dataset/test/', format='csv', mode='overwrite')
#There is an issue with randomSplit. For dev purposes just use AbaloneData for now
abaloneData.write.save('s3a://emr-lab-income-dataset/test/', format='csv', mode='overwrite')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
training_images = {'LinearLearner': '174872318107.dkr.ecr.us-west-2.amazonaws.com/linear-learner:latest',
                  'XGBoost': '174872318107.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest'}

hyperparams = {'feature_dim':len(abaloneData.columns)-1,
                  'predictor_type': 'regressor'}

sagemaker_execution_role = 'arn:aws:iam::883624334343:role/service-role/AmazonSageMaker-ExecutionRole-20190906T093404'
estimator = sagemaker.estimator.Estimator(
            image_name=training_images['LinearLearner'],
            role=sagemaker_execution_role, 
            train_instance_count=1, 
            train_instance_type='ml.m5.large',
            output_path=None, 
            output_kms_key=None, 
            base_job_name=None, 
            sagemaker_session=sage_sdk_session, 
            hyperparameters=hyperparams, 
            train_use_spot_instances=False, 
            train_max_wait=None)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
s3_train_data = 's3://{}/{}/part-00000-ed2ee443-af15-4098-ac63-23131d2dce55-c000.csv'.format('emr-lab-income-dataset', 'test')
train_channel = sagemaker.session.s3_input(s3_train_data, content_type='text/csv')
estimator.fit({'train': train_channel})

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2019-11-07 18:57:02 Starting - Starting the training job...
2019-11-07 18:57:08 Starting - Launching requested ML instances......
2019-11-07 18:58:07 Starting - Preparing the instances for training...
2019-11-07 18:59:01 Downloading - Downloading input data......
2019-11-07 19:00:03 Training - Training image download completed. Training in progress.
2019-11-07 19:00:03 Uploading - Uploading generated training model.Docker entrypoint called with argument(s): train
[11/07/2019 18:59:59 INFO 140320482404160] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step'

In [12]:
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

--------------------------------------------------------------------------------------!

In [None]:
#Write training and validation data to S3
trainData.write.save('s3a://emr-lab-income-dataset/train/', format='csv', mode='overwrite')

In [None]:
#Write training and validation data to S3
abaloneData.write.save('s3a://emr-lab-income-dataset/Clean/abaloneData_writeTest.csv', format='csv', mode='overwrite')

## Training and Hosting a Model

## Inference


How well did the algorithm perform? Let us display the digits corresponding to each of the labels and manually inspect the results:

Since we don't need to make any more inferences, now we delete the endpoint:

In [None]:
# Delete the endpoint