### Challenge Lab

**Instructions**: Build a model using a Built-in Algorithm that performs better than the model we have trained in Lab 008. Use the Automated Model Tuning capability of SageMaker to help you find the best model.

In [2]:
import pandas as pd
import numpy as np

In [3]:
np.set_printoptions(suppress=True, precision=2)
pd.set_option('display.float_format', lambda x: '%.10f' % x)

In [4]:
df = pd.read_csv("files/001.csv")
df

Unnamed: 0,property,floors,price
0,a,3,1000
1,b,4,1500
2,c,6,3000
3,d,8,3200
4,e,2,1200
5,f,9,4000
6,g,10,4200
7,h,11,5000
8,i,12,5500
9,j,15,7000


In [13]:
from sklearn.model_selection import train_test_split

# X = df.drop('target', axis=1)
X = df['floors']

X_train, X_val, y_train, y_val = train_test_split(X.values, df['price'].values, test_size=0.20, random_state=0)

In [14]:
X_train

array([16,  2, 10, 10,  2,  6,  9, 11, 15, 11,  1,  5,  8,  3, 12,  8])

In [19]:
# y_train = y_train.values.reshape(-1, 1)
y_train

array([8000,  800, 4200, 4400, 1200, 3000, 4000, 4500, 7000, 5000,  500,
       1800, 3200, 1000, 4800, 2800])

In [12]:
import io
import sagemaker
import sagemaker.amazon.common as smac

sesh = sagemaker.Session()
bucket = sesh.default_bucket()

# convert the training data to a recordIO-wrapped protobuf format used by the Amazon SageMaker algorithms

# init memory buffer in RAM where data will be written
buf = io.BytesIO()

# write train data (numpy array) into a dense tensor 
# smac.write_numpy_to_dense_tensor(buf, X_train.astype("float32"), y_train.astype("float32"))
smac.write_numpy_to_dense_tensor(buf, X_train.reshape(-1, 1).astype("float32"), y_train.astype("float32"))

# sets the file's current position at the offset (0)
# i.e. set the reference point to the beginning of the memory buffer
buf.seek(0)

0

In [15]:
import os

# upload training data to s3
key = "009-train-data"
prefix = "sagemaker/009"

boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", key)).upload_fileobj(buf)
s3_train_data = f"s3://{bucket}/{prefix}/train/{key}"

print(f"uploaded training data location: {s3_train_data}")

uploaded training data location: s3://sagemaker-us-east-1-305262579855/sagemaker/009/train/009-train-data


In [16]:
# convert the validation data to a recordIO-wrapped protobuf format used by the Amazon SageMaker algorithms 

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, X_val.astype("float32"), y_val.astype("float32"))
buf.seek(0)

0

In [17]:
# upload validation data to s3

key = "009-valid-data"
prefix = "sagemaker/009"

boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "valid", key)).upload_fileobj(buf)
s3_valid_data = f"s3://{bucket}/{prefix}/valid/{key}"

print(f"uploaded validation data location: {s3_train_data}")

uploaded validation data location: s3://sagemaker-us-east-1-305262579855/sagemaker/009/train/009-train-data


In [19]:
import boto3
import sagemaker
from sagemaker import tuner
from sagemaker import get_execution_role

role = get_execution_role()
sesh = sagemaker.Session()
bucket = sesh.default_bucket()
region = boto3.Session().region_name    
client = boto3.Session().client('sagemaker')

In [20]:
from sagemaker.image_uris import retrieve

container = retrieve('linear-learner', region, version="1")
container

'382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:1'

In [290]:
!pip install 'sagemaker[local]' --upgrade



In [291]:
!sudo service docker restart

Stopping docker: [60G[[0;32m  OK  [0;39m]
Starting docker:	.[60G[[0;32m  OK  [0;39m]


In [292]:
estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    sagemaker_session=sesh
)

In [293]:
estimator.set_hyperparameters(
    predictor_type='regressor',
    normalize_data=True,
    optimizer="adam",
)

In [294]:
hyperparameter_ranges = {
    'learning_rate': tuner.ContinuousParameter(1e-5, 1),
    'mini_batch_size': tuner.IntegerParameter(2, 50),
    'l1': tuner.ContinuousParameter(1e-7, 1),
    'use_bias': tuner.CategoricalParameter([True, False]),
    'wd': tuner.ContinuousParameter(1e-7, 1)
}

In [295]:
objective_metric_name = 'validation:objective_loss'

In [296]:
hyperparameter_tuner = tuner.HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges, 
    objective_type='Minimize',
    max_jobs=6,
    max_parallel_jobs=3
)

In [297]:
hyperparameter_tuner.fit(
    {'train': s3_train_data, 'validation': s3_valid_data},
    include_cls_metadata=False,
    wait=False
)

job_name = hyperparameter_tuner.latest_tuning_job.job_name

response = client.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=job_name
)

response['HyperParameterTuningJobStatus']

'InProgress'

In [298]:
from pprint import pprint

pprint(response)

{'CreationTime': datetime.datetime(2021, 5, 23, 5, 38, 45, 798000, tzinfo=tzlocal()),
 'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-east-1:305262579855:hyper-parameter-tuning-job/linear-learner-210523-0538',
 'HyperParameterTuningJobConfig': {'HyperParameterTuningJobObjective': {'MetricName': 'validation:objective_loss',
                                                                        'Type': 'Minimize'},
                                   'ParameterRanges': {'CategoricalParameterRanges': [{'Name': 'use_bias',
                                                                                       'Values': ['True',
                                                                                                  'False']}],
                                                       'ContinuousParameterRanges': [{'MaxValue': '1',
                                                                                      'MinValue': '1e-05',
                                            

In [299]:
hyperparameter_tuner.__dict__

{'estimator': <sagemaker.estimator.Estimator at 0x7f1e98bc1c18>,
 'objective_metric_name': 'validation:objective_loss',
 '_hyperparameter_ranges': {'learning_rate': <sagemaker.parameter.ContinuousParameter at 0x7f1e9a2bb320>,
  'mini_batch_size': <sagemaker.parameter.IntegerParameter at 0x7f1e9a2bb2b0>,
  'l1': <sagemaker.parameter.ContinuousParameter at 0x7f1e9a2bb390>,
  'use_bias': <sagemaker.parameter.CategoricalParameter at 0x7f1e9a2bb400>,
  'wd': <sagemaker.parameter.ContinuousParameter at 0x7f1e9a2bb470>},
 'metric_definitions': None,
 'estimator_dict': None,
 'objective_metric_name_dict': None,
 '_hyperparameter_ranges_dict': None,
 'metric_definitions_dict': None,
 'static_hyperparameters_dict': None,
 'strategy': 'Bayesian',
 'objective_type': 'Minimize',
 'max_jobs': 6,
 'max_parallel_jobs': 3,
 'tags': None,
 'base_tuning_job_name': None,
 '_current_job_name': 'linear-learner-210523-0538',
 'latest_tuning_job': <sagemaker.tuner._TuningJob at 0x7f1e989f0860>,
 'warm_start_c

In [300]:
tuning_job_name = response['HyperParameterTuningJobName']
%store tuning_job_name

tuning_job_name

Stored 'tuning_job_name' (str)


'linear-learner-210523-0538'

In [301]:
from time import sleep

while response['HyperParameterTuningJobStatus'] == 'InProgress':
    response = client.describe_hyper_parameter_tuning_job(
        HyperParameterTuningJobName=job_name
    )

    print(response['HyperParameterTuningJobStatus'])
    sleep(60)

InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


In [302]:
%store -r tuning_job_name

from sagemaker import HyperparameterTuningJobAnalytics

def latest_df():
    analytics = HyperparameterTuningJobAnalytics(tuning_job_name)

    return analytics.dataframe()    

latest_df().sort_values('FinalObjectiveValue', ascending=True)

Unnamed: 0,l1,learning_rate,mini_batch_size,use_bias,wd,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
4,2.1728e-06,1.8174e-05,48.0,False,5.577e-07,linear-learner-210523-0538-002-ead60ab7,Completed,1417753133056.0,2021-05-23 05:42:03+00:00,2021-05-23 05:43:14+00:00,71.0
5,0.0001906496,2.02289e-05,32.0,False,6.268e-07,linear-learner-210523-0538-001-2020a949,Completed,1399963516928.0,2021-05-23 05:41:44+00:00,2021-05-23 05:43:06+00:00,82.0
1,4.794e-07,3.02293e-05,16.0,True,1.6953e-06,linear-learner-210523-0538-005-aed1272a,Completed,817506222080.0,2021-05-23 05:46:32+00:00,2021-05-23 05:47:57+00:00,85.0
0,2.8044e-06,2.94158e-05,15.0,True,3.38504e-05,linear-learner-210523-0538-006-8f9dcda7,Completed,816711860224.0,2021-05-23 05:46:46+00:00,2021-05-23 05:47:52+00:00,66.0
3,0.0060171603,0.0004517723,16.0,True,1.3127e-06,linear-learner-210523-0538-003-53c880f5,Completed,509812834304.0,2021-05-23 05:41:48+00:00,2021-05-23 05:42:51+00:00,63.0
2,0.0721390683,0.0790390138,13.0,True,3.85064e-05,linear-learner-210523-0538-004-a0c9210f,Completed,18176030720.0,2021-05-23 05:46:16+00:00,2021-05-23 05:47:31+00:00,75.0


In [5]:
import sagemaker

sesh = sagemaker.Session()

linear = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    sagemaker_session=sesh
)

linear.set_hyperparameters(
    predictor_type="regressor",
    normalize_data=True,
    optimizer="adam",
    mini_batch_size=13,
    l1=0.0721390683,
    learning_rate=0.0790390138,
    use_bias=True,
    wd=0.0000385064
)

In [21]:
linear.fit(inputs={"train": s3_train_data, "validation": s3_valid_data})

2021-05-24 06:45:06 Starting - Starting the training job...
2021-05-24 06:45:30 Starting - Launching requested ML instancesProfilerReport-1621838706: InProgress
......
2021-05-24 06:46:30 Starting - Preparing the instances for training.........
2021-05-24 06:47:50 Downloading - Downloading input data...
2021-05-24 06:48:33 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/24/2021 06:48:38 INFO 139870522902336] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_

In [33]:
# predictor = linear.deploy(initial_instance_count=1, instance_type='local')

linear_predictor = linear.deploy(initial_instance_count=1, instance_type="ml.t2.medium")
print(f"\ncreated endpoint: {linear_predictor.endpoint_name}")

---------------------!
created endpoint: linear-learner-2021-05-24-08-13-29-583


In [34]:
from sagemaker.serializers import CSVSerializer

linear_predictor.serializer = CSVSerializer()
result = linear_predictor.predict(X_train)

In [35]:
import json

res = json.loads(result.decode())
res['predictions'][0]

{'score': 93212.5625}

In [36]:
y_pred_len = len(res['predictions'])
y_pred_scores = []

for i in range(y_pred_len):
    y_pred_scores.append(res['predictions'][i].get('score'))

In [37]:
# init output DF
output_df = pd.DataFrame()

# populate DF
output_df['Actual'] = y_train
output_df['Predicted'] = y_pred_scores

output_df

Unnamed: 0,Actual,Predicted
0,9600,93212.5625000000
1,634000,698611.4375000000
2,848400,874587.1875000000
3,612800,647857.3750000000
4,456000,515334.6875000000
...,...,...
211,120800,204517.5468750000
212,112000,141272.5312500000
213,480000,533859.7500000000
214,605600,712833.3750000000


In [38]:
linear_predictor.delete_endpoint()
print(f"deleted {linear_predictor.endpoint_name} successfully!")

deleted linear-learner-2021-05-24-08-13-29-583 successfully!
