In [2]:
import numpy as np
import boto3
import sagemaker
import io
import sagemaker.amazon.common as smac
import os
import pandas as pd

# Read csv from s3.
# Download from your S3 bucket the bike share data CSV file based on the publically available bike share data from the ML repository curated by the University of California, Irvine
from io import StringIO
s3 = boto3.resource('s3')
bucket = 'raz-eu-central-1-tutorial' # place the day.csv file in a bucket in your account
object_key = 'sagemaker/machine-learning-exam/day.csv'

# Load the data into a pandas dataframe 
csv_obj = s3.Object(bucket, object_key)
csv_string = csv_obj.get()['Body'].read().decode('utf-8')

dataset = pd.read_csv(StringIO(csv_string))
dataset.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [3]:
# Convert categorical date field
dataset['dteday'] = dataset['dteday'].str.replace("-","")
dataset.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,20110101,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,20110102,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,20110103,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,20110104,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,20110105,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [4]:
# Randomize the data and split it between train and test datasets on a 70% 30% split respectively
train_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset))])
print(train_data.shape, test_data.shape)

(511, 16) (220, 16)


In [24]:
# Get the features and labels.
feature_dataset = train_data[['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
                           'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed' ]]
features = np.array(feature_dataset.values).astype('float32')

label_dataset= train_data[['cnt']]
labels = np.array(label_dataset.values).astype('float32')
labels_vec = np.squeeze(np.asarray(labels))

In [25]:
# Setup protoBuf
buffer = io.BytesIO()
smac.write_numpy_to_dense_tensor(buffer, features, labels_vec)
buffer.seek(0)

prefix = 'realestate'
key = 'linearregression'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buffer)
s3_training_data_location = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('training dataset will be uploaded to: {}'.format(s3_training_data_location))

training dataset will be uploaded to: s3://raz-eu-central-1-tutorial/realestate/train/linearregression


In [26]:
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('model artifacts will be uploaded to: {}'.format(output_location))

model artifacts will be uploaded to: s3://raz-eu-central-1-tutorial/realestate/output


In [27]:
# Get the Linear Learner container instance
from sagemaker.amazon.amazon_estimator import get_image_uri
linear_container = get_image_uri(boto3.Session().region_name, 'linear-learner')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [29]:
# Train the model
from sagemaker import get_execution_role

role = get_execution_role()

sagemaker_session = sagemaker.Session()

# Provide the container, role, instance type and model output location
linear = sagemaker.estimator.Estimator(linear_container,
                                       role=role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sagemaker_session)

# Provide the number of features identified during data preparation
# Provide the predictor_type 

linear.set_hyperparameters(feature_dim=13,
                           mini_batch_size=4,
                           predictor_type='regressor')

# Train the model using the previously prepared test data and validate the 
# data by providing the validation data.

linear.fit({'train': s3_training_data_location})

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-11-13 18:04:38 Starting - Starting the training job...
2020-11-13 18:04:40 Starting - Launching requested ML instances......
2020-11-13 18:05:47 Starting - Preparing the instances for training.........
2020-11-13 18:07:12 Downloading - Downloading input data...
2020-11-13 18:07:55 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[11/13/2020 18:08:19 INFO 139679959119680] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init

In [None]:
# Deploy the model
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.c4.xlarge',
                                 endpoint_name='bikeshare-sagemaker-regression-v2')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Using already existing model: linear-learner-2020-11-13-18-04-38-473


--------

In [32]:
# Get prediction using the test data
from sagemaker.predictor import csv_serializer, json_deserializer

linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

test_feature_dataset = test_data[['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
                           'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed']]

test_actuals = np.array(test_data['cnt'].astype('float32'))
test_features = np.array(test_feature_dataset.values).astype('float32')

predictions = []
actuals = []
for tf, actual in zip(test_features, test_actuals):
    prediction = linear_predictor.predict(tf)
    predictions.append(prediction['predictions'][0]['score'])
    actuals.append(actual)
    print('prediction: ', prediction['predictions'][0]['score'], '\t\tactual: ', str(actual))

prediction:  1434.75 		actual:  801.0
prediction:  4526.875 		actual:  5217.0
prediction:  6714.25 		actual:  7767.0
prediction:  5827.0 		actual:  6852.0
prediction:  2056.5 		actual:  2209.0
prediction:  6624.25 		actual:  6290.0
prediction:  5409.125 		actual:  4792.0
prediction:  1713.625 		actual:  1865.0
prediction:  5868.75 		actual:  5668.0
prediction:  3618.875 		actual:  4492.0
prediction:  5096.125 		actual:  4367.0
prediction:  2249.625 		actual:  2402.0
prediction:  6108.0 		actual:  3846.0
prediction:  3904.0 		actual:  4788.0
prediction:  3168.875 		actual:  3190.0
prediction:  3984.375 		actual:  3005.0
prediction:  7536.75 		actual:  7286.0
prediction:  2331.125 		actual:  2132.0
prediction:  6911.0 		actual:  5464.0
prediction:  4981.0 		actual:  4985.0
prediction:  5431.25 		actual:  6304.0
prediction:  5541.0 		actual:  5532.0
prediction:  6997.125 		actual:  8009.0
prediction:  2104.75 		actual:  2999.0
prediction:  4583.625 		actual:  4338.0
prediction:  5173.25 	

In [33]:
# Get accuracy using Cosine Similarity method 
from numpy import dot
from numpy.linalg import norm
tolerance = 1e-10
accuracy = (dot(actuals, predictions)/(norm(actuals)*norm(predictions))) * 100
print('accuracy: ', accuracy)

accuracy:  98.6166473968964


In [12]:
# delete the endpoint
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)