# Factorization Machine - Cloud Training

## Import Libraries

In [1]:
import numpy as np
import pandas as pd

## Define IAM Role for AWS

In [2]:
import boto3
import re
import sagemaker
from sagemaker import get_execution_role
# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

## Upload Data to S3

In [3]:
# Specify your bucket name
bucket_name = 's3-2-ml-sagemaker'
training_file_key = 'movie/user_movie_train.recordio'
test_file_key = 'movie/user_movie_test.recordio'

s3_model_output_location = r's3://{0}/movie/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_file_key)

### Read Dimensions

Number of unique users + Number of unique movies in dataset

In [4]:
dim_movie = 0

# Update movie dimension - from training file
with open(r'ml-latest-small/movie_dimension.txt','r') as f:
    dim_movie = int(f.read())

In [5]:
dim_movie # data check

10334

In [6]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_test_file_location)

s3://s3-2-ml-sagemaker/movie/model
s3://s3-2-ml-sagemaker/movie/user_movie_train.recordio
s3://s3-2-ml-sagemaker/movie/user_movie_test.recordio


## Writing and reading from S3 Bucket

Ref: [Boto3 Read Docs](http://boto3.readthedocs.io/en/latest/guide/s3.html)

In [7]:
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # read as binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [8]:
write_to_s3(r'ml-latest-small/user_movie_train.recordio',bucket_name,training_file_key)

In [9]:
write_to_s3(r'ml-latest-small/user_movie_test.recordio',bucket_name,test_file_key)

## Training Algorithm Docker Image

### AWS Maintains a separate image for every region and algorithm

This allows quick deployment machines ready for the task.

Registry Path for algorithms by SageMaker
[Latest Dockers](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html)

"Algorithms that are parallelizable can be deployed on multiple compute instances for distributed training. For the Training Image and Inference Image Registry Path column, use the :1 version tag to ensure that you are using a stable version of the algorithm. You can reliably host a model trained using an image with the :1 tag on an inference image that has the :1 tag. Using the :latest tag in the registry path provides you with the most up-to-date version of the algorithm, but might cause problems with backward compatibility. Avoid using the :latest tag for production purposes." - [AWS SageMaker Docs](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html)

- First run will be using only one Docker.
- Next run will allow more, say a six-pack.

In [10]:
# us-east-2 : 404615174143.dkr.ecr.us-east-2.amazonaws.com
# us-east-1 : 382416733822.dkr.ecr.us-east-1.amazonaws.com

containers = {'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:latest',
             'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest',
             'us-west-1': '632365934929.dkr.ecr.us-west-1.amazonaws.com/factorization-machines:latest',
             'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest',}

In [11]:
role = get_execution_role()

In [12]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::399426528351:role/service-role/AmazonSageMaker-ExecutionRole-20200203T173955


## Build Model

In [13]:
sess = sagemaker.Session()

In [14]:
# Access appropriate algorithm container image
#  Specify how many instances to use for distributed training and what type of machine to use
#  Finally, specify where the trained model artifacts needs to be stored
#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
#    Optionally, give a name to the training job using base_job_name

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=s3_model_output_location,
                                       sagemaker_session=sess,
                                       base_job_name ='fm-movie-v4')

In [15]:

# This was the original set of hyperparameters that was used. TEST RMSE was 0.89
# Movie lens dataset was updated recently and these parameters are no longer sufficient to produce
# quality predictions. With new dataset, TEST RMSE is around 1.9
# Refer to next cell for new settings
'''
estimator.set_hyperparameters(feature_dim=dim_movie,
                              num_factors=8,
                              predictor_type='regressor', 
                              mini_batch_size=1000,
                              epochs=100)
'''

"\nestimator.set_hyperparameters(feature_dim=dim_movie,\n                              num_factors=8,\n                              predictor_type='regressor', \n                              mini_batch_size=1000,\n                              epochs=100)\n"

In [16]:
# New Configuration after Model Tuning
# Refer to Hyperparameter Tuning Section on how to optimize hyperparameters
estimator.set_hyperparameters(feature_dim=dim_movie,
                              num_factors=8,
                              predictor_type='regressor', 
                              mini_batch_size=994,
                              epochs=91,
                              bias_init_method='normal',
                              bias_lr=0.21899531189430518,
                              factors_init_method='normal',
                              factors_lr=5.357593337770278e-05,
                              linear_init_method='normal',
                              linear_lr=0.00021524948053767607)

In [17]:
estimator.hyperparameters()

{'feature_dim': 10334,
 'num_factors': 8,
 'predictor_type': 'regressor',
 'mini_batch_size': 994,
 'epochs': 91,
 'bias_init_method': 'normal',
 'bias_lr': 0.21899531189430518,
 'factors_init_method': 'normal',
 'factors_lr': 5.357593337770278e-05,
 'linear_init_method': 'normal',
 'linear_lr': 0.00021524948053767607}

## Train the Model

In [18]:
# New Hyperparameters
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit({'train':s3_training_file_location, 'test': s3_test_file_location})

2020-05-07 21:10:45 Starting - Starting the training job...
2020-05-07 21:10:46 Starting - Launching requested ML instances......
2020-05-07 21:11:47 Starting - Preparing the instances for training...
2020-05-07 21:12:40 Downloading - Downloading input data...
2020-05-07 21:13:14 Training - Downloading the training image...
2020-05-07 21:13:33 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from numpy.testing import nosetester[0m
[34m[05/07/2020 21:13:35 INFO 139783205549888] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'li

## Deploy the Model

In [19]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name = 'fm-movie-v4')

-------------!

## Run Predictions

### Dense and Sparse Formats

Ref: [Common Data Formats for Inference](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html)

In [20]:
import json
from sagemaker.predictor import json_deserializer

def fm_sparse_serializer(data):
    js = {'instances': []}
    for row in data:
        
        column_list = row.tolist()
        value_list = np.ones(len(column_list),dtype=int).tolist()
       
        js['instances'].append({'data':{'features': { 'keys': column_list, 'shape':[dim_movie], 'values': value_list}}})
    return json.dumps(js)

In [21]:
predictor.content_type = 'application/json'
predictor.serializer = fm_sparse_serializer
predictor.deserializer = json_deserializer

In [22]:
fm_sparse_serializer([np.array([341,1416])])

'{"instances": [{"data": {"features": {"keys": [341, 1416], "shape": [10334], "values": [1, 1]}}}]}'

In [23]:
# Let's test with few entries from test file
# Movie dataset is updated regularly...so, instead of hard coding userid and movie id, let's
# use actual values

# Each row is in this format: ['2.5', '426:1', '943:1']
# ActualRating, UserID, MovieID

with open(r'ml-latest-small/user_movie_test.svm','r') as f:
    for i in range(3):
        rating = f.readline().split()
        print(rating)
        print ('***Actual Rating:',rating[0])
        # UserID, MovieID
        userID = rating[1].split(':')[0]
        movieID = rating[2].split(':')[0]
        predicted_rating = predictor.predict([np.array([int(userID),int(movieID)])])
        print('***Predicted Rating:', predicted_rating)

['4', '561:1', '2822:1']
***Actual Rating: 4
***Predicted Rating: {'predictions': [{'score': 3.306558609008789}]}
['3.5', '473:1', '2600:1']
***Actual Rating: 3.5
***Predicted Rating: {'predictions': [{'score': 3.339083433151245}]}
['4.5', '361:1', '2548:1']
***Actual Rating: 4.5
***Predicted Rating: {'predictions': [{'score': 4.247658729553223}]}
