# Movie recommendation with Factorization Machines on Amazon SageMaker

In this notebook, we are going to build a simple movie recommendation model with Factorization machine using Amazon Sagemaker

In [1]:
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import json_deserializer

import boto3, csv, io, json
import numpy as np
from scipy.sparse import lil_matrix

Dataset Description including number of users/moview are in official [Movielens website](https://grouplens.org/datasets/movielens/)

In [3]:
sess   = sagemaker.Session()
bucket = sess.default_bucket()                     
prefix = 'sagemaker/movielens'
region = boto3.Session().region_name
role = 'arn:aws:iam::570447867175:role/SageMakerNotebookRole' # pass your IAM role name

print('Sagemaker session :', sess)
print('S3 bucket :', bucket)
print('Prefix :', prefix)
print('Region selected :', region)
print('IAM role :', role)

Sagemaker session : <sagemaker.session.Session object at 0x000002F746292D48>
S3 bucket : sagemaker-us-west-2-570447867175
Prefix : sagemaker/movielens
Region selected : us-west-2
IAM role : arn:aws:iam::570447867175:role/SageMakerNotebookRole


In [4]:
nbUsers=943
nbMovies=1682
nbFeatures=nbUsers+nbMovies

nbRatingsTrain=90570
nbRatingsTest=9430

In [6]:
# For each user, build a list of rated movies.
# We'd need this to add random negative samples.
moviesByUser = {}
for userId in range(nbUsers):
    moviesByUser[str(userId)]=[]

with open('./data/ml-100k/ua.base','r') as f:
    samples=csv.reader(f,delimiter='\t')
    for userId,movieId,rating,timestamp in samples:
        moviesByUser[str(int(userId)-1)].append(int(movieId)-1) 

In [7]:
def loadDataset(filename, lines, columns):
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((lines, columns)).astype('float32')
    # Labels are stored in a vector
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter='\t')
        for userId,movieId,rating,timestamp in samples:
            X[line,int(userId)-1] = 1
            X[line,int(nbUsers)+int(movieId)-1] = 1
            if int(rating) >= 4:
                Y.append(1)
            else:
                Y.append(0)
            line=line+1
            
    Y=np.array(Y).astype('float32')
    return X,Y

In [9]:
X_train, Y_train = loadDataset('./data/ml-100k/ua.base', nbRatingsTrain, nbFeatures)
X_test, Y_test = loadDataset('./data/ml-100k/ua.test',nbRatingsTest,nbFeatures)

In [10]:
print(X_train.shape)
print(Y_train.shape)

assert X_train.shape == (nbRatingsTrain, nbFeatures)
assert Y_train.shape == (nbRatingsTrain, )
zero_labels = np.count_nonzero(Y_train)

print("Training labels: %d zeros, %d ones" % (zero_labels, nbRatingsTrain-zero_labels))

print(X_test.shape)
print(Y_test.shape)

assert X_test.shape  == (nbRatingsTest, nbFeatures)
assert Y_test.shape  == (nbRatingsTest, )
zero_labels = np.count_nonzero(Y_test)
print("Test labels: %d zeros, %d ones" % (zero_labels, nbRatingsTest-zero_labels))

(90570, 2625)
(90570,)
Training labels: 49906 zeros, 40664 ones
(9430, 2625)
(9430,)
Test labels: 5469 zeros, 3961 ones


# 3. Convert to protobuf and upload data to S3 bucket

In [11]:
train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train3')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test3')

output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [12]:
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    
test_data  = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    
  
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

s3://sagemaker-us-west-2-570447867175/sagemaker/movielens/train3/train.protobuf
s3://sagemaker-us-west-2-570447867175/sagemaker/movielens/test3/test.protobuf
Output: s3://sagemaker-us-west-2-570447867175/sagemaker/movielens/output


# 4. Start Training

In [15]:
# Specify Docker Contatiner
container = sagemaker.amazon.amazon_estimator.get_image_uri(
    boto3.Session().region_name, "factorization-machines", "latest")
print(container)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest


In [18]:
from sagemaker.estimator import Estimator
# https://sagemaker.readthedocs.io/en/stable/estimators.html

## ==== ##
# Make sure to use a non-GPU instance, because Training with a sparse data set is only possible on a non-GPU instance.
## ==== ##

fm = Estimator(container,                             # The contatiner that contatins algorithm
               role = role,                           # your IAM role
               train_instance_count=1,                # Instance requirements
               train_instance_type='ml.c4.xlarge' ,   # Instance type (GPU instances like 'ml.p2.xlarge' would not work in this case)
               output_path=output_prefix,
               sagemaker_session=sess)

fm.set_hyperparameters(feature_dim=nbFeatures,
                      predictor_type='binary_classifier',
                      num_factors=64,
                      epochs=10)

fm.fit({'train': train_data, 'test': test_data})



2020-06-18 23:58:24 Starting - Starting the training job...
2020-06-18 23:58:27 Starting - Launching requested ML instances......
2020-06-18 23:59:41 Starting - Preparing the instances for training......
2020-06-19 00:00:56 Downloading - Downloading input data
2020-06-19 00:00:56 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from numpy.testing import nosetester[0m
[34m[06/19/2020 00:01:14 INFO 140432659142464] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'_tuning_objective_metric': u'', 

### We should be able to check the logs of how Sagemaker takes cares of training like above

# 5. Model Deploy 

You can easily deploy the model with **.deploy** method. This creates a RESTful HTTP endpoint that can be intergrated to any of the applications that we are trying to use. You can also check whether the model is created in your Sagemaker console.

In [None]:
from time import strftime, gmtime
timestamp = strftime('%d-%H-%M-%S', gmtime())

fm_predictor = fm.deploy(
    endpoint_name = 'movielens-{}'.format(timestamp),
    initial_instance_count=1,
    instance_type='ml.p2.xlarge')

# 6. Predicting with the test set

In [None]:
result = fm_predictor.predict(X_test[1000:1010].toarray())
print(result)