# Movie recommendation on Amazon SageMaker with Factorization Machines

### 1.Download the dataset

In [2]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -o ml-100k.zip

--2023-01-22 22:56:32--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2023-01-22 22:56:33 (14.1 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base    

### 2.Inspect the dataset

In [3]:
%cd ml-100k
!shuf ua.base -o ua.base.shuffled
!head -10 ua.base.shuffled

/root/matrix-factor/ml-100k
65	237	4	879217320
773	92	4	888540041
392	632	5	891039015
870	50	3	875050865
674	289	2	887763151
406	971	3	879793328
276	975	3	874836629
819	246	4	884012614
437	418	3	880141084
865	625	1	880235099


In [4]:
!head -10 ua.test

1	20	4	887431883
1	33	4	878542699
1	61	4	878542420
1	117	3	874965739
1	155	2	878542201
1	160	4	875072547
1	171	5	889751711
1	189	3	888732928
1	202	5	875072442
1	265	4	878542441


In [5]:
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import json_deserializer

import boto3, csv, io, json
import numpy as np
from scipy.sparse import lil_matrix

### 3. Build training set and test set

In [6]:
nbUsers=943
nbMovies=1682
nbFeatures=nbUsers+nbMovies

nbRatingsTrain=90570
nbRatingsTest=9430

In [10]:
# For each user, build a list of rated movies.
# We'd need this to add random negative samples.
moviesByUser = {}
for userId in range(nbUsers):
    moviesByUser[str(userId)]=[]

with open('ua.base.shuffled','r') as f:
    samples=csv.reader(f,delimiter='\t')
    for userId,movieId,rating,timestamp in samples:
        moviesByUser[str(int(userId)-1)].append(int(movieId)-1) 
        
moviesByUser

{'0': [110,
  92,
  123,
  141,
  4,
  173,
  143,
  171,
  2,
  134,
  214,
  184,
  71,
  6,
  178,
  55,
  69,
  147,
  119,
  198,
  43,
  21,
  48,
  90,
  49,
  1,
  59,
  262,
  257,
  166,
  135,
  75,
  102,
  7,
  111,
  53,
  12,
  268,
  87,
  241,
  176,
  226,
  52,
  45,
  26,
  108,
  82,
  27,
  155,
  253,
  30,
  246,
  129,
  54,
  118,
  190,
  266,
  66,
  221,
  76,
  81,
  29,
  195,
  256,
  233,
  46,
  202,
  239,
  252,
  112,
  152,
  196,
  140,
  88,
  243,
  177,
  236,
  8,
  180,
  197,
  121,
  164,
  179,
  149,
  65,
  67,
  234,
  73,
  163,
  153,
  139,
  96,
  91,
  106,
  40,
  44,
  231,
  42,
  210,
  57,
  174,
  251,
  205,
  167,
  115,
  62,
  20,
  38,
  227,
  148,
  187,
  213,
  261,
  142,
  161,
  242,
  237,
  182,
  144,
  189,
  156,
  130,
  194,
  103,
  126,
  157,
  185,
  229,
  78,
  206,
  238,
  93,
  186,
  36,
  137,
  18,
  208,
  181,
  114,
  56,
  101,
  70,
  259,
  212,
  25,
  270,
  86,
  47,
  84,
  172,
  151,

In [11]:
def loadDataset(filename, lines, columns):
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((lines, columns)).astype('float32')
    # Labels are stored in a vector
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter='\t')
        for userId,movieId,rating,timestamp in samples:
            X[line,int(userId)-1] = 1
            X[line,int(nbUsers)+int(movieId)-1] = 1
            if int(rating) >= 4:
                Y.append(1)
            else:
                Y.append(0)
            line=line+1
            
    Y=np.array(Y).astype('float32')
    return X,Y

In [12]:
X_train, Y_train = loadDataset('ua.base.shuffled', nbRatingsTrain, nbFeatures)
X_test, Y_test = loadDataset('ua.test',nbRatingsTest,nbFeatures)

In [15]:
print(X_train.shape)
print(Y_train.shape)
assert X_train.shape == (nbRatingsTrain, nbFeatures)
assert Y_train.shape == (nbRatingsTrain, )
zero_labels = np.count_nonzero(Y_train)
print("Training labels: %d zeros, %d ones" % (zero_labels, nbRatingsTrain-zero_labels))

print(X_test.shape)
print(Y_test.shape)
assert X_test.shape  == (nbRatingsTest, nbFeatures)
assert Y_test.shape  == (nbRatingsTest, )
zero_labels = np.count_nonzero(Y_test)
print("Test labels: %d zeros, %d ones" % (zero_labels, nbRatingsTest-zero_labels))

(90570, 2625)
(90570,)
Training labels: 49906 zeros, 40664 ones
(9430, 2625)
(9430,)
Test labels: 5469 zeros, 3961 ones


### 4. Convert to protobuf and save to S3

In [18]:
sess = sagemaker.Session()

bucket = sess.default_bucket()
prefix = 'sagemaker/fm-movielens'

train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train3')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test3')

output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [19]:
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    
test_data  = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    
  
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

s3://sagemaker-us-west-2-376678947624/sagemaker/fm-movielens/train3/train.protobuf
s3://sagemaker-us-west-2-376678947624/sagemaker/fm-movielens/test3/test.protobuf
Output: s3://sagemaker-us-west-2-376678947624/sagemaker/fm-movielens/output


### 5. Run training job

In [20]:
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/factorization-machines:latest'}

In [21]:
fm = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                   get_execution_role(), 
                                   train_instance_count=1, 
                                   train_instance_type='ml.m4.xlarge',
                                   output_path=output_prefix,
                                   sagemaker_session=sagemaker.Session())

fm.set_hyperparameters(feature_dim=nbFeatures,
                      predictor_type='binary_classifier',
                      mini_batch_size=1000,
                      num_factors=64,
                      epochs=100)

fm.fit({'train': train_data, 'test': test_data})

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2023-01-22 23:14:36 Starting - Starting the training job...ProfilerReport-1674429275: InProgress
...
2023-01-22 23:15:19 Starting - Preparing the instances for training......
2023-01-22 23:16:32 Downloading - Downloading input data...
2023-01-22 23:16:59 Training - Downloading the training image............
2023-01-22 23:19:00 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[01/22/2023 23:19:05 INFO 140499074012992] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 'linear_lr': '0.001', 'factors_lr': '0.0001', 'bias_wd': '0.01', 'linear_wd': '0.001', 'factors_wd': '0.00001', 'bias_init_method': 'normal', 'bias_init_sigma': '0.01', 'linear_init_method': 'normal', 'linear_init_sigma': '0

### 6. Deploy model

In [30]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer

class FMSerializer(JSONSerializer):
    def serialize(self, data):
        js = {'instances': []}
        for row in data:
            js['instances'].append({'features': row.tolist()})
        return json.dumps(js)

In [31]:
fm_predictor = fm.deploy(instance_type='ml.t2.medium', initial_instance_count=1, 
    serializer=FMSerializer(),
    deserializer=JSONDeserializer())
# ml.c4.xlarge

--------------------!

### 7. Run predictions

In [32]:
result = fm_predictor.predict(X_test[1000:1010].toarray())
print(result)
print (Y_test[1000:1010])

{'predictions': [{'score': 0.6806391477584839, 'predicted_label': 1.0}, {'score': 0.20273490250110626, 'predicted_label': 0.0}, {'score': 0.2453707605600357, 'predicted_label': 0.0}, {'score': 0.6332478523254395, 'predicted_label': 1.0}, {'score': 0.543194591999054, 'predicted_label': 1.0}, {'score': 0.16069695353507996, 'predicted_label': 0.0}, {'score': 0.4053014814853668, 'predicted_label': 0.0}, {'score': 0.49777984619140625, 'predicted_label': 0.0}, {'score': 0.35441723465919495, 'predicted_label': 0.0}, {'score': 0.1341874748468399, 'predicted_label': 0.0}]}
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
