In this tutorial, we build a simple matrix factorization model using the MovieLens 100K dataset with TFRS. We can use this model to recommend movies for a given user.

### Import TFRS
First, install and import TFRS:

In [2]:
# !pip install tensorflow-recommenders
# !pip install --upgrade tensorflow-datasets

In [4]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

Read the data

In [5]:
# Ratings data.
ratings = tfds.load('movielens/100k-ratings', split='train')

In [17]:
type(ratings)

tensorflow.python.data.ops.dataset_ops.PrefetchDataset

In [30]:
ratings.cardinality().numpy()

100000

In [24]:
# ratings.take(1).as_numpy_iterator().next() # ['movie_genres']
ratings.as_numpy_iterator().next()

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}

In [20]:
# Features of all the available movies.
movies = tfds.load('movielens/100k-movies', split='train')

[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 150.35 KiB, total: 4.84 MiB) to /home/aman/tensorflow_datasets/movielens/100k-movies/0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1682 [00:00<?, ? examples/s]

Shuffling movielens-train.tfrecord...:   0%|          | 0/1682 [00:00<?, ? examples/s]

[1mDataset movielens downloaded and prepared to /home/aman/tensorflow_datasets/movielens/100k-movies/0.1.0. Subsequent calls will reuse this data.[0m


In [26]:
movies.as_numpy_iterator().next()

{'movie_genres': array([4]),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}

In [32]:
movies.cardinality().numpy()

1682

In [33]:
len(movies)

1682

In [34]:
for e in movies.take(1):
    print(e)
    
for i in movies.take(20).as_numpy_iterator():
    print(i)

{'movie_genres': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([4])>, 'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1681'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'You So Crazy (1994)'>}
{'movie_genres': array([4]), 'movie_id': b'1681', 'movie_title': b'You So Crazy (1994)'}
{'movie_genres': array([4, 7]), 'movie_id': b'1457', 'movie_title': b'Love Is All There Is (1996)'}
{'movie_genres': array([1, 3]), 'movie_id': b'500', 'movie_title': b'Fly Away Home (1996)'}
{'movie_genres': array([0]), 'movie_id': b'838', 'movie_title': b'In the Line of Duty 2 (1987)'}
{'movie_genres': array([7]), 'movie_id': b'1648', 'movie_title': b'Niagara, Niagara (1997)'}
{'movie_genres': array([5]), 'movie_id': b'547', 'movie_title': b"Young Poisoner's Handbook, The (1995)"}
{'movie_genres': array([7]), 'movie_id': b'387', 'movie_title': b'Age of Innocence, The (1993)'}
{'movie_genres': array([7]), 'movie_id': b'1495', 'movie_title': b'Flirt (1995)'}
{'movie_genres': array([7

In [36]:
# Select the basic features.
ratings = ratings.map( lambda x : { 'movie_title' : x['movie_title'],
                                    'user_id' : x['user_id']
                                  }
                     )
ratings.as_numpy_iterator().next()

{'movie_title': b"One Flew Over the Cuckoo's Nest (1975)", 'user_id': b'138'}

In [37]:
# Select the basic features.
movies = movies.map(lambda x: x['movie_title'])
for i in movies.take(5):
    print(i)
for i in movies.take(5).as_numpy_iterator():
    print(i)

tf.Tensor(b'You So Crazy (1994)', shape=(), dtype=string)
tf.Tensor(b'Love Is All There Is (1996)', shape=(), dtype=string)
tf.Tensor(b'Fly Away Home (1996)', shape=(), dtype=string)
tf.Tensor(b'In the Line of Duty 2 (1987)', shape=(), dtype=string)
tf.Tensor(b'Niagara, Niagara (1997)', shape=(), dtype=string)
b'You So Crazy (1994)'
b'Love Is All There Is (1996)'
b'Fly Away Home (1996)'
b'In the Line of Duty 2 (1987)'
b'Niagara, Niagara (1997)'


In [58]:
ratings_map = ratings.map(lambda x: x['user_id'])

Build vocabularies to convert user ids and movie titles into integer indices for embedding layers:

In [60]:
user_ids_vocab = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
user_ids_vocab.adapt(ratings_map)

In [62]:
user_ids_vocab

<keras.layers.preprocessing.string_lookup.StringLookup at 0x7f6c52753a60>

In [42]:
movies_ids_vocab = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
movies_ids_vocab.adapt(movies)

In [43]:
emb_dim = 64
user_model = tf.keras.Sequential([
    user_ids_vocab, tf.keras.layers.Embedding(user_ids_vocab.vocabulary_size(), emb_dim)
])
movies_model = tf.keras.Sequential([
    movies_ids_vocab, tf.keras.layers.Embedding(movies_ids_vocab.vocabulary_size(), emb_dim)
])

In [50]:
for _ in movies.take(1).map(movies_model):
    print(_)

tf.Tensor(
[ 0.00488276 -0.02126308  0.0213975   0.02727533  0.01779744 -0.01035053
  0.03386256  0.01611245  0.02353081 -0.01910284 -0.01646175  0.01832548
  0.04249639 -0.02181196 -0.02966985 -0.00986928 -0.03669081  0.02527907
 -0.03758503 -0.03262364 -0.01505643 -0.00633571  0.04587204  0.02371586
 -0.04684955  0.00296987 -0.03228796  0.02646155  0.02738008  0.04429741
  0.01198268 -0.01738922  0.00433515 -0.0268191   0.0366516   0.04281712
 -0.00016258 -0.02289463  0.01001792  0.03003721  0.03990978  0.02464109
 -0.04178623 -0.0067494   0.01700825 -0.02018547 -0.02889882 -0.02941214
 -0.0332037  -0.04823159 -0.03958845 -0.02992287 -0.03452659  0.0463689
  0.00271047 -0.02308537  0.01107264  0.04291209  0.01981464  0.04710691
 -0.00619587  0.00649364  0.01641463  0.00102947], shape=(64,), dtype=float32)


In [55]:
ratings.as_numpy_iterator().next()['user_id']

b'138'

In [63]:
%%time
# for _ in ratings.map(lambda x: x['user_id']).map(user_model):
for _ in ratings_map.map(user_model):
    print(_)
    break

tf.Tensor(
[-0.0366211   0.00094957 -0.04706112  0.04736788 -0.04103935 -0.03584119
 -0.01867415  0.0168316  -0.03883757 -0.01939794 -0.04876237  0.0245409
  0.03740073  0.04787475  0.0398914   0.04754927  0.03827341 -0.03269073
  0.0436799   0.01101138 -0.02682875 -0.04531032 -0.01816456  0.03769524
  0.04040089 -0.03394334 -0.04823234  0.02967893  0.0279496   0.04936758
  0.02757091  0.03281725 -0.00936754 -0.03276924  0.04045517 -0.04859971
  0.04373621 -0.01043211 -0.04799215  0.04669709 -0.00698922 -0.03767061
  0.02569452  0.02091347 -0.00738947  0.00604408  0.02939544  0.01218835
 -0.00622797  0.03101913  0.00196692  0.01752147  0.01200528 -0.04688732
 -0.02778424 -0.00863976  0.02190168  0.03014113 -0.01168313  0.02974044
  0.03915513  0.03425856 -0.03021585 -0.0245313 ], shape=(64,), dtype=float32)
CPU times: user 71.9 ms, sys: 12 ms, total: 83.9 ms
Wall time: 79.1 ms


Define the two models and the retrieval task.

In [None]:
emb_dim = 64

# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocab, tf.keras.layers.Embedding(user_ids_vocab.vocabulary_size(), emb_dim)
])

movies_model = tf.keras.Sequential([
    movies_ids_vocab, tf.keras.layers.Embedding(movies_ids_vocab.vocabulary_size(), emb_dim)
])

### Metrics

### vvvImp & Insightful - 
In our training data we have positive (user, movie) pairs. To figure out how good our model is, we need to compare the affinity score that the model calculates for this pair to the scores of all the other possible candidates: if the score for the positive pair is higher than for all other candidates, our model is highly accurate.

To do this, we can use the `tfrs.metrics.FactorizedTopK` metric. The metric has one required argument: the dataset of candidates that are used as implicit negatives for evaluation.

In our case, that's the `movies` dataset, converted into embeddings via our movie model:

In [68]:
'''
Refer - https://github.com/kewlcoder/recommenders/blob/main/docs/examples/basic_retrieval_mine.ipynb
for a better application and an elaborate explanation. 
Retrieval task object: a convenience wrapper that bundles together the loss function and metric computation:

It probably means - check if the positive data point(true label) comes in the
top-K of the points.

Defined in this link - 
https://github.com/tensorflow/recommenders/blob/v0.6.0/tensorflow_recommenders/metrics/factorized_top_k.py#L25-L108

Computes metrics for across top K candidates surfaced by a retrieval model.
The default metric is top K categorical accuracy: how often the true candidate is in the top K candidates
for a given query.
For e.g., how frequently does the distance b/w a user_embeddings[i] and movies_embeddings[i] is in the topK lowest
distances b/w user_embeddings[i] and all elements of "movies".
'''
# Define your objectives.
task = tfrs.tasks.Retrieval( metrics=tfrs.metrics.FactorizedTopK(movies.batch(128).map(movies_model) ) )

### Define a model
We can define a TFRS model by inheriting from tfrs.Model and implementing the compute_loss method:

In [69]:
class MovieLensModel(tfrs.Model):
    def __init__(self, user_model: tf.keras.Model, movies_model: tf.keras.Model, task : tfrs.tasks.Retrieval):
        
        super().__init__()
        
        self.user_model = user_model
        self.movies_model = movies_model
        
        self.task = task
        
    '''
        Refer the description in the "class Model" definition -
        https://github.com/tensorflow/recommenders/blob/main/tensorflow_recommenders/models/base.py
    '''
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        print("computing loss")
        # for every rating, calculate the user as well as movie embeddings
        user_embeddings = self.user_model(features['user_id'])
        movies_embeddings = self.movies_model(features['movie_title'])
        
        return self.task(user_embeddings, movies_embeddings)
    

### Fit and evaluate it.
Create the model, train it, and generate predictions:

In [66]:
model = MovieLensModel(user_model, movies_model, task)

model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

model.fit(ratings.batch(4096), epochs = 5)

Epoch 1/5
computing loss
computing loss
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f6c527a2e20>

In [75]:
%%time

# Use brute-force search to set up retrieval using the trained representations.
# default k = 10; pass the query model
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

# First arg - candidate embeddings, second arg - identifiers/names of candidate emb.
# old code version
# index.index(movies.batch(100).map(movies_model), movies)


# Build the retrieval index.
index.index_from_dataset(
    movies.batch(100).map(lambda title: (title, model.movies_model(title))))


# calls the call() func.; passing quer(y|ies) / user_id
# Get some recommendations.
# Returns tuple of (top candidate scores, top candidate identifiers).
_, titles = index(np.array(['42']))

print("top rec. = ", titles[0, :3])

Cause: could not parse the source code of <function <lambda> at 0x7f6c5305eaf0>: no matching AST found


Cause: could not parse the source code of <function <lambda> at 0x7f6c5305eaf0>: no matching AST found


Cause: could not parse the source code of <function <lambda> at 0x7f6c5305eaf0>: no matching AST found
top rec. =  tf.Tensor([b'Rent-a-Kid (1995)' b'Just Cause (1995)' b'Shiloh (1997)'], shape=(3,), dtype=string)
CPU times: user 60 ms, sys: 14.1 ms, total: 74.1 ms
Wall time: 52.2 ms


In [76]:
titles[0]

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'Rent-a-Kid (1995)', b'Just Cause (1995)', b'Shiloh (1997)',
       b'House Arrest (1996)',
       b'Winnie the Pooh and the Blustery Day (1968)',
       b'Clean Slate (1994)',
       b'Far From Home: The Adventures of Yellow Dog (1995)',
       b'Halloween: The Curse of Michael Myers (1995)', b'Jack (1996)',
       b'Unstrung Heroes (1995)'], dtype=object)>

In [77]:
_

<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
array([[4.7850003, 4.364386 , 4.2983904, 4.2590117, 3.5862286, 3.299794 ,
        3.2943478, 3.2723546, 3.140592 , 3.087105 ]], dtype=float32)>