# Tensorflow Recommenders tutorial

Following the tutorial: https://www.tensorflow.org/recommenders/examples/basic_retrieval

In [4]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs


## Read data

Original data source: https://grouplens.org/datasets/movielens/100k/

In [5]:
# Ratings data.
ratings = tfds.load("movie_lens/100k-ratings", split="train")
# Features of all the available movies.
movies = tfds.load("movie_lens/100k-movies", split="train")



In [6]:
for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


In [7]:
for x in movies.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'movie_genres': array([4]),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}


## Feature selection

In [8]:
print(f"Number of movies: {len(list(movies))}")
print(f"Number of ratings: {len(list(ratings))}")

Number of movies: 1682
Number of ratings: 100000


In [9]:
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
})
movies = movies.map(lambda x: x["movie_title"])

In [30]:
list(movies)

[<tf.Tensor: shape=(), dtype=string, numpy=b'You So Crazy (1994)'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Love Is All There Is (1996)'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Fly Away Home (1996)'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'In the Line of Duty 2 (1987)'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Niagara, Niagara (1997)'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b"Young Poisoner's Handbook, The (1995)">,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Age of Innocence, The (1993)'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Flirt (1995)'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Frisk (1995)'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'unknown'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Girls Town (1996)'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Stonewall (1995)'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'African Queen, The (1951)'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Bloody Child, The (1996)'>,
 <tf.Tensor: shape=

In [26]:
list(movies.take(1).as_numpy_iterator())

[b'You So Crazy (1994)']

In [27]:
list(ratings.take(1).as_numpy_iterator())

[{'movie_title': b"One Flew Over the Cuckoo's Nest (1975)", 'user_id': b'138'}]

## Train-test split

TODO time split

In [10]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [11]:
# Batch: Combines consecutive elements of this dataset into batches.
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

# Concatenates the batches into one list. This will be our vocab.
unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_movie_titles[:10]

array([b"'Til There Was You (1997)", b'1-900 (1994)',
       b'101 Dalmatians (1996)', b'12 Angry Men (1957)', b'187 (1997)',
       b'2 Days in the Valley (1996)',
       b'20,000 Leagues Under the Sea (1954)',
       b'2001: A Space Odyssey (1968)',
       b'3 Ninjas: High Noon At Mega Mountain (1998)',
       b'39 Steps, The (1935)'], dtype=object)

In [13]:
type(unique_user_ids[0])

bytes

In [49]:
print(f"Number of unique movies: {len(unique_movie_titles)}")
print(f"Number of unique users: {len(unique_user_ids)}")

Number of unique movies: 1664
Number of unique users: 943


The first step is to decide on the dimensionality of the query and candidate representations. Higher values will correspond to models that may be more accurate, but will also be slower to fit and more prone to overfitting.

In [48]:
embedding_dimension = 32

In [51]:
# Query tower
user_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

In [52]:
# Candidate tower
movie_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_movie_titles, mask_token=None),
  
    # We add an additional embedding to account for unknown tokens.
    tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

In [53]:
# top K categorical accuracy: how often the true candidate is in the top K candidates for a given query.
metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.batch(128).map(movie_model)
)

In [29]:
len(list(movies))

1682

This task defines models that facilitate efficient retrieval of candidates from large corpora by maintaining a two-tower, factorized structure: separate query and candidate representation towers, joined at the top via a lightweight scoring function. The default loss function is the tf.keras.losses.CategoricalCrossentropy.

In [54]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [66]:
class MovielensModel(tfrs.Model):

    def __init__(self, user_model, movie_model):
        super().__init__()
        self.movie_model: tf.keras.Model = movie_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user_id"])
        # And pick out the movie features and pass them into the movie model,
        # getting embeddings back.
        positive_movie_embeddings = self.movie_model(features["movie_title"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_movie_embeddings)

In [67]:
model = MovielensModel(user_model, movie_model)

## Fit and evaluate

In [69]:
model = MovielensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [16]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [25]:
len(list(train))

80000

In [26]:
len(list(cached_train))

10

In [71]:
model.fit(cached_train, epochs=3)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x140749790>

As the model trains, the loss is falling and a set of top-k retrieval metrics is updated. These tell us whether the true positive is in the top-k retrieved items from the entire candidate set. For example, a top-5 categorical accuracy metric of 0.2 would tell us that, on average, the true positive is in the top 5 retrieved items 20% of the time.

In [72]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k': array([0.0011 , 0.0105 , 0.0221 , 0.12225, 0.2396 ], dtype=float32),
 'factorized_top_k/top_1_categorical_accuracy': 0.0010999999940395355,
 'factorized_top_k/top_5_categorical_accuracy': 0.010499999858438969,
 'factorized_top_k/top_10_categorical_accuracy': 0.022099999710917473,
 'factorized_top_k/top_50_categorical_accuracy': 0.12224999815225601,
 'factorized_top_k/top_100_categorical_accuracy': 0.23960000276565552,
 'loss': 28227.279296875,
 'regularization_loss': 0,
 'total_loss': 28227.279296875}

## Predictions

In [102]:
# Create a model that takes in raw query features, and
index = tfrs.layers.ann.BruteForce(query_model=model.user_model)

# recommends movies out of the entire movies dataset.
index.index(candidates=movies.batch(100).map(model.movie_model), 
            identifiers=movies)

# Get recommendations.
_, titles = index(queries=tf.constant(["42"]))
print(f"Recommendations for user 42: {titles[0, :3]}")

Recommendations for user 42: [b'Bridges of Madison County, The (1995)' b'Rudy (1993)'
 b'Homeward Bound: The Incredible Journey (1993)']


In [109]:
# Create a model that takes in raw query features, and
index = tfrs.layers.ann.BruteForce()

# recommends movies out of the entire movies dataset.
index.index(candidates=movies.batch(100).map(model.movie_model), 
            identifiers=movies)

user_42_embedding = model.user_model(tf.constant("42"))

# Get recommendations.
_, titles = index(queries=[user_42_embedding])
print(f"Recommendations for user 42: {titles[0, :3]}")

Recommendations for user 42: [b'Bridges of Madison County, The (1995)' b'Rudy (1993)'
 b'Homeward Bound: The Incredible Journey (1993)']


## Save model

In [103]:
# Export the query model.
with tempfile.TemporaryDirectory() as tmp:
    path = os.path.join(tmp, "query_model")
    model.user_model.save(path)
    loaded = tf.keras.models.load_model(path)
    query_embedding = loaded(tf.constant(["10"]))

    print(f"Query embedding: {query_embedding[0, :3]}")

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


INFO:tensorflow:Assets written to: /var/folders/mm/gxmzkxqn7h1dj6fl_j0623480000gq/T/tmpjx_n64hf/query_model/assets


INFO:tensorflow:Assets written to: /var/folders/mm/gxmzkxqn7h1dj6fl_j0623480000gq/T/tmpjx_n64hf/query_model/assets






Query embedding: [0.43864694 0.03869034 0.02641623]


In [118]:
movie_embedding = model.movie_model(tf.constant(b'You So Crazy (1994)'))

<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([ 0.11378269,  0.10946038, -0.09611955, -0.04803411,  0.00705649,
       -0.05529281, -0.10826959,  0.03463656,  0.00729864,  0.10602102,
        0.07266276, -0.02265502,  0.06871083, -0.05544332, -0.11621776,
       -0.03886645, -0.06722645, -0.11729315,  0.06618599, -0.09855856,
        0.0656407 , -0.1143479 , -0.12918611,  0.12472833, -0.00952297,
       -0.15671423,  0.15695775, -0.01657805, -0.09416191,  0.01256337,
       -0.00698943, -0.11238915], dtype=float32)>

In [122]:
movie_embeddings = movies.enumerate().map(lambda idx, title: (idx, title, model.movie_model(title)))





## Annoy for candidate model

In [125]:
from annoy import AnnoyIndex

index = AnnoyIndex(embedding_dimension, "dot")

movie_id_to_title = dict((idx, title) for idx, title, _ in movie_embeddings.as_numpy_iterator())

# We unbatch the dataset because Annoy accepts only scalar (id, embedding) pairs.
for movie_id, _, movie_embedding in movie_embeddings.as_numpy_iterator():
    index.add_item(movie_id, movie_embedding)

# Build a 10-tree ANN index.
index.build(10)

True

In [137]:
for row in test.batch(1).take(3):
    query_embedding = model.user_model(row["user_id"])[0]
    candidates = index.get_nns_by_vector(query_embedding, 3)
    print(f"Movie: {row['user_id']}")
    print(f"Candidates: {[movie_id_to_title[x] for x in candidates]}.\n")

Movie: [b'346']
Candidates: [b'Desperado (1995)', b'Quick and the Dead, The (1995)', b'Highlander (1986)'].

Movie: [b'602']
Candidates: [b"Dante's Peak (1997)", b'Jungle2Jungle (1997)', b'Saint, The (1997)'].

Movie: [b'393']
Candidates: [b'Tom and Huck (1995)', b'Little Big League (1994)', b'War, The (1994)'].

