In [2]:
!pip install scann

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
!pip install tensorflow-recommenders



You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [5]:
!pip install --upgrade tensorflow-datasets

Collecting tensorflow-datasets
  Downloading tensorflow_datasets-4.4.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 23.5 MB/s eta 0:00:01
[?25hCollecting tensorflow-metadata
  Downloading tensorflow_metadata-1.2.0-py3-none-any.whl (48 kB)
[K     |████████████████████████████████| 48 kB 812 kB/s  eta 0:00:01
Collecting promise
  Downloading promise-2.3.tar.gz (19 kB)
Collecting importlib-resources
  Downloading importlib_resources-5.3.0-py3-none-any.whl (28 kB)
Collecting absl-py
  Downloading absl_py-0.12.0-py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 71.0 MB/s eta 0:00:01
[?25hCollecting googleapis-common-protos<2,>=1.52.0
  Downloading googleapis_common_protos-1.53.0-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 58.2 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: promise
  Building wheel for promise (setup.py) ... [?25ldone
[?25h  Created wheel for promise: filename=pr

In [6]:
!pip install tensorflow

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [7]:
from typing import Dict, Text

import os
import pprint
import tempfile

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [8]:
import tensorflow_recommenders as tfrs

In [9]:
# Load the MovieLens 100K data.
ratings = tfds.load(
    "movielens/100k-ratings",
    split="train"
)

# Get the ratings data.
ratings = (ratings
           # Retain only the fields we need.
           .map(lambda x: {"user_id": x["user_id"], "movie_title": x["movie_title"]})
           # Cache for efficiency.
           .cache(tempfile.NamedTemporaryFile().name)
)

# Get the movies data.
movies = tfds.load("movielens/100k-movies", split="train")
movies = (movies
          # Retain only the fields we need.
          .map(lambda x: x["movie_title"])
          # Cache for efficiency.
          .cache(tempfile.NamedTemporaryFile().name))

[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /home/ec2-user/tensorflow_datasets/movielens/100k-ratings/0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling movielens-train.tfrecord...:   0%|          | 0/100000 [00:00<?, ? examples/s]

[1mDataset movielens downloaded and prepared to /home/ec2-user/tensorflow_datasets/movielens/100k-ratings/0.1.0. Subsequent calls will reuse this data.[0m
[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 150.35 KiB, total: 4.84 MiB) to /home/ec2-user/tensorflow_datasets/movielens/100k-movies/0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1682 [00:00<?, ? examples/s]

Shuffling movielens-train.tfrecord...:   0%|          | 0/1682 [00:00<?, ? examples/s]

[1mDataset movielens downloaded and prepared to /home/ec2-user/tensorflow_datasets/movielens/100k-movies/0.1.0. Subsequent calls will reuse this data.[0m


In [10]:
user_ids = ratings.map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(user_ids.batch(1000))))

In [11]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

### Model definition

In [13]:
class MovielensModel(tfrs.Model):

	def __init__(self):
		super().__init__()

		embedding_dimension = 32

		# Set up a model for representing movies.
		self.movie_model = tf.keras.Sequential(
			[
				tf.keras.layers.experimental.preprocessing.StringLookup(
					vocabulary = unique_movie_titles, mask_token = None
				),
				# We add an additional embedding to account for unknown tokens.
				tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
			]
		)

		# Set up a model for representing users.
		self.user_model = tf.keras.Sequential(
			[
				tf.keras.layers.experimental.preprocessing.StringLookup(
					vocabulary = unique_user_ids, mask_token = None
				),
				# We add an additional embedding to account for unknown tokens.
				tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
			]
		)

		# Set up a task to optimize the model and compute metrics.
		self.task = tfrs.tasks.Retrieval(
			metrics = tfrs.metrics.FactorizedTopK(
				candidates = movies.batch(128).cache().map(self.movie_model)
			)
		)

	def compute_loss(self, features: Dict[Text, tf.Tensor], training = False) -> tf.Tensor:
		# We pick out the user features and pass them into the user model.
		user_embeddings = self.user_model(features["user_id"])
		# And pick out the movie features and pass them into the movie model,
		# getting embeddings back.
		positive_movie_embeddings = self.movie_model(features["movie_title"])

		# The task computes the loss and the metrics.

		return self.task(user_embeddings, positive_movie_embeddings, compute_metrics = not training)


### Fitting and evaluation

In [14]:
model = MovielensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [15]:
model.fit(train.batch(8192), epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f7c7cab5320>

In [16]:
model.evaluate(test.batch(8192), return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0012499999720603228,
 'factorized_top_k/top_5_categorical_accuracy': 0.009399999864399433,
 'factorized_top_k/top_10_categorical_accuracy': 0.02199999988079071,
 'factorized_top_k/top_50_categorical_accuracy': 0.1261499971151352,
 'factorized_top_k/top_100_categorical_accuracy': 0.2363000065088272,
 'loss': 28242.8359375,
 'regularization_loss': 0,
 'total_loss': 28242.8359375}

### Approximate prediction

In [17]:
brute_force = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
brute_force.index_from_dataset(
    movies.batch(128).map(lambda title: (title, model.movie_model(title)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f7c7c094780>

In [18]:
# Get predictions for user 42.
_, titles = brute_force(np.array(["42"]), k=3)

print(f"Top recommendations: {titles[0]}")

Top recommendations: [b'Homeward Bound: The Incredible Journey (1993)'
 b"Kid in King Arthur's Court, A (1995)" b'Rudy (1993)']


In [19]:
%timeit _, titles = brute_force(np.array(["42"]), k=3)

1.23 ms ± 1.33 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [20]:
# Construct a dataset of movies that's 1,000 times larger. We 
# do this by adding several million dummy movie titles to the dataset.
lots_of_movies = tf.data.Dataset.concatenate(
    movies.batch(4096),
    movies.batch(4096).repeat(1_000).map(lambda x: tf.zeros_like(x))
)

# We also add lots of dummy embeddings by randomly perturbing
# the estimated embeddings for real movies.
lots_of_movies_embeddings = tf.data.Dataset.concatenate(
    movies.batch(4096).map(model.movie_model),
    movies.batch(4096).repeat(1_000)
      .map(lambda x: model.movie_model(x))
      .map(lambda x: x * tf.random.uniform(tf.shape(x)))
)

In [21]:
brute_force_lots = tfrs.layers.factorized_top_k.BruteForce()
brute_force_lots.index_from_dataset(
    tf.data.Dataset.zip((lots_of_movies, lots_of_movies_embeddings))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f7c7e6f0898>

In [22]:
_, titles = brute_force_lots(model.user_model(np.array(["42"])), k=3)

print(f"Top recommendations: {titles[0]}")

Top recommendations: [b'Homeward Bound: The Incredible Journey (1993)'
 b"Kid in King Arthur's Court, A (1995)" b'Rudy (1993)']


In [23]:
%timeit _, titles = brute_force_lots(model.user_model(np.array(["42"])), k=3)

30.7 ms ± 207 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
scann = tfrs.layers.factorized_top_k.ScaNN(num_reordering_candidates=100)
scann.index_from_dataset(
    tf.data.Dataset.zip((lots_of_movies, lots_of_movies_embeddings))
)

<tensorflow_recommenders.layers.factorized_top_k.ScaNN at 0x7f7c6fd49240>

In [25]:
_, titles = scann(model.user_model(np.array(["42"])), k=3)

print(f"Top recommendations: {titles[0]}")

Top recommendations: [b'Homeward Bound: The Incredible Journey (1993)'
 b"Kid in King Arthur's Court, A (1995)" b'Rudy (1993)']


In [26]:
%timeit _, titles = scann(model.user_model(np.array(["42"])), k=3)

2.18 ms ± 12.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Evaluating the approximation

In [27]:
# Override the existing streaming candidate source.
model.task.factorized_metrics = tfrs.metrics.FactorizedTopK(
    candidates=lots_of_movies_embeddings
)
# Need to recompile the model for the changes to take effect.
model.compile()

%time baseline_result = model.evaluate(test.batch(8192), return_dict=True, verbose=False)

CPU times: user 20min 57s, sys: 1min 16s, total: 22min 14s
Wall time: 5min 39s


In [29]:
model.task.factorized_metrics = tfrs.metrics.FactorizedTopK(
    candidates=scann
)
model.compile()
%time scann_result = model.evaluate(test.batch(8192), return_dict=True, verbose=False)

CPU times: user 9.42 s, sys: 1.44 s, total: 10.9 s
Wall time: 3.65 s


In [30]:
print(f"Brute force top-100 accuracy: {baseline_result['factorized_top_k/top_100_categorical_accuracy']:.2f}")
print(f"ScaNN top-100 accuracy:       {scann_result['factorized_top_k/top_100_categorical_accuracy']:.2f}")

Brute force top-100 accuracy: 0.15
ScaNN top-100 accuracy:       0.27


### Deploying the approximate model

In [31]:
# We re-index the ScaNN layer to include the user embeddings in the same model.
# This way we can give the saved model raw features and get valid predictions
# back.
scann = tfrs.layers.factorized_top_k.ScaNN(model.user_model, num_reordering_candidates = 1000)
scann.index_from_dataset(
	tf.data.Dataset.zip((lots_of_movies, lots_of_movies_embeddings))
)

# Need to call it to set the shapes.
_ = scann(np.array(["42"]))

with tempfile.TemporaryDirectory() as tmp:
	path = os.path.join(tmp, "model")
	tf.saved_model.save(
		scann,
		path,
		options = tf.saved_model.SaveOptions(namespace_whitelist = ["Scann"])
	)

	loaded = tf.saved_model.load(path)



INFO:tensorflow:Assets written to: /tmp/tmp1zziyly4/model/assets


INFO:tensorflow:Assets written to: /tmp/tmp1zziyly4/model/assets


In [32]:
_, titles = loaded(tf.constant(["42"]))

print(f"Top recommendations: {titles[0][:3]}")

Top recommendations: [b'Homeward Bound: The Incredible Journey (1993)'
 b"Kid in King Arthur's Court, A (1995)" b'Rudy (1993)']


### Tuning ScaNN

In [33]:
# Process queries in groups of 1000; processing them all at once with brute force
# may lead to out-of-memory errors, because processing a batch of q queries against
# a size-n dataset takes O(nq) space with brute force.
titles_ground_truth = tf.concat([
  brute_force_lots(queries, k=10)[1] for queries in
  test.batch(1000).map(lambda x: model.user_model(x["user_id"]))
], axis=0)

In [34]:
# Get all user_id's as a 1d tensor of strings
test_flat = np.concatenate(list(test.map(lambda x: x["user_id"]).batch(1000).as_numpy_iterator()), axis=0)

# ScaNN is much more memory efficient and has no problem processing the whole
# batch of 20000 queries at once.
_, titles = scann(test_flat, k=10)

In [35]:
def compute_recall(ground_truth, approx_results):
	return np.mean(
		[
			len(np.intersect1d(truth, approx)) / len(truth)
			for truth, approx in zip(ground_truth, approx_results)
		]
	)

In [36]:
print(f"Recall: {compute_recall(titles_ground_truth, titles):.3f}")

Recall: 0.931


In [37]:
%timeit -n 1000 scann(np.array(["42"]), k=10)

2.41 ms ± 5.16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [38]:
scann2 = tfrs.layers.factorized_top_k.ScaNN(
    model.user_model, 
    num_leaves=1000,
    num_leaves_to_search=100,
    num_reordering_candidates=1000)
scann2.index_from_dataset(
    tf.data.Dataset.zip((lots_of_movies, lots_of_movies_embeddings))
)

_, titles2 = scann2(test_flat, k=10)

print(f"Recall: {compute_recall(titles_ground_truth, titles2):.3f}")

Recall: 0.966


In [39]:
%timeit -n 1000 scann2(np.array(["42"]), k=10)

2.57 ms ± 8.41 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [40]:
scann3 = tfrs.layers.factorized_top_k.ScaNN(
    model.user_model,
    num_leaves=1000,
    num_leaves_to_search=70,
    num_reordering_candidates=400)
scann3.index_from_dataset(
    tf.data.Dataset.zip((lots_of_movies, lots_of_movies_embeddings))
)

_, titles3 = scann3(test_flat, k=10)
print(f"Recall: {compute_recall(titles_ground_truth, titles3):.3f}")

Recall: 0.957


In [41]:
%timeit -n 1000 scann3(np.array(["42"]), k=10)

2.35 ms ± 11 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
