In [56]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

2 stages:
- Retrieval model - selects an initial set of candidates from all possible candidates
- Ranking model - ranks the selected candidates and selects the best ones

In [60]:
def show(dataset, n=2):
    return pd.DataFrame(dataset.take(n).as_numpy_iterator())

### Retrieval

In [61]:
ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")

In [62]:
# user_rating -> explicit feedback
# first stage of the recommender (retrieval model) will use implicit feedback
# (converted from the explicit user ratings)
# every movie a user rated (watched) -> positive example
# every movie a user have not rated -> negative example
show(ratings)

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,movie_title,raw_user_age,timestamp,user_gender,user_id,user_occupation_label,user_occupation_text,user_rating,user_zip_code
0,45.0,[7],b'357',"b""One Flew Over the Cuckoo's Nest (1975)""",46.0,879024327,True,b'138',4,b'doctor',4.0,b'53211'
1,25.0,"[4, 14]",b'709',b'Strictly Ballroom (1992)',32.0,875654590,True,b'92',5,b'entertainment',2.0,b'80525'


In [63]:
# move_genres -> list of genre ids the movie belngs to
show(movies)

Unnamed: 0,movie_genres,movie_id,movie_title
0,[4],b'1681',b'You So Crazy (1994)'
1,"[4, 7]",b'1457',b'Love Is All There Is (1996)'


In [64]:
# no explicit rating
seen_by_users = ratings.map(
    lambda x: {'movie_title': x['movie_title'], 'user_id': x['user_id']})

movie_titles = movies.map(lambda x: x['movie_title'])

In [65]:
n_samples = 100_000
train_split = 0.8
n_train_samples = round(train_split * n_samples)
n_test_samples = n_samples = n_train_samples

tf.random.set_seed(0)
seen_by_users = seen_by_users.shuffle(
    100_000, seed=0, reshuffle_each_iteration=False
)

train = seen_by_users.take(n_train_samples)
test = seen_by_users.skip(n_train_samples).take(n_test_samples)

In [66]:
unique_movie_titles = np.unique(list(movie_titles.as_numpy_iterator()))
unique_user_ids = np.unique(
    list(seen_by_users.map(lambda x: x['user_id']).as_numpy_iterator())
)

In [67]:
# string lookup example
vocab = ['a', 'b', 'c']
# data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
test_data = np.array([['a', 'b', 'b'], ['a', 'b', 'c']])
layer = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=vocab)
layer(test_data).numpy()

array([[1, 2, 2],
       [1, 2, 3]])

In [68]:
# embedding layer example
# embedding dim=2: 2x3 input -> 2x3x2 output
layer_2 = tf.keras.layers.Embedding(len(vocab), 2, input_length=3)
i = np.array([[0, 1, 2], [0, 0, 2]])
layer_2(i)

<tf.Tensor: shape=(2, 3, 2), dtype=float32, numpy=
array([[[-0.02080249, -0.02934336],
        [ 0.00353907,  0.00612575],
        [-0.00833255,  0.03078279]],

       [[-0.02080249, -0.02934336],
        [-0.02080249, -0.02934336],
        [-0.00833255,  0.03078279]]], dtype=float32)>

In [32]:
# query model
# https://stats.stackexchange.com/questions/270546/how-does-keras-embedding-layer-work
def embedding_model_factory(n_embedding_dimensions=32, vocabulary):
    layers = [
        # Maps strings from a vocabulary to a range of integer indices
        tf.keras.layers.experimental.preprocessing.StringLookup(
            vocabulary=vocabulary, mask_token=None
        ),
        # + 1 to add one aditional embedding entry for unknown tokens
        tf.keras.layers.Embedding(
            len(vocabulary) + 1, n_embedding_dimensions
        )
    ]
    
    return tf.keras.Sequential(layers)

In [33]:
movie_model = embedding_model_factory(
    n_embedding_dimensions=32,
    vocabulary=unique_movie_titles
)
# embedding for the movie "Til There Was You (1997)"
movie_model(np.array([b"'Til There Was You (1997)"]))

<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[-2.4332905e-02,  1.5845884e-02,  2.8408874e-02,  4.8857283e-02,
        -2.8385568e-02, -4.8242688e-02, -8.9645386e-05, -4.0224455e-02,
         4.6875920e-02,  3.5669688e-02,  3.5790887e-02, -2.7830116e-03,
         2.4314746e-03,  1.8764388e-02,  8.4960945e-03,  2.2447530e-02,
        -4.4737469e-02, -2.6634825e-02,  4.8829857e-02, -3.5618961e-02,
         3.8881149e-02,  2.6854027e-02, -2.4254238e-02, -4.8504997e-02,
         2.9529762e-02, -6.4371899e-04,  2.0598780e-02, -2.7926341e-03,
         2.4232868e-02, -1.7427541e-02, -6.1231032e-03, -2.7959382e-02]],
      dtype=float32)>

In [34]:
user_model = embedding_model_factory(
    n_embedding_dimensions=32,
    vocabulary=unique_user_ids
)
# embedding for the user "10"
user_model(np.array([b'10']))

<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[-0.036576  , -0.03089347, -0.04926305, -0.00108545, -0.02889257,
        -0.02835456, -0.02898588,  0.03876593,  0.01880166,  0.01347914,
         0.03804502, -0.04827794, -0.03280818,  0.00111272, -0.04606495,
        -0.03216724,  0.04409171,  0.01784435,  0.04915215, -0.0402068 ,
        -0.04015896,  0.02858837, -0.01993331,  0.02911016, -0.01273555,
        -0.02697011, -0.0156634 ,  0.00053151,  0.0050525 ,  0.04768539,
         0.02215948,  0.00570469]], dtype=float32)>

In [35]:
# affinity score := dot product of user and movie embedding vectors
# for an accurate model, the score for positive pairs (pairs of movies/users in the dataset)
# should be higher than for any other user/movie pairs

metrics = tfrs.metrics.FactorizedTopK(
  # candidates is an nx32 embedding matrix containing all the movies
  # used as implicit negative pairs?
  candidates=movie_titles.batch(128).map(movie_model)
)

# setup the task for the retrival model with appropriate loss
# default is categorical cross entropy
task = tfrs.tasks.Retrieval(metrics=metrics)

u = user_model(np.array(['b102']))
v = movie_model(np.array([b'1-900 (1994)']))
# task is a layer that takes query and candidate vectors and returns loss
task(u, v)

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [36]:
class FullModel(tfrs.Model):
    
    def __init__(self, user_model, movie_model, task):
        super().__init__()
        self.user_model = user_model
        self.movie_model = movie_model
        self.task = task
        
    def compute_loss(self, features, training=False):
        # -> features: Dict[Text, tf.Tensor]
        # -> tf.Tensor

        # get embeddings for given user
        user_embeddings = self.user_model(features['user_id'])
        # get embedding for given movie (that the user has seen) -> positive signal
        positive_movie_embeddings = self.movie_model(features['movie_title'])
        # return loss a metrics
        return self.task(user_embeddings, positive_movie_embeddings)

In [37]:
model = FullModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [38]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f7e6442fe10>

In [39]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0013000000035390258,
 'factorized_top_k/top_5_categorical_accuracy': 0.009349999949336052,
 'factorized_top_k/top_10_categorical_accuracy': 0.020649999380111694,
 'factorized_top_k/top_50_categorical_accuracy': 0.12229999899864197,
 'factorized_top_k/top_100_categorical_accuracy': 0.2326499968767166,
 'loss': 28265.875,
 'regularization_loss': 0,
 'total_loss': 28265.875}

In [74]:
# making predictions

# create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset
dataset = tf.data.Dataset.zip(
      (movie_titles.batch(100), movie_titles.batch(100).map(model.movie_model))
)
index.index_from_dataset(dataset)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f7e5c6d4d50>

In [75]:
# get recommendations
_, result = index(tf.constant(["42"]))
print(result)

tf.Tensor(
[[b'Angels in the Outfield (1994)' b'Rudy (1993)'
  b'Homeward Bound: The Incredible Journey (1993)'
  b'Affair to Remember, An (1957)' b'Circle of Friends (1995)'
  b'Client, The (1994)' b'Firm, The (1993)'
  b'Miracle on 34th Street (1994)'
  b'Winnie the Pooh and the Blustery Day (1968)' b'Michael (1996)']], shape=(1, 10), dtype=string)


In [76]:
# using ScaNN index
scann_index = tfrs.layers.factorized_top_k.ScaNN(model.user_model)
scann_index.index_from_dataset(dataset)

_, scann_result = index(tf.constant(["101"]))
print(scann_result)

# to persist the model, just save the index object

tf.Tensor(
[[b'Juror, The (1996)' b'Down Periscope (1996)' b'Phantom, The (1996)'
  b'Eddie (1996)' b'Nutty Professor, The (1996)' b'Happy Gilmore (1996)'
  b'Bulletproof (1996)' b'Father of the Bride Part II (1995)'
  b'Craft, The (1996)' b'Space Jam (1996)']], shape=(1, 10), dtype=string)
