# Tensorflow features 

https://www.tensorflow.org/recommenders/examples/featurization

https://www.tensorflow.org/recommenders/examples/context_features

This caveat aside, real-world models may well benefit from other time-based features such as time of day or day of the week, especially if the data has strong seasonal patterns.

In [1]:
import os
import tempfile
import pprint

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

In [2]:
ratings = tfds.load("movie_lens/100k-ratings", split="train")
movies = tfds.load("movie_lens/100k-movies", split="train")


ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "timestamp": x["timestamp"],
})
movies = movies.map(lambda x: x["movie_title"])

for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)



{'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'timestamp': 879024327,
 'user_id': b'138'}


## User-Item embeddings

User and item embeddings: Initially, these embeddings will take on random values - but during training, we will adjust them so that embeddings of users and the movies they watch end up closer together.

Taking raw categorical features and turning them into embeddings is normally a two-step process:

- Firstly, we need to translate the raw values into a range of contiguous integers, normally by building a mapping (called a "vocabulary") that maps raw values ("Star Wars") to integers (say, 15).
- Secondly, we need to take these integers and turn them into embeddings.


### String lookup

In [3]:
import numpy as np
import tensorflow as tf

movie_title_lookup = tf.keras.layers.experimental.preprocessing.StringLookup()

In [4]:
movie_title_lookup.adapt(ratings.map(lambda x: x["movie_title"]))

In [5]:
movie_title_lookup(["Star Wars (1977)", "One Flew Over the Cuckoo's Nest (1975)"])

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([ 2, 59])>

### Feature hashing

In [6]:
# We set up a large number of bins to reduce the chance of hash collisions.
num_hashing_bins = 200_000

movie_title_hashing = tf.keras.layers.experimental.preprocessing.Hashing(
    num_bins=num_hashing_bins
)

In [7]:
movie_title_hashing(["Star Wars (1977)", "One Flew Over the Cuckoo's Nest (1975)"])


<tf.Tensor: shape=(2,), dtype=int64, numpy=array([101016,  96565])>

### Embeddings

An embedding layer has two dimensions: the first dimension tells us how many distinct categories we can embed; the second tells us how large the vector representing each of them can be.

In [8]:
movie_title_embedding = tf.keras.layers.Embedding(
    # Let's use the hashing approach.
    input_dim=movie_title_lookup.vocab_size(),
    output_dim=32
)
movie_title_model = tf.keras.Sequential([movie_title_lookup, movie_title_embedding])
movie_title_model(["Star Wars (1977)"])

Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[ 0.00396277,  0.01172978, -0.02384911, -0.04837387, -0.04741423,
         0.01612918,  0.0479348 ,  0.02198057, -0.02069057, -0.02909831,
         0.03733841, -0.03273455, -0.01077179,  0.01432376,  0.02987571,
        -0.01175674, -0.01337911,  0.02950252,  0.03352079, -0.00063802,
         0.04012284, -0.02083405, -0.02146355,  0.03655815,  0.01027417,
         0.01376143,  0.02866649,  0.0131222 , -0.04579788, -0.02678558,
        -0.03854306, -0.0214591 ]], dtype=float32)>

In [9]:
user_id_lookup = tf.keras.layers.experimental.preprocessing.StringLookup()

user_id_lookup.adapt(ratings.map(lambda x: x["user_id"]))

user_id_embedding = tf.keras.layers.Embedding(
    # Let's use the hashing approach.
    input_dim=user_id_lookup.vocab_size(),
    output_dim=32
)
user_id_model = tf.keras.Sequential([user_id_lookup, user_id_embedding])

In [10]:
user_id_lookup.get_vocabulary()[:3]

['', '[UNK]', '405']

In [11]:
user_id_model('405')

Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([ 0.03022064, -0.00098227, -0.0162935 ,  0.02301618,  0.01949287,
       -0.04085468, -0.01580148,  0.0021788 , -0.00351913, -0.02449911,
       -0.00816481, -0.0109116 , -0.016825  , -0.0100292 , -0.01753429,
       -0.0463519 ,  0.02666036,  0.00514909,  0.01995898,  0.02405507,
        0.0125909 ,  0.04897085, -0.02830227,  0.04055488, -0.02390121,
       -0.01923902,  0.04656236,  0.04607103, -0.02452377, -0.02794539,
        0.00302502,  0.0306075 ], dtype=float32)>

# Continuous features

In [12]:
for x in ratings.take(3).as_numpy_iterator():
    print(f"Timestamp: {x['timestamp']}.")

Timestamp: 879024327.
Timestamp: 875654590.
Timestamp: 882075110.


In [13]:
timestamp_normalization = tf.keras.layers.experimental.preprocessing.Normalization()
timestamp_normalization.adapt(ratings.map(lambda x: x["timestamp"]).batch(1024))

for x in ratings.take(3).as_numpy_iterator():
    print(f"Normalized timestamp: {timestamp_normalization(x['timestamp'])}.")

Normalized timestamp: [[-0.84293705]].
Normalized timestamp: [[-1.47352]].
Normalized timestamp: [[-0.27203262]].


In [14]:
timestamps = np.concatenate(list(ratings.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000)

print(f"Buckets: {timestamp_buckets[:3]}")

Buckets: [8.74724710e+08 8.74743291e+08 8.74761871e+08]


In [15]:
timestamp_embedding_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.Discretization(timestamp_buckets.tolist()),
  tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32)
])

for timestamp in ratings.take(1).map(lambda x: x["timestamp"]).batch(1).as_numpy_iterator():
  print(f"Timestamp embedding: {timestamp_embedding_model(timestamp)}.")                                       

Cause: could not parse the source code:

for timestamp in ratings.take(1).map(lambda x: x["timestamp"]).batch(1).as_numpy_iterator():

This error may be avoided by creating the lambda in a standalone statement.



Cause: could not parse the source code:

for timestamp in ratings.take(1).map(lambda x: x["timestamp"]).batch(1).as_numpy_iterator():

This error may be avoided by creating the lambda in a standalone statement.



Cause: could not parse the source code:

for timestamp in ratings.take(1).map(lambda x: x["timestamp"]).batch(1).as_numpy_iterator():

This error may be avoided by creating the lambda in a standalone statement.

Timestamp embedding: [[ 0.04888289  0.00913225 -0.00306313 -0.01845644 -0.00260611  0.03414357
   0.02560953 -0.00913762  0.01979966 -0.04191906  0.04516757 -0.00173339
  -0.03848343  0.04710786  0.04880596 -0.02773777 -0.03101053  0.03004799
   0.02313245  0.04730508 -0.01537329 -0.01770693  0.04392416  0.00200449
   0.01659519 -0.02663034  0.01308459  0.02205885 -0.03799649 -0.01079416
  -0.03766627  0.00561325]].


In [16]:
unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(lambda x: x["user_id"]))))

# Text pre-processing

In [17]:
title_text = tf.keras.layers.experimental.preprocessing.TextVectorization()
title_text.adapt(ratings.map(lambda x: x["movie_title"]))

In [18]:
for row in ratings.batch(1).map(lambda x: x["movie_title"]).take(1):
    print(row.numpy())
    print(title_text(row).numpy())

Cause: could not parse the source code:

for row in ratings.batch(1).map(lambda x: x["movie_title"]).take(1):

This error may be avoided by creating the lambda in a standalone statement.



Cause: could not parse the source code:

for row in ratings.batch(1).map(lambda x: x["movie_title"]).take(1):

This error may be avoided by creating the lambda in a standalone statement.



Cause: could not parse the source code:

for row in ratings.batch(1).map(lambda x: x["movie_title"]).take(1):

This error may be avoided by creating the lambda in a standalone statement.

[b"One Flew Over the Cuckoo's Nest (1975)"]
[[ 32 266 162   2 267 265  53]]


In [19]:
title_text.get_vocabulary()[40:45]

['first', '1998', '1977', '1971', 'monty']

# User model

In [20]:
class UserModel(tf.keras.Model):
  
    def __init__(self, use_timestamps):
        super().__init__()

        self._use_timestamps = use_timestamps

        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
        ])

        if use_timestamps:
            self.timestamp_embedding = tf.keras.Sequential([
                  tf.keras.layers.experimental.preprocessing.Discretization(timestamp_buckets.tolist()),
                  tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
              ])
            self.normalized_timestamp = tf.keras.layers.experimental.preprocessing.Normalization()

            self.normalized_timestamp.adapt(timestamps)

    def call(self, inputs):
        if not self._use_timestamps:
            return self.user_embedding(inputs["user_id"])

        return tf.concat([
            self.user_embedding(inputs["user_id"]),
            self.timestamp_embedding(inputs["timestamp"]),
            self.normalized_timestamp(inputs["timestamp"]),
        ], axis=1)

In [22]:
user_model = UserModel(use_timestamps=True)

In [23]:
user_model.normalized_timestamp.adapt(
    ratings.map(lambda x: x["timestamp"]).batch(128))

In [24]:
for row in ratings.batch(1).take(1):
    print(row['user_id'].numpy()[0])
    print(row['timestamp'].numpy()[0])
    print(f"Computed representations: {user_model(row)[0, :3]}")

b'138'
879024327
Computed representations: [-0.02008412  0.04478449 -0.04165016]


# Movie model

In [25]:
class MovieModel(tf.keras.Model):
  
    def __init__(self):
        super().__init__()

        max_tokens = 10_000

        self.title_embedding = tf.keras.Sequential([
          movie_title_lookup,
          tf.keras.layers.Embedding(movie_title_lookup.vocab_size(), 32)
        ])
        
        self.title_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
            max_tokens=max_tokens)

        self.title_text_embedding = tf.keras.Sequential([
          self.title_vectorizer,
          tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
          # We average the embedding of individual words to get one embedding vector
          # per title.
          tf.keras.layers.GlobalAveragePooling1D(),
        ])

        self.title_vectorizer.adapt(movies)


    def call(self, titles):
        return tf.concat([
            self.title_embedding(titles),
            self.title_text_embedding(titles),
        ], axis=1)

In [27]:
movie_model = MovieModel()

movie_model.title_text_embedding.layers[0].adapt(
    ratings.map(lambda x: x["movie_title"])
)

for row in ratings.batch(1).take(1):
    print(movie_model(row['movie_title']))

tf.Tensor(
[[-0.00243484 -0.00526751  0.02876684 -0.00157429  0.01605536  0.04530487
  -0.03256219 -0.0481589   0.02902817 -0.04365135  0.0059179  -0.02370564
  -0.03139018  0.02896781 -0.02369294  0.0463177  -0.04394087 -0.01041824
   0.01856836 -0.01508058 -0.03724738  0.01180527 -0.01620079 -0.0234993
   0.00927129 -0.04977113  0.00794138  0.01821036 -0.04593598 -0.03037766
   0.02262406 -0.00814469 -0.00873667  0.01587838 -0.01149853 -0.00071614
   0.01813574  0.00232467  0.00239427  0.00161277  0.00178416 -0.00589302
  -0.00654895  0.01150121  0.0048835  -0.0150386  -0.00294239 -0.00124559
  -0.01609775 -0.00117175  0.0070271  -0.01696349  0.02014591  0.00943684
  -0.01481183  0.01854273 -0.02757332  0.00213743 -0.00013924  0.00952758
   0.01832676 -0.00432802 -0.0031046  -0.01905141]], shape=(1, 64), dtype=float32)


# Combined model

Note that we also need to make sure that the user model and query model output embeddings of compatible size. Because we'll be varying their sizes by adding more features, the easiest way to accomplish this is to use a dense projection layer after each model:



In [28]:
class MovielensModel(tfrs.models.Model):

    def __init__(self, use_timestamps):
        super().__init__()
        self.query_model = tf.keras.Sequential([
          UserModel(use_timestamps),
          tf.keras.layers.Dense(32)
        ])
        self.candidate_model = tf.keras.Sequential([
          MovieModel(),
          tf.keras.layers.Dense(32)
        ])
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=movies.batch(128).map(self.candidate_model),
            ),
        )

    def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
        query_embeddings = self.query_model({
            "user_id": features["user_id"],
            "timestamp": features["timestamp"],
        })
        movie_embeddings = self.candidate_model(features["movie_title"])

        return self.task(query_embeddings, movie_embeddings)

# Evaluate


In [29]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

## Without timestamp

In [30]:
model = MovielensModel(use_timestamps=False)

model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

model.fit(cached_train, epochs=3)

Epoch 1/3
Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x13ce53cd0>

In [31]:
train_accuracy = model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]

print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


Top-100 accuracy (train): 0.29.
Top-100 accuracy (test): 0.21.


## With timestamp

In [32]:
model = MovielensModel(use_timestamps=True)

model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

model.fit(cached_train, epochs=3)

train_accuracy = model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]

print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

Epoch 1/3
Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


Epoch 2/3
Epoch 3/3
Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


Top-100 accuracy (train): 0.37.
Top-100 accuracy (test): 0.25.
