In [None]:
!pip install tensorflow-recommenders

In [None]:
!pip install tensorflow-recommenders-addons

In [11]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_recommenders_addons as tfra
import tensorflow_recommenders_addons.dynamic_embedding as de
import tensorflow_datasets as tfds

import functools
from typing import Dict
import dataclasses
import matplotlib.pyplot as plt

%matplotlib inline
print(tf.__version__)

2.15.0


### Download datasets

In [15]:
# https://www.tensorflow.org/datasets/catalog/movielens
# Interactions dataset
raw_ratings_dataset = tfds.load("movielens/1m-ratings", split="train")
# Candidates dataset
raw_movies_dataset = tfds.load("movielens/1m-movies", split="train")

Downloading and preparing dataset 5.64 MiB (download: 5.64 MiB, generated: 351.12 KiB, total: 5.99 MiB) to /root/tensorflow_datasets/movielens/1m-movies/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/3883 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/1m-movies/0.1.1.incompleteRINAJS/movielens-train.tfrecord*...:  …

Dataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/1m-movies/0.1.1. Subsequent calls will reuse this data.


In [16]:
df = tfds.as_dataframe(raw_ratings_dataset.take(100))
df.head()

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,movie_title,timestamp,user_gender,user_id,user_occupation_label,user_occupation_text,user_rating,user_zip_code
0,35.0,"[0, 7]",b'3107',b'Backdraft (1991)',977432193,True,b'130',18,b'technician/engineer',5.0,b'50021'
1,25.0,[7],b'2114',"b'Outsiders, The (1983)'",965932967,False,b'3829',0,b'academic/educator',4.0,b'22307'
2,18.0,"[4, 15]",b'256',b'Junior (1994)',1012103552,False,b'1265',21,b'writer',1.0,b'49321'
3,18.0,"[0, 10]",b'1389',b'Jaws 3-D (1983)',972004605,True,b'2896',14,b'sales/marketing',5.0,b'60073'
4,18.0,[0],b'3635',"b'Spy Who Loved Me, The (1977)'",961180111,True,b'5264',17,b'college/grad student',4.0,b'15217'


In [17]:
df = tfds.as_dataframe(raw_movies_dataset.take(100))
df.head()

Unnamed: 0,movie_genres,movie_id,movie_title
0,"[5, 7]",b'1729',b'Jackie Brown (1997)'
1,[7],b'1486',"b'Quiet Room, The (1996)'"
2,[4],b'3086',b'March of the Wooden Soldiers (a.k.a. Laurel ...
3,[0],b'2965',"b'Omega Code, The (1999)'"
4,[10],b'2853',"b'Communion (a.k.a. Alice, Sweet Alice/Holy Te..."


In [18]:
for item in raw_ratings_dataset.take(1):
  print(item)

{'bucketized_user_age': <tf.Tensor: shape=(), dtype=float32, numpy=35.0>, 'movie_genres': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([0, 7])>, 'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'3107'>, 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Backdraft (1991)'>, 'timestamp': <tf.Tensor: shape=(), dtype=int64, numpy=977432193>, 'user_gender': <tf.Tensor: shape=(), dtype=bool, numpy=True>, 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'130'>, 'user_occupation_label': <tf.Tensor: shape=(), dtype=int64, numpy=18>, 'user_occupation_text': <tf.Tensor: shape=(), dtype=string, numpy=b'technician/engineer'>, 'user_rating': <tf.Tensor: shape=(), dtype=float32, numpy=5.0>, 'user_zip_code': <tf.Tensor: shape=(), dtype=string, numpy=b'50021'>}


### Processing datasets

In [22]:
max_token_length = 6
pad_token = "[PAD]"
punctuation_regex = "[\!\"#\$%&\(\)\*\+,-\.\/\:;\<\=\>\?@\[\]\\\^_`\{\|\}~\\t\\n]"

def process_text(x: tf.Tensor, max_token_length: int, punctuation_regex: str) -> tf.Tensor:

    return tf.strings.split(
        tf.strings.regex_replace(
            tf.strings.lower(x["movie_title"]), punctuation_regex, ""
        )
    )[:max_token_length]


def process_ratings_dataset(ratings_dataset: tf.data.Dataset) -> tf.data.Dataset:

    partial_process_text = functools.partial(
        process_text, max_token_length=max_token_length, punctuation_regex=punctuation_regex
    )

    preprocessed_movie_title_dataset = ratings_dataset.map(
        lambda x: partial_process_text(x)
    )

    processed_dataset = tf.data.Dataset.zip(
        (ratings_dataset, preprocessed_movie_title_dataset)
    ).map(
        lambda x,y: {"user_id": x["user_id"]} | {"movie_title": y}
    )

    return processed_dataset


def process_movies_dataset(movies_dataset: tf.data.Dataset) -> tf.data.Dataset:

    partial_process_text = functools.partial(
        process_text, max_token_length=max_token_length, punctuation_regex=punctuation_regex
    )

    processed_dataset = raw_movies_dataset.map(
        lambda x: partial_process_text(x)
    )

    return processed_dataset

processed_ratings_dataset = process_ratings_dataset(raw_ratings_dataset)
for item in processed_ratings_dataset.take(3):
    print(item)

{'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'130'>, 'movie_title': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'backdraft', b'1991'], dtype=object)>}
{'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'3829'>, 'movie_title': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'outsiders', b'the', b'1983'], dtype=object)>}
{'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1265'>, 'movie_title': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'junior', b'1994'], dtype=object)>}


In [23]:
batch_size=4096
seed=2023
train_size = int(len(processed_ratings_dataset) * 0.9)
validation_size = len(processed_ratings_dataset) - train_size
print(f"Train size: {train_size}")
print(f"Validation size: {validation_size}")

Train size: 900188
Validation size: 100021


In [25]:
@dataclasses.dataclass(frozen=True)
class TrainingDatasets:
    train_ds: tf.data.Dataset
    validation_ds: tf.data.Dataset

@dataclasses.dataclass(frozen=True)
class RetrievalDatasets:
    training_datasets: TrainingDatasets
    candidate_dataset: tf.data.Dataset

def pad_and_batch_ratings_dataset(dataset: tf.data.Dataset) -> tf.data.Dataset:

    return dataset.padded_batch(
        batch_size,
        padded_shapes={
            "user_id": tf.TensorShape([]),
            "movie_title": tf.TensorShape([max_token_length,])
        }, padding_values={
            "user_id": pad_token,
            "movie_title": pad_token
        }
    )

def pad_and_batch_candidate_dataset(movies_dataset: tf.data.Dataset) -> tf.data.Dataset:
    return movies_dataset.padded_batch(
        batch_size,
        padded_shapes=tf.TensorShape([max_token_length,]),
        padding_values=pad_token
    )


def split_train_validation_datasets(ratings_dataset: tf.data.Dataset) -> TrainingDatasets:

    shuffled_dataset = ratings_dataset.shuffle(buffer_size=5*batch_size, seed=seed)
    train_ds = shuffled_dataset.skip(validation_size).shuffle(buffer_size=10*batch_size).apply(pad_and_batch_ratings_dataset)
    validation_ds = shuffled_dataset.take(validation_size).apply(pad_and_batch_ratings_dataset)

    return TrainingDatasets(train_ds=train_ds, validation_ds=validation_ds)


def create_datasets() -> RetrievalDatasets:

    raw_ratings_dataset = tfds.load("movielens/1m-ratings", split="train")
    raw_movies_dataset = tfds.load("movielens/1m-movies", split="train")

    processed_ratings_dataset = process_ratings_dataset(raw_ratings_dataset)
    processed_movies_dataset = process_movies_dataset(raw_movies_dataset)

    training_datasets = split_train_validation_datasets(processed_ratings_dataset)
    candidate_dataset = pad_and_batch_candidate_dataset(processed_movies_dataset)

    return RetrievalDatasets(training_datasets=training_datasets, candidate_dataset=candidate_dataset)

datasets = create_datasets()
print(f"Train dataset size (after batching): {len(datasets.training_datasets.train_ds)}")
print(f"Validation dataset size (after batching): {len(datasets.training_datasets.validation_ds)}")

Train dataset size (after batching): 220
Validation dataset size (after batching): 25


## Defining user and item towers