##### Copyright 2020 The TensorFlow Authors.

In [6]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# TensorFlow Recommenders: Quickstart

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/recommenders/quickstart"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/recommenders/blob/main/docs/examples/quickstart.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/recommenders/blob/main/docs/examples/quickstart.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/recommenders/docs/examples/quickstart.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

In this tutorial, we build a simple matrix factorization model using the [MovieLens 100K dataset](https://grouplens.org/datasets/movielens/100k/) with TFRS. We can use this model to recommend movies for a given user.

### Import TFRS

First, install and import TFRS:

In [7]:
!pip install -q tensorflow-recommenders --user
!pip install -q --upgrade tensorflow-datasets --user
!pip install -q tqdm --user

In [38]:
from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
import tempfile
import os

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

from google.cloud import storage
from io import BytesIO

from tqdm.notebook import tqdm, trange
import time    # to be used in loop iterations

from keras.callbacks import TensorBoard

In [9]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Clear any logs from previous runs
#rm -rf ./logs/

In [10]:
# Use tmqdm for cell execution progress bar:
# The general syntax is just like the sample bel. In your loop, just wrap the iterable inside the tqdm()

# Loop with a progres bar
for i in tqdm(range(100)):
    time.sleep(0.01)

  0%|          | 0/100 [00:00<?, ?it/s]

In [11]:
# nested loops with progress bar
outer_level = list(range(2))
inner_level = list(range(100))
for _ in tqdm(outer_level, desc='Outer Level'):
    for number in tqdm(inner_level, desc='Inner Level'):
        time.sleep(0.01)

Outer Level:   0%|          | 0/2 [00:00<?, ?it/s]

Inner Level:   0%|          | 0/100 [00:00<?, ?it/s]

Inner Level:   0%|          | 0/100 [00:00<?, ?it/s]

### Read the data

In [12]:
#from google.colab import drive
#drive.mount('/content/drive')

In [16]:
# import Black Friday Sales datasets from GCS

# connect to gcs
client = storage.Client()
bucket_name = "black_friday_datasets"

train_sales_data = "train_black_friday.csv"
test_sales_data = "test_black_friday.csv"
concatenated_sales_datasets = "full_black_friday.csv"
bucket = client.get_bucket(bucket_name)

# get blobs from bucket
blob_train_sale = bucket.get_blob(train_sales_data)
blob_test_sale = bucket.get_blob(test_sales_data)
blob_concatenated_sales = bucket.get_blob(concatenated_sales_datasets)

# save as local files in /datasets
filename_train_sale = blob_train_sale.download_to_filename("datasets/" + train_sales_data)
filename_test_sale = blob_test_sale.download_to_filename("datasets/" + test_sales_data)
filename_content_concatenated_sales = blob_concatenated_sales.download_to_filename("datasets/" + concatenated_sales_datasets)

# open one blob to have a look at the data
content_train_sale = blob_train_sale.download_as_bytes()

black_friday_df = pd.read_csv(BytesIO(content_train_sale))
black_friday_df.head()


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [19]:
# mapping function
def sales_map(user_id, product_id):
    return {"user_id": user_id, "product_id": product_id}


# paths to datasets
train_csv_path = "datasets/train_black_friday.csv"
test_csv_path = "datasets/test_black_friday.csv"
full_black_friday_ds_path = "datasets/full_black_friday.csv"

Convert CSV to TensorFlow Datasets and extract features

In [20]:
# #TODO the model could use more features (currently uses only user_id and product_id)

sales_train_ds = tf.data.experimental.CsvDataset(
    filenames=train_csv_path,
    # record_defaults=[tf.constant([1000000], dtype=tf.int32), tf.string],
    record_defaults=[tf.string, tf.string],
    select_cols=[0, 1],
    field_delim=",",
    header=True)

sales_train_ds = sales_train_ds.map(map_func=sales_map)
# sales_train_ds = sales_train_ds.batch(1)

for data in sales_train_ds:
    tf.print("Train DS sample: ", data)  # {'product_id': ["P00069042"], 'user_id': [1000001]}
    break

sales_test_ds = tf.data.experimental.CsvDataset(
    filenames=test_csv_path,
    # record_defaults=[tf.constant([1000000], dtype=tf.int32), tf.string],
    record_defaults=[tf.string, tf.string],
    select_cols=[0, 1],
    field_delim=",",
    header=True)

sales_test_ds = sales_test_ds.map(map_func=sales_map)
# sales_test_ds = sales_test_ds.batch(1)

for data in sales_test_ds:
    tf.print("Test DS sample: ", data)  # {'product_id': ["P00069042"], 'user_id': [1000001]}
    break



Train DS sample:  {'product_id': "P00069042", 'user_id': "1000001"}
Test DS sample:  {'product_id': "P00128942", 'user_id': "1000004"}


ratings DS sample:  {'movie_title': "One Flew Over the Cuckoo\'s Nest (1975)", 'user_id': "138"}
movies DS sample:  "You So Crazy (1994)"

In [21]:
full_dataset = tf.data.experimental.CsvDataset(
    filenames=full_black_friday_ds_path,
    # record_defaults=[tf.constant([1000000], dtype=tf.int32), tf.string],
    record_defaults=[tf.string, tf.string],
    select_cols=[0, 1],
    field_delim=",",
    header=True,
	na_value="")

full_dataset = full_dataset.map(map_func=sales_map)
# sales_test_ds = sales_test_ds.batch(1)

for data in full_dataset:
    tf.print("Test DS sample: ", data)  # {'product_id': ["P00069042"], 'user_id': [1000001]}
    break


Test DS sample:  {'product_id': "P00128942", 'user_id': "1000004"}


Build vocabularies to convert user ids and movie titles into integer indices for embedding layers:

In [22]:
# TODO make sure that the product_id and user_id are unique - what we have now is immensly slowing down training

In [23]:

products = sales_train_ds.map(lambda x: x["product_id"])
products = products.unique()



users = sales_train_ds.map(lambda x: x["user_id"])
#users = users.unique()

for data in products:
    tf.print("products dataset sample row: ", data)  
    break
    
for data in users:
    tf.print("users dataset sample row: ", data)  
    break    
    
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(users)
#for i in tqdm(range(1)):
#    time.sleep(0.01)

product_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
product_ids_vocabulary.adapt(products)

products dataset sample row:  "P00069042"
users dataset sample row:  "1000001"


In [24]:
#create a small dataset to make prototyping faster and easier

#small_dataset = sales_train_ds.shuffle()
#small_dataset = small_dataset.take(50000)

### Define a model

We can define a TFRS model by inheriting from `tfrs.Model` and implementing the `compute_loss` method:

In [25]:
class BlackFridayModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_model: tf.keras.Model,
      product_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and product representations.
    self.user_model = user_model
    self.product_model = product_model

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.

    user_embeddings = self.user_model(features["user_id"])
    product_embeddings = self.product_model(features["product_id"])

    return self.task(user_embeddings, product_embeddings)

Define the two models and the retrieval task.

In [26]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64)
])
product_model = tf.keras.Sequential([
    product_ids_vocabulary,
    tf.keras.layers.Embedding(product_ids_vocabulary.vocabulary_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    products.batch(128).map(product_model)
  )
)


### Fit and evaluate it.

Create the model, train it, and generate predictions:



In [30]:
# Create a retrieval model.
model = BlackFridayModel(user_model, product_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Setup logs and callback for tensorboard
logdir = os.path.join("logs_", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

# Train for 3 epochs.
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")
model.fit(sales_train_ds.batch(4096), epochs=3, callbacks=[tensorboard_callback])

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
    products.batch(100).map(lambda product_id: (product_id, model.product_model(product_id))))

# Get some recommendations.
_, product_ids = index(np.array(["42"]))
print(f"Top 3 recommendations for user 42: {product_ids[0, :3]}")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Top 3 recommendations for user 42: [b'P00067542' b'P00272642' b'P00136242']


In [44]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6007 (pid 3555), started 0:11:01 ago. (Use '!kill 3555' to kill it.)

In [33]:
# wante
model.evaluate(sales_test_ds, return_dict=True)



ValueError: in user code:

    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1366, in test_function  *
        return step_function(self, iterator)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1356, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1349, in run_step  **
        outputs = model.test_step(data)
    File "/home/jupyter/.local/lib/python3.7/site-packages/tensorflow_recommenders/models/base.py", line 88, in test_step
        loss = self.compute_loss(inputs, training=False)
    File "/tmp/ipykernel_1/1728946185.py", line 25, in compute_loss
        return self.task(user_embeddings, product_embeddings)
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None

    ValueError: Exception encountered when calling layer "retrieval" (type Retrieval).
    
    in user code:
    
        File "/home/jupyter/.local/lib/python3.7/site-packages/tensorflow_recommenders/tasks/retrieval.py", line 132, in call  *
            scores = tf.linalg.matmul(
    
        ValueError: Shape must be rank 2 but is rank 1 for '{{node retrieval/MatMul}} = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=true](sequential/embedding/embedding_lookup/Identity_1, sequential_1/embedding_1/embedding_lookup/Identity_1)' with input shapes: [64], [64].
    
    
    Call arguments received:
      • query_embeddings=tf.Tensor(shape=(64,), dtype=float32)
      • candidate_embeddings=tf.Tensor(shape=(64,), dtype=float32)
      • sample_weight=None
      • candidate_sampling_probability=None
      • candidate_ids=None
      • compute_metrics=True


In [39]:
# Export the query model.
with tempfile.TemporaryDirectory() as tmp:
  path = os.path.join(tmp, "model")

  # Save the index.
  tf.saved_model.save(index, path)

  # Load it back; can also be done in TensorFlow Serving.
  loaded = tf.saved_model.load(path)

  # Pass a user id in, get top predicted movie titles back.
  scores, titles = loaded(["42"])

  print(f"Recommendations: {titles[0][:3]}")

2022-01-19 23:35:26.627037: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /tmp/tmp8or5h_a0/model/assets


INFO:tensorflow:Assets written to: /tmp/tmp8or5h_a0/model/assets


Recommendations: [b'P00067542' b'P00272642' b'P00136242']


In [41]:
EXPORT_PATH = "models/model1"
  # Save the index.
tf.saved_model.save(index, EXPORT_PATH)



INFO:tensorflow:Assets written to: models/model1/assets


INFO:tensorflow:Assets written to: models/model1/assets


In [None]:
scann_index = tfrs.layers.factorized_top_k.ScaNN(model.user_model)
scann_index.index_from_dataset(
  tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)

In [None]:
# Get recommendations.
_, titles = scann_index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles[0, :3]}")

In [None]:
# Export the query model.
with tempfile.TemporaryDirectory() as tmp:
  path = os.path.join(tmp, "model")

  # Save the index.
  tf.saved_model.save(
      index,
      path,
      options=tf.saved_model.SaveOptions(namespace_whitelist=["Scann"])
  )

  # Load it back; can also be done in TensorFlow Serving.
  loaded = tf.saved_model.load(path)

  # Pass a user id in, get top predicted movie titles back.
  scores, titles = loaded(["42"])

  print(f"Recommendations: {titles[0][:3]}")