##### Copyright 2020 The TensorFlow Authors.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# TensorFlow Recommenders: Quickstart

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/recommenders/quickstart"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/recommenders/blob/main/docs/examples/quickstart.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/recommenders/blob/main/docs/examples/quickstart.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/recommenders/docs/examples/quickstart.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

In this tutorial, we build a simple matrix factorization model using the [MovieLens 100K dataset](https://grouplens.org/datasets/movielens/100k/) with TFRS. We can use this model to recommend movies for a given user.

### Import TFRS

First, install and import TFRS:

In [1]:
!pip install -q tensorflow-recommenders --user
!pip install -q --upgrade tensorflow-datasets --user
!pip install -q tqdm --user

In [2]:
from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

from google.cloud import storage
from io import BytesIO

from tqdm.notebook import tqdm, trange
import time    # to be used in loop iterations

In [None]:
# for tqmd on jupyter notebook (classic)

# !pip install ipywidgets
# jupyter nbextension enable --py widgetsnbextension

In [None]:
# for tqmd on jupyter lab
jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [None]:
# Using tmqdm for cell execution progress bar:
# The general syntax is just like that. In your loop, just wrap the iterable inside the tqdm()

# Loop with a progres bar
for i in tqdm(range(100)):
    time.sleep(0.01)

In [None]:
# nested loops with progress bar
outer_level = list(range(2))
inner_level = list(range(100))
for _ in tqdm(outer_level, desc='Outer Level'):
    for number in tqdm(inner_level, desc='Inner Level'):
        time.sleep(0.01)

### Read the data

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

In [18]:
# import Black Friday Sales datasets from GCS

# connect to gcs
client = storage.Client()
bucket_name = "black_friday_datasets"

train_sales_data = "train_black_friday.csv"
test_sales_data = "test_black_friday.csv"
concatenated_sales_datasets = "full_black_friday.csv"
bucket = client.get_bucket(bucket_name)

# get blobs from bucket
blob_train_sale = bucket.get_blob(train_sales_data)
blob_test_sale = bucket.get_blob(test_sales_data)
blob_concatenated_sales = bucket.get_blob(concatenated_sales_datasets)

# save as local files in /datasets
filename_train_sale = blob_train_sale.download_to_filename("ml-demo2/datasets/" + train_sales_data)
filename_test_sale = blob_test_sale.download_to_filename("ml-demo2/datasets/" + test_sales_data)
filename_content_concatenated_sales = blob_concatenated_sales.download_to_filename("ml-demo2/datasets/" + concatenated_sales_datasets)

# open one blob to have a look at the data
content_train_sale = blob_train_sale.download_as_bytes()

black_friday_df = pd.read_csv(BytesIO(content_train_sale))
black_friday_df.head()


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [6]:
# mapping function
def sales_map(user_id, product_id):
    return {"user_id": user_id, "product_id": product_id}


# paths to datasets
train_csv_path = "ml-demo2/datasets/train_black_friday.csv"
test_csv_path = "ml-demo2/datasets/test_black_friday.csv"
full_black_friday_ds_path = "ml-demo2/datasets/full_black_friday.csv"

Convert CSV to TensorFlow Datasets and extract features

In [None]:
# #TODO the model could use more features (currently uses only user_id and product_id)

sales_train_ds = tf.data.experimental.CsvDataset(
    filenames=train_csv_path,
    # record_defaults=[tf.constant([1000000], dtype=tf.int32), tf.string],
    record_defaults=[tf.string, tf.string],
    select_cols=[0, 1],
    field_delim=",",
    header=True)

sales_train_ds = sales_train_ds.map(map_func=sales_map)
# sales_train_ds = sales_train_ds.batch(1)

for data in sales_train_ds:
    tf.print("Train DS sample: ", data)  # {'product_id': ["P00069042"], 'user_id': [1000001]}
    break

sales_test_ds = tf.data.experimental.CsvDataset(
    filenames=test_csv_path,
    # record_defaults=[tf.constant([1000000], dtype=tf.int32), tf.string],
    record_defaults=[tf.string, tf.string],
    select_cols=[0, 1],
    field_delim=",",
    header=True)

sales_test_ds = sales_test_ds.map(map_func=sales_map)
# sales_test_ds = sales_test_ds.batch(1)

for data in sales_test_ds:
    tf.print("Test DS sample: ", data)  # {'product_id': ["P00069042"], 'user_id': [1000001]}
    break



2022-01-18 02:28:00.784626: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-18 02:28:01.463569: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-18 02:28:01.464519: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-18 02:28:01.534198: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Train DS sample:  {'product_id': "P00128942", 'user_id': "1000004"}
Test DS sample:  {'product_id': "P00069042", 'user_id': "1000001"}


In [19]:
full_dataset = tf.data.experimental.CsvDataset(
    filenames=full_black_friday_ds_path,
    # record_defaults=[tf.constant([1000000], dtype=tf.int32), tf.string],
    record_defaults=[tf.string, tf.string],
    select_cols=[0, 1],
    field_delim=",",
    header=True,
	na_value="")

full_dataset = full_dataset.map(map_func=sales_map)
# sales_test_ds = sales_test_ds.batch(1)

for data in full_dataset:
    tf.print("Test DS sample: ", data)  # {'product_id': ["P00069042"], 'user_id': [1000001]}
    break


Test DS sample:  {'product_id': "P00128942", 'user_id': "1000004"}


Build vocabularies to convert user ids and movie titles into integer indices for embedding layers:

In [None]:
# TODO make sure that the product_id and user_id are unique - what we have now is immensly slowing down training

In [10]:

products = full_dataset.map(lambda x: x["product_id"])

for data in full_dataset:
    tf.print("full DS sample: ", data)  # {'product_id': ["P00069042"], 'user_id': [1000001]}
    break
    
    
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(full_dataset.map(lambda x: x["user_id"]))

product_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
product_ids_vocabulary.adapt(full_dataset.map(lambda x: x["product_id"]))

full DS sample:  {'product_id': "P00128942", 'user_id': "1000004"}


In [None]:
#create a small dataset to make prototyping faster and easier

small_dataset = sales_train_ds.shuffle()
small_dataset = small_dataset.take(50000)

### Define a model

We can define a TFRS model by inheriting from `tfrs.Model` and implementing the `compute_loss` method:

In [11]:
class BlackFridayModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_model: tf.keras.Model,
      product_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and product representations.
    self.user_model = user_model
    self.product_model = product_model

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.

    user_embeddings = self.user_model(features["user_id"])
    product_embeddings = self.product_model(features["product_id"])

    return self.task(user_embeddings, product_embeddings)

Define the two models and the retrieval task.

In [13]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64)
])
product_model = tf.keras.Sequential([
    product_ids_vocabulary,
    tf.keras.layers.Embedding(product_ids_vocabulary.vocabulary_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    products.batch(128).map(product_model)
  )
)


### Fit and evaluate it.

Create the model, train it, and generate predictions:



In [None]:
# Create a retrieval model.
model = BlackFridayModel(user_model, product_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for 3 epochs.
model.fit(full_dataset.batch(1024), epochs=25)

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
    products.batch(100).map(lambda product_id: (product_id, model.product_model(product_id))))

# Get some recommendations.
_, product_ids = index(np.array(["42"]))
print(f"Top 3 recommendations for user 42: {product_ids[0, :3]}")

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25

In [None]:
# 4. Save model
EXPORT_PATH = "modesl/model_1"
model.save(EXPORT_PATH)