In [10]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q scann

In [11]:
import pandas as pd
import requests
import os
import gzip
import json
import numpy as np

import pprint
import tempfile

from typing import Dict, Text

import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [12]:
file_name_type_mapping = {'goodreads_books_fantasy_paranormal.json.gz':'byGenre', 'goodreads_interactions_fantasy_paranormal.json.gz':'byGenre'
, 'goodreads_reviews_fantasy_paranormal.json.gz':'byGenre', 'book_id_map.csv':'complete', 'user_id_map.csv':'complete', 'goodreads_book_authors.json.gz':'complete'}
file_name_url_mapping = {}

for fname in file_name_type_mapping:
    ftype = file_name_type_mapping[fname]
    if ftype == "complete":
        # url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/'+fname
        url = '/content/drive/MyDrive/KaggleX/'+fname
        file_name_url_mapping[fname] = url
    elif ftype == "byGenre":
        # url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/byGenre/'+fname
        url = '/content/drive/MyDrive/KaggleX/'+fname
        file_name_url_mapping[fname] = url

In [13]:
# url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/byGenre/goodreads_books_fantasy_paranormal.json.gz'
# OUT_DIR = './genre'

# if not os.path.exists(OUT_DIR):
#     os.makedirs(OUT_DIR)

# local_filename = os.path.join(OUT_DIR, 'goodreads_books_fantasy_paranormal.json.gz')

# with requests.get(url, stream=True) as r:
#   r.raise_for_status()
#   with open(local_filename, 'wb') as f:
#     for chunk in r.iter_content(chunk_size=8192):
#       f.write(chunk)
def download_by_name(fname, local_filename):
    if fname in file_name_url_mapping:
        url = file_name_url_mapping[fname]
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print('Dataset', fname, 'has been downloaded!')
    else:
        print('Dataset', fname, 'can not be found!')

In [14]:
def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)

            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return data

In [15]:
GENRE_OUT_DIR = './goodreads/genre/'
COMPLETE_OUT_DIR = './goodreads/complete/'
if not os.path.exists(GENRE_OUT_DIR):
    os.makedirs(GENRE_OUT_DIR)

if not os.path.exists(COMPLETE_OUT_DIR):
    os.makedirs(COMPLETE_OUT_DIR)

# for
# output_path = os.path.join(GENRE_OUT_DIR, 'goodreads_books_poetry.json.gz')
# download_by_name('goodreads_books_poetry.json.gz', output_path)

In [16]:
def download_file_list():
  for fname in file_name_type_mapping:
    ftype = file_name_type_mapping[fname]
    if ftype == "complete":
        output_path = os.path.join(COMPLETE_OUT_DIR, fname)
        download_by_name(fname, output_path)
    elif ftype == "byGenre":
        output_path = os.path.join(GENRE_OUT_DIR, fname)
        download_by_name(fname, output_path)

In [17]:
# download_file_list()

# New Section

In [18]:
DIR = './'

books = load_data(os.path.join('/content/drive/MyDrive/KaggleX/', 'goodreads_books_fantasy_paranormal.json.gz'))
interactions = load_data(os.path.join('/content/drive/MyDrive/KaggleX/', 'goodreads_interactions_fantasy_paranormal.json.gz'))


In [19]:
# print(' == sample record (books) ==')
display(np.random.choice(interactions))

{'user_id': 'f8a89075dc6de14857561522e729f82c',
 'book_id': '19346451',
 'review_id': 'e39fba6af04079b2335b1128d6e219aa',
 'is_read': False,
 'rating': 0,
 'review_text_incomplete': '',
 'date_added': 'Fri May 09 18:14:08 -0700 2014',
 'date_updated': 'Mon Mar 09 15:39:41 -0700 2015',
 'read_at': '',
 'started_at': ''}

In [20]:
# print(books[0])
list(filter(lambda x: x['book_id'] == '24793295', books))

[{'isbn': '',
  'text_reviews_count': '2',
  'series': ['269219'],
  'country_code': 'US',
  'language_code': 'eng',
  'popular_shelves': [{'count': '1115', 'name': 'to-read'},
   {'count': '198', 'name': 'fantasy'},
   {'count': '59', 'name': 'mystery'},
   {'count': '24', 'name': 'm-m'},
   {'count': '20', 'name': 'lgbt'},
   {'count': '20', 'name': 'fiction'},
   {'count': '19', 'name': 'series'},
   {'count': '14', 'name': 'currently-reading'},
   {'count': '14', 'name': 'queer'},
   {'count': '14', 'name': 'romance'},
   {'count': '12', 'name': 'lgbtq'},
   {'count': '12', 'name': 'gay'},
   {'count': '11', 'name': 'historical'},
   {'count': '10', 'name': 'favorites'},
   {'count': '10', 'name': 'ebook'},
   {'count': '9', 'name': 'glbt'},
   {'count': '9', 'name': 'to-buy'},
   {'count': '8', 'name': 'owned'},
   {'count': '8', 'name': 'mm'},
   {'count': '7', 'name': 'science-fiction'},
   {'count': '6', 'name': 'maybe'},
   {'count': '6', 'name': 'crime'},
   {'count': '5', 'n

In [21]:
# csv_file = tf.keras.utils.get_file('goodreads_book_authors.json.gz', './goodreads/complete/goodreads_book_authors.json.gz')

In [22]:
ratings = map(lambda x: {"book_id":x["book_id"], "user_id":x["user_id"]}, filter(lambda x: x['is_read'], interactions))
# 199
book_ids = map(lambda x: {"book_id": x["book_id"]}, books)
# 501
# print(len(list(book_ids)))
ratings_df = pd.read_json(json.dumps(list(ratings)))
books_df = pd.read_json(json.dumps(list(book_ids)))
ratings_tf = tf.data.Dataset.from_tensor_slices((dict(ratings_df)))
books_tf = tf.data.Dataset.from_tensor_slices((dict(books_df)))




In [23]:
ratings_df[ratings_df['user_id'] == '8842281e1d1347389f2ab93d60773d4d']

Unnamed: 0,book_id,user_id
0,18245960,8842281e1d1347389f2ab93d60773d4d
1,29058155,8842281e1d1347389f2ab93d60773d4d
2,186074,8842281e1d1347389f2ab93d60773d4d
3,15839976,8842281e1d1347389f2ab93d60773d4d
4,5577844,8842281e1d1347389f2ab93d60773d4d
5,17315048,8842281e1d1347389f2ab93d60773d4d
6,13453029,8842281e1d1347389f2ab93d60773d4d
7,13239822,8842281e1d1347389f2ab93d60773d4d
8,10673579,8842281e1d1347389f2ab93d60773d4d
9,6987584,8842281e1d1347389f2ab93d60773d4d


In [24]:
all_book_ids = books_tf.batch(1_000).map(lambda x: x["book_id"])

user_ids = ratings_tf.batch(1_000_000).map(lambda x: x["user_id"])

unique_book_ids = np.unique(np.concatenate(list(all_book_ids)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))
all_book_ids
unique_book_ids[:10]
# unique_user_ids

array([ 13892,  38560,  38562,  38564,  38568,  39307,  64737,  89583,
        89588, 175248])

In [25]:
tf.random.set_seed(42)
# shuffled = ratings_tf.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

# train = shuffled.take(80_000)
# test = shuffled.skip(80_000).take(20_000)
shuffled = ratings_tf.shuffle(199, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(160)
test = shuffled.skip(160).take(39)

In [26]:
book_map = books_tf.map(lambda x: x["book_id"])
book_map

<_MapDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [27]:
embedding_dimension = 32

In [28]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

In [29]:
book_model = tf.keras.Sequential([
  tf.keras.layers.IntegerLookup(
      vocabulary=unique_book_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_book_ids) + 1, embedding_dimension)
])

In [30]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=book_map.batch(128).map(book_model)
)

In [31]:
metrics

<tensorflow_recommenders.metrics.factorized_top_k.FactorizedTopK at 0x7f1065051c00>

In [32]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [33]:
class GoodreadsModel(tfrs.Model):

  def __init__(self, user_model, book_model):
    super().__init__()
    self.book_model: tf.keras.Model = book_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_book_embeddings = self.book_model(features["book_id"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_book_embeddings)

In [34]:
model = GoodreadsModel(user_model, book_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [35]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [36]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7f10d3f54c40>

In [37]:
cached_train

<CacheDataset element_spec={'book_id': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'user_id': TensorSpec(shape=(None,), dtype=tf.string, name=None)}>

In [38]:
cached_test

<CacheDataset element_spec={'book_id': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'user_id': TensorSpec(shape=(None,), dtype=tf.string, name=None)}>

In [39]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0,
 'factorized_top_k/top_5_categorical_accuracy': 0.0,
 'factorized_top_k/top_10_categorical_accuracy': 0.0,
 'factorized_top_k/top_50_categorical_accuracy': 0.0,
 'factorized_top_k/top_100_categorical_accuracy': 0.12820513546466827,
 'loss': 142.87890625,
 'regularization_loss': 0,
 'total_loss': 142.87890625}

In [40]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((book_map.batch(100), book_map.batch(100).map(model.book_model)))
)

# Get recommendations.
_, titles = index(tf.constant(["8842281e1d1347389f2ab93d60773d4d"]))
print(f"Recommendations for user 8842281e1d1347389f2ab93d60773d4d: {titles[0, :3]}")

Recommendations for user 8842281e1d1347389f2ab93d60773d4d: [15701114  9834950  1076048]


In [41]:
# with tempfile.TemporaryDirectory() as tmp:
#   path = os.path.join(tmp, "model")

# Save the index.
tf.saved_model.save(index, '/content/drive/MyDrive/KaggleX/models/')

# Load it back; can also be done in TensorFlow Serving.
loaded = tf.saved_model.load('/content/drive/MyDrive/KaggleX/models')

