In [1]:
from typing import Dict,Text
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
import numpy as np
import duckdb
con = duckdb.connect('database.db')
podcast_ratings_query = con.sql("select author_id as user_id, podcasts.title as podcast_title,rating from reviews "+ 
        "join categories using (podcast_id) " +
        "join podcasts using (podcast_id) where average_rating >= 0").to_df()

podcast_titles_query = con.sql("select podcasts.title as podcast_title from reviews "+ 
        "join categories using (podcast_id) " +
        "join podcasts using (podcast_id) where average_rating >= 0").to_df()

CatalogException: Catalog Error: Table with name reviews does not exist!
Did you mean "pg_views"?

In [None]:
  
podcast_ratings = podcast_ratings_query.to_dict(orient='records')
podcast_titles = podcast_titles_query.to_dict(orient='records')
podcast_ratings_tf = tf.data.Dataset.from_tensor_slices(pd.DataFrame.from_dict(podcast_ratings).to_dict(orient="list"))
podcast_titles_tf = tf.data.Dataset.from_tensor_slices(pd.DataFrame.from_dict(podcast_titles).to_dict(orient="list"))

ratings_tf = podcast_ratings_tf.map(lambda x: {
    "podcast_title": x["podcast_title"],
    "user_id": x["user_id"],
    "user_rating": x["rating"]
})
titles_tf = podcast_titles_tf.map(lambda x: x["podcast_title"])

In [None]:
tf.random.set_seed(42)
shuffled = ratings_tf.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

podcast_titles = titles_tf.batch(1_000)
user_ids = ratings_tf.batch(1_000_000).map(lambda x: x["user_id"])

unique_podcast_titles = np.unique(np.concatenate(list(podcast_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [None]:
class PodcastlensModel(tfrs.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32
    self.user_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_user_ids, mask_token=None),
      # We add an additional embedding to account for unknown tokens.
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    self.podcast_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_podcast_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_podcast_titles) + 1, embedding_dimension)
    ])
    metrics = tfrs.metrics.FactorizedTopK(
      candidates=titles_tf.batch(128).map(self.podcast_model)
    )

    self.task = tfrs.tasks.Retrieval(
      metrics=metrics
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the podcast features and pass them into the podcast model,
    # getting embeddings back.
    positive_podcast_embeddings = self.podcast_model(features["podcast_title"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_podcast_embeddings)

model = PodcastlensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
cached_train = train.shuffle(100_000).batch(256).cache()

model.fit(cached_train, epochs=10)

model.evaluate(cached_train, return_dict=True)
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.Streaming(model.user_model)
# recommends podcasts out of the entire podcasts dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((titles_tf.batch(10), titles_tf.batch(10).map(model.podcast_model)))
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow_recommenders.layers.factorized_top_k.Streaming at 0x1a6e45763b0>

In [None]:
num = 0
threshold = 2
for user in unique_user_ids:
    user_id = user.decode()
    _, titles = index(tf.constant([user_id]))
    unique_preds = np.unique(titles.numpy())
    unique_preds = [el.decode('UTF-8') for el in unique_preds]
    if len(unique_preds) > threshold:
        print("Recommendations for user {}: {}".format(user_id,unique_preds))
        print("")
        num += 1
print("Number of users with recommendations over {}: {}".format(threshold,num))

Recommendations for user 0d14d171f46c19b: ['All of the Above radio', 'Rebel Chums', 'Things a Teacher Taught Me']

Recommendations for user 42ded2f5c6d7ac3: ['F*ck Like a Woman', 'I Crush Barbecue Show', 'Rebel Chums', 'Things a Teacher Taught Me']

Recommendations for user 91ce11be82ebaf7: ['Hear It Now (retired)', 'Inglestotal : Cursos y clases gratis de Ingles', 'Nahh B! Podcast MMA / UFC And Boxing Event Preview & Reviews', 'Twin Talk with the King Twins']

Recommendations for user b3dd55b0f6dea86: ['F*ck Like a Woman', 'Nahh B! Podcast MMA / UFC And Boxing Event Preview & Reviews', 'Talking Web Marketing']

Recommendations for user e49452d7f3403b3: ['Nahh B! Podcast MMA / UFC And Boxing Event Preview & Reviews', 'Talking Web Marketing', 'Twin Talk with the King Twins']

Number of users with recommendations over 2: 5
