In [1]:
from typing import Dict,Text
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
import numpy as np
import duckdb
con = duckdb.connect('database.db')

#PREPARE DATA

podcast_ratings_query = con.sql("select author_id as user_id, podcasts.title as podcast_title,rating from reviews "+ 
        "join categories using (podcast_id) " +
        "join podcasts using (podcast_id) where average_rating >= 0").to_df()

podcast_titles_query = con.sql("select podcasts.title as podcast_title from reviews "+ 
        "join categories using (podcast_id) " +
        "join podcasts using (podcast_id) where average_rating >= 0").to_df()

podcast_ratings = podcast_ratings_query.to_dict(orient='records')
podcast_titles = podcast_titles_query.to_dict(orient='records')
podcast_ratings_tf = tf.data.Dataset.from_tensor_slices(pd.DataFrame.from_dict(podcast_ratings).to_dict(orient="list"))
podcast_titles_tf = tf.data.Dataset.from_tensor_slices(pd.DataFrame.from_dict(podcast_titles).to_dict(orient="list"))

ratings_tf = podcast_ratings_tf.map(lambda x: {
    "podcast_title": x["podcast_title"],
    "user_id": x["user_id"],
    "user_rating": x["rating"]
})
titles_tf = podcast_titles_tf.map(lambda x: x["podcast_title"])

tf.random.set_seed(42)
shuffled = ratings_tf.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train_len = int(len(shuffled) * (2/3))
test_len = int(len(shuffled) - train_len)

train = shuffled.take(train_len)
test = shuffled.skip(train_len).take(test_len)

podcast_titles = titles_tf.batch(1_000)
user_ids = ratings_tf.batch(1_000_000).map(lambda x: x["user_id"])

unique_podcast_titles = np.unique(np.concatenate(list(podcast_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [2]:
#TRAIN MODEL

class PodcastlensModel(tfrs.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32
    self.user_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_user_ids, mask_token=None),
      # We add an additional embedding to account for unknown tokens.
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    self.podcast_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_podcast_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_podcast_titles) + 1, embedding_dimension)
    ])
    metrics = tfrs.metrics.FactorizedTopK(
      candidates=titles_tf.batch(128).map(self.podcast_model)
    )

    self.task = tfrs.tasks.Retrieval(
      metrics=metrics
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the podcast features and pass them into the podcast model,
    # getting embeddings back.
    positive_podcast_embeddings = self.podcast_model(features["podcast_title"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_podcast_embeddings)

model = PodcastlensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
cached_train = train.shuffle(50_000).batch(512).cache()
cached_test = test.batch(256).cache()
model.fit(cached_train, epochs=100)

# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.Streaming(model.user_model,k=20)
# recommends podcasts out of the entire podcasts dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((titles_tf.batch(100), titles_tf.batch(100).map(model.podcast_model)))
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow_recommenders.layers.factorized_top_k.Streaming at 0x2364efdd540>

In [3]:
#PULL METRICS FOR MODEL ON TEST DATA 
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.5680933594703674,
 'factorized_top_k/top_5_categorical_accuracy': 0.6303501725196838,
 'factorized_top_k/top_10_categorical_accuracy': 0.6342412233352661,
 'factorized_top_k/top_50_categorical_accuracy': 0.688715934753418,
 'factorized_top_k/top_100_categorical_accuracy': 0.731517493724823,
 'loss': 0.0,
 'regularization_loss': 0,
 'total_loss': 0.0}

In [4]:
#RUN PREDICTIONS ON TEST DATA
num = 0
threshold = 2
for element in test:
    user_id = element["user_id"].numpy().decode()
    _, titles = index(tf.constant([user_id]))
    unique_preds = np.unique(titles.numpy())
    unique_preds = [el.decode('UTF-8') for el in unique_preds]
    if len(unique_preds) > threshold:
        print("Recommendations for user {}: {}".format(user_id,unique_preds))
        print("")
        num += 1
print("Number of users with more than {} recommendation: {}".format(threshold,num))

Recommendations for user bf68b549892a57c: ['Hear It Now (retired)', 'Last Week at the Movies Podcast', 'Little Realms | A DnD Actual Play Podcast']

Recommendations for user 5eac2a0f1457a77: ['Hear It Now (retired)', 'In Real Life with Emily and Kimzilla | WFMU', 'The Oatley Academy ArtCast']

Recommendations for user 809f83c21a4e78d: ['F*ck Like a Woman', 'I Crush Barbecue Show', 'In Real Life with Emily and Kimzilla | WFMU', 'Inglestotal : Cursos y clases gratis de Ingles', 'Sketched Out']

Recommendations for user e49452d7f3403b3: ['Daily Solutions Podcast', 'I Crush Barbecue Show', 'Maximiza Tu Negocio en Redes de Mercadeo', 'Nahh B! Podcast MMA / UFC And Boxing Event Preview & Reviews']

Recommendations for user 3ed5db71aa70323: ['All of the Above radio', 'Daily Solutions Podcast', 'Talking Web Marketing']

Recommendations for user 75858959dbb5b15: ['Inglestotal : Cursos y clases gratis de Ingles', 'NFL no ProFootballcast com Antony Curti e Eduardo Miceli', 'Things a Teacher Taugh