In [49]:
from typing import Dict,Text
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
import numpy as np
import duckdb
con = duckdb.connect('database.db')

#PREPARE DATA

podcast_ratings_query = con.sql("select author_id as user_id, podcasts.title as podcast_title,rating from reviews "+ 
        "join categories using (podcast_id) " +
        "join podcasts using (podcast_id) where average_rating >= 0").to_df()

podcast_titles_query = con.sql("select podcasts.title as podcast_title from reviews "+ 
        "join categories using (podcast_id) " +
        "join podcasts using (podcast_id) where average_rating >= 0").to_df()

podcast_ratings = podcast_ratings_query.to_dict(orient='records')
podcast_titles = podcast_titles_query.to_dict(orient='records')
podcast_ratings_tf = tf.data.Dataset.from_tensor_slices(pd.DataFrame.from_dict(podcast_ratings).to_dict(orient="list"))
podcast_titles_tf = tf.data.Dataset.from_tensor_slices(pd.DataFrame.from_dict(podcast_titles).to_dict(orient="list"))

ratings_tf = podcast_ratings_tf.map(lambda x: {
    "podcast_title": x["podcast_title"],
    "user_id": x["user_id"],
    "user_rating": x["rating"]
})
titles_tf = podcast_titles_tf.map(lambda x: x["podcast_title"])

tf.random.set_seed(42)
shuffled = ratings_tf.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train_len = int(len(shuffled) * (2/3))
test_len = int(len(shuffled) - train_len)

train = shuffled.take(train_len)
test = shuffled.skip(train_len).take(test_len)

podcast_titles = titles_tf.batch(1_000)
user_ids = ratings_tf.batch(1_000_000).map(lambda x: x["user_id"])

unique_podcast_titles = np.unique(np.concatenate(list(podcast_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [50]:
#TRAIN MODEL

class PodcastlensModel(tfrs.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32
    self.user_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_user_ids, mask_token=None),
      # We add an additional embedding to account for unknown tokens.
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    self.podcast_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_podcast_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_podcast_titles) + 1, embedding_dimension)
    ])
    metrics = tfrs.metrics.FactorizedTopK(
      candidates=titles_tf.batch(128).map(self.podcast_model)
    )

    self.task = tfrs.tasks.Retrieval(
      metrics=metrics
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the podcast features and pass them into the podcast model,
    # getting embeddings back.
    positive_podcast_embeddings = self.podcast_model(features["podcast_title"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_podcast_embeddings)

model = PodcastlensModel()
model.compile(optimizer=tf.keras.optimizers.Adamax(learning_rate=0.1))
cached_train = train.shuffle(100_000).batch(250).cache()
cached_test = test.batch(100).cache()
model.fit(cached_train, epochs=25)

# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.Streaming(model.user_model,k=25)
# recommends podcasts out of the entire podcasts dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((titles_tf.batch(100), titles_tf.batch(100).map(model.podcast_model)))
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow_recommenders.layers.factorized_top_k.Streaming at 0x2368047e680>

In [51]:
#PULL METRICS FOR MODEL ON TEST DATA 
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.5603112578392029,
 'factorized_top_k/top_5_categorical_accuracy': 0.5642023086547852,
 'factorized_top_k/top_10_categorical_accuracy': 0.5797665119171143,
 'factorized_top_k/top_50_categorical_accuracy': 0.6498054265975952,
 'factorized_top_k/top_100_categorical_accuracy': 0.7392995953559875,
 'loss': 132.70932006835938,
 'regularization_loss': 0,
 'total_loss': 132.70932006835938}

In [52]:
#PULL METRICS FOR MODEL ON TRAIN DATA 
model.evaluate(cached_train, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.6705653071403503,
 'factorized_top_k/top_5_categorical_accuracy': 0.6744639277458191,
 'factorized_top_k/top_10_categorical_accuracy': 0.6998050808906555,
 'factorized_top_k/top_50_categorical_accuracy': 0.7738791704177856,
 'factorized_top_k/top_100_categorical_accuracy': 0.859649121761322,
 'loss': 0.0,
 'regularization_loss': 0,
 'total_loss': 0.0}

In [53]:
#RUN PREDICTIONS ON TEST DATA
num = 0 
threshold = 1
for element in test:
    user_id = element["user_id"].numpy().decode()
    _, titles = index(tf.constant([user_id]))
    unique_preds = np.unique(titles.numpy())
    unique_preds = [el.decode('UTF-8') for el in unique_preds]
    if len(unique_preds) > threshold:
        print("Recommendations for user {}: {}".format(user_id,unique_preds))
        print("")
        num += 1
print("Number of users with more than {} recommendation: {}".format(threshold,num))

Recommendations for user a28b1fa544f1f98: ['Little Realms | A DnD Actual Play Podcast', 'Lost In The Shuffle']

Recommendations for user 56a78729efe1265: ['Hear It Now (retired)', 'Lost In The Shuffle', 'The Daily Boost | Daily Coaching and Motivation']

Recommendations for user 56a78729efe1265: ['Hear It Now (retired)', 'Lost In The Shuffle', 'The Daily Boost | Daily Coaching and Motivation']

Recommendations for user d366ab241d287f1: ['Inglestotal : Cursos y clases gratis de Ingles', 'NFL no ProFootballcast com Antony Curti e Eduardo Miceli', 'New Patient Group™ (Formally known as the Doctor Diamond Club Podcast)']

Recommendations for user 2aec6e2a52c1771: ['Inglestotal : Cursos y clases gratis de Ingles', 'Nahh B! Podcast MMA / UFC And Boxing Event Preview & Reviews', 'Sketched Out', 'The Fertility Warriors Podcast: Helping women survive infertility and trying to conceive', 'Twin Talk with the King Twins']

Recommendations for user 27d85c140f6e9ba: ['NFL no ProFootballcast com Anto