In [247]:
import random
import string
from random import randrange

def generate_user_id():
    return ''.join(random.choices(string.ascii_lowercase +
                             string.digits, k=15))
def get_random_podcast_title(podcasts):
    num = randrange(len(podcasts))
    return podcasts[num]

def get_random_entries(podcasts,num=4300,num_reviews_per_user=11):
    result = []
    count = 1
    user_id =  generate_user_id()
    for i in range(num):   
        if count == num_reviews_per_user:
            user_id =  generate_user_id()
            count = 1
        result.append((user_id,get_random_podcast_title(podcasts),random.randint(1, 5)))
        count += 1
    return result

In [248]:
from typing import Dict,Text
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
import numpy as np
import duckdb
import json
con = duckdb.connect('database.db')

#PREPARE DATA

podcast_ratings_query = con.sql("select author_id as user_id, podcasts.title as podcast_title,rating from reviews "+ 
        "join categories using (podcast_id) " +
        "join podcasts using (podcast_id) where average_rating >= 0").to_df()

podcast_titles_query = con.sql("select podcasts.title as podcast_title from reviews "+ 
        "join categories using (podcast_id) " +
        "join podcasts using (podcast_id) where average_rating >= 0").to_df()

entries = get_random_entries(podcasts=podcast_ratings_query["podcast_title"].unique())
for entry in entries:
    row = pd.Series(entry, index=podcast_ratings_query.columns)
    podcast_ratings_query = podcast_ratings_query.append(row,ignore_index=True) 

  podcast_ratings_query = podcast_ratings_query.append(row,ignore_index=True)


In [249]:
podcast_ratings = podcast_ratings_query.to_dict(orient='records')
podcast_titles = podcast_titles_query.to_dict(orient='records')
podcast_ratings_tf = tf.data.Dataset.from_tensor_slices(pd.DataFrame.from_dict(podcast_ratings).to_dict(orient="list"))
podcast_titles_tf = tf.data.Dataset.from_tensor_slices(pd.DataFrame.from_dict(podcast_titles).to_dict(orient="list"))

ratings_tf = podcast_ratings_tf.map(lambda x: {
    "podcast_title": x["podcast_title"],
    "user_id": x["user_id"],
    "user_rating": x["rating"]
})
titles_tf = podcast_titles_tf.map(lambda x: x["podcast_title"])

tf.random.set_seed(42)
shuffled = ratings_tf.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train_len = int(len(shuffled) * (2/3))
test_len = int(len(shuffled) - train_len)

train = shuffled.take(train_len)
test = shuffled.skip(train_len).take(test_len)

podcast_titles = titles_tf.batch(1_000)
user_ids = ratings_tf.batch(1_000_000).map(lambda x: x["user_id"])

unique_podcast_titles = np.unique(np.concatenate(list(podcast_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [259]:
#TRAIN MODEL

class PodcastlensModel(tfrs.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32
    self.user_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    self.podcast_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_podcast_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_podcast_titles) + 1, embedding_dimension)
    ])
    metrics = tfrs.metrics.FactorizedTopK(
      candidates=titles_tf.batch(128).map(self.podcast_model)
    )

    self.task = tfrs.tasks.Retrieval(
      metrics=metrics
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    user_embeddings = self.user_model(features["user_id"])
    positive_podcast_embeddings = self.podcast_model(features["podcast_title"])

    return self.task(user_embeddings, positive_podcast_embeddings)

model = PodcastlensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
cached_train = train.shuffle(100_000).batch(1500).cache()
cached_test = test.batch(1000).cache()
model.fit(cached_train, epochs=33)

index = tfrs.layers.factorized_top_k.Streaming(model.user_model,k=25)
index.index_from_dataset(
  tf.data.Dataset.zip((titles_tf.batch(1000), titles_tf.batch(100).map(model.podcast_model)))
)

Epoch 1/33
Epoch 2/33
Epoch 3/33
Epoch 4/33
Epoch 5/33
Epoch 6/33
Epoch 7/33
Epoch 8/33
Epoch 9/33
Epoch 10/33
Epoch 11/33
Epoch 12/33
Epoch 13/33
Epoch 14/33
Epoch 15/33
Epoch 16/33
Epoch 17/33
Epoch 18/33
Epoch 19/33
Epoch 20/33
Epoch 21/33
Epoch 22/33
Epoch 23/33
Epoch 24/33
Epoch 25/33
Epoch 26/33
Epoch 27/33
Epoch 28/33
Epoch 29/33
Epoch 30/33
Epoch 31/33
Epoch 32/33
Epoch 33/33


<tensorflow_recommenders.layers.factorized_top_k.Streaming at 0x2539be85060>

In [260]:
#PULL METRICS FOR MODEL ON TEST AND TRAIN DATA
train_metrics = model.evaluate(cached_train, return_dict=True)
test_metrics = model.evaluate(cached_test, return_dict=True)
print("Train Metrics")
print(json.dumps(train_metrics,indent=4))
print("") 
print("Test Metrics")
print(json.dumps(test_metrics,indent=4))

Train Metrics
{
    "factorized_top_k/top_1_categorical_accuracy": 0.23698224127292633,
    "factorized_top_k/top_5_categorical_accuracy": 0.30739644169807434,
    "factorized_top_k/top_10_categorical_accuracy": 0.36390531063079834,
    "factorized_top_k/top_50_categorical_accuracy": 0.7769230604171753,
    "factorized_top_k/top_100_categorical_accuracy": 0.9136094450950623,
    "loss": 1560.76953125,
    "regularization_loss": 0,
    "total_loss": 1560.76953125
}

Test Metrics
{
    "factorized_top_k/top_1_categorical_accuracy": 0.12011834233999252,
    "factorized_top_k/top_5_categorical_accuracy": 0.1337278038263321,
    "factorized_top_k/top_10_categorical_accuracy": 0.14792899787425995,
    "factorized_top_k/top_50_categorical_accuracy": 0.25917160511016846,
    "factorized_top_k/top_100_categorical_accuracy": 0.3159763216972351,
    "loss": 5823.7265625,
    "regularization_loss": 0,
    "total_loss": 5823.7265625
}


In [263]:
#RUN PREDICTIONS ON TEST DATA 
num = 0 
for element in test:
    user_id = element["user_id"].numpy().decode()
    _, preds = index(tf.constant([user_id]))
    unique_preds = np.unique(preds.numpy())
    unique_preds = [el.decode('UTF-8') for el in unique_preds]
    print("Top {} recommendations for user {}: {}".format(len(unique_preds),user_id,unique_preds))
    print("")
    num += 1
    if num == 5:
        break

Top 6 recommendations for user fejcke1h862u5by: ['All of the Above radio', 'Daily Solutions Podcast', 'Little Realms | A DnD Actual Play Podcast', 'Nahh B! Podcast MMA / UFC And Boxing Event Preview & Reviews', 'New Patient Group™ (Formally known as the Doctor Diamond Club Podcast)', 'Sketched Out']

Top 4 recommendations for user lw1ihjvaxzi5e39: ['Lost In The Shuffle', 'Tall Tale TV', 'The Oatley Academy ArtCast', 'Understanding Human Behavior - Video']

Top 3 recommendations for user vgmgnve48id083r: ['I Crush Barbecue Show', 'Sketched Out', 'The Oatley Academy ArtCast']

Top 5 recommendations for user jmv18lgli0r4bd0: ['All of the Above radio', 'Daily Solutions Podcast', 'Little Realms | A DnD Actual Play Podcast', 'Maximiza Tu Negocio en Redes de Mercadeo', 'Talking Web Marketing']

Top 7 recommendations for user 836fb074ac07b26: ['All of the Above radio', 'Lost In The Shuffle', 'Maximiza Tu Negocio en Redes de Mercadeo', 'Rebel Chums', 'Sketched Out', 'Tall Tale TV', 'Understandi