<a href="https://colab.research.google.com/github/kmkawa/podcast-book-recommender/blob/main/podcast_to_embed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data (descriptions) to Embeddings

In [None]:
!pip install transformers torch pandas numpy



In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

#takes in the text and returns the embedings
def bert_embed(texts, batch_size=16, max_length=256):
    #Generate BERT embeddings for a list of texts using mean pooling.

    model.eval()
    embeddings = []

    with torch.no_grad():
        #for each text
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            inputs = tokenizer(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
            outputs = model(**inputs)
            last_hidden_state = outputs.last_hidden_state
            mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_state.size()).float()

            # Mean pooling
            pooled_embedding = torch.sum(last_hidden_state * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
            embeddings.append(pooled_embedding.cpu().numpy())

    return np.vstack(embeddings)


In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json

# Origiinally in .json, but changing to .csv
json_path = "/content/drive/My Drive/DIS Work/Core Course Work/podcast_data/spotify_podcast_data_cleaned.json"
with open(json_path, "r") as f:
    data = json.load(f)
# getting titles & descriptions
titles = list(data.keys())
descriptions = [data[title].get("description", "") for title in titles]

# Filter descriptions to ensure valid input
# In this case, more than 10 chars, and not None
valid_data = {
    title: meta for title, meta in data.items()
    if isinstance(meta.get("description"), str) and len(meta.get("description", "").strip()) > 10
}

titles = list(valid_data.keys())
descriptions = [valid_data[title]["description"] for title in titles]
print(f"Filtered down to {len(descriptions)} valid descriptions.")

# here is where we get the embedings by just passing through
# the description texts of each podcast
embeddings = bert_embed(descriptions)

Loaded 2890 podcast descriptions.
Filtered down to 2842 valid descriptions.


(2842, 768)

In [None]:
import csv
import numpy as np

# Paths for saving
csv_output_path = "/content/drive/MyDrive/podcast_data_with_bert_embeddings.csv"
embeddings_output_path = "/content/drive/MyDrive/podcast_description_bert_embeddings.npy"

# Save embeddings matrix to .npy
np.save(embeddings_output_path, embeddings)
print(f"Saved embeddings array to: {embeddings_output_path}")

# Write data to CSV
with open(csv_output_path, "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["title", "description", "total_episodes", "embedding"])

    for title, desc, emb in zip(titles, descriptions, embeddings):
        meta = data.get(title, {})  # safely access metadata dict for this title
        episode_count = meta.get("total_episodes", "")

        writer.writerow([title, desc, episode_count, emb.tolist()])

print(f"CSV with embeddings and metadata created: {csv_output_path}")


Saved embeddings array to: /content/drive/MyDrive/podcast_description_bert_embeddings.npy
CSV with embeddings and metadata created: /content/drive/MyDrive/podcast_data_with_bert_embeddings.csv
