In [1]:
import pandas as pd
import numpy as np

In [34]:
# Load the dataset from the clip_topic_modeling management command
dataset = pd.read_json("../data/dataset.jsonl", lines=True)
dataset.head()
summaries = dataset["summary"][:5000]

In [35]:
# Embed the summary instead of using the transcript embeddings
from bertopic.backend import BaseEmbedder
from fastembed import TextEmbedding

class NomicEmbedder(BaseEmbedder):
    def __init__(self):
        super().__init__()
        self.embedding_model = TextEmbedding(model_name="nomic-ai/nomic-embed-text-v1.5-Q")

    def embed(self, documents, verbose=False):
        embeddings_generator = self.embedding_model.embed(documents)
        return np.array(list(embeddings_generator)) 
    
embedding_model = NomicEmbedder()
embeddings = embedding_model.embed(summaries)

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 6964.97it/s]


In [36]:
from sklearn.feature_extraction import text

# We need custom stop words for the AI generated summaries 
summary_stop_words = [
  "delve", "clip", "hosts", "discuss", "tone", "conversational", "personal", "touches", "conversation", "including"
]
stop_words = list(text.ENGLISH_STOP_WORDS.union(summary_stop_words) )


In [67]:
import openai
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, OpenAI

# Reccomended umap for dimensionality reduction
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# Reccomended HDBSCAN for clustering
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# CountVectorizer for improving topic representation
vectorizer_model = CountVectorizer(stop_words=stop_words, min_df=2, ngram_range=(1, 2))
# KeyBERTInspired for additional topic representation
keybert_model = KeyBERTInspired()
# GPT-3.5
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
client = openai.OpenAI(api_key="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")
openai_model = OpenAI(client, model="gpt-4o-mini", exponential_backoff=True, chat=True, prompt=prompt)


# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,
}

In [69]:
from bertopic import BERTopic

# Convert embeddings into np.ndarray
topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

topics, probs = topic_model.fit_transform(summaries, embeddings)

2024-08-09 14:23:46,412 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-09 14:23:51,729 - BERTopic - Dimensionality - Completed ✓
2024-08-09 14:23:51,729 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-09 14:23:51,795 - BERTopic - Cluster - Completed ✓
2024-08-09 14:23:51,796 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 62/62 [00:55<00:00,  1.12it/s]
2024-08-09 14:25:24,743 - BERTopic - Representation - Completed ✓


In [70]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI,Representative_Docs
0,-1,1313,-1_trump_political_discusses_discussion,"[trump, political, discusses, discussion, chal...","[topics, insights, media, podcast, discusses, ...",[Social Media's Evolving Impact],[Pollster Rich Barris discusses recent politic...
1,0,471,0_football_nfl_team_quarterback,"[football, nfl, team, quarterback, fantasy, pl...","[fantasy football, quarterbacks, draft, discus...",[Fantasy Football Strategy Insights],[This clip discusses recent trade rumors surro...
2,1,211,1_harris_kamala_kamala harris_walz,"[harris, kamala, kamala harris, walz, politica...","[harris campaign, presidential campaign, kamal...",[Harris vs. Trump Campaign Dynamics],[Vice President Kamala Harris challenges Donal...
3,2,209,2_importance_speaker_self_business,"[importance, speaker, self, business, growth, ...","[encouraging, practical insights, speaker emph...",[Personal Growth and Success Strategies],"[In this powerful conversation, a successful e..."
4,3,185,3_olympic_olympics_athletes_sports,"[olympic, olympics, athletes, sports, paris, g...","[olympics, olympic, olympic sports, olympic ga...",[Simone Biles and Olympic Excellence],[Sports reporter Ava Wallace delivers a recap ...
...,...,...,...,...,...,...,...
57,56,17,56_brain_consciousness_human_network,"[brain, consciousness, human, network, ai, res...","[consciousness, brain, mind, thought provoking...","[Neuroscience, Consciousness, and Technology]",[This clip explores the intersection of neuros...
58,57,16,57____,"[, , , , , , , , , ]","[, , , , , , , , , ]",[Missing Keywords and Documents],"[, , ]"
59,58,16,58_russia_prisoner_exchange_gershkovich,"[russia, prisoner, exchange, gershkovich, swap...","[prisoner exchange, russia prisoner, prisoner ...",[High-Profile Multinational Prisoner Swap],[This clip delves into the recent multinationa...
60,59,15,59_media_social media_political_musk,"[media, social media, political, musk, social,...","[political discourse, media particularly, medi...","[Media, Politics, and Free Speech]","[This clip covers a range of topics, centering..."


In [71]:
new_topics = topic_model.reduce_outliers(summaries, topics)
new_topics = topic_model.reduce_outliers(summaries, topics, strategy="embeddings", embeddings=embeddings)
topic_model.update_topics(summaries, topics=new_topics)

100%|██████████| 2/2 [00:00<00:00,  3.03it/s]


In [72]:
topic_model.get_topic(1, full=True)

{'Main': [('harris', 0.054052345003635566),
  ('kamala', 0.024914251678689516),
  ('her', 0.021771913604771546),
  ('walz', 0.01896104315106717),
  ('political', 0.018493916685635787),
  ('campaign', 0.017501987263279208),
  ('democratic', 0.015272902382806465),
  ('presidential', 0.014146856995733908),
  ('vice', 0.014067093785628737),
  ('trump', 0.013960943925954658)],
 'KeyBERT': [('harris campaign', 0.7150465),
  ('presidential campaign', 0.70166427),
  ('kamala harris', 0.6983432),
  ('campaign', 0.6796224),
  ('harris political', 0.6692285),
  ('president kamala', 0.6661615),
  ('candidates', 0.64990175),
  ('candidate', 0.64885986),
  ('kamala', 0.64526975),
  ('presidential race', 0.6309114)],
 'OpenAI': [('Harris vs. Trump Campaign Dynamics', 1)]}

In [73]:
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,OpenAI,Representative_Docs
0,0,474,0_football_nfl_team_quarterback,Fantasy Football Strategy Insights,"[football, nfl, team, quarterback, the, fantas...","[fantasy football, quarterbacks, draft, discus...",[Fantasy Football Strategy Insights],[This clip discusses recent trade rumors surro...
1,1,257,1_harris_kamala_her_walz,Harris vs. Trump Campaign Dynamics,"[harris, kamala, her, walz, political, campaig...","[harris campaign, presidential campaign, kamal...",[Harris vs. Trump Campaign Dynamics],[Vice President Kamala Harris challenges Donal...
2,2,287,2_personal_importance_to_and,Personal Growth and Success Strategies,"[personal, importance, to, and, of, speaker, t...","[encouraging, practical insights, speaker emph...",[Personal Growth and Success Strategies],"[In this powerful conversation, a successful e..."
3,3,193,3_olympic_olympics_the_athletes,Simone Biles and Olympic Excellence,"[olympic, olympics, the, athletes, sports, in,...","[olympics, olympic, olympic sports, olympic ga...",[Simone Biles and Olympic Excellence],[Sports reporter Ava Wallace delivers a recap ...
4,4,169,4_biden_democratic_party_political,Biden's Cognitive Health Concerns,"[biden, democratic, party, political, joe, pre...","[concerns biden, biden candidacy, biden, biden...",[Biden's Cognitive Health Concerns],[Deep dive into Joe Biden's cognitive health a...
...,...,...,...,...,...,...,...,...
56,56,29,56_brain_consciousness_human_our,"Neuroscience, Consciousness, and Technology","[brain, consciousness, human, our, of, in, how...","[consciousness, brain, mind, thought provoking...","[Neuroscience, Consciousness, and Technology]",[This clip explores the intersection of neuros...
57,57,16,57____,Missing Keywords and Documents,"[, , , , , , , , , ]","[, , , , , , , , , ]",[Missing Keywords and Documents],"[, , ]"
58,58,16,58_russia_prisoner_exchange_gershkovich,High-Profile Multinational Prisoner Swap,"[russia, prisoner, exchange, gershkovich, swap...","[prisoner exchange, russia prisoner, prisoner ...",[High-Profile Multinational Prisoner Swap],[This clip delves into the recent multinationa...
59,59,39,59_media_political_social_of,"Media, Politics, and Free Speech","[media, political, social, of, the, and, on, a...","[political discourse, media particularly, medi...","[Media, Politics, and Free Speech]","[This clip covers a range of topics, centering..."


In [74]:
topic_model.visualize_hierarchy(custom_labels=True)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [80]:
topic_distr, _ = topic_model.approximate_distribution(summaries, window=8, stride=4)

100%|██████████| 5/5 [00:00<00:00,  6.25it/s]


In [88]:
import random
# Random summary
summary_index = random.randint(0, len(summaries) - 1)
print(dataset["id"][summary_index])    
print(dataset["name"][summary_index])    
print(dataset["summary"][summary_index])
topic_model.visualize_distribution(topic_distr[summary_index], custom_labels=True)

7424
US Soccer Coach Search: Vieira Emerges as Strong Contender
This clip discusses the ongoing search for a new US national soccer team coach. Fabrizio Romano, a football insider, provides insights into potential candidates. Patrick Vieira emerges as a strong contender, with positive conversations taking place. Other names mentioned include Thierry Henry, Mauricio Pochettino, and Thomas Tuchel, though they seem less likely. The clip also touches on the future prospects of outgoing manager Greg Berhalter, suggesting he might be suited for a newly promoted Premier League club. The conversation concludes with speculation about the England national team job post-Gareth Southgate, with Pep Guardiola mentioned as a dream candidate. The tone is informational and speculative, offering insider knowledge on high-profile coaching positions in international soccer.


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed