This is an example for a failed embedding on our dataset. Only for documentation purposes.

In [1]:
%%capture
! pip install -r requirements.txt

In [2]:
from pyarrow import feather
df = feather.read_feather("./data/clean/mdb_paragraphs.feather")
df = df.reset_index(drop=True)
df.head()

# count distinct rede_ids
df["rede_id"].nunique()

38542

In [3]:
%%capture
import nltk
nltk.download("stopwords")
nltk.download('punkt')

In [None]:
from sentence_transformers import SentenceTransformer
import torch

# Initialize the SentenceTransformer model
sentence_model = SentenceTransformer("all-MiniLM-L6-v2",
                                     # this script is running on a mac with an M1 chip
                                     # if you use a device with an NVIDIA GPU, you can set device to "cuda"
                                     device=torch.device("mps"), trust_remote_code=True)

# Compute embeddings for each paragraph
embeddings = sentence_model.encode(df.paragraph, show_progress_bar=True)

In [19]:
from umap import UMAP
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN

# removing german stopwords, downloaded earlier
german_stop_words = stopwords.words('german')

# these are the default values, can be improved for your dataset
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
vectorizer_model = CountVectorizer(stop_words=german_stop_words, min_df=2, ngram_range=(1, 2))
hdbscan_model = HDBSCAN(metric="euclidean", 
                        cluster_selection_method="eom",
                        prediction_data=True,
                        # the following two options can be changed to improve the clustering
                        # lower values for min_cluster_size will result in more clusters/topics
                        # lower values for min_samples will reduce noise but have an impact on the size of clusters
                        min_cluster_size=300,
                        min_samples=300)

In [20]:
# We reduce our embeddings to 2D as we will construct a 2D plot for visualization
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(embeddings)


In [21]:
from bertopic import BERTopic
import openai
from bertopic.representation import KeyBERTInspired, OpenAI
from dotenv import load_dotenv
import os

load_dotenv()
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# KeyBERT
keybert = KeyBERTInspired(top_n_words=20, nr_repr_docs=10)

# All representation models
representation_model = {
    "Main": keybert,
}

topic_model = BERTopic(
    embedding_model=sentence_model, 
    hdbscan_model=hdbscan_model,
    umap_model=umap_model, 
    vectorizer_model=vectorizer_model, 
    calculate_probabilities=True,
    representation_model=representation_model,
    verbose=True
    )

topics, probs = topic_model.fit_transform(df.paragraph, embeddings=embeddings)

2024-06-09 14:44:53,473 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-09 14:50:23,220 - BERTopic - Dimensionality - Completed ✓
2024-06-09 14:50:23,223 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-09 14:59:16,589 - BERTopic - Cluster - Completed ✓
2024-06-09 14:59:16,625 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-09 14:59:46,461 - BERTopic - Representation - Completed ✓


In [22]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,164462,-1_herr präsident_kollegen_kolleginnen kollege...,"[herr präsident, kollegen, kolleginnen kollege...",[Vielen Dank. – Sehr geehrte Frau Präsidentin!...
1,0,20172,0_kolleginnen kollegen_kollegen_liebe kollegin...,"[kolleginnen kollegen, kollegen, liebe kollegi...","[Liebe Kolleginnen und Kollegen, es ist in der..."
2,1,19345,1_milliarden euro_milliarde euro_millionen eur...,"[milliarden euro, milliarde euro, millionen eu...",[Ihr Antrag kostet inklusive Abschaffung des S...
3,2,13201,2_natürlich_wirklich_einfach_geht,"[natürlich, wirklich, einfach, geht, vielen da...","[Ich glaube, da sind wir uns alle einig., – Al..."
4,3,8389,3_europäischen union_europäische kommission_eu...,"[europäischen union, europäische kommission, e...",[Herr Präsident! Sehr geehrte Damen und Herren...
...,...,...,...,...,...
190,189,312,189_ausweitung minijobs_minijobs sozialversich...,"[ausweitung minijobs, minijobs sozialversicher...",[Erster Punkt. Minijobs sind ein Haupteinfalls...
191,190,311,190_geht diejenigen_diejenigen wirklich_kommen...,"[geht diejenigen, diejenigen wirklich, kommen ...","[Sie würden nämlich all diejenigen, die Sie kr..."
192,191,305,191_wasserstoffwirtschaft_wasserstoff wasserst...,"[wasserstoffwirtschaft, wasserstoff wasserstof...",[Wir brauchen es übrigens auch für blauen Wass...
193,192,304,192_beim kurzarbeitergeld_kollegen kurzarbeite...,"[beim kurzarbeitergeld, kollegen kurzarbeiterg...","[Dann haben Sie einen Antrag gestellt, dass de..."


In [23]:
# OpenAI
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# custom prompt
prompt = """
Du agierst als Politikwissenschaflter und musst passende Topic-Labels für eine wissenschaftliche Arbeit erstellen. 
Finde als Oberbegriff den kleinsten gemeinsamen Nenner für die folgenden Bundestagsausschnitte und Schlüsselwörter:
[DOCUMENTS]
Schlüsselwörter: [KEYWORDS]

Erstelle auf der Grundlage der obigen Informationen ein kurzes Topic-Label. Bedingung ist, dass das Label allen Dokumenten gut beschreibt, die Schlüsselwörter dienen als Orientierung. 
Verwende weniger als fünf Wörter und verzichte auf Wörter wie "Deutschland", "Debatte" und "Bundestag".
Erstelle keine zu generellen Bezeichnungen wie "Politik", "Politische Maßnahmen", "Gesetzesentwürfe", "Antragsberatung" bzw. "Antragsdebatte" oder "Bundestagsrede".
Verwende folgendes Format:
topic: <topic label>
"""

representation_model = {
    "Main": keybert,
    # the number of documents is set to 10, can be reduced or increased based on your dataset
    # WARNING: this may have a huge impact on costs as OpenAI is a paid service calculated per token
    "OpenAI": OpenAI(client, model="gpt-4o", chat=True, prompt=prompt, nr_docs=10),
}

topic_model.update_topics(df.paragraph, topics, representation_model=representation_model)

100%|██████████| 195/195 [02:17<00:00,  1.41it/s]


In [24]:
topic_info = topic_model.get_topic_info()

In [25]:
custom_labels = topic_info[['Topic', 'OpenAI']]
custom_labels["OpenAI"] = topic_info["OpenAI"].apply(lambda x: x[0])
custom_labels = custom_labels.set_index('Topic').to_dict()['OpenAI']

topic_model.set_topic_labels(custom_labels)

In [28]:
topic_model.visualize_documents(df.paragraph, reduced_embeddings=reduced_embeddings, 
                                title="2D-Darstellung des Topic Models deutscher Bundestagsreden mit all-MiniLM-L6-v2-Embeddings",
                                hide_document_hover=True, hide_annotations=True, custom_labels=True, sample=0.1)
