In [1]:
# imports
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# load data

data = pd.read_excel("data/cleaned_data.xlsx")
data.dropna(inplace=True)

## stopwords from https://github.com/stopwords-iso/stopwords-de/blob/master/raw/stopwords-filter-de.txt
stopwords = pd.read_excel("data/stopwords.xlsx", header=None)[0].to_list()

In [3]:
god_string = " ".join(data["topic"])

# vectorizer object
count_vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words=stopwords)
doc_term_matrix = count_vect.fit_transform(data["topic"].values.astype('U'))
# LDA object for clustering topics
LDA = LatentDirichletAllocation(n_components=8, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(n_components=8, random_state=42)

In [4]:
# printing top words for each cluster
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-5:]])
    print('\n')

Top 10 words for topic #0:
['usa', 'gipfel', 'brexit', 'eu', 'gestorben']


Top 10 words for topic #1:
['em', 'wm', 'ergebnisse', 'bundesliga', 'fußball']


Top 10 words for topic #2:
['türkei', 'corona', 'griechenland', 'is', 'syrien']


Top 10 words for topic #3:
['csu', 'union', 'cdu', 'merkel', 'spd']


Top 10 words for topic #4:
['kommentar', 'meldungen', 'überblick', 'weitere', 'wetter']


Top 10 words for topic #5:
['reaktionen', 'tote', 'eu', 'meinung', 'lottozahlen']


Top 10 words for topic #6:
['ägypten', 'ukraine', 'tag', 'machtkampf', 'sport']


Top 10 words for topic #7:
['eu', 'formel', 'trump', 'präsident', 'us']




In [5]:
# cluster topics and save to file
topic_values = LDA.transform(doc_term_matrix)
data['cluster'] = topic_values.argmax(axis=1)
data.to_excel("data/cleaned_data.xlsx", index=False)