In [151]:
import pandas as pd
import requests
import json
import numpy as np

import regex as re
import string
import spacy
from collections import defaultdict, Counter

import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

#%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.table import Table
from matplotlib.ticker import FuncFormatter

import seaborn as sns

import pyLDAvis
import pyLDAvis.gensim_models

import warnings, logging
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# Daten laden und vorverarbeiten

In [152]:
def load_data(file_path):
    df = pd.read_csv(file_path, encoding='utf-8')
    return df

def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    text = re.sub(f"[{re.escape(string.punctuation)}]", ' ', text)
    text = re.sub(r"\d+", " ", text)
    text = text.strip().replace('\n', ' ')
    text = text.lower()
    return text

def tokenize_text(text):
    return word_tokenize(text, language='german')

def make_bigrams(data_words):
    return [bigram_mod[data] for data in data_words]

def make_trigrams(data_words):
    return [trigram_mod[bigram_mod[data]] for data in data_words]

def lemmatize(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for text in texts:
        doc = nlp(" ".join(text))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    texts_out = [[word for word in simple_preprocess(str(text)) if word not in stop_words] for text in texts_out]
    return texts_out

def build_dictionary_and_corpus(data_ready):
    id2word = Dictionary(data_ready) # fit dictionary
    
    print(f"Vokabular vor der Filterung: {len(id2word)} Wörter")
    # Filtere das Vokabular
    id2word.filter_extremes(no_below=5, no_above=0.40, keep_n=10000)
    print(f"Vokabular nach der Filterung: {len(id2word)} Wörter")
    
    bow_corpus = [id2word.doc2bow(text, allow_update=True) for text in data_ready]
    return id2word, bow_corpus

In [153]:
url = "https://ordnungsamt.berlin.de/frontend.webservice.opendata/api/meldungen"

#get the data from url
response = requests.get(url).json()
df = pd.json_normalize(response, record_path=['index'])
df.replace(to_replace=[None], value='', inplace=True)

#New column after customer complaints are processed with function to clean up and lemmatize.
df['sachverhalt_clean'] = df['sachverhalt'].map(preprocess_text)
data = df['sachverhalt_clean'].values.tolist()

#tokenize
words_tokenized = [tokenize_text(sentence) for sentence in data]

#remove stopwords
stop_words = stopwords.words('german')
custom_stopwords = ['geehrt', 'damen', 'und', 'herren', 'schon', 'herr', 'dame', 'grüßenk', 'geehrte_damen', 'vsehr', 'mal', 'seeehr', 'grüßenstefanie', 'lieb', 'berlintel', 'mit_freundlich', 'auf_dem', 'hiermit', 'soweit', 'zudem', 'siehe_foto', 'nicht_mehr', 'mehr_als']
stop_words.extend(custom_stopwords)

# make bigram and trigram
bigram = gensim.models.Phrases(words_tokenized, min_count=5, threshold=15)
trigram = gensim.models.Phrases(bigram[words_tokenized], threshold=15)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

words_bigram = make_bigrams(words_tokenized)
words_trigram = make_trigrams(words_tokenized)

#lemmatize and remove stop words
#!python -m spacy download de_core_news_sm
nlp = spacy.load("de_core_news_sm")
data_ready = lemmatize(words_trigram, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_ready = [x for x in data_ready if x] #remove empty lists from list

In [154]:
# create dictionary and corpus
id2word, bow_corpus = build_dictionary_and_corpus(data_ready)
bow_corpus = [x for x in bow_corpus if x] # remove empty lists from list

Vokabular vor der Filterung: 6274 Wörter
Vokabular nach der Filterung: 1039 Wörter


# Besten Model finden und implementieren

In [155]:
def build_tfidf_model(bow_corpus, id2word, smartire):
    tfidf_model = models.TfidfModel(bow_corpus, dictionary=id2word, smartirs=smartire)
    tfidf_corpus = tfidf_model[bow_corpus]
    return tfidf_model, tfidf_corpus

def calculate_lsa_perplexity(model, corpus):
    count = len(corpus) #number of documents in the corpus
    log_likelihood = 0.0
    nonzero_count = 0  # Zählt die nicht-negativen Wahrscheinlichkeiten

    for doc in corpus:
        topics = model[doc] #retrieves the topics and their probabilities for the current document from model
        for topic, prob in topics:
            if prob >= 0:
                # accumulates the log likelihood of the document, 
                # adding a small constant (1e-10) to avoid issues with taking the logarithm of very small probabilities
                log_likelihood += np.log(prob + 1e-10) # Add a small constant to avoid log(0)
                nonzero_count += 1

    # Überprüfen, ob es mindestens eine nicht-negative Wahrscheinlichkeit gab
    if nonzero_count > 0:
        perplexity = np.exp(-log_likelihood / nonzero_count)
    else:
        perplexity = np.inf  # If there are zero non-zero probabilities, the perplexity is set to infinity

    return perplexity

def calculate_coherence_perplexity(corpus, n, alpha, beta, id2word, data_ready, model_type):
    if model_type == "LDA TF-IDF" or model_type == "LDA BoW":
        model = gensim.models.LdaModel(corpus=corpus, id2word=id2word, num_topics=n, random_state=42,
                                           update_every=0, decay=0.9, chunksize=100, passes=30, alpha=alpha,
                                           per_word_topics=True, eta=beta)
        perplexity = -model.log_perplexity(corpus)
    elif model_type == "LSA TF-IDF" or model_type == "LSA BoW":
        model = gensim.models.LsiModel(corpus=corpus, id2word=id2word, num_topics=n, random_seed=42, decay=0.9, chunksize=100, power_iters=30)
        perplexity = calculate_lsa_perplexity(model, corpus)
    
    coherence_model = CoherenceModel(model=model, texts=data_ready, dictionary=id2word, coherence='c_npmi') 
    coherence = coherence_model.get_coherence()
    
    return coherence, perplexity

def find_best_model_parameters(corpus, id2word, data_ready, no_topics, alpha_list, beta_list, model_type):
    best_coherence = -np.inf
    best_parameters = None

    if model_type == "LDA BoW":
        for n in no_topics:
            for alpha in alpha_list:
                for beta in beta_list:
                    coherence, perplexity = calculate_coherence_perplexity(corpus, n, alpha, beta, id2word, data_ready, model_type)
                    if coherence > best_coherence:
                        best_coherence = coherence
                        best_parameters = n, alpha, beta
    else: #LSA
        for n in no_topics:
            coherence, perplexity = calculate_coherence_perplexity(corpus, n, alpha_list[0], beta_list[0], id2word, data_ready, model_type)
            if coherence > best_coherence:
                best_coherence = coherence
                best_parameters = n, alpha_list[0], beta_list[0]

    return best_parameters

def find_best_model_parameters_tfidf(corpus, id2word, data_ready, no_topics, alpha_list, beta_list, model_type):
    best_coherence = -np.inf
    best_parameters = None
    
    smartirs_lda = ['dpn', 'bxn', 'bxu', 'bpb', 'bfb'] 
    smartirs_lsa = ['lpn', 'txn', 'tpn']

    if model_type == "LDA TF-IDF":
        for i in smartirs_lda:
            # Create the TF-IDF model
            tfidf_model = gensim.models.TfidfModel(corpus, dictionary = id2word, smartirs = i)
            tfidf_corpus = tfidf_model[corpus]

            for n in no_topics:
                for alpha in alpha_list:
                    for beta in beta_list:
                        coherence, perplexity = calculate_coherence_perplexity(tfidf_corpus, n, alpha, beta, id2word, data_ready, model_type)
                        if coherence > best_coherence:
                            best_coherence = coherence
                            best_parameters = n, alpha, beta, i
    else:
        for i in smartirs_lsa:
            # Create the TF-IDF model
            tfidf_model = gensim.models.TfidfModel(corpus, dictionary = id2word, smartirs = i)
            tfidf_corpus = tfidf_model[corpus]

            for n in no_topics:
                coherence, perplexity = calculate_coherence_perplexity(tfidf_corpus, n, alpha_list[0], beta_list[0], id2word, data_ready, model_type)
                if coherence > best_coherence:
                    best_coherence = coherence
                    best_parameters = n, alpha_list[0], beta_list[0], i
                
    return best_parameters
    
def plot_coherence(coherence_lda_bow, coherence_lda_tfidf, coherence_lsa_bow, coherence_lsa_tfidf, no_topics):
    fig, ax1 = plt.subplots(figsize=(12, 6))
    ax1.plot(no_topics, coherence_lda_bow, label="LDA BoW")
    ax1.plot(no_topics, coherence_lda_tfidf, label="LDA TF-IDF")
    ax1.plot(no_topics, coherence_lsa_bow, label="LSA BoW")
    ax1.plot(no_topics, coherence_lsa_tfidf, label="LSA TF-IDF")
    
    ax1.set_xlabel('Anzahl der Themen')
    ax1.set_ylabel("Koheränz")
    ax1.legend(loc='best')
    ax1.set_title('Koheränz in Abhängigkeit von der Anzahl der Themen')

    # Table data
    data = [coherence_lda_bow, coherence_lda_tfidf, coherence_lsa_bow, coherence_lsa_tfidf]
    data = [['%.2f' % j for j in i] for i in data] #round
    rows = ["LDA BoW", "LDA TF-IDF", "LSA BoW", "LSA TF-IDF"]
    columns = ['%d' % x for x in no_topics]
    
    # Add a table below the graph with the same size
    ax2 = plt.gca().twiny()
    ax2.axis("off")
    table = ax2.table(cellText=data, rowLabels=rows, colLabels=columns, rowLoc='center', loc='center', bbox=[0, -0.3, 1.0, 0.2])
    table.auto_set_font_size(False)
    table.set_fontsize(10)

    # Set the size of ax2 to match the size of ax1
    ax2.set_position(ax1.get_position())
    
    plt.savefig('coherence_score.png', bbox_inches='tight', dpi=150)
    
    plt.show()

def plot_perplexity(perplexity_lda_bow, perplexity_lda_tfidf, perplexity_lsa_bow, perplexity_lsa_tfidf, no_topics):
    fig, ax1 = plt.subplots(figsize=(12, 6))
    ax1.plot(no_topics, perplexity_lda_bow, label="LDA BoW")
    ax1.plot(no_topics, perplexity_lda_tfidf, label="LDA TF-IDF")
    ax1.plot(no_topics, perplexity_lsa_bow, label="LSA BoW")
    ax1.plot(no_topics, perplexity_lsa_tfidf, label="LSA TF-IDF")
    
    ax1.set_xlabel('Anzahl der Themen')
    ax1.set_ylabel("Perplexität")
    ax1.legend(loc='best')
    ax1.set_title('Perplexität in Abhängigkeit von der Anzahl der Themen')

    # Table data
    data = [perplexity_lda_bow, perplexity_lda_tfidf, perplexity_lsa_bow, perplexity_lsa_tfidf]
    data = [['%.2f' % j for j in i] for i in data]
    rows = ["LDA BoW", "LDA TF-IDF", "LSA BoW", "LSA TF-IDF"]
    columns = ['%d' % x for x in no_topics]
    
    # Add a table below the graph with the same size
    ax2 = plt.gca().twiny()
    ax2.axis("off")
    table = ax2.table(cellText=data, rowLabels=rows, colLabels=columns, rowLoc='center', loc='center', bbox=[0, -0.3, 1.0, 0.2]) #[xmin, ymin, width, height]
    table.auto_set_font_size(False)
    table.set_fontsize(10)

    # Set the size of ax2 to match the size of ax1
    ax2.set_position(ax1.get_position())
    
    plt.savefig('perplexity.png', bbox_inches='tight', dpi=150)
    
    plt.show()

def build_lda_model(corpus, id2word, n, alpha, beta):
    lda_model = gensim.models.LdaModel(corpus=corpus, id2word=id2word, num_topics=n, 
                                       random_state=42, update_every=0, decay=0.9, 
                                       chunksize=100, passes=30, alpha=alpha,
                                       per_word_topics=True, eta=beta)
    return lda_model

def build_lsa_model(corpus, id2word, n):
    lsa_model = gensim.models.LsiModel(corpus=corpus, id2word=id2word, num_topics=n, random_seed=42, 
                                       decay=0.9, chunksize=100, power_iters=30)
    return lsa_model

In [156]:
def find_best_model(corpus, id2word, data_ready, no_topics, alpha_list, beta_list, model_type):
    corpus = corpus
    
    coherence = -np.inf
    perplexity = -np.inf
    
    best_model = None
    
    if  model_type == "LDA BoW" or model_type == "LSA BoW":
        n, alpha, beta = find_best_model_parameters(corpus, id2word, data_ready, no_topics, alpha_list, beta_list, model_type)
        
        if model_type == "LDA BoW":
            best_model = build_lda_model(corpus, id2word, n, alpha, beta)
        elif model_type == "LSA BoW":
            best_model = build_lsa_model(corpus, id2word, n)
        
        coherence, perplexity = calculate_coherence_perplexity(corpus, n, alpha, beta, id2word, data_ready, model_type)
        
    elif model_type == "LDA TF-IDF" or model_type == "LSA TF-IDF":
        n, alpha, beta, smartire = find_best_model_parameters_tfidf(corpus, id2word, data_ready, no_topics, alpha_list, beta_list, model_type)
        tfidf, tfidf_corpus = build_tfidf_model(corpus, id2word, smartire)
        #tfidf_id2word = Dictionary.from_corpus(tfidf_corpus)
            
        if model_type == "LDA TF-IDF":
            best_model = build_lda_model(tfidf_corpus, id2word, n, alpha, beta)
        elif model_type == "LSA TF-IDF":
            best_model = build_lsa_model(tfidf_corpus, id2word, n)

        coherence, perplexity = calculate_coherence_perplexity(tfidf_corpus, n, alpha, beta, id2word, data_ready, model_type)

    print(f"\nBestes {model_type} Model:")
    if model_type == "LDA TF-IDF" or model_type == "LDA BoW":
        print(f"Die besten Parameter: Anzahl der Themen: {n}, Alpha: {alpha}, Beta: {beta}")
    else:
        print(f'Der beste Parameter: Anzahl der Themen: {n}')
    
    if model_type == "LDA TF-IDF" or model_type == "LSA TF-IDF":
        print(f'Der beste Parameter für TF-IDF: Smartire: {smartire}')
        
    print(f"Koheränz: {coherence}")
    print(f"Perplexität: {perplexity}")

    return best_model, corpus, coherence, perplexity

In [157]:
no_topics = range(2, 21, 1)
alpha_list = [0.001, 0.01, 0.1, 0.5]  
beta_list = [0.001, 0.01, 0.02, 0.1, 0.5] 

In [None]:
# Initialize lists to store coherence scores for each method
coherence_scores_lda_bow = []
coherence_scores_lda_tfidf = []
coherence_scores_lsa_bow = []
coherence_scores_lsa_tfidf = []

perplexity_scores_lda_bow = []
perplexity_scores_lda_tfidf = []
perplexity_scores_lsa_bow = []
perplexity_scores_lsa_tfidf = []

best_coherence_lda_bow = -np.inf
best_coherence_lda_tfidf = -np.inf
best_coherence_lsa_bow = -np.inf
best_coherence_lsa_tfidf = -np.inf


for n in no_topics:
    # LDA BoW
    lda_bow_model, lda_bow_corpus, lda_bow_coherence, lda_bow_perplexity = find_best_model(bow_corpus, id2word, data_ready, [n], alpha_list, beta_list, "LDA BoW")
    coherence_scores_lda_bow.append(lda_bow_coherence)
    perplexity_scores_lda_bow.append(lda_bow_perplexity)
    if lda_bow_coherence > best_coherence_lda_bow:
        best_coherence_lda_bow = lda_bow_coherence
        perplexity_lda_bow = lda_bow_perplexity
        model_lda_bow = lda_bow_model
        corpus_lda_bow = lda_bow_corpus
    
    # LDA TF-IDF
    lda_tfidf_model, lda_tfidf_corpus, lda_tfidf_coherence, lda_tfidf_perplexity = find_best_model(bow_corpus, id2word, data_ready, [n], alpha_list, beta_list, "LDA TF-IDF")
    coherence_scores_lda_tfidf.append(lda_tfidf_coherence)
    perplexity_scores_lda_tfidf.append(lda_tfidf_perplexity)
    if lda_tfidf_coherence > best_coherence_lda_tfidf:
        best_coherence_lda_tfidf = lda_tfidf_coherence
        perplexity_lda_tfidf = lda_tfidf_perplexity
        model_lda_tfidf = lda_tfidf_model
        corpus_lda_tfidf = lda_tfidf_corpus
    
    # LSA BoW
    lsa_bow_model, lsa_bow_corpus, lsa_bow_coherence, lsa_bow_perplexity = find_best_model(bow_corpus, id2word, data_ready, [n], alpha_list, beta_list, "LSA BoW")
    coherence_scores_lsa_bow.append(lsa_bow_coherence)
    perplexity_scores_lsa_bow.append(lsa_bow_perplexity)
    if lsa_bow_coherence > best_coherence_lsa_bow:
        best_coherence_lsa_bow = lsa_bow_coherence
        perplexity_lsa_bow = lsa_bow_perplexity
        model_lsa_bow = lsa_bow_model
        corpus_lsa_bow = lsa_bow_corpus
    
    # LSA TF-IDF
    lsa_tfidf_model, lsa_tfidf_corpus, lsa_tfidf_coherence, lsa_tfidf_perplexity = find_best_model(bow_corpus, id2word, data_ready, [n], alpha_list, beta_list, "LSA TF-IDF")
    coherence_scores_lsa_tfidf.append(lsa_tfidf_coherence)
    perplexity_scores_lsa_tfidf.append(lsa_tfidf_perplexity)
    if lsa_tfidf_coherence > best_coherence_lsa_tfidf:
        best_coherence_lsa_tfidf = lsa_tfidf_coherence
        perplexity_lsa_tfidf = lsa_tfidf_perplexity
        model_lsa_tfidf = lsa_tfidf_model
        corpus_lsa_tfidf = lsa_tfidf_corpus


Bestes LDA BoW Model:
Die besten Parameter: Anzahl der Themen: 2, Alpha: 0.5, Beta: 0.5
Koheränz: -0.1384093377046739
Perplexität: 8.150993666052978

Bestes LDA TF-IDF Model:
Die besten Parameter: Anzahl der Themen: 2, Alpha: 0.5, Beta: 0.5
Der beste Parameter für TF-IDF: Smartire: bxn
Koheränz: -0.1537891893625272
Perplexität: 8.193209199371655

Bestes LSA BoW Model:
Der beste Parameter: Anzahl der Themen: 2
Koheränz: -0.10245136185975384
Perplexität: 232.12568919970792

Bestes LSA TF-IDF Model:
Der beste Parameter: Anzahl der Themen: 2
Der beste Parameter für TF-IDF: Smartire: lpn
Koheränz: 0.13857558515376384
Perplexität: 1118.967523693542

Bestes LDA BoW Model:
Die besten Parameter: Anzahl der Themen: 3, Alpha: 0.001, Beta: 0.001
Koheränz: -0.1646368766214199
Perplexität: 9.998538223566287

Bestes LDA TF-IDF Model:
Die besten Parameter: Anzahl der Themen: 3, Alpha: 0.5, Beta: 0.1
Der beste Parameter für TF-IDF: Smartire: bxu
Koheränz: -0.16257740286213865
Perplexität: 10.815747040

In [None]:
plot_coherence(coherence_scores_lda_bow, coherence_scores_lda_tfidf, coherence_scores_lsa_bow, coherence_scores_lsa_tfidf, no_topics)

In [None]:
plot_perplexity(perplexity_scores_lda_bow, perplexity_scores_lda_tfidf, perplexity_scores_lsa_bow, perplexity_scores_lsa_tfidf, no_topics)

# Visualisiere die Modelle

In [None]:
def coherence_per_topic(model, texts, dictionary, model_type):
    
    coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_npmi')
    coherence_per_topic = coherence_model.get_coherence_per_topic()

    # Wandele die Topic-Terme in Strings um
    topics_str = [', '.join([term for term, _ in model.show_topic(topic)]) for topic in range(model.num_topics)]

    # Erstelle ein DataFrame
    topic_score = pd.DataFrame(data=zip(topics_str, coherence_per_topic), columns=['Topic', 'Coherence'])
    topic_score = topic_score.set_index('Topic')

    # Erstelle die Heatmap
    fig, ax = plt.subplots(figsize=(2, 6)) #(10, 6)
    
    ax.set_title(f"Themenkohärenz für {str(model)}\n $C_v$")
    
    sns.heatmap(data=topic_score, annot=True, square=True,
                cmap='Reds', fmt='.2f',
                linecolor='black', ax=ax)
    
    plt.yticks(rotation=0)
    ax.set_xlabel('')
    ax.set_ylabel('Themen')
    
    plt.savefig(f'coherence_per_topic_{model_type}.png', bbox_inches='tight', dpi=150)
    
    plt.show()

In [None]:
coherence_per_topic(model_lsa_bow, data_ready, id2word, 'LSA BoW')

In [None]:
coherence_per_topic(model_lsa_tfidf, data_ready, id2word, 'LSA TF-IDF')

In [None]:
coherence_per_topic(model_lda_bow, data_ready, id2word, 'LDA BoW')

In [None]:
coherence_per_topic(model_lda_tfidf, data_ready, id2word, 'LDA TF-IDF')

In [None]:
def plot_topic_distribution(model, corpus, model_type):
    dominant_topics = []
    topic_percentages = []

    for idx, row in enumerate(corpus):
        result = model[row]
        
        # Unpack values from the result
        if model_type.startswith("LDA"):
            topic_tuples = result[0]
            topic_indices, topic_weights = zip(*topic_tuples)
        elif model_type.startswith("LSA"):
            topic_indices, topic_weights = zip(*result) if result else ([], [])

        # Filter out negative weights
        non_negative_weights = [(idx, weight) for idx, weight in enumerate(topic_weights) if weight >= 0]

        # Sort by weight and get the dominant topic
        if non_negative_weights:
            dominant_topic = sorted(non_negative_weights, key=lambda x: x[1], reverse=True)[0][0]
            dominant_topics.append((idx, dominant_topic))
            topic_percentages.append(non_negative_weights)

    # Distribution of Dominant Topics in Each Document
    df = pd.DataFrame(dominant_topics, columns=['Document_ID', 'Dominant_Topic'])
    
    dominant_topic = df.groupby('Dominant_Topic').size()
    df_dominant_topic = dominant_topic.to_frame(name='count').reset_index()

    # Total Topic Distribution by actual weight
    topic_weight = pd.DataFrame([dict(t) for t in topic_percentages])
    df_topic_weight = topic_weight.sum().to_frame(name='count').reset_index()

    # Top Keywords for each Topic
    topic_top_words = [(idx, topic) for idx, topics in model.show_topics(formatted=False) 
                                     for j, (topic, weight) in enumerate(topics) if j < 5]

    df_top_words = pd.DataFrame(topic_top_words, columns=['topic_id', 'words'])
    df_top_words = df_top_words.groupby('topic_id').agg(', \n'.join)
    df_top_words.reset_index(level=0, inplace=True)
    
    # Plot
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8), dpi=150, sharey=True)
    
    # Adjust the spacing between subplots
    plt.subplots_adjust(hspace=0.7)
    
    # Topic Distribution by Dominant Topics
    ax1.bar(x='Dominant_Topic', height='count', data=df_dominant_topic, width=.5, color='firebrick')
    ax1.set_xticks(range(df_dominant_topic.Dominant_Topic.unique().__len__()))
    
    # Add annotations above each bar
    for i, count in enumerate(df_dominant_topic['count']):
        ax1.text(i, count + 50, str(count), ha='center', va='bottom', fontsize=10, color='black')
    
    formatter = FuncFormatter(lambda x, pos: 'Thema ' + str(x) + '\n' + df_top_words.loc[df_top_words.topic_id == x, 'words'].values[0])
    ax1.xaxis.set_major_formatter(formatter)
    ax1.set_title(f'Anzahl der Dokumente nach vorherrschendem Thema für \n{model}', fontdict=dict(size=10))
    ax1.set_ylabel('Anzahl der Dokumente')
    
    # Set y-axis limit dynamically
    max_count = df_dominant_topic['count'].max()
    ax1.set_ylim(0, max_count + 500)

    # Topic Distribution by Topic Weights
    ax2.bar(x='index', height='count', data=df_topic_weight, width=.5, color='steelblue')
    ax2.set_xticks(range(df_topic_weight.index.unique().__len__()))
    
    # Add annotations above each bar
    for i, count in enumerate(df_topic_weight['count']):
        ax2.text(i, count + 50, str(round(count, 1)), ha='center', va='bottom', fontsize=10, color='black')
    
    ax2.xaxis.set_major_formatter(formatter)
    ax2.set_title(f'Anzahl der Dokumente nach Themengewichtung für \n{model}', fontdict=dict(size=10))
    
    plt.savefig(f'topic_dist_{model_type}.png', bbox_inches='tight', dpi=150)
    
    plt.show()

In [None]:
plot_topic_distribution(model_lsa_bow, corpus_lsa_bow, 'LSA BoW')

In [None]:
plot_topic_distribution(model_lsa_tfidf, corpus_lsa_tfidf, 'LSA TF-IDF')

In [None]:
plot_topic_distribution(model_lda_bow, corpus_lda_bow, 'LDA BoW')

In [None]:
plot_topic_distribution(model_lda_tfidf, corpus_lda_tfidf, 'LDA TF-IDF')

In [None]:
# for LDA
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(model_lda_tfidf, corpus_lda_tfidf, dictionary=model_lda_tfidf.id2word)
vis