# LDA Topic Modelling
Code from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#9createbigramandtrigrammodels

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Notwendige Bibliotheken installieren und laden

In [2]:
!pip install gensim pyldavis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyldavis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
Collecting numpy>=1.15.0 (from spacy)
  Downloading numpy-1.25.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas>=2.0.0 (from pyldavis)
  Downloading pandas-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyldavis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, numpy, pandas, pyldavis
  Attempting uninstall: numpy
    Found 

In [2]:
!python -m spacy download de

2023-06-18 16:48:19.492645: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'de' are deprecated. Please use the
full pipeline package name 'de_core_news_sm' instead.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.5.0/de_core_news_sm-3.5.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m86.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pprint import pprint

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

## Rohdaten einlsen

In [4]:
df = pd.read_parquet('/content/drive/MyDrive/Masterthesis/2023-06-16_gesamtdaten_bis_2022.parquet')

  and should_run_async(code)


## Bearbeitung der Daten

In [13]:
# Convert to list
df_filtered = df[df['kommune'].isin(['Frankenthal', 'Lampertheim', 'Schwetzingen', 'Kreis Bergstraße'])]
data = df_filtered['text'].values.tolist()

  and should_run_async(code)


In [6]:
import nltk
nltk.download('stopwords')
stopwords = stopwords.words('german')

  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=False))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

  and should_run_async(code)


[['wieder', 'bedankt', 'sich', 'bei', 'den', 'ausgeschiedenen', 'ratsmitgliedern', 'siehe', 'beiliegender', 'redeentwurf', 'und', 'überreicht', 'ihnen', 'als', 'zeichen', 'der', 'anerkennung', 'und', 'des', 'dankes', 'einen', 'bildband', 'kernland', 'rheinland', 'pfalz']]


In [15]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    # Initialize spacy 'de' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download de
    nlp = spacy.load('de_core_news_sm', disable=['parser', 'ner'])
    texts_out = []


    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

  and should_run_async(code)


In [16]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

  and should_run_async(code)


['wieder', 'bedankt', 'sich', 'bei', 'den', 'ausgeschiedenen', 'ratsmitgliedern', 'siehe', 'beiliegender', 'redeentwurf', 'und', 'überreicht', 'ihnen', 'als', 'zeichen', 'der', 'anerkennung', 'und', 'des', 'dankes', 'einen', 'bildband', 'kernland', 'rheinland_pfalz']


In [17]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

  and should_run_async(code)


In [18]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams and Trigrams
data_words_bigrams = make_bigrams(data_words_nostops)
data_words_bigrams_trigrams = make_trigrams(data_words_bigrams)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams_trigrams)

print(data_lemmatized[:1])

  and should_run_async(code)


[['bedanken', 'ausgeschieden', 'Ratsmitglieder', 'sehen', 'beiliegend', 'Redeentwurf', 'überreicht', 'Zeichen', 'Anerkennung', 'Bildband']]


Error:
* NER braucht sehr viel RAM und nur ein Teil der Daten können verabreitet werden

In [19]:
# TF-IDF
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_lemmatized)

corpus = [id2word.doc2bow(text) for text in data_lemmatized]
print(corpus[0][:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words = []
words_missing_in_tfidf = []

for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    #reassign
    corpus[i] = new_bow

  and should_run_async(code)


[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]


In [20]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

  and should_run_async(code)


[[('Anerkennung', 1),
  ('Bildband', 1),
  ('Ratsmitglieder', 1),
  ('Redeentwurf', 1),
  ('Zeichen', 1),
  ('ausgeschieden', 1),
  ('bedanken', 1),
  ('beiliegend', 1),
  ('sehen', 1),
  ('überreicht', 1)]]

## LDA Modelle erstellen

### TF-IDF

#### 500 Themen

In [21]:
# Build LDA model
lda_model_tfidf_500 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=500,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=2,
                                           alpha='auto',
                                           per_word_topics=True)

  and should_run_async(code)


In [None]:
#saving model to disk.
from gensim.test.utils import datapath

temp_file = datapath("/content/drive/MyDrive/Masterthesis/lda_model_tfidf_500")

lda_model_tfidf_500.save(temp_file)

  and should_run_async(code)


#### 800 Themen

In [None]:
# Build LDA model
lda_model_tfidf_800 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=800,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=2,
                                           alpha='auto',
                                           per_word_topics=True)

  and should_run_async(code)


In [None]:
#saving model to disk.

temp_file = datapath("/content/drive/MyDrive/Masterthesis/lda_model_tfidf_800")

lda_model_tfidf_800.save(temp_file)

  and should_run_async(code)


## Modelle mit TDF 

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

  and should_run_async(code)


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]]


#### 500 Themen

In [None]:
# Build LDA model
lda_model_tdf_500 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=500,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=2,
                                           alpha='auto',
                                           per_word_topics=True)

  and should_run_async(code)


In [None]:
#saving model to disk.

temp_file = datapath("/content/drive/MyDrive/Masterthesis/lda_model_tdf_500")

lda_model_tdf_500.save(temp_file)

  and should_run_async(code)


#### 800 Themen

In [None]:
# Build LDA model
lda_model_tdf_800 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=800,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=2,
                                           alpha='auto',
                                           per_word_topics=True)

  and should_run_async(code)


In [None]:
#saving model to disk.

temp_file = datapath("/content/drive/MyDrive/Masterthesis/lda_model_tdf_800")

lda_model_tdf_800.save(temp_file)

## Ergebnisse

In [2]:
df = pd.read_parquet("data/raw/2023-06-23_gesamtdaten_rhein_neckar.parquet")
df_filtered = df[df['kommune'].isin(['Frankenthal', 'Lampertheim', 'Schwetzingen', 'Kreis Bergstraße'])]
data = df_filtered['text'].values

### TF-IDF

#### 500 Themen

In [3]:
#loading model from disk
from gensim import  models

lda_tfidf_500 = models.ldamodel.LdaModel.load("data/lda_models/lda_model_tfidf_500")
id2word_tfidf_500 = corpora.Dictionary.load('data/lda_models/lda_model_tfidf_500.id2word')
corpus_tfidf_500 = corpora.MmCorpus("data/lda_models/tdidf_corpus")

In [4]:
pyLDAvis.enable_notebook()
vis_tfidf_500 = pyLDAvis.gensim.prepare(lda_tfidf_500, corpus_tfidf_500, id2word_tfidf_500, mds='mmds')
vis_tfidf_500



Notizen:
* 90 beinhaltet Radweg
* Verkehrsthemen sind schwer zu identifizieren
* 

#### 800 Themen

In [4]:
lda_tfidf_800 = models.ldamodel.LdaModel.load("data/lda_models/lda_model_tfidf_800")

In [None]:
vis_tfidf_800 = pyLDAvis.gensim.prepare(lda_tfidf_500, corpus, id2word, mds='mmds')
vis_tfidf_800

### TDF

#### 500 Themen

In [3]:
from gensim import  models
lda_tdf_500 = models.ldamodel.LdaModel.load("data/lda_models/lda_model_tdf_500")
id2word_tdf_500 = corpora.Dictionary.load('data/lda_models/lda_model_tdf_500.id2word')
corpus_tdf_500 = corpora.MmCorpus("data/lda_models/tdidf_corpus")

In [4]:
vis_tdf_500 = pyLDAvis.gensim.prepare(lda_tdf_500, corpus_tdf_500, id2word_tdf_500, mds='mmds')
vis_tdf_500



PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
14     0.544707 -0.284241       1        1  5.590264
163    0.344105 -0.493156       2        1  5.338511
484    0.417559 -0.430189       3        1  3.570035
410    0.363770 -0.482489       4        1  3.527847
85     0.136864 -0.600781       5        1  2.826116
...         ...       ...     ...      ...       ...
320   -0.081629  0.070500     496        1  0.000735
93    -0.081629  0.070500     497        1  0.000735
324   -0.081629  0.070500     498        1  0.000735
92    -0.081629  0.070500     499        1  0.000735
499   -0.081629  0.070500     500        1  0.000735

[500 rows x 5 columns], topic_info=             Term           Freq          Total  Category  logprob  loglift
19          Stadt  114532.000000  114532.000000   Default  30.0000  30.0000
80          Kreis  122985.000000  122985.000000   Default  29.0000  29.0000
192         Seit

#### 800 Themen

In [6]:
lda_tdf_800 = models.ldamodel.LdaModel.load("data/lda_models/lda_model_")

FileNotFoundError: [Errno 2] No such file or directory: 'data/lda_models/lda_model_tdf_800'

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(1,
  '0.000*"begutachen" + 0.000*"stammdaen" + 0.000*"örtlichen_erhebungen" + '
  '0.000*"örtlichen_trägers" + 0.000*"Gewährleisteung" + 0.000*"Kontr" + '
  '0.000*"Nachbearbeitung" + 0.000*"ßerung" + 0.000*"Beanstandete" + '
  '0.000*"Kraft_gesetzt"'),
 (24,
  '0.000*"begutachen" + 0.000*"stammdaen" + 0.000*"örtlichen_erhebungen" + '
  '0.000*"örtlichen_trägers" + 0.000*"Gewährleisteung" + 0.000*"Kontr" + '
  '0.000*"Nachbearbeitung" + 0.000*"ßerung" + 0.000*"Beanstandete" + '
  '0.000*"Kraft_gesetzt"'),
 (10,
  '0.000*"dr_brud" + 0.000*"straßendat" + 0.000*"Beanstandete" + '
  '0.000*"Gewährleisteung" + 0.000*"Kontr" + 0.000*"Nachbearbeitung" + '
  '0.000*"begutachen" + 0.000*"stammdaen" + 0.000*"örtlichen_erhebungen" + '
  '0.000*"Pflichtgemäß"'),
 (20,
  '0.007*"bernd_leidig" + 0.003*"rm_sonja_schönherr" + '
  '0.002*"rm_christian_baldauf" + 0.001*"dr_günther_serfas" + '
  '0.000*"rm_daniel_kühner" + 0.000*"rm_dr_rainer" + '
  '0.000*"mitglieder_stellvertretung" + 0.000*"Ingrid_h

## Compute Model Perplexity and Coherence Score

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

## Visiualize Topic Model

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)
