# 2. Topic Modeling - Latent Dirchlet Allocation (LDA) + time dimension

In [1]:
# format code
# pip install nb_black
%load_ext nb_black

<IPython.core.display.Javascript object>

In [38]:
import pickle
import numpy as np
import pandas as pd
import re
import scipy.sparse
import gensim
from gensim import matutils
from gensim import models
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

<IPython.core.display.Javascript object>

### condition:  >10words AND 7 unique words

In [3]:
stop_words = set(stopwords.words("german"))
stop_words_add = [
    "mal",
    "mehr",
    "ja",
    "schon",
    "gibt",
    "geht",
    "hast",
    "einfach",
    "ganz",
    "macht",
    "immer",
    "tun",
    "viele",
    "wer",
    "sagen",
    "wäre",
    "genau",
    "dafür",
    "natürlich",
    "seit",
    "wurde",
    "eigentlich",
    "kommt",
    "gesagt",
    "sagt",
    "nie",
    "sehen",
    "deren",
    "versuchen",
    "empfehlen",
    "müssen",
    "kurz",
    "wenig",
    "erste",
    "klare",
    "gar",
    "grad",
    "wohl",
    "oft",
    "ha",
    "schaffen",
    "daher",
    "schreibt",
    "ständig",
    "völlig",
    "verdient",
    "worden",
    "solange",
    "könnt",
    "mann",
    "zeigt",
    "später",
    "erste",
    "iwelche",
    "wen",
    "eigenem",
    "gründen",
    "ups",
    "irgendjemand",
    "wuerde",
    "gründen",
]
sw = set(list(stop_words) + stop_words_add)

<IPython.core.display.Javascript object>

In [4]:
corpora_names = [
    "2019-05",
    "2019-06",
    "2019-07",
    "2019-08",
    "2019-09",
    "2019-10",
    "2019-11",
    "2019-12",
    "2020-01",
    "2020-02",
    "2020-03",
    "2020-04",
    "2020-05",
    "2020-06",
    "2020-07",
    "2020-08",
    "2020-09",
    "2020-10",
    "2020-11",
    "2020-12",
    "2021-01",
    "2021-02",
    "2021-03",
    "2021-04",
    "2021-05",
    "2021-06",
    "2021-07",
    "2021-08",
]
corpora_docs = [
    131917,
    14823,
    2644,
    2395,
    1875,
    1167,
    1290,
    1370,
    1064,
    1136,
    1039,
    1567,
    1392,
    1859,
    1705,
    2102,
    1843,
    1976,
    2142,
    1510,
    3090,
    8059,
    9025,
    11319,
    6833,
    3285,
    9394,
    1727,
]
corpora_info = list(zip(corpora_names, corpora_docs))
print(corpora_info)

[('2019-05', 131917), ('2019-06', 14823), ('2019-07', 2644), ('2019-08', 2395), ('2019-09', 1875), ('2019-10', 1167), ('2019-11', 1290), ('2019-12', 1370), ('2020-01', 1064), ('2020-02', 1136), ('2020-03', 1039), ('2020-04', 1567), ('2020-05', 1392), ('2020-06', 1859), ('2020-07', 1705), ('2020-08', 2102), ('2020-09', 1843), ('2020-10', 1976), ('2020-11', 2142), ('2020-12', 1510), ('2021-01', 3090), ('2021-02', 8059), ('2021-03', 9025), ('2021-04', 11319), ('2021-05', 6833), ('2021-06', 3285), ('2021-07', 9394), ('2021-08', 1727)]


<IPython.core.display.Javascript object>

In [5]:
corpus = pd.read_pickle("data/pickle/corpus_clean.pkl")
corpus

Unnamed: 0,comment
0,feier sowas endlich jemand alten cdu stimme ni...
1,achtung faschisten irma lo beleidigt user ganz...
2,zeigt jahre jahre
3,doppelmoralist rezo eigenes video dezember pri...
4,richtig
...,...
229571,legalisierung mehr fahrverbote läuft
229572,j pepe leg fakten falsche informationen rausha...
229573,bundestag stehen abgeordnete mandat halten unt...
229574,liebe user lasst daran hindern frei meinung ek...


<IPython.core.display.Javascript object>

In [6]:
idx = 0
# idx_min = 0
# idx_max = corpus.shape[0]  # number of rows (documents in corpus)
corpus_d = {}

for i, t in enumerate(corpora_info):
    lb = idx
    rb = idx + t[1]

    corpus_d[t[0]] = {
        "len": t[1],
        "corpora": pd.DataFrame(data=corpus.iloc[lb:rb]),
    }
    assert corpus_d[t[0]]["len"] == corpus_d[t[0]]["corpora"].shape[0]
    idx += t[1] + 1

<IPython.core.display.Javascript object>

In [7]:
MIN_WORDS = 10
MIN_UNIQUE_WORDS = 7
progress = 0
print("")

for key, value in corpus_d.items():
    doc_to_drop = []

    for i in list(value["corpora"].index):
        progress += 1
        if len(value["corpora"].loc[i].comment.split()) < MIN_WORDS:
            doc_to_drop.append(i)
        elif len(set(value["corpora"].loc[i].comment.split())) < MIN_UNIQUE_WORDS:
            doc_to_drop.append(i)
        if progress % 100 == 0:
            print(f"progress: {progress}/{sum(corpora_docs)}", end="\r")

    value["corpora"] = value["corpora"].drop(index=doc_to_drop)

    d_ = {}
    for i, v in enumerate(list(value["corpora"].index)):
        d_[v] = i
    value["corpora"] = value["corpora"].rename(index=d_)

print("")


progress: 229500/229548


<IPython.core.display.Javascript object>

In [8]:
corpora_filtered_info = [
    (key, len(value["corpora"].index)) for key, value in corpus_d.items()
]
print(corpora_filtered_info)

[('2019-05', 70576), ('2019-06', 7700), ('2019-07', 1249), ('2019-08', 1113), ('2019-09', 861), ('2019-10', 552), ('2019-11', 629), ('2019-12', 653), ('2020-01', 509), ('2020-02', 530), ('2020-03', 519), ('2020-04', 748), ('2020-05', 656), ('2020-06', 869), ('2020-07', 799), ('2020-08', 949), ('2020-09', 860), ('2020-10', 937), ('2020-11', 991), ('2020-12', 727), ('2021-01', 1389), ('2021-02', 3773), ('2021-03', 4233), ('2021-04', 5405), ('2021-05', 3199), ('2021-06', 1531), ('2021-07', 4463), ('2021-08', 809)]


<IPython.core.display.Javascript object>

In [9]:
corpora_filtered_percentage = [
    (x[0][0], (round(1 - (x[0][1] / x[1][1]), 2)))
    for x in list(zip(corpora_filtered_info, corpora_info))
]
corpora_filtered_percentage

[('2019-05', 0.46),
 ('2019-06', 0.48),
 ('2019-07', 0.53),
 ('2019-08', 0.54),
 ('2019-09', 0.54),
 ('2019-10', 0.53),
 ('2019-11', 0.51),
 ('2019-12', 0.52),
 ('2020-01', 0.52),
 ('2020-02', 0.53),
 ('2020-03', 0.5),
 ('2020-04', 0.52),
 ('2020-05', 0.53),
 ('2020-06', 0.53),
 ('2020-07', 0.53),
 ('2020-08', 0.55),
 ('2020-09', 0.53),
 ('2020-10', 0.53),
 ('2020-11', 0.54),
 ('2020-12', 0.52),
 ('2021-01', 0.55),
 ('2021-02', 0.53),
 ('2021-03', 0.53),
 ('2021-04', 0.52),
 ('2021-05', 0.53),
 ('2021-06', 0.53),
 ('2021-07', 0.52),
 ('2021-08', 0.53)]

<IPython.core.display.Javascript object>

In [13]:
corpora_filtered_documents_f = [x[1] for x in corpora_filtered_info]
corpora_filtered_documents_min = min(corpora_filtered_documents_f)
corpora_filtered_documents_max = max(corpora_filtered_documents_f)
corpora_filtered_documents_mean = np.mean(corpora_filtered_documents_f)
corpora_filtered_documents_median = np.median(corpora_filtered_documents_f)
print(f"min docs: {corpora_filtered_documents_min}")
print(f"max docs: {corpora_filtered_documents_max}")
print(f"mean docs: {corpora_filtered_documents_mean}")
print(f"median docs: {corpora_filtered_documents_median}")

min docs: 509
max docs: 70576
mean docs: 4186.75
median docs: 903.0


<IPython.core.display.Javascript object>

In [17]:
for key, value in corpus_d.items():
    value["tfidf_vectorizer"] = TfidfVectorizer(stop_words=sw)
    value["tfidf_wm"] = value["tfidf_vectorizer"].fit_transform(
        value["corpora"].comment
    )
    value["tfidf"] = pd.DataFrame(
        value["tfidf_wm"].toarray(),
        columns=value["tfidf_vectorizer"].get_feature_names(),
    )
    value["tfidf"].index = value["corpora"].index
    value["tfidf"] = value["tfidf"].transpose()

<IPython.core.display.Javascript object>

In [19]:
for key, value in corpus_d.items():
    value["sparse_counts"] = scipy.sparse.csr_matrix(value["tfidf"])
    value["corpus_sparse"] = matutils.Sparse2Corpus(value["sparse_counts"])
    value["id2word"] = dict(
        (v, k) for k, v in value["tfidf_vectorizer"].vocabulary_.items()
    )

<IPython.core.display.Javascript object>

In [20]:
for key, value in corpus_d.items():
    value["lda"] = models.LdaModel(
        corpus=value["corpus_sparse"],
        id2word=value["id2word"],
        num_topics=10,
        random_state=42,
        passes=50,
        iterations=100,
    )

<IPython.core.display.Javascript object>

In [73]:
nr_words = 10


def get_topics_format(topics):
    topics_words = []
    for topic in topics:
        s = topic[1].split("+")
        s = [tuple(x.split("*")) for x in s]
        weights = [float(x[0]) for x in s]
        words = [x[1].replace('"', "") for x in s]
        topics_words.append(list(zip(words, weights)))
    return topics_words


for key, value in corpus_d.items():
    print(f"##### {key} #####\n")
    for i, topic_words in enumerate(
        get_topics_format(value["lda"].print_topics(num_words=nr_words))
    ):
        mean_weights = round(sum([x[1] for x in topic_words]) / nr_words, 4)
        print(f"##### Topic: {i} MeanWeights: {mean_weights}")
        print(topic_words, "\n")
    print("\n\n")

##### 2019-05 #####

##### Topic: 0 MeanWeights: 0.0235
[('zitat ', 0.038), ('grünen ', 0.033), ('sebastian ', 0.021), ('striegel ', 0.021), ('zuwanderung ', 0.021), ('volkstod ', 0.021), ('terror ', 0.02), ('brandstifter ', 0.02), ('geistige ', 0.02), ('linker', 0.02)] 

##### Topic: 1 MeanWeights: 0.0099
[('wählt ', 0.025), ('afd ', 0.021), ('starkes ', 0.009), ('schlechter ', 0.007), ('desto ', 0.007), ('angeschaut ', 0.006), ('gauland ', 0.006), ('je ', 0.006), ('vorkommen ', 0.006), ('besprochen', 0.006)] 

##### Topic: 2 MeanWeights: 0.0111
[('reich ', 0.02), ('halt ', 0.011), ('gehört ', 0.01), ('möchtest ', 0.01), ('vadda ', 0.01), ('kommunismus ', 0.01), ('armer ', 0.01), ('staat ', 0.01), ('neidisch ', 0.01), ('mudda', 0.01)] 

##### Topic: 3 MeanWeights: 0.0073
[('video ', 0.015), ('cdu ', 0.01), ('danke ', 0.007), ('wählen ', 0.007), ('rezo ', 0.007), ('gut ', 0.006), ('politik ', 0.006), ('menschen ', 0.005), ('leute ', 0.005), ('bitte', 0.005)] 

##### Topic: 4 MeanWeight

<IPython.core.display.Javascript object>