# LDA HPO

Hyper parameter optimization for the LDA topic models.

Calculating coherence and model perplexity.

In [None]:
from gensim import corpora
from os.path import join
import src.constants as const
import pandas as pd
import seaborn as sb
from gensim.models import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

sb.set()
df = pd.read_pickle(const.JOURNALS_DF)

df_other = df[~df["dc:description:tokenized"].isna()]

In [None]:
n_docs = len(df_other)
texts = df_other["dc:description:tokenized"]


In [None]:
documents = texts.str.split().values

print("Creating dict")
dictionary = corpora.Dictionary(documents)

print("Creating corpus")
corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in documents]


In [None]:
scores = {}

for n_topics in range(5,100, 5):
    lda_model = LdaMulticore(corpus=corpus,
                         id2word=dictionary,
                         random_state=100,
                         num_topics=n_topics,
                         passes=10,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

    perplexity = lda_model.log_perplexity(corpus)
    cm = CoherenceModel(model=lda_model, corpus=corpus, texts=documents, coherence="u_mass", dictionary=dictionary)
    coherence = cm.get_coherence()
    scores[n_topics] = (coherence, perplexity)
    print(f"{n_topics} -> Coherence: {coherence} Perplexity: {perplexity}")

In [None]:


# lda_model.save(join(const.MODELS_DIR, "lda-gensim"))

In [None]:
lda_model.print_topics(-1)

In [None]:
for c in lda_model[corpus[:10]]:
    print("Document Topics      : ", c[0])      # [(Topics, Perc Contrib)]
    print("Word id, Topics      : ", c[1][:3])  # [(Word id, [Topics])]
    print("Phi Values (word id) : ", c[2][:2])  # [(Word id, [(Topic, Phi Value)])]
    print("Word, Topics         : ", [(dictionary[wd], topic) for wd, topic in c[1][:2]])   # [(Word, [Topics])]
    print("Phi Values (word)    : ", [(dictionary[wd], topic) for wd, topic in c[2][:2]])  # [(Word, [(Topic, Phi Value)])]
    print("------------------------------------------------------\n")

In [None]:
texts = df["dc:description:tokenized"]
documents = texts.str.split().values
corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in documents]

In [None]:
predictions = lda_model[corpus]
predictions = list(predictions)

In [None]:
topics = []
props = []

for result in predictions:
    topics.append([topic[0] for topic in result[0]])
    props.append([topic[1] for topic in result[0]])

In [None]:
df["lda:topics"] = pd.Series(topics, index=df.index)
df["lda:topics:top"] = pd.Series([topic[0] for topic in topics if len(topic) > 0], index=df.index)
df["lda:topics:props"] = pd.Series(props, index=df.index)
df["lda:topics:props:top"] = pd.Series([p[0] for p in props if len(p) > 0], index=df.index)

df.to_pickle(join(const.ARTIFACTS_DIR, "journals-with-topics.pkl"))

In [None]:
data= []

for n in range(20):
    terms = lda_model.get_topic_terms(n, topn=20)
    terms = [dictionary.id2token[t[0]] for t in terms]
    data.append({"terms": terms})

topid_df = pd.DataFrame(data)
topid_df.to_pickle(join(const.ARTIFACTS_DIR, "topics.pkl"))

# Mallet

In [None]:
# from gensim.models.wrappers import LdaMallet
# mallet_path = "/home/ki/crypt/git/Mallet/bin/mallet"
#
# lda_mallet = LdaMallet(mallet_path,
#                        corpus=corpus,
#                        num_topics=10,
#                        alpha=50,
#                        id2word=dictionary,
#                        workers=10,
#                        prefix=None,
#                        optimize_interval=0,
#                        iterations=1000,
#                        topic_threshold=0.0,
#                        random_seed=0)

In [None]:
# lda_mallet.print_topics(-1)
#
# lda_model2 =  gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_mallet)
# lda_model.save(join(const.MODELS_DIR, "lda-gensim-mallet"))
# for c in lda_model2[corpus[:10]]:
#     print(c)
#