## Topic Modelling Gensim

In [1]:
import gensim
import gensim.corpora as corpora
import pickle
import pandas as pd
from pprint import pprint
from gensim.models import CoherenceModel
import csv

In [None]:
# select tokens and subsample
tokenlist = pd.read_pickle("dataset_with_tokens.pkl")[["page", "tokens"]]
tokenlist = tokenlist["tokens"].tolist()

In [None]:
# make bigram and trigram mod

# is that necessary?
bigram = gensim.models.Phrases(tokenlist, min_count = 5, threshold = 100)
trigram = gensim.models.Phrases(bigram[tokenlist], threshold = 100)

trigram_mod = gensim.models.phrases.Phraser(trigram)  
bigram_mod = gensim.models.phrases.Phraser(bigram)

tokenlist = [trigram_mod[bigram_mod[text]] for text in tokenlist]

In [None]:
tokens_trigrams = []
for doc in tokenlist:
    tokenstring = [" ".join([token for token in doc])]
    tokens_trigrams.append(tokenstring)

In [None]:
with open("tokens/tokens_trigrams.csv", "w") as f:
    wr = csv.writer(f)
    wr.writerows(tokens_trigrams)

In [None]:
# create dictionary (each word recorded once and connected to id)
id2word = corpora.Dictionary(tokenlist)
id2word.save("gensim_models/id2word")

# create corpus
corpus = [id2word.doc2bow(text) for text in tokenlist]
import pickle
with open("gensim_models/corpus", "wb") as f:
    pickle.dump(corpus, f)


In [2]:
# load dictionary
id2word = gensim.corpora.dictionary.Dictionary.load("gensim_models/id2word")

# load corpus
class MyCorpus:
    def __iter__(self):
        for line in open("tokens/tokens_trigrams.csv"):
            # assume there's one document per line, tokens separated by whitespace
            yield id2word.doc2bow(line.lower().split())

corpus = MyCorpus()


In [3]:
# build LDA model

lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           eval_every = 1,
                                           chunksize = 100,
                                           passes = 10,
                                           alpha = "symmetric",
                                           per_word_topics = True,
                                           workers = 4)

In [7]:
lda_model.save("gensim_models/lda")

In [9]:
modell =  gensim.models.ldamodel.LdaModel.load("gensim_models/lda")

In [5]:
from gensim.test.utils import datapath

# Save model to disk.
temp_file = datapath("gensim_models/lda")
lda_model.save(temp_file)

FileNotFoundError: [Errno 2] No such file or directory: '/home/hennes/.local/lib/python3.8/site-packages/gensim/test/test_data/gensim_models/lda.state'

In [4]:
# Load a potentially pretrained model from disk.
lda = LdaModel.load(temp_file)

<gensim.models.ldamulticore.LdaMulticore at 0x7fa370e9e8e0>

In [None]:
# print word loadings on each topic
pprint(lda_model.print_topics())

In [None]:
# not sure what this saves
doc_lda = lda_model[corpus]

### Model Performance

In [15]:
corrpus = (row for row in open("tokens/tokens_trigrams.csv"))

In [18]:
# Compute Perplexity
print('\nPerplexity: ', modell.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

TypeError: object of type 'MyCorpus' has no len()

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenlist, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### Visualising Model

In [21]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

with open("gensim_models/corpus", "rb") as f:
    corpus = pickle.load(f)

In [22]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(modell, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(


## Documents and Topics

In [None]:
# find dominant model in each doc

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=tokenlist):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=tokenlist)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
# find most representative document for each topic

# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head()

In [None]:
# topic distribution across documents
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics