## Topic Modelling Gensim

In [40]:
import gensim
import gensim.corpora as corpora
import pickle
import pandas as pd
from pprint import pprint
from gensim.models import CoherenceModel

In [4]:
# select tokens and subsample
tokenlist = pd.read_pickle("dataset_with_tokens.pkl")[["page", "tokens"]]
tokenlist = tokenlist.loc[tokenlist.page == "100% Noticias"]["tokens"].tolist()

In [23]:
# make bigram and trigram mod

# is that necessary?
bigram = gensim.models.Phrases(tokenlist, min_count = 5, threshold = 100)
trigram = gensim.models.Phrases(bigram[tokenlist], threshold = 100)

trigram_mod = gensim.models.phrases.Phraser(trigram)  
bigram_mod = gensim.models.phrases.Phraser(bigram)

tokenlist = [trigram_mod[bigram_mod[text]] for text in tokenlist]

In [31]:
# create dictionary (each word recorded once and connected to id)
id2word = corpora.Dictionary(tokenlist)

# this is not necessary, just rename texts variable in later cells
texts = tokenlist

# create corpus
corpus = [id2word.doc2bow(text) for text in texts]

In [49]:
# build LDA model

lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           eval_every = 1,
                                           chunksize = 100,
                                           passes = 10,
                                           alpha = "symmetric",
                                           per_word_topics = True,
                                           workers = 4)

In [50]:
# print word loadings on each topic
pprint(lda_model.print_topics())

[(0,
  '0.016*"nicaragua" + 0.016*"año" + 0.012*"millones_dólares" + 0.011*"país" + '
  '0.011*"millones" + 0.010*"economía" + 0.009*"ciento" + 0.007*"crisis" + '
  '0.006*"empresas" + 0.005*"crecimiento"'),
 (1,
  '0.017*"ortega" + 0.012*"policía" + 0.010*"daniel" + 0.010*"nicaragua" + '
  '0.010*"sandinista" + 0.008*"régimen" + 0.006*"libertad" + 0.006*"noticias" '
  '+ 0.005*"dictadura" + 0.005*"gobierno"'),
 (2,
  '0.023*"nicaragua" + 0.010*"país" + 0.007*"ortega" + 0.007*"nacional" + '
  '0.006*"presidente" + 0.006*"o" + 0.006*"unidos" + 0.004*"política" + '
  '0.004*"países" + 0.004*"gobierno"'),
 (3,
  '0.012*"precio" + 0.010*"córdobas" + 0.009*"precios" + 0.009*"mercado" + '
  '0.008*"producción" + 0.007*"dólares" + 0.007*"petróleo" + 0.007*"semana" + '
  '0.006*"productores" + 0.006*"productos"'),
 (4,
  '0.029*"ley" + 0.021*"o" + 0.013*"nacional" + 0.008*"sanciones" + '
  '0.007*"asamblea" + 0.007*"ortega" + 0.006*"empresa" + 0.006*"empresas" + '
  '0.005*"régimen" + 0.005*"u

In [51]:
# not sure what this saves
doc_lda = lda_model[corpus]

### Model Performance

In [52]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -10.050280717959899


In [53]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenlist, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4828455733542813


### Visualising Model

In [47]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [54]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(


## Documents and Topics

In [55]:
# find dominant model in each doc

In [58]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=tokenlist):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=tokenlist)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

TypeError: '<' not supported between instances of 'int' and 'tuple'

In [None]:
# find most representative document for each topic

# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head()

In [None]:
# topic distribution across documents
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics