In [8]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora.dictionary import Dictionary
from gensim.models.phrases import Phrases, Phraser

In [9]:
abstracts = pd.read_pickle('data/EPMC/eye_in_new_keywords_ABSTRACTS_LEMMA.pkl')
abstracts = abstracts.sample(2000)
abstracts = abstracts.str.split()

pmids = abstracts.index
display(abstracts.head())

pmid
26266437    [study, vivo, pattern, ocular, surface, epithe...
29249004    [leber, hereditary, optic, neuropathy, inherit...
26109843    [background, acute, hemorrhagic, conjunctiviti...
18591848    [year, old, woman, diagnosed, vogt, koyanagi, ...
24319681    [purpose, evaluate, ability, frequency, doubli...
Name: abstractText, dtype: object

In [10]:
bigram = Phraser(Phrases(abstracts, min_count=50))
texts = [bigram[text] for text in abstracts]

In [11]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(texts)
dictionary.filter_extremes(no_below=50, no_above=0.2)

#texts = [filter(lambda x: x in dictionary.values(), t) for t in texts]


print('Number of unique tokens: %d' % len(dictionary))

Number of unique tokens: 686


In [12]:
corpus = [dictionary.doc2bow(doc) for doc in texts]

print('Number of documents: %d' % len(corpus))

Number of documents: 2000


In [13]:
# Set parameters.
num_topics = 55 

# Make a index to word dictionary.
temp = dictionary[0]  # only to "load" the dictionary.
id2word = dictionary.id2token

lda_model = LdaModel(corpus=corpus, 
                     id2word=id2word, 
                     num_topics=num_topics)

# Print the Keyword in the 5 topics
lda_model.print_topics()

[(22,
  '0.026*"protein" + 0.023*"expression" + 0.014*"corneal" + 0.012*"increased" + 0.011*"level" + 0.008*"induced" + 0.006*"showed" + 0.006*"clinical" + 0.006*"activation" + 0.006*"beta"'),
 (30,
  '0.023*"protein" + 0.021*"visual" + 0.014*"change" + 0.011*"expression" + 0.011*"response" + 0.011*"level" + 0.009*"increased" + 0.008*"decreased" + 0.008*"retinal" + 0.007*"age"'),
 (2,
  '0.016*"subject" + 0.016*"retinal" + 0.014*"retina" + 0.012*"high" + 0.010*"mg" + 0.009*"injection" + 0.009*"day" + 0.007*"area" + 0.007*"response" + 0.007*"different"'),
 (34,
  '0.016*"retinal" + 0.011*"visual" + 0.010*"mouse" + 0.007*"size" + 0.007*"ocular" + 0.007*"retina" + 0.007*"contrast" + 0.007*"longer" + 0.006*"association" + 0.006*"following"'),
 (21,
  '0.025*"retinal" + 0.009*"gene" + 0.008*"different" + 0.008*"type" + 0.008*"increased" + 0.008*"expression" + 0.008*"laser" + 0.008*"function" + 0.008*"found" + 0.007*"retina"'),
 (15,
  '0.030*"group" + 0.013*"outcome" + 0.010*"rat" + 0.010*"

In [16]:
# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda_model, 
                                     corpus=corpus, 
                                     dictionary=dictionary, 
                                     coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  -2.403628552337365


In [None]:
def compute_coherence_values(dictionary, corpus, texts, n_topics_to_try):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    for num_topics in n_topics_to_try:
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

n_topics_to_try = np.arange(5,55,5)
model_list, coherence_values = compute_coherence_values(dictionary, corpus, texts, n_topics_to_try)

plt.plot(n_topics_to_try, coherence_values)
plt.xlabel('Num Topics')
plt.ylabel('Coherence score')
plt.savefig('data/LDA_Gensim_Coherence.png')