In [3]:
%%bash

pip install --index-url https://test.pypi.org/simple/ hierarchical_lda_ncrp



You are using pip version 9.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.


In [1]:
import hierarchical_lda_ncrp
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from hierarchical_lda_ncrp import hLDA

In [3]:
stop_words = set(stopwords.words('english'))
with open("abstract.txt") as f:
    data = f.read()
    data = data.strip('\n')
    data_cleaned = data.split('\n\n')
    
cs = [",", ".", "’", "”", "“", "?", "!", ":", "\n", ";", "\\", "-", "—", "$", "/", "(", ")", "–", "[", "]"]
corpus = []
for i in range(len(data_cleaned)):
    data = data_cleaned[i]
    for x in cs:
        data = data.replace(x, '')
    word = data.lower().split(' ')
    no_stop = []
    for j in word:
        if j not in stop_words:
            no_stop.append(j)
    corpus.append(no_stop)

In [4]:
# Comparative Analysis
from gensim.models import LdaModel, HdpModel
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel


def coherence_input(hlda_object):
    """
    return a list of lists for coherence
    
    input: output of the hlda function, which includes the tree and the index 
    
    output: a list of lists, sample a path
    """ 
    c_input = []
    tree = hlda_object[0]
    struc = hlda_object[1]
    n = 0
    idx = 0
    c_input.append(tree[0][0])
    
    for i in range(1, len(struc)):
        if struc[i][idx] == 0:
            break
        n = 0
        for j in range(idx):
            n += struc[i][j]
        idx = n + np.random.choice(struc[i][idx],1)[0]   
        c_input.append(tree[i][idx]) 
    return c_input


def hlda_coherence(hlda_object,dictionary,corpus,iterations):
    """
    calculate the coherence for hlda model
    
    input: 
        hlda_object, output of the hlda function, which includes the tree and the index 
        
        dictionary, Dictionary for the input corpus
        
        corpus, a list of lists
        
        iterations, number of times for sampling a path 
    
    output: float number, coherence calculated for hlda model
    """ 
    S = 0 
    n = 0
    for i in range(iterations):
        tes = coherence_input(hlda_object)
        cm_hlda = CoherenceModel(topics=tes, corpus=corpus, dictionary=dictionary, coherence='u_mass')
        hlda_cohe = cm_hlda.get_coherence()
        if not np.isnan(hlda_cohe):
            n += 1 
            S += hlda_cohe
    
    avg = S/n
    return avg


def comp_analysis(corpus, hlda_object, n_topics = 10, iterations = 10000):   
    """
    print coherence for HLDA, LDA, HDP model
    
    input: 
        corpus, a list of lists
        
        hlda_object, output of the hlda function, which includes the tree and the index 
        
        n_topics, number of topics presepcfied
        
        iterations, number of times for sampling a path 
    
    """    
    dictionary2 = Dictionary(corpus)
    corpus2 = [dictionary2.doc2bow(text) for text in corpus]

    hdpmodel = HdpModel(corpus=corpus2, id2word=dictionary2)
    ldamodel = LdaModel(corpus=corpus2, num_topics=n_topics, id2word=dictionary2)
    cm_lda = CoherenceModel(model=ldamodel, corpus=corpus2, coherence='u_mass')
    cm_hdp = CoherenceModel(model=hdpmodel, corpus=corpus2, coherence='u_mass')
    
    lda_cohe = cm_lda.get_coherence()
    hdp_cohe = cm_hdp.get_coherence()
    hlda_cohe = hlda_coherence(hlda_object,dictionary2,corpus2,iterations=10000)
    
    print("Coherence for HLDA is %.2f" %hlda_cohe)
    print("Coherence for LDA is %.2f" %lda_cohe)
    print("Coherence for HDP is %.2f" %hdp_cohe)    

In [None]:
hlda_object = hLDA(corpus, alpha=0.1, beta=0.01, gamma=0.6, eta=0.01, ite=20, level=10)

In [None]:
comp_analysis(corpus, hlda_object, n_topics = 10, iterations = 1000)