In [3]:
from coherence import eval_coherence
import pandas as pd
import numpy as np
import ast

In [4]:
def get_hdp_topics(hdp, top_n=10):
    '''Wrapper function to extract topics from trained tomotopy HDP model 
    
    ** Inputs **
    hdp:obj -> HDPModel trained model
    top_n: int -> top n words in topic based on frequencies
    
    ** Returns **
    topics: dict -> per topic, an arrays with top words and associated frequencies 
    '''
    
    # Get most important topics by # of times they were assigned (i.e. counts)
    sorted_topics = [k for k, v in sorted(enumerate(hdp.get_count_by_topics()), key=lambda x:x[1], reverse=True)]

    topics=dict()
    
    # For topics found, extract only those that are still assigned
    for k in sorted_topics:
        if not hdp.is_live_topic(k): continue # remove un-assigned topics at the end (i.e. not alive)
        topic_wp =[]
        for word, prob in hdp.get_topic_words(k, top_n=top_n):
            topic_wp.append((word, prob))

        topics[k] = topic_wp # store topic word/frequency array
        
    return topics

In [5]:
with open('sums.txt', 'r') as f:
    sums = ast.literal_eval(f.read())

In [6]:
sums_list = [item.split() for item in sums]
len(sums_list)

2305

### Modeling

In [7]:
import tomotopy as tp
term_weight = tp.TermWeight.IDF
HDP = tp.HDPModel(tw=term_weight, min_cf=5, rm_top=6, gamma=0.1, eta=0.2,
                  alpha=0.1, initial_k=5, seed=42)

In [8]:
# Add docs to train
for vec in sums_list:
    HDP.add_doc(vec)

In [9]:
# Initiate sampling burn-in  (i.e. discard N first iterations)
HDP.burn_in = 100
HDP.train(0)
print('Num docs:', len(HDP.docs), ', Vocab size:', HDP.num_vocabs,
      ', Num words:', HDP.num_words)
# print('Removed top words:', hdp.removed_top_words)

Num docs: 2305 , Vocab size: 3255 , Num words: 214561


In [10]:
# Train model
for i in range(0, 10):
    HDP.train(100) # 100 iterations at a time
    print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format(i, HDP.ll_per_word, HDP.live_k))

Iteration: 0	Log-likelihood: -21.708408297021368	Num. of topics: 5
Iteration: 1	Log-likelihood: -21.693367732265916	Num. of topics: 5
Iteration: 2	Log-likelihood: -21.692122286538122	Num. of topics: 5
Iteration: 3	Log-likelihood: -21.689517352333738	Num. of topics: 5
Iteration: 4	Log-likelihood: -21.689856206368574	Num. of topics: 5
Iteration: 5	Log-likelihood: -21.691685329358712	Num. of topics: 5
Iteration: 6	Log-likelihood: -21.687985446776775	Num. of topics: 5
Iteration: 7	Log-likelihood: -21.687567375596345	Num. of topics: 5
Iteration: 8	Log-likelihood: -21.68451050185152	Num. of topics: 5
Iteration: 9	Log-likelihood: -21.685256553010362	Num. of topics: 5


In [11]:
topics = get_hdp_topics(HDP, top_n=50) # changing top_n changes no. of words displayed

In [12]:
eval_coherence(topics,sums_list)

Evaluating topic coherence...
Done



0.5682074918558703

### Store results

In [30]:
results = pd.DataFrame(columns = np.array([['Topic_{}_words'.format(str(i)), 'Topic_{}_significance'.format(str(i))] for i in range(5)]).flatten())

In [31]:
results

Unnamed: 0,Topic_0_words,Topic_0_significance,Topic_1_words,Topic_1_significance,Topic_2_words,Topic_2_significance,Topic_3_words,Topic_3_significance,Topic_4_words,Topic_4_significance


In [49]:
for i in range(5):
    results['Topic_{}_words'.format(str(i))] = np.array(topics[i]).T[0]
    results['Topic_{}_significance'.format(str(i))] = [float(item) for item in np.array(topics[i]).T[1]]

In [50]:
results.head()

Unnamed: 0,Topic_0_words,Topic_0_significance,Topic_1_words,Topic_1_significance,Topic_2_words,Topic_2_significance,Topic_3_words,Topic_3_significance,Topic_4_words,Topic_4_significance
0,prostitution,0.005627,ms,0.009118,vessel,0.021557,applicant,0.010408,appellant,0.010006
1,prostitute,0.004545,girl,0.006133,smuggle,0.014508,appellant,0.009013,france,0.007201
2,anonymous,0.004491,complainant,0.00555,siev,0.011109,court,0.007132,irregular,0.007006
3,sexual,0.004296,tell,0.00555,crew,0.011002,section,0.006069,smuggle,0.005423
4,force,0.003928,mr,0.005286,venture,0.008033,article,0.005321,italy,0.005421


In [52]:
results.to_excel('HDP_results.xlsx', index=False)