In [4]:
import os
import pathlib
from data_processor import TopicModelDataPreprocessor
from expert_field_project.utils import topic_model_config

all_pdf_folder_path = topic_model_config.ALL_PDF_FOLDER_PATH
path_all_pdf_folder_path = pathlib.Path(all_pdf_folder_path)

for author_folder in path_all_pdf_folder_path.iterdir():
    if author_folder.is_dir():
        topic_model_processor = TopicModelDataPreprocessor(all_pdf_folder_path=path_all_pdf_folder_path)
        docs = topic_model_processor.get_and_process_pdf_files(author_folder=author_folder)
        

ModuleNotFoundError: No module named 'expert_field_project'

In [5]:
import tomotopy as tp

hdp = tp.HDPModel(tw=tp.TermWeight.IDF, min_cf=5, rm_top=7,
                 gamma=1, alpha=0.1, initial_k=10, seed=99999)

# Add docs to train
for vec in docs:
    hdp.add_doc(vec)

# Initiate MCMC burn-in 
hdp.burn_in = 100
hdp.train(0)
print('Num docs:', len(hdp.docs), ', Vocab size:', hdp.num_vocabs, ', Num words:', hdp.num_words)
print('Removed top words:', hdp.removed_top_words)

In [4]:
mcmc_iter=1000
for i in range(0, mcmc_iter, 100):
    hdp.train(100, workers=3)
    print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format(i, hdp.ll_per_word, hdp.live_k))

  hdp.train(100, workers=3)


Iteration: 0	Log-likelihood: -6.726657755728436	Num. of topics: 33
Iteration: 100	Log-likelihood: -6.702145664466894	Num. of topics: 36
Iteration: 200	Log-likelihood: -6.662633374210541	Num. of topics: 40
Iteration: 300	Log-likelihood: -6.632178103411953	Num. of topics: 42
Iteration: 400	Log-likelihood: -6.706047074875143	Num. of topics: 42
Iteration: 500	Log-likelihood: -6.698826180078019	Num. of topics: 45
Iteration: 600	Log-likelihood: -6.617993206529829	Num. of topics: 47
Iteration: 700	Log-likelihood: -6.597068623761541	Num. of topics: 44
Iteration: 800	Log-likelihood: -6.595335463967501	Num. of topics: 44
Iteration: 900	Log-likelihood: -6.572204671079737	Num. of topics: 43


In [5]:
def get_hdp_topics(hdp, top_n=10):
    '''Wrapper function to extract topics from trained tomotopy HDP model 
    
    ** Inputs **
    hdp:obj -> HDPModel trained model
    top_n: int -> top n words in topic based on frequencies
    
    ** Returns **
    topics: dict -> per topic, an arrays with top words and associated frequencies 
    '''
    
    # Get most important topics by # of times they were assigned (i.e. counts)
    sorted_topics = [k for k, v in sorted(enumerate(hdp.get_count_by_topics()), key=lambda x:x[1], reverse=True)]

    topics=dict()
    
    # For topics found, extract only those that are still assigned
    for k in sorted_topics:
        if not hdp.is_live_topic(k): continue # remove un-assigned topics at the end (i.e. not alive)
        topic_wp =[]
        for word, prob in hdp.get_topic_words(k, top_n=top_n):
            topic_wp.append((word, prob))

        topics[k] = topic_wp # store topic word/frequency array
        
    return topics

In [6]:
hdp_topics = get_hdp_topics(hdp)

In [9]:
for i in range(len(hdp_topics)):
    print(hdp_topics[i])


[('section_waiver', 0.03754639998078346), ('congressional_district', 0.03337470442056656), ('vote', 0.02746480144560337), ('campaign_contribution', 0.02642187848687172), ('section_recipient', 0.025031305849552155), ('candidate', 0.0229750107973814), ('kolstad_page', 0.018546288833022118), ('payment', 0.01798292249441147), ('available_pmc', 0.016004910692572594), ('treated', 0.015934139490127563)]
[('compliers', 0.06427974253892899), ('never_taker', 0.053927332162857056), ('oregon', 0.04942040145397186), ('always_taker', 0.037864748388528824), ('utilization', 0.03537672385573387), ('mte', 0.030407056212425232), ('unobserved_net', 0.0300722885876894), ('effect', 0.026873253285884857), ('visit', 0.026282748207449913), ('late', 0.022581961005926132)]
[('eshi', 0.06380578875541687), ('hour', 0.02511393092572689), ('author_manuscript', 0.024928109720349312), ('per_week', 0.01680145226418972), ('job', 0.01598692312836647), ('deadweight_loss', 0.014211013913154602), ('wage', 0.0134965414181351

KeyError: 35