In [1]:
from gensim import models, corpora
import pandas as pd
import pyLDAvis.gensim

In [2]:
def prep_corpus_csv(fname):
    """
    reads in a csv file, and makes a corpus that is usable in gensim
    the third column here is the raw text that will be used to generate the topic models
    """

    d = pd.read_csv(fname)
        
    docs = []
    for i in range(len(d.index)):
        docs.append(d.iloc[i,2].split())
    
    for doc in docs:
        while '<NUMBER>' in doc:
            doc.remove('<NUMBER>')
        while 'patient' in doc:
            doc.remove('patient')
        while 'cell' in doc:
            doc.remove('cell')

    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return corpus, dictionary, docs


In [3]:
"""
this dataset comes from scraped abstracts to articles published in PubMed, an open source journal for biology and medical articles
james then did a keyword search for a few key terms (HIV, TBI, microbiome, etc) to ensure there were at least some common topics
"""

corp, dictionary, docs = prep_corpus_csv('data/pubmed_sample_data.csv')

In [6]:
lda_model = models.LdaModel(corpus=corp, 
 num_topics=6, 
 id2word=dictionary, 
 distributed=False, 
 chunksize=2000, 
 passes=5, 
 update_every=1, 
 alpha='auto', 
 eta=None, 
 decay=0.5, 
 offset=1.0, 
 eval_every=10, 
 iterations=50, 
 gamma_threshold=0.001, 
 minimum_probability=0.01, 
 random_state=2, 
 ns_conf=None, 
 minimum_phi_value=0.01, 
 per_word_topics=False, 
 callbacks=None)

In [7]:
lda_visualization = pyLDAvis.gensim.prepare(lda_model, corp, dictionary, sort_topics=False)
pyLDAvis.display(lda_visualization)