# Topic Modeling with Gensim

This notebook used gensim topic modeling on the text of the first 8 chapers of the course book. 

In [196]:
import os 
num_docs = 4
 
docs = [] 
path = '../school_texts/ebooks/'

for i in os.listdir(path): 
    if i.endswith('.txt'): 
        with open(path+i, encoding='utf-16', errors='ignore') as f: 
            doc = f.read()
            docs.append(doc)

In [197]:
# gensim and nltk imports
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords

In [198]:
NUM_TOPICS = 4

In [199]:
# preprocess docs
def preprocess(docs, stopwords):
    """
    Tokenize, remove stopwords and non-alpha tokens.
    param: docs - a list of raw text documents
    return: a list of processed tokens
    """
    
    processed_docs = []
    for doc in docs:
        tokens = [t for t in word_tokenize(doc.lower()) if t not in stopwords
                 and t.isalpha()]
        processed_docs.append(tokens)
        
    return processed_docs


In [216]:
stopword_list = stopwords.words('english')
stopword_list += ['http', 'url', 'saylor', 'chapter', 'figure', 'section', 'one', 'two', 'many', 'may', 'would', 'must', 'creative', 'commons',
                  'foundation', 'rice', 'openstax', 'college', 'text', 'adapted', 'license', 'without', 'attribution', 'main', 'street', 
                  'houston', 'texas', 'university', 'licensee', 'requested', 'work', 'original', 'creator']
preprocessed_docs = preprocess(docs, stopword_list)

In [217]:
for i in range(num_docs):
    print(preprocessed_docs[i][:5])

['preface', 'welcome', 'new', 'introduction', 'sociology']
['anatomy', 'physiology', 'learn', 'visit', 'individual']
['preface', 'world', 'regional', 'geography', 'takes']
['introduction', 'law', 'legal', 'systems', 'learning']


In [218]:
# the dictionary maps words to id numbers
dictionary = corpora.Dictionary(preprocessed_docs)

In [219]:
# represent the doc tokens in numeric form
corpus = [dictionary.doc2bow(tokens) for tokens in preprocessed_docs]

In [220]:
# build an LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [221]:
for i in range(NUM_TOPICS):
    top_words = [t[0] for t in lda_model.show_topic(i, 9)]
    print("\nTopic", str(i), ':', top_words)


Topic 0 : ['blood', 'court', 'system', 'states', 'cell', 'contract', 'cells', 'body', 'also']

Topic 1 : ['blood', 'also', 'cells', 'people', 'system', 'states', 'contract', 'social', 'body']

Topic 2 : ['people', 'social', 'also', 'system', 'cells', 'new', 'blood', 'contract', 'court']

Topic 3 : ['also', 'law', 'states', 'blood', 'court', 'body', 'system', 'cells', 'muscle']


In [222]:
# look at weights for top 10 words in topic 0
lda_model.show_topic(0, 10)

[('blood', 0.0037950315),
 ('court', 0.003178862),
 ('system', 0.002894712),
 ('states', 0.0027146898),
 ('cell', 0.0025093951),
 ('contract', 0.0023916587),
 ('cells', 0.0022504742),
 ('body', 0.002213133),
 ('also', 0.0021367136),
 ('bone', 0.002074401)]

In [223]:
print("LDA Model 1 Perplexity:", lda_model.log_perplexity(corpus))

from gensim.models.coherencemodel import CoherenceModel

coherence1 = CoherenceModel(model=lda_model,
                           texts=preprocessed_docs, dictionary=dictionary, coherence='c_v')
print('Coherence score:', coherence1.get_coherence())

LDA Model 1 Perplexity: -8.877115757278968
Coherence score: 0.13740788068464832


## Visualization

The pyLDAvis package enables visualization of topics and documents. The package can be installed with pip or pip3.

In [208]:
import pyLDAvis
from pyLDAvis import gensim
pyLDAvis.enable_notebook()

In [224]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

vis

## LSI

In [225]:
# build an LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [226]:
print("LSI Model Results")
for i in range(NUM_TOPICS):
    print("\nTopic #%s:" % i, lsi_model.print_topic(i, 10))

LSI Model Results

Topic #0: 0.213*"blood" + 0.177*"cells" + 0.171*"system" + 0.162*"also" + 0.144*"court" + 0.140*"contract" + 0.139*"body" + 0.129*"law" + 0.114*"muscle" + 0.113*"cell"

Topic #1: -0.267*"blood" + -0.225*"cells" + 0.193*"court" + -0.170*"system" + 0.166*"contract" + 0.166*"law" + -0.165*"body" + -0.144*"muscle" + -0.141*"cell" + -0.126*"bone"

Topic #2: -0.312*"social" + -0.291*"people" + -0.157*"health" + -0.154*"population" + -0.147*"united" + -0.140*"world" + -0.130*"society" + -0.125*"women" + -0.122*"states" + 0.117*"court"

Topic #3: 0.287*"social" + -0.246*"country" + -0.235*"region" + -0.173*"countries" + -0.160*"south" + -0.151*"population" + 0.150*"health" + -0.135*"north" + -0.134*"economic" + -0.119*"world"


In [227]:
coherence3 = CoherenceModel(model=lsi_model,
                           texts=preprocessed_docs, dictionary=dictionary, coherence='c_v')
print('Coherence score:', coherence3.get_coherence())

Coherence score: 0.501282918495215
