In [18]:
from gensim import corpora, models 
from scripts.normalization import normalize_corpus
import numpy as np
from docx import Document
import sys
import os
import gensim
import pickle
python_root = './scripts'
sys.path.insert(0, python_root)

#### Load data

In [27]:
doc_dict = pickle.load(open('./data/xml_docs.p', "rb")) 
ids = list(doc_dict.keys())
print('sample document ids: \n',ids[:5],'\n')
test_docs = doc_dict[ids[0]]
print('sample paragraphs: \n',test_docs.paras[0])

sample document ids: 
 ['9781451823295', '9781462328451', '9781451806069', '9781451815733', '9781451814002'] 

sample paragraphs: 
 1. As a small, open, tourism-based economy, St. Lucia is highly vulnerable to exogenous shocks. Tourism accounts for over three-quarters of exports, and the import content of both consumption and foreign direct investment (FDI) is very high (Figure 1). The economy has been buffeted by the global economic downturn, which has hobbled the tourism and construction sectors, with potential spillovers to the financial sector.


In [39]:
## faltten all paragraphs 
paras = [doc_dict[i].paras for i in ids]
corpus = list()
for ps in paras:
    corpus.extend(ps)

print('Total number of paragraphs in the corpus: {}'.format(len(corpus)))

Total number of paragraphs in the corpus: 255915


In [40]:
norm_tokenized_corpus = normalize_corpus(corpus,tokenize=True)

In [41]:
norm_tokenized_corpus[0]

['1',
 'small',
 'open',
 'tourism',
 'based',
 'economy',
 'st',
 'lucia',
 'highly',
 'vulnerable',
 'exogenous',
 'shock',
 'tourism',
 'account',
 'three',
 'quarters',
 'export',
 'import',
 'content',
 'consumption',
 'foreign',
 'direct',
 'investment',
 'fdi',
 'high',
 'figure',
 '1',
 'economy',
 'buffet',
 'global',
 'economic',
 'downturn',
 'hobble',
 'tourism',
 'construction',
 'sector',
 'potential',
 'spillover',
 'financial',
 'sector']

In [None]:
def read_txt(file_path):
    with open(file_path,'r') as f:
        corpus = f.readlines()
    
    corpus = [c.strip('\n') for c in corpus]
    corpus = [c for c in corpus if len(c)> 10]
    return corpus

In [None]:
##read data
file_path = 'input/doc_1.txt'
corpus = read_txt(file_path)
## preprocess text
norm_tokenized_corpus = normalize_corpus(corpus,tokenize=True)

In [None]:
# build dictionary
dictionary = corpora.Dictionary(norm_tokenized_corpus)
# convert document into bow
corpus_bow = [dictionary.doc2bow(text) for text in norm_tokenized_corpus]
## comput tfidf feature vectors
tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

#### Latent Semantic Indexing

In [None]:
## topic modeling 
total_topics = 7
lsi = models.LsiModel(corpus_tfidf,
                      id2word=dictionary,
                      num_topics= total_topics)

In [None]:
## now you can see topic by index, words and weights 
print(lsi.show_topic(0))

In [None]:
for index, topic in lsi.print_topics(total_topics):
    print('Topic #{}'.format(index+1))
    print(topic)
    print()

In [None]:
## a better way to print 
def print_topics_gensim(topic_model, total_topics=1,
                        weight_threshold=0.0001,
                        display_weights=False,
                        num_terms=None):
    
    for index in range(total_topics):
        topic = topic_model.show_topic(index)
        topic = [(word, round(wt,2)) 
                 for word, wt in topic 
                 if abs(wt) >= weight_threshold]
        if display_weights:
            print('Topic #'+str(index+1)+' with weights')
            print (topic[:num_terms] if num_terms else topic)
        else:
            print ('Topic #'+str(index+1)+' without weights')
            tw = [term for term, wt in topic]
            print (tw[:num_terms] if num_terms else tw)
        print
    


In [None]:
print_topics_gensim(topic_model=lsi,
                    total_topics=total_topics,
                    num_terms=5,
                    display_weights=True)

####  Latent Dirichlet Allocation

In [None]:
total_topics = 4

In [None]:
lda = models.LdaMulticore(corpus_tfidf,
                          id2word = dictionary,
                          iterations = 1000,
                          num_topics = total_topics,
                          workers = 10)

In [None]:
print_topics_gensim(topic_model=lda,
                   total_topics = total_topics,
                   num_terms=10,
                   display_weights=True)