In [30]:
from gensim import corpora, models 
from scripts.normalization import normalize_corpus
import numpy as np
from docx import Document
import sys
import gensim

#### Load data

In [2]:
def read_txt(file_path):
    with open(file_path,'r') as f:
        corpus = f.readlines()
    
    corpus = [c.strip('\n') for c in corpus]
    corpus = [c for c in corpus if len(c)> 10]
    return corpus

In [3]:
##read data
file_path = 'input/doc_1.txt'
corpus = read_txt(file_path)
## preprocess text
norm_tokenized_corpus = normalize_corpus(corpus,tokenize=True)

In [4]:
# build dictionary
dictionary = corpora.Dictionary(norm_tokenized_corpus)
# convert document into bow
corpus_bow = [dictionary.doc2bow(text) for text in norm_tokenized_corpus]
## comput tfidf feature vectors
tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

#### Latent Semantic Indexing

In [5]:
## topic modeling 
total_topics = 7
lsi = models.LsiModel(corpus_tfidf,
                      id2word=dictionary,
                      num_topics= total_topics)

In [15]:
## now you can see topic by index, words and weights 
print(lsi.show_topic(0))

[('support', 0.18616100782667738), ('imf', 0.18590254483454932), ('debt', 0.17392126931141452), ('’', 0.1566605545149682), ('country', 0.12965053970238727), ('policy', 0.12111440720898438), ('strengthen', 0.11931488669225022), ('global', 0.11814602109861143), ('growth', 0.11765602963699684), ('include', 0.1125068635406627)]


In [16]:
for index, topic in lsi.print_topics(total_topics):
    print('Topic #{}'.format(index+1))
    print(topic)
    print()

Topic #1
0.186*"support" + 0.186*"imf" + 0.174*"debt" + 0.157*"’" + 0.130*"country" + 0.121*"policy" + 0.119*"strengthen" + 0.118*"global" + 0.118*"growth" + 0.113*"include"

Topic #2
-0.221*"rate" + -0.221*"exchange" + -0.211*"growth" + -0.169*"global" + -0.135*"policy" + -0.125*"priority" + -0.125*"outlook" + -0.112*"stability" + -0.110*"competitive" + 0.109*"debt"

Topic #3
-0.368*"resource" + -0.316*"governance" + -0.266*"operation" + -0.263*"imf" + 0.202*"debt" + -0.152*"’" + 0.124*"sustainability" + 0.111*"program" + 0.106*"vulnerability" + 0.103*"practice"

Topic #4
-0.298*"rate" + -0.298*"exchange" + 0.194*"outlook" + 0.194*"priority" + 0.145*"global" + -0.138*"competitive" + 0.136*"growth" + 0.126*"policy" + -0.113*"candid" + -0.113*"capital"

Topic #5
-0.249*"operation" + -0.224*"outlook" + -0.224*"priority" + 0.185*"trade" + 0.165*"tax" + -0.149*"policy" + -0.127*"surveillance" + -0.122*"others" + -0.122*"managing" + -0.122*"director"

Topic #6
0.379*"committee" + 0.258*"quo

In [18]:
## a better way to print 
def print_topics_gensim(topic_model, total_topics=1,
                        weight_threshold=0.0001,
                        display_weights=False,
                        num_terms=None):
    
    for index in range(total_topics):
        topic = topic_model.show_topic(index)
        topic = [(word, round(wt,2)) 
                 for word, wt in topic 
                 if abs(wt) >= weight_threshold]
        if display_weights:
            print('Topic #'+str(index+1)+' with weights')
            print (topic[:num_terms] if num_terms else topic)
        else:
            print ('Topic #'+str(index+1)+' without weights')
            tw = [term for term, wt in topic]
            print (tw[:num_terms] if num_terms else tw)
        print
    


In [19]:
print_topics_gensim(topic_model=lsi,
                    total_topics=total_topics,
                    num_terms=5,
                    display_weights=True)

Topic #1 with weights
[('support', 0.19), ('imf', 0.19), ('debt', 0.17), ('’', 0.16), ('country', 0.13)]
Topic #2 with weights
[('rate', -0.22), ('exchange', -0.22), ('growth', -0.21), ('global', -0.17), ('policy', -0.13)]
Topic #3 with weights
[('resource', -0.37), ('governance', -0.32), ('operation', -0.27), ('imf', -0.26), ('debt', 0.2)]
Topic #4 with weights
[('rate', -0.3), ('exchange', -0.3), ('outlook', 0.19), ('priority', 0.19), ('global', 0.14)]
Topic #5 with weights
[('operation', -0.25), ('outlook', -0.22), ('priority', -0.22), ('trade', 0.19), ('tax', 0.17)]
Topic #6 with weights
[('committee', 0.38), ('quota', 0.26), ('operation', -0.22), ('general', 0.18), ('progress', 0.15)]
Topic #7 with weights
[('operation', -0.29), ('priority', 0.26), ('outlook', 0.26), ('resource', -0.2), ('governance', -0.19)]


####  Latent Dirichlet Allocation

In [48]:
total_topics = 4

In [49]:
lda = models.LdaMulticore(corpus_tfidf,
                          id2word = dictionary,
                          iterations = 1000,
                          num_topics = total_topics,
                          workers = 10)

In [50]:
print_topics_gensim(topic_model=lda,
                   total_topics = total_topics,
                   num_terms=10,
                   display_weights=True)

Topic #1 with weights
[('debt', 0.01), ('lic', 0.01), ('sustainability', 0.0), ('strengthen', 0.0), ('data', 0.0), ('gap', 0.0), ('program', 0.0), ('capacity', 0.0), ('work', 0.0), ('vulnerability', 0.0)]
Topic #2 with weights
[('growth', 0.01), ('committee', 0.0), ('diversity', 0.0), ('quota', 0.0), ('high', 0.0), ('exchange', 0.0), ('rate', 0.0), ('trade', 0.0), ('policy', 0.0), ('strong', 0.0)]
Topic #3 with weights
[('outlook', 0.01), ('priority', 0.01), ('global', 0.0), ('policy', 0.0), ('risk', 0.0), ('sustainable', 0.0), ('toward', 0.0), ('sector', 0.0), ('reform', 0.0), ('scarcity', 0.0)]
Topic #4 with weights
[('operation', 0.01), ('resource', 0.01), ('imf', 0.01), ('governance', 0.01), ('’', 0.01), ('support', 0.0), ('director', 0.0), ('managing', 0.0), ('collaborate', 0.0), ('others', 0.0)]
