In [1]:
from gensim import corpora, models 
from scripts.normalization import normalize_corpus
import numpy as np
from docx import Document
import sys
import os
import gensim
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import pickle
python_root = './scripts'
sys.path.insert(0, python_root)

import normalization_spacy as util
from contractions import CONTRACTION_MAP



#### Load data

In [2]:
doc_dict = pickle.load(open('./data/xml_docs.p', "rb")) 
ids = list(doc_dict.keys())
print('sample document ids: \n',ids[:5],'\n')
test_docs = doc_dict[ids[0]]
print('sample paragraphs: \n',test_docs.paras[0])

sample document ids: 
 ['9781451823295', '9781462328451', '9781451806069', '9781451815733', '9781451814002'] 

sample paragraphs: 
 1. As a small, open, tourism-based economy, St. Lucia is highly vulnerable to exogenous shocks. Tourism accounts for over three-quarters of exports, and the import content of both consumption and foreign direct investment (FDI) is very high (Figure 1). The economy has been buffeted by the global economic downturn, which has hobbled the tourism and construction sectors, with potential spillovers to the financial sector.


In [3]:
## faltten all paragraphs 
paras = [doc_dict[i].paras for i in ids]
corpus = list()
for ps in paras:
    corpus.extend(ps)

print('Total number of paragraphs in the corpus: {}'.format(len(corpus)))

Total number of paragraphs in the corpus: 255915


### Tokenize and lemmatize corpus

In [4]:
import en_core_web_md
nlp = en_core_web_md.load()

In [None]:
## single / multi threaded 
n_core = 30 
if n_core == 1:
    docs = [nlp(d) for d in corpus]
    docs_lemma = [[token.lemma_ for token in doc if not util.punct_space(token) ] for doc in docs]
else:
    trigram_reviews_filepath = 'data/lemma_docs.txt'
    with open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        for doc in nlp.pipe(corpus,batch_size=10000,n_threads=n_core):
            docs_lemma = [token.lemma_ for token in doc if not util.punct_space(token)]
            trigram_para = ' '.join(docs_lemma)
            f.write(trigram_para + '\n')
    
    with open(trigram_reviews_filepath, 'r', encoding='utf_8') as f:
        docs_lemma = f.readlines()
        docs_lemma = [d.strip('\n').split() for d in docs_lemma]

In [43]:
with open(trigram_reviews_filepath, 'r', encoding='utf_8') as f:
    docs_lemma = f.readlines()
    docs_lemma = [d.strip('\n').split() for d in docs_lemma]

In [44]:
print(corpus[3])
print(docs_lemma[3])

4. Real GDP growth slowed in 2007-08. Spurred by preparations for the Cricket World Cup, St. Lucia’s economy grew by about 5 percent in 2006. However, slowing construction and tourism activity, together with a hurricane-induced contraction in banana exports, reduced growth to an estimated 1.7 percent and 0.7 percent in 2007 and 2008, respectively. The unemployment rate increased by three percentage points to 16.8 percent during the same period. Despite being underpinned by the regional currency board arrangement, annual inflation reached 7.2 percent in 2008, reflecting high international prices of energy and food. With the decline in these prices, inflation has fallen to 3.2 percent by end-March 2009.
['4', 'real', 'gdp', 'growth', 'slow', 'in', '2007', '08', 'spur', 'by', 'preparation', 'for', 'the', 'cricket', 'world', 'cup', 'st.', 'lucia', '’s', 'economy', 'grow', 'by', 'about', '5', 'percent', 'in', '2006', 'however', 'slow', 'construction', 'and', 'tourism', 'activity', 'together

### Bigram and Trigram transform

In [47]:
train_phrase_model = True
bigram_transformer_path = os.path.join('data','bigram_transformer')
trigram_transformer_path = os.path.join('data','trigram_transformer')

if train_phrase_model: 
    paras = util.phrase_detect_train(docs_lemma,min_count=5,threshold=20,phrase_model_save_path='./data/bigram')
    paras = util.phrase_detect_train(paras,min_count=5,threshold=10,phrase_model_save_path='./data/trigram')
else:
    bigram_transformer = Phraser.load(bigram_transformer_path)
    trigram_transformer = Phraser.load(trigram_transformer_path)
    paras = util.phrase_detect(bigram_transformer,trigram_transformer,sentances) 
    

Transform sentances to trigrams .........

Phrase model training done.
Transform sentances to trigrams .........

Phrase model training done.


- exma phrases

In [54]:
trigram = Phraser.load('data/trigram')
pharses_list = list(trigram.vocab.keys())
pharses_list[:20]

[b'1',
 b'as',
 b'1_as',
 b'a',
 b'as_a',
 b'small_open',
 b'a_small_open',
 b'tourism',
 b'small_open_tourism',
 b'base',
 b'tourism_base',
 b'economy',
 b'base_economy',
 b'st._lucia',
 b'economy_st._lucia',
 b'be',
 b'st._lucia_be',
 b'highly_vulnerable',
 b'be_highly_vulnerable',
 b'to']

In [51]:
len(trigram.vocab.keys())

254756

In [None]:
def read_txt(file_path):
    with open(file_path,'r') as f:
        corpus = f.readlines()
    
    corpus = [c.strip('\n') for c in corpus]
    corpus = [c for c in corpus if len(c)> 10]
    return corpus

In [None]:
##read data
file_path = 'input/doc_1.txt'
corpus = read_txt(file_path)
## preprocess text
norm_tokenized_corpus = normalize_corpus(corpus,tokenize=True)

In [None]:
# build dictionary
dictionary = corpora.Dictionary(norm_tokenized_corpus)
# convert document into bow
corpus_bow = [dictionary.doc2bow(text) for text in norm_tokenized_corpus]
## comput tfidf feature vectors
tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

#### Latent Semantic Indexing

In [None]:
## topic modeling 
total_topics = 7
lsi = models.LsiModel(corpus_tfidf,
                      id2word=dictionary,
                      num_topics= total_topics)

In [None]:
## now you can see topic by index, words and weights 
print(lsi.show_topic(0))

In [None]:
for index, topic in lsi.print_topics(total_topics):
    print('Topic #{}'.format(index+1))
    print(topic)
    print()

In [None]:
## a better way to print 
def print_topics_gensim(topic_model, total_topics=1,
                        weight_threshold=0.0001,
                        display_weights=False,
                        num_terms=None):
    
    for index in range(total_topics):
        topic = topic_model.show_topic(index)
        topic = [(word, round(wt,2)) 
                 for word, wt in topic 
                 if abs(wt) >= weight_threshold]
        if display_weights:
            print('Topic #'+str(index+1)+' with weights')
            print (topic[:num_terms] if num_terms else topic)
        else:
            print ('Topic #'+str(index+1)+' without weights')
            tw = [term for term, wt in topic]
            print (tw[:num_terms] if num_terms else tw)
        print
    


In [None]:
print_topics_gensim(topic_model=lsi,
                    total_topics=total_topics,
                    num_terms=5,
                    display_weights=True)

####  Latent Dirichlet Allocation

In [None]:
total_topics = 4

In [None]:
lda = models.LdaMulticore(corpus_tfidf,
                          id2word = dictionary,
                          iterations = 1000,
                          num_topics = total_topics,
                          workers = 10)

In [None]:
print_topics_gensim(topic_model=lda,
                   total_topics = total_topics,
                   num_terms=10,
                   display_weights=True)