## Import relevant libraries

In [None]:
# !pip install wordcloud

In [22]:
import sys
import re, numpy as np, pandas as pd
from pprint import pprint
import json
import os

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come', 'report', 'page'])

# wordcloud
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

#%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

## Topic Modelling (LDA)

In [23]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)

In [24]:
def process_words(texts, bigram_mod, trigram_mod, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

In [43]:
def generate_wordcloud(lda_model, path):
    cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]

    cloud = WordCloud(stopwords=stop_words,
                      background_color='white',
                      width=2500,
                      height=1800,
                      max_words=10,
                      colormap='tab10',
                      color_func=lambda *args, **kwargs: cols[i],
                      prefer_horizontal=1.0)

    topics = lda_model.show_topics(formatted=False)

    fig, axes = plt.subplots(2, 2, figsize=(8,8), sharex=True, sharey=True)

    for i, ax in enumerate(axes.flatten()):
        fig.add_subplot(ax)
        topic_words = dict(topics[i][1])
        cloud.generate_from_frequencies(topic_words, max_font_size=300)
        plt.gca().imshow(cloud)
        plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
        plt.gca().axis('off')


    plt.subplots_adjust(wspace=0, hspace=0)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()
    plt.savefig(path)
    plt.close()
    return

In [46]:
# Main function
def output_json_wordcloud_img(json_path):
    
    # Opening JSON file
    f = open(json_path,)

    # returns JSON object as a dictionary
    data = json.load(f)
    
    json_lst = []
    
    # check if directory exists
    isExist = os.path.exists('wordcloud_images')
    if not isExist:
        os.mkdir('wordcloud_images')
    
    for i in data:
        company = i['company']
        year = i['year']
        pdf_url = i['url']
        
        json_obj = {}
        json_obj['company'] = company
        json_obj['year'] = year
        json_obj['pdf_url'] = pdf_url
            
        lst = []

        for page in i['filtered_report_sentences_direct']:
            sentences = i['filtered_report_sentences_direct'][page]
            joined = ' '.join(sentences)
            lst.append(joined)

        for page in i['filtered_report_sentences_indirect']:
            sentences = i['filtered_report_sentences_indirect'][page]
            joined = ' '.join(sentences)
            lst.append(joined)

        #print(lst)
        
        try:
            data_words = list(sent_to_words(lst))
            #print(data_words[:1])

            # Build the bigram and trigram models
            bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
            trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
            bigram_mod = gensim.models.phrases.Phraser(bigram)
            trigram_mod = gensim.models.phrases.Phraser(trigram)

            data_processed = process_words(data_words, bigram_mod, trigram_mod)  # processed Text Data

            data_processed2 = [x for x in data_processed if x] # remove empty lst
            
            data_ready = []
            for lst in data_processed2:
                lst_of_words = []
                for word in lst:
                    if word not in company.lower():
                        lst_of_words.append(word)
                data_ready.append(lst_of_words)

            # Create Dictionary
            id2word = corpora.Dictionary(data_ready)

            # Create Corpus: Term Document Frequency
            corpus = [id2word.doc2bow(text) for text in data_ready]

            # Build LDA model
            lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                       id2word=id2word,
                                                       num_topics=4, 
                                                       random_state=100,
                                                       update_every=1,
                                                       chunksize=10,
                                                       passes=10,
                                                       alpha='symmetric',
                                                       iterations=100,
                                                       per_word_topics=True)

            # Generate word cloud
            path = 'wordcloud_images/' + company + '_' + year + '.png'
            generate_wordcloud(lda_model, path)

            json_obj['wordcloud_img_path'] = path

        except Exception as e:
            print(e)
            print("Error occurred in topic modelling")
            json_obj['wordcloud_img_path'] = 'nan'
            
        json_lst.append(json_obj)
        
    return json_lst

In [47]:
# input: XM's preprocessed json file
# output (e.g.): 
'''[
    {
        "company": "BNI",
        "year": "2020",
        "pdf_url": "https://www.bni.co.id/Portals/1/BNI/Perusahaan/HubunganInvestor/Docs/SR-BNI-2020-US.pdf",
        "wordcloud_img_path": "wordcloud_images/BNI_2020.png"
    },
    {
        "company": "Citibank",
        "year": "nan",
        "pdf_url": "https://www.citigroup.com/citi/about/countries-and-jurisdictions/data/a-time-for-action-july-2021.pdf?ieNocache=413",
        "wordcloud_img_path": "wordcloud_images/Citibank_nan.png"
    },
    {
        "company": "ICBC",
        "year": "2020",
        "pdf_url": "http://v.icbc.com.cn/userfiles/Resources/ICBCLTD/download/2021/2020shzrEN202103.pdf",
        "wordcloud_img_path": "wordcloud_images/ICBC_2020.png"
    }
]'''

json_wordcloud_img = output_json_wordcloud_img('all_asian_banks_preprocessed_vfinal.json')

In [48]:
# output json file
with open('asian_banks_wordcloud_output.json', 'w') as f:
     json.dump(json_wordcloud_img, f)