In [132]:
import sys
import re, numpy as np, pandas as pd
from pprint import pprint
import json
import os

# Gensim
import gensim, spacy, logging, warnings
from gensim.utils import simple_preprocess
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'bank', 'project', 'company', 'employee', 'head', 'report', 'subject', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come', 'report', 'page'])

# wordcloud
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

#%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [133]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)

In [134]:
def process_words(texts, bigram_mod, trigram_mod, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

In [139]:
def prepare_data(data):
    
    data_words = list(sent_to_words(data))

    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=2, threshold=50) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    data_processed = process_words(data_words, bigram_mod, trigram_mod)  # processed Text Data
        
    return data_processed

In [140]:
def generate_wordcloud(data, path, carbon_class):
    cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
    
    if carbon_class == 'Carbon Emissions':
        i = 0
    elif carbon_class == 'Energy':
        i = 1
    elif carbon_class == 'Waste':
        i = 2
    else:
        i = 3

    cloud = WordCloud(stopwords=stop_words,
                      background_color='white',
                      width=1800,
                      height=1800,
                      max_words=10,
                      color_func=lambda *args, **kwargs: cols[i],
                      prefer_horizontal=1.0)
    
    flat_list = [item for sublist in data for item in sublist]
    word_frequency_list = pd.Series(flat_list).value_counts()
    final = dict(word_frequency_list)
    cloud.generate_from_frequencies(final, max_font_size=300)
    plt.figure(figsize=(6,6))
    plt.imshow(cloud)
    plt.title(carbon_class, fontdict=dict(size=24))
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(path)
    plt.close()
    return

In [141]:
# Opening JSON file
f = open('all_text_chart_output.json',)

# returns JSON object as a dictionary
data = json.load(f)

json_lst = []

# check if directory exists
isExist = os.path.exists('wordcloud_images')
if not isExist:
    os.mkdir('wordcloud_images')

for i in data:
    company = i['company']
    year = i['year']
    pdf_url = i['url']

    json_obj = {}
    json_obj['company'] = company
    json_obj['year'] = year
    json_obj['url'] = pdf_url
    
    
    # create a list to store 4 wordcloud image paths for 4 classes
    wordcloud_img_path = []
    
    df = pd.DataFrame.from_dict(i['text_output'])
    
    # 4 fixed classes
    classes = ['Carbon Emissions', 'Energy', 'Waste', 'Sustainable Investing']
    
    # generating 1 word cloud for each class
    for c_class in classes:
        try:
            filtered_df = df[df['carbon_class'] == c_class]
            lst = filtered_df['sentence'].to_list()
            
            data_processed = prepare_data(lst)

            data_ready = []
            # remove words that are company names as they frequently occur but not meaningful
            for lst in data_processed:
                lst_of_words = []
                for word in lst:
                    if word not in company.lower():
                        lst_of_words.append(word)
                data_ready.append(lst_of_words)

            path = 'wordcloud_images/' + company + '_' + year + '_' + c_class + '.png'
            # generate and save wordcloud image
            generate_wordcloud(data_ready, path, c_class)
            wordcloud_img_path.append(path)
            
        except Exception as e: # when there is no record in the carbon class
            print(e)
            path = 'wordcloud_images/NO_DATA_' + c_class + '.png'
            wordcloud_img_path.append(path)
    
    json_obj['wordcloud_img_path'] = wordcloud_img_path
    
    json_lst.append(json_obj)

We need at least 1 word to plot a word cloud, got 0.
We need at least 1 word to plot a word cloud, got 0.


In [142]:
with open('ALL_wordcloud_output_V2.json', 'w') as f:
     json.dump(json_lst, f)

In [None]:
# SAMPLE OUTPUT FOR ONE COMPANY
'''
{
    "company": "Hang Seng Investment Management",
    "year": "2019",
    "url": "https://www.hangseng.com/cms/ccd/csr/corporate-sustainability-report-2019/en/full-report.pdf",
    "wordcloud_img_path": [
        "wordcloud_images/Hang Seng Investment Management_2019_Carbon Emissions.png",
        "wordcloud_images/Hang Seng Investment Management_2019_Energy.png",
        "wordcloud_images/Hang Seng Investment Management_2019_Waste.png",
        "wordcloud_images/Hang Seng Investment Management_2019_Sustainable Investing.png"
    ]
}
'''