This notebook serves to extract information from parsed PDF text. The steps are as follows.

1. Filter sentences with numbers, new lines in them (if aim is to extract number + key metrics)
2. Remove stop words, punctuations, year etc
3. Apply part of speech tagging - generate some rules that will allow for extraction of number and metrics. To remember that negative words means that a minus needs to be added in front of the number
4. Output for each source -> The metrics, and value

# Import Packages

In [1]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
import re
import en_core_web_sm
import string
from matplotlib import pyplot as plt

Retrieve stopwords

In [2]:
#nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('english'))
stopwords_to_keep = set(['above', 'below', 'up', 'down', 'over', 'under'])
final_stopwords = nltk_stopwords - stopwords_to_keep

Retrieve punctuation

In [3]:
string_punc = string.punctuation
final_punc = ''.join(list(i for i in string_punc if i not in ['%', '$', '&']))

Decarbonisation Terms

In [4]:
decarb_terms = set(["carbon","co2","environment","GHG emissions","Greenhouse Gas","carbon footprint","carbon emissions","Scope 1","Scope 2","Scope 3", "WACI","Carbon Intensity","carbon pricing",
                                  "net-zero","metrics and targets","TCFD", "sustainability goals","decarbonisation","climate",'energy', 'emission',
                                  'emissions', 'renewable', 'carbon', 'fuel','power consumption','green', 'gas', 'green energy', 'sustainable', 'climate',
                                  'sustainability', 'environmental', 'environment','GHG','decarbon', 'energy consumption', 'paper consumption',
                                  'water consumption', 'carbon intensity', 'waste management', 'electricity consumption', 'cdp', 'global warming',
                                  'business travel','climate solutions', "decarbonization", 'cvar', "climate value-at-risk"])

# Helper Functions

In [38]:
# this processed json file, flattens and store indices for preprocessed text for easier retrieval of original text later
def get_indices_filter_nondigits(data, filter_corpus):
    d = {}
    for company in data:
        d[company['url']] = []
        for page_ind in range(len(company['report_sentences_preprocessed'])):
            for sentence_ind in range(len(company['report_sentences_preprocessed'][page_ind])):
                sentence = company['report_sentences_preprocessed'][page_ind][sentence_ind]
                if line_has_digits(sentence) and line_has_decarbonisation(sentence, filter_corpus):
                    text = preprocess(sentence)
                    d[company['url']].append((sentence, text, (page_ind, sentence_ind)))
    return d

# this filters out those lines with possible metrics
def line_has_digits(sentence): 
    line = remove_year_co2(sentence)
    for j in line:
        if j.isdigit(): # this step filters 25% of the data
            return True

def line_has_decarbonisation(sentence, filter_corpus): #replace with word embeddings later on
    words = sentence.split(" ")
    for i in words:
        if i in filter_corpus:
            return True

# preproc for pos-tagging
def preprocess(text):
    removed = re.sub(r'(\d{2})/(\d{2})/(\d{4})', 'date_dummy', text) # replace with dummy
    removed = re.sub(r'(\d{4})/(\d{4})', 'year_dummy', removed)
    removed = re.sub(r'[1-2][0-9]{3}', 'year_dummy', removed)
    removed = removed.strip()
    return removed 

# this removes years from the string (prevent inteference with pos extraction)
def remove_year_co2(text):
    removed = preprocess(text)
    removed = removed.replace("co2", "")
    return removed

# this remove stop words specific to use case
# def remove_stopwords(text, stopwords):
#     words = word_tokenize(text)
#     keep = []
#     for i in words:
#         if i not in stopwords:
#             keep.append(i)
#     return ' '.join(keep)
    
# # this removes punctuations 
# def remove_punctuations(text, punc):
#     subbed = text.translate(str.maketrans('', '', punc))
#     #remove extra white space
#     formatted_string = re.sub('\s{2,}', ' ', subbed)
#     return formatted_string

# this extracts out the pos tag for each token in thhe cleaned words
nlp = en_core_web_sm.load()
def pos_extraction(text):
    tokens, pos, tag = [], [], []
    doc = nlp(text)
    for token in doc:
        tokens.append(token)
        pos.append(token.pos_)
        tag.append(token.tag_)
    return tokens, pos, tag 

# function to tag both original and processed versions of text
def tag_both_text(org_text, processed_text):
    tk_org, pos_org, tag_org = pos_extraction(org_text)
    tk_proc, pos_proc, tag_proc = pos_extraction(processed_text)
    return [org_text, processed_text, tk_org, pos_proc, tag_proc]

# helper function to print output of pos tagging
def pos_printer(token, pos, tag):
    for i in range(len(token)):
        print(str(token[i])+' -> ' + pos[i] +',' +tag[i])

# this extracts out the length of each sentence for better info extraction
def filter_length(processed, limit):
    pos = []
    for i in processed:
        if len(i.split(' ')) <=limit:
            pos.append(i)
    return pos

In [None]:
def plot_histogram(processed):
    # get lengths
    le = [len(i.split(' ')) for i in processed]

    # plot histogram
    fig, ax = plt.subplots(figsize =(10, 7))
    ax.hist(le, bins = [i for i in range(0,101,5)])

    # Show plot
    plt.show()

-------

# Rule mining

test.txt from compiled sentences from word embeddings

In [11]:
file1 = open('test.txt', 'r')
lines = file1.readlines()
test_set_preproc = []
for line in lines:
    a = line.replace("\n", "")
    if a != '':
        test_set_preproc.append([a.lower(), preprocess(a.lower(), final_stopwords, final_punc)])

In [12]:
tags = []
for i,j in test_set_preproc: 
    tk_org, pos_org, tag_org = pos_extraction(i)
    tk, pos, tag = pos_extraction(j)
    tags.append([i, j, tk_org, pos, tag]) 

# Method

1. Store a list, keeping the number and extracted portion
> Alternative is to store a set and disregard the number in case multiple numbers in one sentence and we dedup the extracted sentences
2. Check left and right until we hit a verb not of finer pos tag 'VBP' or 'VB'. If in between this checking there are no NOUNS, continue checking after this VERB (exclude noun directly before and after). 
3. Get the indices of the tag we need to slice to retrieve from the tokens_list (must use this method since pos now has punctuations & stopwords)

In [9]:
def extract_text(tags, verb_exclude): 
    tokens, pos_list, tag_list = tags[2], tags[3], tags[4]
    results = []
    for i in range(len(pos_list)):
        pos = pos_list[i]
        if pos == 'NUM' and line_has_digits(tokens[i].text): #million recognised as a NUM
            j = max(i-1,0)
            k = min(i+1, len(pos_list)-1)
            noun_flag_left, noun_flag_right = False, False
            if j != 0:
                while pos_list[j] != 'VERB' or noun_flag_left == False or tag_list[j] in verb_exclude:
                    if pos_list[j] == 'NOUN':
                        noun_flag_left = True
                    j -= 1
                    if j == 0:
                        break
            if k != len(pos_list)-1:
                while pos_list[k] != 'VERB' or noun_flag_right == False or tag_list[k] in verb_exclude:
                    if pos_list[k] == 'NOUN':
                        noun_flag_right = True
                    k += 1
                    if k == len(pos_list)-1:
                        break
            results.append([tokens[i].text, generate_extracted_text(tokens, pos_list, j,k)])
    return results    

def generate_extracted_text(tokens, pos_list, j, k): #need to write if not simply joining will give extra spaces
    extracted_text = ''
    for tk in range(j,k):
        if pos_list[tk] != 'PUNCT' and pos_list[tk] != 'PART':
            extracted_text += ' '
        extracted_text += tokens[tk].text
    return extracted_text.strip()

In [13]:
l = []
for info in tags:
    result_noexclusion = extract_text(info, [])
    result_verbfg_exclusion1 = extract_text(info, ['VBP']) # best
    result_verbfg_exclusion2 = extract_text(info, ['VB'])
    result_verbfg_exclusion3 = extract_text(info, ['VBP', 'VB'])
    l.append([info[0], result_noexclusion, result_verbfg_exclusion1, result_verbfg_exclusion2, result_verbfg_exclusion3])

In [14]:
pd.DataFrame(l).to_csv('test3.csv')

-------

# Import Json

In [5]:
f = open('../data/all_pension_funds_preprocessed.json',) #can change to os directory method later to process all text
data = json.load(f) 

# Process Data

In [None]:
# step 1 filtering, processing for relevant text
relevant_text = get_indices_filter_nondigits(data, decarb_terms)

In [None]:
'''
d: dictionary storing original text, (page_ind, sentence_ind), [['num being extracted', 'full_text'], ....]
'''
d = {}
for k,v in relevant_text.items():
#     print('processing' + k)
    for sentence_info in v:
        try:
            tagged = tag_both_text(sentence_info[0], sentence_info[1]) # [org_text, processed_text, tk_org, pos_proc, tag_proc]
            extracted = extract_text(tagged, ['VBP']) #token, extracted portion
            final_info = [sentence_info[0], sentence_info[2], extracted]
            if k not in d:
                d[k] = [final_info]
            else:
                d[k].append(final_info)
        except:
            print('error' + ' at ' + sentence_info[0] + ' ' + sentence_info[1])