This notebook serves to extract information from parsed PDF text. The steps are as follows.

1. Filter sentences with numbers, new lines in them (if aim is to extract number + key metrics)
2. Remove stop words, punctuations, year etc
3. Apply part of speech tagging - generate some rules that will allow for extraction of number and metrics. To remember that negative words means that a minus needs to be added in front of the number
4. Output for each source -> The metrics, and value

# Import Packages

In [None]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
import re
import en_core_web_sm

Retrieve stopwords

In [None]:
#nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('english'))
stopwords_to_keep = set(['above', 'below', 'up', 'down', 'over', 'under'])
final_stopwords = nltk_stopwords - stopwords_to_keep

Retrieve punctuation

In [None]:
string_punc = string.punctuation
final_punc = ''.join(list(i for i in string_punc if i not in ['%', '$', '&']))

# Import Json

In [None]:
f = open('../data/citi.json',) #can change to os directory method later to process all text
data = json.load(f) 

In [None]:
data

# Helper Functions

In [None]:
# this filters out those lines with possible metrics
def lines_with_digits(data): #filter out data using length first!!!see 
    filtered = []
    for i in data['content']: 
        if 'carbon' not in i.lower() and 'co2' not in i.lower() and 'renewable' not in i.lower() and 'green' not in i.lower(): 
            # We can filter manually here, or we can do labelling (binary logreg should be high accuracy). 
            # Though I think modelling is overkill, filtering manually is more feasbile
            continue
        line = remove_year(i)
        for j in line:
            if j.isdigit(): # this step filters 25% of the data
                filtered.append(i)
                break
    return filtered

# preproc for pos-tagging
def preprocess(text, stopwords, punc):
    removed = remove_year(text)
    removed = remove_stopwords(removed, stopwords)
    removed = remove_punctuations(removed, punc)
    return removed 

# this removes years from the string (prevent inteference with pos extraction)
def remove_year(text):
    removed = re.sub(r'[1-2][0-9]{3}', '', text) # remove year between 1000 and 2999
    return removed

# this remove stop words specific to use case
def remove_stopwords(text, stopwords):
    words = word_tokenize(text)
    keep = []
    for i in words:
        if i not in stopwords:
            keep.append(i)
    return ' '.join(keep)
    
# this removes punctuations 
def remove_punctuations(text, punc):
    subbed = text.translate(str.maketrans('', '', punc))
    #remove extra white space
    formatted_string = re.sub('\s{2,}', ' ', subbed)
    return formatted_string

# this extracts out the pos tag for each token in thhe cleaned words
nlp = en_core_web_sm.load()
def pos_extraction(text):
    tokens, pos, tag = [], [], []
    doc = nlp(text)
    for token in doc:
        tokens.append(token)
        pos.append(token.pos_)
        tag.append(token.tag_)
    return tokens,pos,tag 

In [None]:
relevant_text = lines_with_digits(data)
processed = [preprocess(i, final_stopwords, final_punc) for i in relevant_text]