In [79]:
######################################### IMPORTING PACAKGES #############################
from scipy import spatial
import pandas as pd
import os
import json
import numpy as np
import string

import warnings
warnings.filterwarnings("ignore")


import sys  
import os
from dateutil.parser import parse


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


# PDF text extraction
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# Others
import requests
import string
import re
from pprint import pprint
from tqdm.notebook import tqdm
import io

import nltk
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
nltk.download('stopwords')

import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm", disable=['ner'])


# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)

DATA_FOLDER = "dataset/"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sieji\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sieji\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [80]:
def extract_pdf(file_path):
    """
    Process raw PDF text to structured and processed PDF text to be worked on in Python.
    Parameters
    ----------
    file_path : Relative Location of File
    Return
    ------
    text : str
        processed PDF text if no error is throw
    """   

    try:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()

        converter = TextConverter(resource_manager, fake_file_handle, codec=codec, laparams=laparams)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        content = []

        with open(file_path, 'rb') as file:
            for page in PDFPage.get_pages(file,
                                        pagenos, 
                                        maxpages=maxpages,
                                        password=password,
                                        caching=True,
                                        check_extractable=False):

                page_interpreter.process_page(page)

                content.append(fake_file_handle.getvalue())

                fake_file_handle.truncate(0)
                fake_file_handle.seek(0)        

        text = '##PAGE_BREAK##'.join(content)

        # close open handles
        converter.close()
        fake_file_handle.close()
        
        return text

    except Exception as e:
        print(e)

        # close open handles
        converter.close()
        fake_file_handle.close()

        return ""

In [103]:
# nlp preprocessing
def preprocess_lines(line_input):
    """
    Helper Function to preprocess and clean sentences from raw PDF text 
    Parameters
    ----------
    line_input : str
        String that contains a sentence to be cleaned
    Return
    ------
    line : str
        Cleaned sentence
    ----------
    Sub: Substitute regular expression
    Split: Remove blank space from front and rear 
    """  
    # removing header number
    line = re.sub(r'^\s?\d+(.*)$', r'\1', line_input)
    # removing trailing spaces
    line = line.strip()
    # words may be split between lines, ensure we link them back together
    line = re.sub(r'\s?-\s?', '-', line)
    # remove space prior to punctuation
    line = re.sub(r'\s?([,:;\.])', r'\1', line)
    # ESG contains a lot of figures that are not relevant to grammatical structure
    line = re.sub(r'\d{5,}', r' ', line)
    # remove emails
    line = re.sub(r'\S*@\S*\s?', '', line)
    # remove mentions of URLs
    line = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', r' ', line)
    # remove multiple spaces
    line = re.sub(r'\s+', ' ', line)

    # # remove punctuation
    # line = re.sub(r'[,\.!?]', '', line)
    # # convert to lowercase
    # line = line.lower()

    
    # join next line with space
    line = re.sub(r' \n', ' ', line)
    line = re.sub(r'.\n', '. ', line)
    line = re.sub(r'\x0c', ' ', line)
    return line

In [91]:
def remove_non_ascii(text):
    """
    Helper Function to remove non ascii characters from text
    Printable will 
    """
    printable = set(string.printable) #Convert iterable to set
    return ''.join(filter(lambda x: x in printable, text))

def not_header(line):
    """
    Helper Function to remove headers
    Check if all the characters are in upper case
    """
    return not line.isupper()

In [93]:
def extract_pages_sentences(nlp, text):    
    """
    Extracting text from raw PDF text and store them by pages and senteces. Raw text is also cleand by removing junk, URLs, etc.
    Consecutive lines are also grouped into paragraphs and spacy is used to parse sentences.
    Parameters
    ----------
    nlp: spacy nlp model
        NLP model to parse sentences
    text : str
        Raw PDF text
    Return
    ------
    pages_content : list of str
        A list containing text from each page of the PDF report. Page number is the index of list + 1
    
    pages_sentences : list of list
        A list containing lists. Page number is the index of outer list + 1. Inner list contains sentences from each page
 
    """  
    MIN_WORDS_PER_PAGE = 500
    
    pages = text.split('##PAGE_BREAK##')
    #print('Number of Pages: {}'.format(len(pages)))

    lines = []
    for i in range(len(pages)):
        page_number = i + 1
        page = pages[i]
        
        # remove non ASCII characters
        text = remove_non_ascii(page)
        
        # if len(text.split(' ')) < MIN_WORDS_PER_PAGE:
        #     print(f'Skipped Page: {page_number}')
        #     continue
        
        prev = ""
        for line in text.split('\n\n'):
            # aggregate consecutive lines where text may be broken down
            # only if next line starts with a space or previous does not end with dot.
            if(line.startswith(' ') or not prev.endswith('.')):
                prev = prev + ' ' + line
            else:
                # new paragraph
                lines.append(prev)
                prev = line

        # don't forget left-over paragraph
        lines.append(prev)
        lines.append('##SAME_PAGE##')
        
    lines = '  '.join(lines).split('##SAME_PAGE##')
    
    # clean paragraphs from extra space, unwanted characters, urls, etc.
    # best effort clean up, consider a more versatile cleaner
    
    pages_content = []
    pages_sentences = []

    for line in lines[:-1]: # looping through each page
        
        line = preprocess_lines(line)       
        pages_content.append(str(line).strip())

        sentences = []
        # split paragraphs into well defined sentences using spacy
        for part in list(nlp(line).sents):
            sentences.append(str(part).strip())

        #sentences += nltk.sent_tokenize(line)
            
        # Only interested in full sentences and sentences with 10 to 100 words. --> filter out first page/content page
        sentences = [s for s in sentences if re.match('^[A-Z][^?!.]*[?.!]$', s) is not None]
        sentences = [s.replace('\n', ' ') for s in sentences]
        
        pages_sentences.append(sentences)
        
    return pages_content, pages_sentences #list, list of list where page is index of outer list

In [119]:
pages_content, pages_sentences = extract_pages_sentences(spacy.load("en_core_web_sm"),extract_pdf("dataset_jx/NYSE_RDS_2021.pdf"))

In [120]:
pages_sentences

[[],
 [],
 ['ABOUT THIS REPORT The Shell plc Annual Report (this Report) serves as the Annual Report and Accounts in accordance with UK requirements for the year ended December 31, 2021, for Shell plc (the Company) and its subsidiaries (collectively referred to as Shell).',
  'This Report presents the Consolidated Financial Statements of Shell (page 228), the Parent Company Financial Statements of Shell (pages 306-313) and the Financial Statements ofthe Royal Dutch Shell Dividend Access Trust (pages 317-319).',
  'Except forthese Financial Statements, the numbers presented throughout this Report may not sum precisely to the totals provided and percentages may notprecisely reflect the absolute figures due to rounding.',
  'The Financial Statements contained in this Report have been prepared in accordance with International Financial Reporting Standards (IFRS) as issued by the International Accounting Standards Board (IASB).',
  'IFRSas defined above includes interpretations issued by th

In [158]:
def lemmatization(text_list, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    # lemmatize text in sentence
    """https://spacy.io/api/annotation"""
    texts_out = []
    for texts in text_list:
        texts = texts.lower()
        texts_out.append(" ".join([token.lemma_ for token in nlp(texts)]))
    return texts_out

In [159]:
def filter_report_highLevel(report):
    """
    Page filter to filter report for only relevant pages with decarbonisation related words.
    Two types of word filters: direct and indirect. Direct contains words that are directly related to decarbonisation while indirect contains other relevant decarbonisation information.
    
    Parameters
    ----------
    report: list of str
        A list containing text from each page of the PDF report. Page number is the index of list + 1
    Return
    ------
    filtered_report_direct : dict of {int : str}
        A dictionary that contains relevant pages obtained using direct filter. The key is the page number and value is the text on the page. 
    
    filtered_report_indirect : dict of {int : str}
        A dictionary that contains relevant pages obtained using indirect filter. The key is the page number and value is the text on the page.     
    """  
    
    # list of words used to filter
    relevant_terms_directFilter = set(["carbon","co2","environment","GHG emissions","Greenhouse Gas","carbon footprint","carbon emissions","Scope 1","Scope 2",
                               "Scope 3", "WACI","Carbon Intensity","carbon pricing","net-zero","metrics and targets","TCFD",
                                "sustainability goals","decarbonisation","climate",'energy', 'emission', 'emissions', 'renewable', 'carbon', 'fuel', 'power', 
                               'green', 'gas', 'green energy', 'sustainable', 'climate', 'sustainability', 'environmental', 'environment', 'GHG', 
                               'decarbon', 'energy consumption', 'paper consumption','water consumption', 'carbon intensity', 'waste management', 'electricity consumption', 
                                'cdp', 'global warming', 'business travel','climate solutions', 'decarbonization', 'cvar', 'climate value-at-risk','waste output'])
    relevant_terms_combinationA = ["emissions","exposure","carbon related","esg","sustainable","green","climate sensitive","impact investing", "investment framework", 'msci', 'ftse', 'responsible investing', 'responsible investment','transition']
    relevant_terms_combinationB = ["portfolio","assets","AUM","investment","financing","ratings","revenue","bond","goal","insurance", "equity", "swap", "option", "portfolio holdings", "risk management",'financial products']
    relevant_terms_combinationC = ["net zero","carbon footprint","CO2","carbon","oil","coal", "gas", "fossil fuel","green"]
    relevant_terms_combination_directFilter_lem = lemmatization(relevant_terms_directFilter)
    relevant_terms_combinationA_lem = lemmatization(relevant_terms_combinationA)
    relevant_terms_combinationB_lem = lemmatization(relevant_terms_combinationB)
    relevant_terms_combinationC_lem = lemmatization(relevant_terms_combinationC)
    
    
    filtered_report_direct = {}
    filtered_report_indirect = {}
    for i in range(len(report)):
        page = report[i]
        page_number = i + 1
        no_words = len(page.split(" "))
        
        # filter for pages that contain at least 3 words from the relevant_terms_combination_directFilter_lem list
        if sum(map(page.__contains__, relevant_terms_combination_directFilter_lem)) > 2:
            filtered_report_direct[page_number] = page
        
        # filter for pages that contain at least 1 word (relevant_terms_combinationC_lem AND relevant_terms_combinationA_lem) OR (relevant_terms_combinationC_lem AND  relevant_terms_combinationB_lem)
        elif (any(map(page.__contains__, relevant_terms_combinationA_lem)) and any(map(page.__contains__, relevant_terms_combinationC_lem))) or (any(map(page.__contains__, relevant_terms_combinationB_lem)) and any(map(page.__contains__, relevant_terms_combinationC_lem))):
            filtered_report_indirect[page_number] = page
    
    return filtered_report_direct,filtered_report_indirect

In [160]:
filtered_report_direct, filtered_report_indirect = filter_report_highLevel(pages_content)


In [None]:
filtered_report_direct

In [161]:
list1 = list(filtered_report_direct.values())
list1 = [d.split() for d in list1]

In [162]:
#Creating dictionary
id2word = corpora.Dictionary(list1)

In [163]:
#Creating corpus
texts = list1
corpus = [id2word.doc2bow(text) for text in texts]

In [164]:
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 3), (9, 1), (10, 1), (11, 1), (12, 3), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)]


In [165]:
##LDA model training
from pprint import pprint

#number of topics
num_topics = 10

#building LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

#Print keyword in the 10 topics
pprint(lda_model.print_topic())



[(0,
  '0.035*"the" + 0.033*"and" + 0.031*"of" + 0.017*"to" + 0.016*"in" + '
  '0.010*"a" + 0.008*"for" + 0.007*"with" + 0.007*"on" + 0.006*"is"'),
 (1,
  '0.034*"and" + 0.019*"of" + 0.017*"the" + 0.015*"to" + 0.015*"in" + '
  '0.012*"a" + 0.008*"are" + 0.006*"our" + 0.006*"from" + 0.005*"on"'),
 (2,
  '0.030*"and" + 0.023*"to" + 0.023*"the" + 0.020*"of" + 0.012*"in" + '
  '0.009*"our" + 0.007*"a" + 0.007*"on" + 0.006*"with" + 0.006*"for"'),
 (3,
  '0.050*"the" + 0.034*"and" + 0.034*"of" + 0.022*"to" + 0.017*"in" + '
  '0.011*"a" + 0.011*"our" + 0.009*"on" + 0.006*"for" + 0.006*"from"'),
 (4,
  '0.026*"and" + 0.023*"the" + 0.015*"of" + 0.015*"in" + 0.014*"to" + '
  '0.007*"a" + 0.007*"on" + 0.007*"for" + 0.005*"are" + 0.005*"from"'),
 (5,
  '0.031*"the" + 0.028*"and" + 0.023*"of" + 0.021*"to" + 0.019*"in" + '
  '0.012*"a" + 0.011*"our" + 0.008*"on" + 0.008*"for" + 0.007*"are"'),
 (6,
  '0.052*"the" + 0.050*"and" + 0.031*"of" + 0.026*"to" + 0.020*"in" + '
  '0.010*"a" + 0.010*"for" + 0.