In [1]:
from scipy import spatial
import seaborn as sns
import pandas as pd
import os
import json

In [2]:
data = "../data/preprocessed_sustainability_reports"
all_files = os.listdir(data)
all_json = {}
for file in all_files:
    if "vfinal" in file:
        with open(data + "/" + file,) as inputfile:
            all_json[file] = json.load(inputfile)

## Preprocess Data for BERT Embeddings
1. remove numbers --> SCRAPE BECAUSE THE WHOLE POINT IS TO BE ABLE TO CAPTURE NUMBERS
2. remove punctuations except %,$,&
- as numbers & punctations are encoded differently
- upper/lower casing dont affect embeddings

Json Fields Used : 
1. use *filtered_report_sentences_direct* & *filtered_report_sentences_indirect* keys to get page numbers of relevant pages and get the unprocesse sentences from *report_sentences*

In [None]:
def remove_numbers(string):
    return ''.join(i for i in string if not i.isdigit())

def remove_punc(s):
    import string
    exclude = string.punctuation
    final_punc = ''.join(list(i for i in exclude if i not in ['%', '$', '&']))
    s = ''.join(ch for ch in s if ch not in list(final_punc))
    return s


In [None]:
bert_json = {}
for json_name,json in all_json.items():
    fi_list = []
    for fi in json:
        fi_dict = {}
        fi_direct_dict = {}
        for page in fi["filtered_report_pages_direct"].keys():
            page = int(page)
#           fi_direct_dict[page] = list(map(lambda x : remove_punc(x) ,list(map(lambda x : remove_numbers(x) ,fi["report_sentences"][page-1]))))
            fi_direct_dict[page] = list(map(lambda x : remove_punc(x) ,fi["report_sentences"][page-1]))
        fi_indirect_dict = {}
        for page in fi["filtered_report_pages_indirect"].keys():
            page = int(page)
 #          fi_indirect_dict[page] = list(map(lambda x : remove_punc(x) ,list(map(lambda x : remove_numbers(x) ,fi["report_sentences"][page-1]))))
            fi_indirect_dict[page] =  list(map(lambda x : remove_punc(x) ,fi["report_sentences"][page-1]))      
        fi_dict["company"] = fi["company"] #identifier
        fi_dict["year"] = fi["year"] #identifier
        fi_dict["filtered_report_pages_direct_bert"]  = fi_direct_dict
        fi_dict["filtered_report_pages_indirect_bert"]  = fi_indirect_dict
        fi_list.append(fi_dict)
    bert_json[json_name] = fi_list
    
    


### BERT embeddings

Compare text similarity:
- BERT embeddings + Cosine Similarity does the best


Reference : https://medium.com/@adriensieg/text-similarities-da019229c894


In [None]:
%tensorflow_version 1.x

!pip install bert-serving-client
!pip install -U bert-serving-server[http]

!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip
!nohup bert-serving-start -model_dir=./uncased_L-12_H-768_A-12 > out.file 2>&1 &


!ls  # you should see uncased_something_.zip



In [None]:
from bert_serving.client import BertClient
bc = BertClient(check_length=False)

relevant_sentences = ['In 2019, Citi financed $74 million of subordinate lien bonds that were certified green, given the projects environmental aspects.', 
             'In addition, our cogeneration plant, fueled by natural gas, will produce heat and electricity on-site, reducing the building\'s carbon footprint by 34 percent.',
             'These efforts reduced energy consumption by more than 2,100 metric tons (mt) of carbon dioxide equivalents (CO2e) during the one-year challenge.',
             'The companies in our equity portfolio emitted around 133 tonnes of CO2 -equivalents for every million US dollars of revenue.', 
             'The equity portfolio’s carbon intensity was 9 percent below that of the benchmark index.',
             'A total of 106 companies that produce certain types of weapon, tobacco or coal, or use coal for power production, are currently excluded from the fund', 
             'For public and private assets, excluding cash and non-equity derivatives as they were not reported in 2019, our year-overyear portfolio weighted average carbon intensity was reduced by approximately 23%.',
             'Having met these targets, we have set new, more ambitious ones: to reduce the Fund’s emissions intensity by 40% and fossil fuel reserves by 80% by 2025.',
             'The carbon footprint of the non-listed companies was 0.6 tCO₂e per million SEK invested',
             'Energy consumption and carbon emissions per unit area were 149 kWh/ m² and 0.037 tCO₂e/m², which means a reduction of 9 per cent and 12 per cent, respectively.',
             'The carbon intensity (CO2 equivalent tons per million yen of sales) of GPIF’s equity and corporate bond portfolio decreased by 15.3%, from 2.29 tons to 1.94 tons, in the space of a year.'
             'Based on our percentage holdings in each company, the total emissions of the equity portfolio were 108 million tonnes of CO2 - equivalents in 2019.',
             'The carbon footprint of the companies in our equity portfolio',
             'The companies in our equity portfolio emitted around 156 tonnes of CO2 -equivalents for every million US dollars (USD) of revenue.'
             'The carbon intensity of the companies in the equity portfolio and the benchmark index decreased by 16 and 17 percent respectively from 2018 to 2019.',
             'We are focused on supporting the goal of net zero greenhouse gas emissions by 2050, in line with global efforts to limit warming to 1.5°C. ',
             'quantitative target for ESG-themed investments and finance of ¥700 billion ',
             'Commit to reduce investment carbon footprint by',
             'esg investing', 'green bonds', 'Green Investment target', 'Achieve 100% renewable electricity by 2025'
             ]
relevant_sentences_embeddings = bc.encode(relevant_sentences)

print(relevant_sentences_embeddings)

In [None]:
def cosine_distance(s1,s2):
    return 1 - spatial.distance.cosine(s1, s2)

In [None]:
all_relevant_sentences = []
for json_name,json in bert_json.items():
    fi_list = []
    for fi_index in range(len(json))[:10]: #change
        fi = json[fi_index]
        page_relevant_sentences = {}
        page_relevant_sentences_original = {}
        for page_number, page in fi["filtered_report_pages_direct_bert"].items():
            relevant_sentences = []
            relevant_sentences_original = []
            for sentence_index in range(len(page)):
                original_sentence = all_json[json_name][fi_index]["report_sentences"][page_number-1][sentence_index]
                all_filtered_sentences.append(original_sentence)
                sentence = page[sentence_index]
                sentence_encoding = bc.encode([sentence])[0]
                for relevant_sentence in relevant_sentences_embeddings:
                    cosine_similarity = cosine_distance(sentence_encoding,relevant_sentence)
                    all_cosine_similarities.append(cosine_similarity)
                    if cosine_similarity >= 0.8 : # tentative
                        relevant_sentences.append([sentence,cosine_similarity])
                        relevant_sentences_original.append(original_sentence)
                        all_relevant_sentences.append(original_sentence)
                        break
            if len(relevant_sentences) != 0.7357 :
                page_relevant_sentences[page_number] = relevant_sentences
                page_relevant_sentences_original[page_number] = relevant_sentences_original
        bert_json[json_name][fi_index]["bert_relevant_sentences_direct"] = page_relevant_sentences
        bert_json[json_name][fi_index]["bert_relevant_sentences_direct_original"] = page_relevant_sentences_original


        page_relevant_sentences_indirect = {}
        page_relevant_sentences_indirect_original = {}
        for page_number, page in fi["filtered_report_pages_indirect_bert"].items():
            relevant_sentences = []
            relevant_sentences_original = []
            for sentence_index in range(len(page)):
                original_sentence = all_json[json_name][fi_index]["report_sentences"][page_number-1][sentence_index]
                all_filtered_sentences.append(original_sentence)
                sentence = page[sentence_index]
                sentence_encoding = bc.encode([sentence])[0]
                for relevant_sentence in relevant_sentences_embeddings:
                    cosine_similarity = cosine_distance(sentence_encoding,relevant_sentence)
                    all_cosine_similarities.append(cosine_similarity)
                    if cosine_similarity >= 0.7357 : # tentative maybe must b smilar to most terms?
                        relevant_sentences.append([sentence,cosine_similarity])
                        relevant_sentences_original.append(original_sentence)
                        all_relevant_sentences.append(original_sentence)
                        break
            if len(relevant_sentences) != 0 :
                page_relevant_sentences_indirect[page_number] = relevant_sentences
                page_relevant_sentences_indirect_original[page_number] = relevant_sentences_original
        bert_json[json_name][fi_index]["bert_relevant_sentences_indirect"] = page_relevant_sentences_indirect
        bert_json[json_name][fi_index]["bert_relevant_sentences_indirect_original"] = page_relevant_sentences_indirect_original

                

In [None]:
# to determine threshold
sns.histplot(all_cosine_similarities)
pd.DataFrame(all_cosine_similarities,columns=["cos_similarity"]).describe()

In [None]:
print(len(all_filtered_sentences))
print(len(all_relevant_sentences))

### Splitting Data for labelling

In [None]:
# all_filtered_sentences_df = pd.DataFrame(all_filtered_sentences,columns=["filtered_sentences"])
# n = all_filtered_sentences_df.shape[0]//5
# split_1 = all_filtered_sentences_df.sample(n=n,random_state=200) #random state is a seed value
# test = all_filtered_sentences_df.drop(split_1.index)
# split_2 = test.sample(n=n,random_state=200)
# test = test.drop(split_2.index)
# split_3 = test.sample(n=n,random_state=200) 
# test = test.drop(split_3.index)
# split_4 = test.sample(n=n,random_state=200) 
# test = test.drop(split_4.index)
# split_5 = test
 
data = "../data/to_label"
all_relevant_sentences_df = pd.DataFrame(all_relevant_sentences,columns=["relevant_sentences"])
n = all_relevant_sentences_df.shape[0]//5
split_1 = all_relevant_sentences_df.sample(n=n,random_state=200)
split_1.to_csv(data + "/split_1.csv")
test = all_relevant_sentences_df.drop(split_1.index)
print(test.shape)
split_2 = test.sample(n=n,random_state=200)
split_2.to_csv(data + "/split_2.csv")
test = test.drop(split_2.index)
print(test.shape)
split_3 = test.sample(n=n,random_state=200)
split_3.to_csv(data + "/split_3.csv")
test = test.drop(split_3.index)
split_4 = test.sample(n=n,random_state=200)
split_4.to_csv(data + "/split_4.csv")
test = test.drop(split_4.index)
split_5 = test
split_5.to_csv(data + "/split_5.csv")
