In [1]:
import openai
import os

api_key = os.environ.get("OPENAI_API_KEY")
openai.api_key = api_key

In [2]:
def call_gpt(prompt: str) -> str:
    
    messages = [{"role": "system", "content": "You are a professional who can annotate corporate esg report."}, 
                {"role": "user", "content":prompt}]

    response = openai.chat.completions.create(
                model="gpt-4-turbo",
                messages=messages,
                temperature= 0.7
            )

    return response.choices[0].message.content, response.usage

def create_prompt(statement_list: list) -> str:
    '''
    takes in a list of sentence and returns two ouptut, whether it belongs to scope3, how vague the statement is
    '''
    
    prompt = '''Below are statements from a corporate esg report.''' \
    ''' Classify whether the below statements pertains to scope3 emission and how vague the statement is.''' \
    ''' For Scope 3, label it Yes or No. For vagueness, score it if its specific, ambiguous or generic relating to ESG and notESG if it doesnt relate to ESG at all.''' \
    ''' Reply only as a valid json array like this [{{"scope3":<answer>, 'vague":<answer>}},..]'''\
    ''' Here is an example Input:["These Scope 3 emissions, encompassing a range of activities from the procurement of goods and services to the use of our sold products, represent a significant portion of our overall carbon footprint.", "We have successfully reduced our water usage by 20% in the past year across all operational facilities."]'''\
    ''' Response : [{{"scope3":"yes", "vague":"ambiguous"}}, {{"scope3":"no", "vague":"specific"}}] '''\
    ''' Now do it for below array {statement_list} Response:'''
    
    modified_prompt = prompt.format(statement_list = statement_list)
    
    return modified_prompt

In [15]:
import pandas as pd
import os
from tqdm import tqdm
import json
import time


folder_path = '../parsed_docs/'
output_folder = '../annotated_docs/'
text_batch_size = 5

gpt_calculated_files = [file_name for file_name in os.listdir() if 'results.csv' in file_name]

for file_name in os.listdir(folder_path):
    print('Running file', file_name)
    if file_name.replace('parsed.txt', 'results.csv') in gpt_calculated_files:
        print('Already calculated', file_name)
        continue

    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r') as file:
        data = [line for line in file.readlines() if line != '\n']
    print('No of lines', len(data))
    
    text_batches = [data[i:i+text_batch_size] for i in range(0, len(data), text_batch_size)]
    responses = []
    for batch in tqdm(text_batches):
        mod_prompt = create_prompt(batch)
        responses.append(call_gpt(mod_prompt))

    total=0
    completion=0
    prompt=0
    for i in responses:
        total += i[1].total_tokens
        completion += i[1].completion_tokens
        prompt += i[1].prompt_tokens
    print(total, prompt, completion)

    all_responses = []
    for i, j in zip(responses, text_batches):
        try:
            string = i[0]
            parsed_json = json.loads(string)
        except:
            try:
                string = i[0].replace('```json', '').replace('```', '')
                parsed_json = json.loads(string)
            except:
                print('Error in json', i)

        # print(len(parsed_json))
        if len(parsed_json) != len(j):
            raise ValueError('Length not same as batch_size', len(parsed_json), len(j))
        all_responses.extend(parsed_json)

    final_df = pd.DataFrame([[i,j] for i,j in zip(data, all_responses)], columns=['text', 'gpt_responses'])
    final_df['scope3'] = final_df['gpt_responses'].apply(lambda x: x['scope3'])
    final_df['vague'] = final_df['gpt_responses'].apply(lambda x: x['vague'])
    
    output_file_path = os.path.join(output_folder, file_name.replace('parsed.txt', 'baseline.csv'))
    final_df.to_csv(output_file_path, index=False)

    time.sleep(60)

Running file <_io.TextIOWrapper name='reports/NASDAQ_BKNG_2022_parsed.txt' mode='r' encoding='UTF-8'>
Already calculated NASDAQ_BKNG_2022_parsed.txt
Running file <_io.TextIOWrapper name='reports/NASDAQ_BKNG_2022_parsed.txt' mode='r' encoding='UTF-8'>
No of lines 1076


100%|██████████| 216/216 [12:03<00:00,  3.35s/it]


98168 83390 14778
Running file <_io.TextIOWrapper name='reports/NYSE_DIS_2022_parsed.txt' mode='r' encoding='UTF-8'>
No of lines 948


100%|██████████| 190/190 [10:45<00:00,  3.40s/it]


86991 74091 12900
Running file <_io.TextIOWrapper name='reports/NYSE_UBER_2022_parsed.txt' mode='r' encoding='UTF-8'>
No of lines 1273


100%|██████████| 255/255 [14:54<00:00,  3.51s/it]


116396 99844 16552
Running file <_io.TextIOWrapper name='reports/NYSE_TTE_2022_parsed.txt' mode='r' encoding='UTF-8'>
No of lines 1157


100%|██████████| 232/232 [13:36<00:00,  3.52s/it]


103846 88559 15287
Running file <_io.TextIOWrapper name='reports/NYSE_MCD_2022_parsed.txt' mode='r' encoding='UTF-8'>
No of lines 1054


100%|██████████| 211/211 [12:09<00:00,  3.46s/it]


92034 78161 13873
Running file <_io.TextIOWrapper name='reports/NYSE_KO_2022_parsed.txt' mode='r' encoding='UTF-8'>
No of lines 2125


100%|██████████| 425/425 [24:29<00:00,  3.46s/it]


189360 160958 28402
Running file <_io.TextIOWrapper name='reports/NYSE_NKE_2022_parsed.txt' mode='r' encoding='UTF-8'>
No of lines 810


100%|██████████| 162/162 [09:26<00:00,  3.49s/it]


70740 59952 10788
Running file <_io.TextIOWrapper name='reports/NYSE_XOM_2022_parsed.txt' mode='r' encoding='UTF-8'>
No of lines 1068


100%|██████████| 214/214 [11:53<00:00,  3.33s/it]


98772 84368 14404


In [17]:
dfs = []
for file in os.listdir():
    if 'results.csv' not in file:
        continue
    print(file)

    df = pd.read_csv(file)
    df['file_name'] = file

    dfs.append(df)

dfs = pd.concat(dfs)
dfs.to_csv('final_annotated_data.csv', index=False)

NYSE_DIS_2022_results.csv
NYSE_TTE_2022_results.csv
NYSE_MCD_2022_results.csv
NYSE_XOM_2022_results.csv
NASDAQ_BKNG_2022_results.csv
NYSE_NKE_2022_results.csv
NYSE_UBER_2022_results.csv
NYSE_PFE_2022_results.csv
NYSE_KO_2022_results.csv
NYSE_DE_2022_results.csv
