In [2]:
import os
from together import Together

client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))

In [3]:
def call_gpt(prompt: str) -> str:
    
    messages = [{"role": "system", "content": "You are a professional who can annotate corporate esg report."}, 
                {"role": "user", "content":prompt}]

    response = client.chat.completions.create(
                model="mistralai/Mistral-7B-Instruct-v0.2",
                messages=messages,
                temperature= 0.7
            )

    return response.choices[0].message.content, response.usage

def create_prompt(statement_list: list) -> str:
    '''
    takes in a list of sentence and returns two ouptut, whether it belongs to scope3, how vague the statement is
    '''
    
    prompt = '''Below is a statement from a corporate esg report.''' \
    ''' Classify whether the below statement pertains to scope3 emission and how vague the statement is.''' \
    ''' For Scope 3, label it Yes or No. For vagueness, score it if its specific, ambiguous or generic relating to ESG and notESG if it doesnt relate to ESG at all.''' \
    ''' Reply only as a valid json array like this {{"scope3":<answer>, 'vague":<answer>}}. Reply only with json array and nothing else.'''\
    ''' Here is an example '''\
    ''' Input:"These Scope 3 emissions, encompassing a range of activities from the procurement of goods and services to the use of our sold products, represent a significant portion of our overall carbon footprint." '''\
    ''' Response: {{"scope3":"yes", "vague":"ambiguous"}}'''\
    ''' Input:"We have successfully reduced our water usage by 20% in the past year across all operational facilities."'''\
    ''' Response : {{"scope3":"no", "vague":"specific"}} '''\
    ''' Now do it for below statement Input:{statement_list}. Response:'''
    
    modified_prompt = prompt.format(statement_list = statement_list)
    
    return modified_prompt

In [11]:
import os
from tqdm import tqdm
import json
import time
import pandas as pd


folder_path = '../parsed_docs/'
output_folder = '../baseline_docs/'
text_batch_size = 5

gpt_calculated_files = [file_name for file_name in os.listdir(output_folder) if 'baseline.csv' in file_name]

for file_name in os.listdir(folder_path):
    print('Running file', file_name)
    if file_name.replace('parsed.txt', 'baseline.csv') in gpt_calculated_files:
        print('Already calculated', file_name)
        continue

    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r') as file:
        data = [line for line in file.readlines() if line != '\n']
    print('No of lines', len(data))
    
    responses = []
    for batch in tqdm(data):
        mod_prompt = create_prompt(batch)
        responses.append(call_gpt(mod_prompt))

    total=0
    completion=0
    prompt=0
    for i in responses:
        total += i[1].total_tokens
        completion += i[1].completion_tokens
        prompt += i[1].prompt_tokens
    print(total, prompt, completion)

    all_responses = []
    for i, j in zip(responses, data):
        try:
            string = i[0]
            parsed_json = json.loads(string)
        except:
            try:
                if '}' in i[0]:
                    string = i[0].split('}')[0]+'}'
                    parsed_json = json.loads(string)
            except:
                print('Error in json', i)
                parsed_json = {'scope3':'', 'vague':''}

        if len(parsed_json) != 2:
            raise ValueError('Length not same as batch_size', len(parsed_json), len(j))
        all_responses.append(parsed_json)

    final_df = pd.DataFrame([[i,j] for i,j in zip(data, all_responses)], columns=['text', 'llm_responses'])
    final_df['scope3'] = final_df['llm_responses'].apply(lambda x: x.get('scope3', ''))
    final_df['vague'] = final_df['llm_responses'].apply(lambda x: x.get('vague', '') )

    output_file_path = os.path.join(output_folder, file_name.replace('parsed.txt', 'baseline.csv'))
    
    final_df.to_csv(output_file_path, index=False)

Running file NASDAQ_BKNG_2022_parsed.txt
Already calculated NASDAQ_BKNG_2022_parsed.txt
Running file NYSE_DIS_2022_parsed.txt
Already calculated NYSE_DIS_2022_parsed.txt
Running file NYSE_UBER_2022_parsed.txt
Already calculated NYSE_UBER_2022_parsed.txt
Running file NYSE_TTE_2022_parsed.txt
Already calculated NYSE_TTE_2022_parsed.txt
Running file NYSE_MCD_2022_parsed.txt
Already calculated NYSE_MCD_2022_parsed.txt
Running file NYSE_KO_2022_parsed.txt
Already calculated NYSE_KO_2022_parsed.txt
Running file NYSE_NKE_2022_parsed.txt
Already calculated NYSE_NKE_2022_parsed.txt
Running file NYSE_XOM_2022_parsed.txt
Already calculated NYSE_XOM_2022_parsed.txt
Running file NYSE_PFE_2022_parsed.txt
No of lines 1068


100%|██████████| 1068/1068 [25:46<00:00,  1.45s/it]

341401 317102 24299





In [12]:
dfs = []
for file in os.listdir():
    if 'baseline.csv' not in file:
        continue
    print(file)

    df = pd.read_csv(file)
    df['file_name'] = file
    df['model_name'] = 'Mistral-7B-Instruct-v0.2'

    dfs.append(df)

dfs = pd.concat(dfs)
dfs.to_csv('baseline_data_mistral7b.csv', index=False)

NYSE_DE_baseline.csv
NYSE_XOM_2022_baseline.csv
NYSE_DIS_2022_baseline.csv
NASDAQ_BKNG_2022_baseline.csv
NYSE_KO_2022_baseline.csv
NYSE_TTE_2022_baseline.csv
NYSE_PFE_2022_baseline.csv
NYSE_NKE_2022_baseline.csv
NYSE_UBER_2022_baseline.csv
NYSE_MCD_2022_baseline.csv


In [14]:
def call_gpt(prompt: str) -> str:
    
    messages = [{"role": "system", "content": "You are a professional who can annotate corporate esg report."}, 
                {"role": "user", "content":prompt}]

    response = client.chat.completions.create(
                model="mistralai/Mistral-7B-Instruct-v0.2",
                messages=messages,
                temperature= 0.7
            )

    return response.choices[0].message.content, response.usage

def get_yes_no(statement: str) -> str:
    
    prompt = '''Below is a response from a decoder model. It is supposed to be only yes or no but returned answers were not correct sometimes.''' \
    ''' Map the text to only two of these value and if its not possible return null. Return only one keyword and only yes or no or null''' \
    ''' Text {statement} Response:'''
    
    modified_prompt = prompt.format(statement = statement)
    
    return modified_prompt

def get_vague(statement: str) -> str:
    
    prompt = '''Below is a response from a decoder model. It is supposed to be only specific, ambigious, generic or notESG but returned answers were not correct sometimes.''' \
    ''' Map the text to only 4 of these value and if its not possible return null. Return only one keyword.''' \
    ''' Text {statement} Response:'''
    
    modified_prompt = prompt.format(statement = statement)
    
    return modified_prompt

In [15]:
data = pd.read_csv('baseline_data_mistral7b.csv')

In [16]:
data.head()

Unnamed: 0,text,llm_responses,scope3,vague,file_name,model_name
0,Business Impact Report Databook This Databook ...,"{'scope3': 'no', 'vague': 'specific'}",no,specific,NYSE_DE_baseline.csv,Mistral-7B-Instruct-v0.2
1,Supplemental Content This Supplemental Content...,"{'scope3': 'no', 'vague': 'generic'}",no,generic,NYSE_DE_baseline.csv,Mistral-7B-Instruct-v0.2
2,"Operating Return on Sales (OROS), and Sharehol...","{'scope3': 'no', 'vague': 'generic'}",no,generic,NYSE_DE_baseline.csv,Mistral-7B-Instruct-v0.2
3,For reconciliations of OROS to the most direct...,"{'scope3': 'no', 'vague': 'generic'}",no,generic,NYSE_DE_baseline.csv,Mistral-7B-Instruct-v0.2
4,"John Deere had an outstanding year in 2023, de...","{'scope3': 'no', 'vague': 'generic'}",no,generic,NYSE_DE_baseline.csv,Mistral-7B-Instruct-v0.2


In [17]:
data['scope3'].value_counts()

scope3
no                                                                                                                                                            7066
yes                                                                                                                                                            587
maybe                                                                                                                                                          158
generic                                                                                                                                                         44
ambiguous                                                                                                                                                       31
possibly                                                                                                                                                        13
not specific   

In [18]:
data['vague'].value_counts()

vague
generic                                                                                                                             5490
specific                                                                                                                            1420
ambiguous                                                                                                                            645
notESG                                                                                                                                94
not relevant to ESG                                                                                                                   72
                                                                                                                                    ... 
specific (relates to global CO2 emissions, not directly to Scope 3)                                                                    1
specific (relating to TotalEnergies