In [1]:
import os
import pandas as pd
import nest_asyncio
import sys
import glob
from llama_parse import LlamaParse
from uniflow.flow.client import TransformClient
from uniflow.flow.config import TransformOpenAIConfig
from uniflow.flow.config import OpenAIModelConfig
from uniflow.op.prompt import PromptTemplate, Context
from dotenv import load_dotenv


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#set APIs
openAI_API = "sk-APQOsxZGveaK9TlxLrf4CQwkmJbPVkg9kRhxzBLvQZT3BlbkFJL5Et9rReJvbKSQVIaLN_gyeOqfI6NegK3sft-ITfEA"
llama_API = "llx-CMbz4KON9u4zzPqk1HFLVgW6CztcKYfLUBv8dsPgi7rn2sG5"

In [3]:
#import file
sys.path.append(".")
sys.path.append("..")

dir_cur = os.getcwd() 
pdf_file = "test.pdf"
input_file = os.path.join(f"{dir_cur}/data/", pdf_file)

# Get the base name of the file
base_name = os.path.basename(input_file)

#check the outputs directory
os.makedirs("outputs", exist_ok=True)


In [4]:
#transform PDF
nest_asyncio.apply()

os.environ["LLAMA_CLOUD_API_KEY"] = llama_API

documents = LlamaParse(result_type="markdown").load_data(input_file)

Started parsing the file under job_id 28c07949-9819-45aa-b299-dfab31270024


In [5]:
#merge all the text into one str
all_text = []
for doc in documents:
    all_text.append(doc.text)

merged_doc = '\n\n'.join(all_text)

# Save as txt
txt_output_path = os.path.join(dir_cur, 'outputs', f'{base_name}.txt')
with open(txt_output_path, 'w', encoding='utf-8') as file:
    file.write(merged_doc)

In [6]:
#prompt and example
identify_prompt = PromptTemplate(
    instruction="""Extract and directly copy any text-based content or tables specifically containing ESG information that could be used for a data analysis. Focus on capturing content that is comprehensive.
    """,
    few_shot_prompt=[
        Context(
            context="!!!PROMOTE!!!The company reported a total of 10,000 tons of Scope 1 emissions and aims to reduce this by 20% over the next five years.."""
    )]
)



standardize_prompt = PromptTemplate(
    instruction="""Standardize the ESG contents or tables into a structured data frame. Ensure each entry includes: 'label' (ESG metric name), 'context' (brief excerpt from report), 'year' (year of data), 'metrics' (unit of measurement), and 'value' (numerical value). Return the standardized data frame for analysis.
    """,
    few_shot_prompt=[
        Context(
            label="Greenhouse Gas Emissions""",
            year="2020""",
            metrics="Scope 1 (tCO2e)""",
            value=10000
    )]
)

In [7]:
#Set AI config
identify_config = TransformOpenAIConfig(
    prompt_template=identify_prompt,
    model_config=OpenAIModelConfig(
        model_name = 'gpt-4o-mini',
        response_format={"type": "json_object"}
    ),
)

standardize_config = TransformOpenAIConfig(
    prompt_template=standardize_prompt,
    model_config=OpenAIModelConfig(
        model_name = 'gpt-4o-2024-08-06',
        response_format={"type": "json_object"}
    ),
)

load_dotenv()  
os.environ["OPENAI_API_KEY"] = openAI_API

identify_client = TransformClient(identify_config)
standardize_client = TransformClient(standardize_config)

In [8]:
#store the extracted esg contents as a dictionary
ESG_contents = {}

for idx, doc in enumerate(documents):
    input_page = [
        Context(
            context=doc.text,
        )]

    ESG_contents[idx] = identify_client.run(input_page)

100%|██████████| 1/1 [00:08<00:00,  8.96s/it]


In [9]:
#restructure the extracted esg contents as a list
def extract_esg_contents(esg_contents):
    extracted_responses = []

    try:
        # Iterate through the keys of the ESG_contents dictionary
        for key in esg_contents:
            items = esg_contents[key]
            
            # Iterate through each item in the list associated with the current key
            for item in items:
                output_list = item.get('output', [])
                
                # Iterate through each output item
                for output_item in output_list:
                    response_list = output_item.get('response', [])
                    
                    # Append each response item to the extracted_responses list
                    for response_item in response_list:
                        extracted_responses.append(response_item)
    
    except Exception as e:
        print(f"Error extracting response content: {e}")

    return extracted_responses

extracted_contents = extract_esg_contents(ESG_contents)

In [10]:
# check the input list in the next AI step
print(extracted_contents)

[{'ESG_Data': {'Energy_Consumption': {'Total_energy_consumption': {'2023': 418221, '2022': 401076, '2021': 419941, '%_change': 4.3}, 'Diesel_consumption': {'2023': 219, '2022': 586, '2021': 643, '%_change': -62.6}, 'Company_fleet_mileage': {'2023': 1842, '2022': 1650, '2021': 2659, '%_change': 11.6}, 'Total_electricity_consumption': {'2023': 115600, '2022': 110789, '2021': 115733, '%_change': 4.3}, 'Electricity_intensity': {'2023': 0.021, '2022': 0.02, '2021': 0.022, '%_change': 5.0}}, 'Emissions': {'Total_emissions': {'2023': 27497, '2022': 65488, '2021': 68151, '%_change': -58.0}, 'Scope_1': {'2023': 147, '2022': 160, '2021': 237, '%_change': -8.1}, 'Scope_2': {'2023': 68334, '2022': 63811, '2021': 67636, '%_change': 7.1}, 'Scope_3': {'2023': 3849, '2022': 1517, '2021': 278, '%_change': 153.7}, 'Emission_intensity_ratios': {'Scope_2': {'2023': 0.0123, '2022': 0.0115, '2021': None, '%_change': 7.0}}}, 'Water': {'Total_water_consumption': {'2023': 433969, '2022': 400322, '2021': 407051

In [11]:
# store the json output in a dictionary in the second step
output = {}

for idx, item in enumerate(extracted_contents):
    sentence = [
        Context(
            context=item
        )
        ]

    output[idx] = standardize_client.run(sentence)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:29<00:00, 29.73s/it]


In [12]:
print(output)

{0: [{'output': [{'response': [{'data': [{'label': 'Total Energy Consumption', 'context': 'Total energy consumption data showing measurements across multiple years.', 'year': '2023', 'metrics': 'kWh', 'value': 418221}, {'label': 'Total Energy Consumption', 'context': 'Total energy consumption data showing measurements across multiple years.', 'year': '2022', 'metrics': 'kWh', 'value': 401076}, {'label': 'Total Energy Consumption', 'context': 'Total energy consumption data showing measurements across multiple years.', 'year': '2021', 'metrics': 'kWh', 'value': 419941}, {'label': 'Diesel Consumption', 'context': 'Diesel consumption data over several years presented.', 'year': '2023', 'metrics': 'liters', 'value': 219}, {'label': 'Diesel Consumption', 'context': 'Diesel consumption data over several years presented.', 'year': '2022', 'metrics': 'liters', 'value': 586}, {'label': 'Diesel Consumption', 'context': 'Diesel consumption data over several years presented.', 'year': '2021', 'metr

In [13]:
# transform the json output into a DataFrame

label = []
year =[]
metrics = []
value = []
for out in output.values():  
    for item in out:
        for i in item.get('output', []):
            for response in i.get('response', []):
                for key in response:
                                if isinstance(response[key], list) and len(response[key]) > 0:
                                    for res in response[key]:  
                                        if all(k in res for k in [ 'label', 'year', 'metrics', 'value']):
                                            label.append(res['label'])
                                            year.append(res['year'])
                                            metrics.append(res['metrics'])
                                            value.append(res['value'])
                    
df = pd.DataFrame({
    'label': label,
    'year':year,
    'metrics': metrics,
    'value' :value
})

In [14]:
# Set display options
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.width', 1000)

#show the dataframe
df

Unnamed: 0,label,year,metrics,value
0,Total Energy Consumption,2023,kWh,418221.0
1,Total Energy Consumption,2022,kWh,401076.0
2,Total Energy Consumption,2021,kWh,419941.0
3,Diesel Consumption,2023,liters,219.0
4,Diesel Consumption,2022,liters,586.0
5,Diesel Consumption,2021,liters,643.0
6,Company Fleet Mileage,2023,miles,1842.0
7,Company Fleet Mileage,2022,miles,1650.0
8,Company Fleet Mileage,2021,miles,2659.0
9,Total Electricity Consumption,2023,kWh,115600.0


In [15]:
# Save DataFrame as Excel file
excel_output_path = os.path.join(dir_cur, 'outputs', f'{base_name}.xlsx')
df.to_excel(excel_output_path)