In [2]:
import os
import pandas as pd
import nest_asyncio
import sys
from llama_parse import LlamaParse
from uniflow.flow.client import TransformClient
from uniflow.flow.config import TransformOpenAIConfig
from uniflow.flow.config import OpenAIModelConfig
from uniflow.op.prompt import PromptTemplate, Context
from dotenv import load_dotenv


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the .env file
dotenv_path = 'D:/apikeys/.env'  
load_dotenv(dotenv_path=dotenv_path)

# Retrieve the API key from the environment
openAI_API = os.getenv('OPENAI_API_KEY')
llama_API = os.getenv('LLAMA_API_KEY')

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
llx-CMbz4KON9u4zzPqk1HFLVgW6CztcKYfLUBv8dsPgi7rn2sG5


In [4]:
#import file
sys.path.append(".")
sys.path.append("..")

dir_cur = os.getcwd() 
pdf_file = "test.pdf"
input_file = os.path.join(f"{dir_cur}/data/Reports", pdf_file)

# Get the base name of the file
base_name = os.path.basename(input_file)

#check the outputs directory
os.makedirs("outputs", exist_ok=True)


In [5]:
#transform PDF
nest_asyncio.apply()

os.environ["LLAMA_CLOUD_API_KEY"] = llama_API

documents = LlamaParse(result_type="markdown").load_data(input_file)

Started parsing the file under job_id 66e2d331-6301-42dd-9425-38ceee69d8ea


In [6]:
#merge all the text into one str
all_text = []
for doc in documents:
    all_text.append(doc.text)

merged_doc = '\n\n'.join(all_text)

# Save as txt
txt_output_path = os.path.join(dir_cur, 'outputs', f'{base_name}.txt')
with open(txt_output_path, 'w', encoding='utf-8') as file:
    file.write(merged_doc)

In [7]:
#prompt and example
identify_prompt = PromptTemplate(
    instruction="""Extract and directly copy any text-based content or tables specifically containing ESG information that could be used for a data analysis. Focus on capturing content that is comprehensive.
    """,
    few_shot_prompt=[
        Context(
            context="The company reported a total of 10,001 promtCO2e of Scope 1 emissions in 2020."""
    )]
)



standardize_prompt = PromptTemplate(
    instruction="""Standardize the ESG contents or tables into a structured data frame that includes: 'label' , 'metric', 'unit', 'year' and 'value' (numerical value). 
    Here is the reference for 'label', 'metric' and 'unit': 
    {
  "Label": {
    "Greenhouse Gas Emissions": [
      {
        "metric": "Total","Scope 1","Scope 2","Scope 3"
        "unit": "tCO2e"
      },
      {
        "metric": "Emission intensities of total","Emission intensities of Scope 1","Emission intensities of Scope 2","Emission intensities of Scope 3"
        "unit": "tCO2e"
      }
    ],
    "Energy Consumption": [
      {
        "metric": "Total energy consumption",
        "unit": "MWhs", "GJ"
      },
      {
        "metric": "Energy consumption intensity",
        "unit": "MWhs", "GJ"
      }
    ],
    "Water Consumption": [
      {
        "metric": "Total water consumption",
        "unit": "ML", "m³"
      },
      {
        "metric": "Water consumption intensity",
        "unit": "ML", "m³"
      }
    ],
    "Waste Generation": {
      "metric": "Total waste generated",
      "unit": "t"
    },
    "Gender Diversity": [
      {
        "metric": "Current employees by gender",
        "unit": "Male Percentage (%)","Female Percentage (%)","Others Percentage (%)"
      },
      {
        "metric": "New hires and turnover by gender",
        "unit": "Male Percentage (%)","Female Percentage (%)","Others Percentage (%)"
      }
    ],
    "Age-Based Diversity": [
      {
        "metric": "Current employees by age groups",
        "unit": "Baby Boomers (%)","Gen Xers (%)","Millennials (%)","Gen Z (%)"
      },
      {
        "metric": "New hires and turnover by age groups",
        "unit": "Baby Boomers (%)","Gen Xers (%)","Millennials (%)","Gen Z (%)"
      }
    ],
    "Employment": [
      {
        "metric": "Total employee turnover",
        "unit": "Number", "Percentage (%)"
      },
      {
        "metric": "Total number of employees",
        "unit": "Number"
      }
    ],
    "Development & Training": [
      {
        "metric": "Average training hours per employee",
        "unit": "Hours/No. of employees"
      },
      {
        "metric": "Average training hours per employee by gender",
        "unit": "Male Hours/No. of employees", "Female Hours/No. of employees"
      }
    ],
    "Occupational Health & Safety": [
      {
        "metric": "Fatalities",
        "unit": "Number of cases"
      },
      {
        "metric": "High-consequence injuries",
        "unit": "Number of cases"
      },
      {
        "metric": "Recordable injuries",
        "unit": "Number of cases"
      }
    ],
    "Recordable work-related illnesses": {
      "metric": "Number of recordable work-related illnesses or health conditions",
      "unit": "Number of cases"
    },
    "Board Composition": [
      {
        "metric": "Board independence",
        "unit": "Percentage (%)"
      },
      {
        "metric": "Women on the board",
        "unit": "Percentage (%)"
      }
    ],
    "Management Diversity": {
      "metric": "Women in the management team",
      "unit": "Percentage (%)"
    },
    "Ethical Behaviour": [
      {
        "metric": "Anti-corruption disclosures",
        "unit": "Discussion and number"
      },
      {
        "metric": "Anti-corruption training for employees",
        "unit": "Number and Percentage (%)"
      }
    ],
    "Certifications": {
      "metric": "List of relevant certifications",
      "unit": "List"
    },
    "Alignment with Frameworks": {
      "metric": "Alignment with frameworks and disclosure practices"
    },
    "Assurance": {
      "metric": "Assurance of sustainability report",
      "unit": "Internal","External","None"
    }
  }
}
    Return the standardized data frame for analysis.
    """,
    few_shot_prompt=[
        Context(
            label="Greenhouse Gas Emissions""",
            metrics="Scope 1""",
            unit="tCO2e""",
            year="2020""",
            value=10001
    )]
)

In [8]:
#Set AI config
identify_config = TransformOpenAIConfig(
    prompt_template=identify_prompt,
    model_config=OpenAIModelConfig(
        model_name = 'gpt-4o-mini',
        response_format={"type": "json_object"}
    ),
)

standardize_config = TransformOpenAIConfig(
    prompt_template=standardize_prompt,
    model_config=OpenAIModelConfig(
        model_name = 'gpt-4o-2024-08-06',
        response_format={"type": "json_object"}
    ),
)

load_dotenv()  
os.environ["OPENAI_API_KEY"] = openAI_API

identify_client = TransformClient(identify_config)
standardize_client = TransformClient(standardize_config)

In [9]:
#store the extracted esg contents as a dictionary
ESG_contents = {}

for idx, doc in enumerate(documents):
    input_page = [
        Context(
            context=doc.text,
        )]

    ESG_contents[idx] = identify_client.run(input_page)

100%|██████████| 1/1 [00:08<00:00,  8.50s/it]


In [10]:
#restructure the extracted esg contents as a list
def extract_esg_contents(esg_contents):
    extracted_responses = []

    try:
        # Iterate through the keys of the ESG_contents dictionary
        for key in esg_contents:
            items = esg_contents[key]
            
            # Iterate through each item in the list associated with the current key
            for item in items:
                output_list = item.get('output', [])
                
                # Iterate through each output item
                for output_item in output_list:
                    response_list = output_item.get('response', [])
                    
                    # Append each response item to the extracted_responses list
                    for response_item in response_list:
                        extracted_responses.append(response_item)
    
    except Exception as e:
        print(f"Error extracting response content: {e}")

    return extracted_responses

extracted_contents = extract_esg_contents(ESG_contents)

In [11]:
# run step 2 and store the json output in a dictionary 
output = {}

for idx, item in enumerate(extracted_contents):
    sentence = [
        Context(
            context=item
        )
        ]

    output[idx] = standardize_client.run(sentence)

100%|██████████| 1/1 [00:10<00:00, 10.05s/it]


In [12]:
# transform the json output into a DataFrame
unit = []
label = []
year =[]
metric = []
value = []
for out in output.values():  
    for item in out:
        for i in item.get('output', []):
            for response in i.get('response', []):
                for key in response:
                                if isinstance(response[key], list) and len(response[key]) > 0:
                                    for res in response[key]:  
                                        if all(k in res for k in [ 'unit','label', 'year', 'metric', 'value']):
                                            unit.append(res['unit'])
                                            label.append(res['label'])
                                            year.append(res['year'])
                                            metric.append(res['metric'])
                                            value.append(res['value'])
                    
df = pd.DataFrame({
    'label': label,
    'metric': metric,
    'unit' : unit,
    'year':year,
    'value' :value
})

In [13]:
#remove the example data
df_filtered = df[df['value'] != 10001]

In [14]:
# Set display options
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.width', 1000)

#show the dataframe
df_filtered

Unnamed: 0,label,metric,unit,year,value
0,Energy Consumption,Total energy consumption,GJ,2023,418221
1,Energy Consumption,Total energy consumption,GJ,2022,401076
2,Energy Consumption,Total energy consumption,GJ,2021,419941
3,Greenhouse Gas Emissions,Scope 1,tCO2e,2023,147
4,Greenhouse Gas Emissions,Scope 1,tCO2e,2022,160
5,Greenhouse Gas Emissions,Scope 1,tCO2e,2021,237
6,Greenhouse Gas Emissions,Scope 2,tCO2e,2023,23501
7,Greenhouse Gas Emissions,Scope 2,tCO2e,2022,63811
8,Greenhouse Gas Emissions,Scope 2,tCO2e,2021,67636
9,Greenhouse Gas Emissions,Scope 3,tCO2e,2023,3849


In [15]:
# Save DataFrame as Excel file
excel_output_path = os.path.join(dir_cur, 'outputs', f'{base_name}.xlsx')
df_filtered.to_excel(excel_output_path)