In [0]:
!pip install pdfservices-sdk==2.3.1
!pip install openpyxl
!pip install langchain==0.0.278
!pip install pdfservices-sdk
!pip install openai==0.27.8
!pip install jsonpickle 
dbutils.library.restartPython()

In [0]:
import json
import os.path
import re
import sys
import glob
import zipfile
from datetime import datetime

import openai
import pandas as pd
import tiktoken
from adobe.pdfservices.operation.auth.credentials import Credentials

from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import \
ExtractPDFOperation
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import (
    ExtractElementType,
)
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import (
    ExtractPDFOptions,
)
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import (
    ExtractRenditionsElementType,
)
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import AzureChatOpenAI
from langchain.docstore.document import Document
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.text_splitter import TokenTextSplitter
from langchain_core.prompts.prompt import PromptTemplate

# sys.path.append("..")
from functions.image_processing import *
from functions.adobe import *
from functions.text_prompt import *
from functions.text_processing import *

### Fewshot prompt template

In [0]:
# now create the few shot prompt template
prompt_template = FewShotPromptTemplate(
    examples=examples_emissions,
    example_prompt=example_prompt,
    prefix=prefix_emissions,
    suffix=suffix,
    input_variables=["context", "question"],
    example_separator="\n\n",
)

### Call the model

In [0]:
model = "GPT4"
openai.api_base = os.getenv("OPENAI_BASE_GPT4")
openai.api_key = os.getenv("OPENAI_KEY_GPT4")
openai.api_version = "2024-02-15-preview"

llm = AzureChatOpenAI(
    deployment_name="GPT4",
    openai_api_version="2024-02-15-preview",
    openai_api_key=openai.api_key,
    openai_api_base=openai.api_base,
    model_kwargs={"engine": "GPT4"},
    temperature=0,
    # seed = 1
)

### Now run the QA pipeline

In [0]:
df_isa_emissions = pd.DataFrame()
responses = []

output_base_zip_path = "/tmp/sdk_result/"

# ADOBE Pipeline
if not os.path.exists(output_base_zip_path):
    os.mkdir(output_base_zip_path)

output_base_extract_folder = (
    "/dbfs/FileStore/projects/WSsustainability/data/validation/tmp/pdf"
)
all_files = os.listdir(
    "/dbfs/FileStore/projects/WSsustainability/data/validation/tmp/pdf"
)
all_files = glob.glob('/dbfs/FileStore/projects/WSsustainability/data/validation/ISA/ISA_reports/OCI_Annual_Report_2022_vf*')
to_run_file = [os.path.join(output_base_extract_folder, file) for file in all_files]

# Processing each PDF
for pdf_path in to_run_file:
    pdf_name = os.path.basename(pdf_path).replace(".pdf", "")
    output_zip_path = os.path.join(output_base_zip_path, f"{pdf_name}.zip")
    output_zipextract_folder = os.path.join(output_base_extract_folder, pdf_name)

    # Check if the extraction folder already exists
    if not os.path.exists(output_zipextract_folder):
        # Run adobe API only if the extraction folder doesn't exist
        adobeLoader(pdf_path, output_zip_path)
    
    documents = extract_text_from_file_adobe(output_zip_path, output_zipextract_folder)

In [0]:
query_emissions = """
Given the context provided, summarize the company's GHG emissions data for the
latest reporting year:  including Scope 1, Scope 2, (include scope 2 market
or location-based or both if reported), and Scope 3. Always includes GHG emissions
if provided in the context. CO2 emissions are a type of GHG emissions. 
If the context includes only CO2 emissions data, use CO2 emissions
numbers as the GHG emissions values. Do not overlook things. Pay extra attention
to the data provided in json format in the context where GHG/CO2 emissions are
usually reported. Do not confuse the Scope 1+2 with Scope 1 or Scope 2 
emissions solely. Carefully read the context and extract the data exactly as
it is presented. If the data is not stated in the context, leave it
as null. DO NOT fabricate any numbers. It is very important to accurately
reporting the data as per the guidelines provided. If you cannot extract the
specific Scope 1, Scope 2, and Scope 3 GHG/CO2 emissions because the context
does not explicitly state these figures, provide response with null values for
the emissions data. Provide the answer in a structured JSON format.
"""

emissions_keywords = [
    "scope 1",
    "scope 2",
    "scope 3",
    "greenhouse gas emissions",
    "ghg",
    "direct emissions",
    "direct emission",
    "indirect emissions",
    "indirect emission",
    "Scope 1",
    "Scope 2",
    "Scope 3",
    "emission",
    "emissions",
]

df_isa_emissions = pd.DataFrame()
responses = []

output_base_zip_path = "/tmp/sdk_result/"

# ADOBE Pipeline
if not os.path.exists(output_base_zip_path):
    os.mkdir(output_base_zip_path)

output_update_extract_folder = (
    "/dbfs/FileStore/projects/WSsustainability/data/validation/tmp/pdf"
)
output_base_extract_folder = (
    "/dbfs/FileStore/projects/WSsustainability/data/validation/ISA/adobe_raw"
)
# all_files = os.listdir(
    # "/dbfs/FileStore/projects/WSsustainability/data/validation/tmp/"
# )
all_files = glob.glob('/dbfs/FileStore/projects/WSsustainability/data/validation/ISA/ISA_reports/*')
to_run_file = [os.path.join(output_base_extract_folder, file) for file in all_files]
to_run_file = list(set([sub.replace("-part1", "").replace("-part2", "").replace("-part3", "") for sub in to_run_file]))

# Processing each PDF
for pdf_path in to_run_file:
    
    pdf_name = os.path.basename(pdf_path).replace(".pdf", "")
    output_zip_path = os.path.join(output_base_zip_path, f"{pdf_name}.zip")
    output_zipextract_folder = os.path.join(output_base_extract_folder, pdf_name)
    output_update_zipextract_folder = os.path.join(output_update_extract_folder, pdf_name)

    # Check if the extraction folder already exists
    if os.path.exists(output_zipextract_folder) or os.path.exists(output_update_zipextract_folder):
        # print(pdf_path)
        # adobeLoader(pdf_path, output_zip_path)

        if os.path.exists(output_update_zipextract_folder):
            print(f'process {output_update_zipextract_folder}')
            documents = extract_text_from_file_adobe(output_zip_path, output_update_zipextract_folder)
        else:
            print(f'process {output_zipextract_folder}')
            documents = extract_text_from_file_adobe(output_zip_path, output_zipextract_folder)

        # QA GenAI pipeline
        output, result_df = qa_unstructured(
            documents,
            query=query_emissions,
            prompt_template=prompt_template,
            keywords=emissions_keywords,
            input_type="text",
        )
        result_df["filename"] = pdf_name

        # Append results
        df_isa_emissions = pd.concat([df_isa_emissions, result_df], ignore_index=True)
        responses.append(output)

        # Save the final df
        df_isa_emissions.to_csv(
            "/dbfs/FileStore/projects/WSsustainability/data/validation/tmp/isa_extracted_emissions_0726.csv",
            index=False,
        )


In [0]:
pd.set_option('max_colwidth', 200)
df_isa_emissions#['raw_text']#[df_isa_emissions['model_explanation'].notna()]

In [0]:

# df = pd.read_csv("/dbfs/FileStore/projects/WSsustainability/data/validation/tmp/isa_extracted_emissions_0724.csv")

In [0]:
# Sanity check
import ast
pd.set_option('max_colwidth', 100)
df = pd.read_csv("/dbfs/FileStore/projects/WSsustainability/data/validation/tmp/isa_extracted_emissions_0724.csv")
# df.loc[(df_null['emissions_reduction_targets'].isna()) & (df['GHG_reduction_targets'].notna()), 'emissions_reduction_targets'] = df['GHG_reduction_targets']
df.drop(['error', 'net_scope_1', 'net_scope_2'], axis=1, inplace=True)
df = df[~df['filename'].str.contains('part')]
df_sub = df[(df['model_explanation'].notna())]
# df_full = pd.concat([df_null, df_explode], axis=0, ignore_index=True)
# df_full = df_full.sort_values('company_name')
df_sub['reporting_year'] = df_sub['reporting_year'].astype('Int64').astype('str')
df_sub = df_sub.fillna('None')
# df_full = df_full.replace('<NA>', 'None', regex=False)

# for col in ['target_year', 'base_year']:
#     df_full[col] = df_full[col].astype(str).str.replace('FY', '').replace('2017-2019', '2017')
# df_full['reduction_percentage'] = pd.to_numeric(df_full['reduction_percentage'], errors='coerce').astype(str)

# df_full.rename(columns={'scope':'emissions_scope', 'target_year':'emissions_target_year', 'base_year':'emissions_base_year', 'target_type':'emissions_target_type', 'reduction_percentage':'emissions_reduction_percentage'}, inplace=True)
# df_full = df_full.drop_duplicates()

In [0]:
df_truth = pd.read_excel('/dbfs/FileStore/projects/WSsustainability/data/validation/ISA/feedback/isa_climate_energy_genAI_output.xlsx')
df_truth = df_truth[df_truth['emissions_pipeline']=='text_table_pipeline']
for col in ['emissions_scope_1', 'emissions_scope_2', 'emissions_scope_2_market',
       'emissions_scope_2_location', 'emissions_scope_3']:
    df_truth[col] =  pd.to_numeric(df_truth[col], errors='coerce')

df_correct = df_truth[(df_truth['Feedback GHG Scope1']=='Correct') | (df_truth['Feedback GHG Scope2']=='Correct') |(df_truth['Feedback GHG Scope3']=='Correct')][['emissions_company', 'filename', 'emissions_scope_1', 'emissions_scope_2', 'emissions_scope_2_market', 'emissions_scope_2_location', 'emissions_scope_3', 'Feedback GHG Scope1', 'Feedback GHG Scope2', 'Feedback GHG Scope3']]
df_correct = df_correct.fillna('None')
df_correct = df_correct.drop_duplicates()

df_merge = pd.merge(df_sub, df_correct, on=['filename'], how='left')
# df_merge = pd.merge(df_merge, df_truth[['filename', 'custom_url']], on='filename', how='left')
# df_merge.loc[df_merge['emissions_company'].notna(), 'validate'] = 'Correct'
df_merge.loc[(df_merge['Feedback GHG Scope1']=='Correct') & (df_merge['scope_1']==df_merge['emissions_scope_1']), 'scope1_validate'] = 'Correct'
df_merge.loc[(df_merge['Feedback GHG Scope2']=='Correct') & (df_merge['scope_2']==df_merge['emissions_scope_2']) &
             (df_merge['scope_2_market']==df_merge['emissions_scope_2_market']) & (df_merge['scope_2_location']==df_merge['emissions_scope_2_location']), 'scope2_validate'] = 'Correct'
df_merge.loc[(df_merge['Feedback GHG Scope3']=='Correct') & (df_merge['scope_3']==df_merge['emissions_scope_3']), 'scope3_validate'] = 'Correct'

df_merge = df_merge.drop_duplicates().sort_values('company_name').reset_index(drop=True)

In [0]:
display(df_merge)

In [0]:
# ### Check consistency by running many times
# results_validation = []
# run_ids = []
# total_runs = 5  # Total number of runs
# df_isa_emissions_benchmark = pd.DataFrame()
# responses_benchmark = []
# output_base_zip_path = "/tmp/sdk_result/"
# if not os.path.exists(output_base_zip_path):
#     os.mkdir(output_base_zip_path)

# output_base_extract_folder = (
#     "/dbfs/FileStore/projects/WSsustainability/data/validation/ISA/adobe_raw"
# )
# all_files = os.listdir(
#     "/dbfs/FileStore/projects/WSsustainability/data/validation/ISA/adobe_raw"
# )
# to_run_file = [os.path.join(output_base_extract_folder, file) for file in all_files]

# # Processing each PDF
# for pdf_path in to_run_file[:1]:
#     pdf_name = os.path.basename(pdf_path).replace(".pdf", "")
#     output_zip_path = os.path.join(output_base_zip_path, f"{pdf_name}.zip")
#     output_zipextract_folder = os.path.join(output_base_extract_folder, pdf_name)

#     # Check if the extraction folder already exists
#     if not os.path.exists(output_zipextract_folder):
#         # Run adobe API only if the extraction folder doesn't exist
#         adobeLoader(pdf_path, output_zip_path)

#     documents = extract_text_from_file_adobe(output_zip_path, output_zipextract_folder)

#     output_base_zip_path = "/tmp/sdk_result/"
#     # Process the extracted documents
#     for run_id in range(1, total_runs + 1):
#         seed = run_id
#         output, result_df = qa_unstructured(
#             documents,
#             query=query_emissions,
#             prompt_template=prompt_template,
#             keywords=emissions_keywords,
#             seed=seed,
#             input_type="text",
#         )
#         result_df["filename"] = pdf_name
#         result_df["run_id"] = run_id
#         # Append results
#         df_isa_emissions_benchmark = pd.concat(
#             [df_isa_emissions_benchmark, result_df], ignore_index=True
#         )
#         responses_benchmark.append(output)

# df_isa_emissions_benchmark

In [0]:
# input_txt = responses[0]["input_documents"]
# with open(
#     "/dbfs/FileStore/projects/WSsustainability/data/validation/ISA/input/input_txt.txt",
#     "w",
# ) as file:
#     for item in input_txt:
#         file.write(str(item) + "\n")

In [0]:
# # Save the final df
# df_isa_emissions.to_csv(
#     "/dbfs/FileStore/projects/WSsustainability/data/validation/ISA/result/isa_extracted_emissions_1005.csv",
#     index=False,
# )
# df = pd.read_csv(
#     "/dbfs/FileStore/projects/WSsustainability/data/validation/ISA/result/isa_extracted_emissions_1005.csv",
# )
# df

In [0]:
# # save and load json responses
# # serialize with jsonpickle
# serialized_documents = jsonpickle.encode(responses)
# with open(
#     "/dbfs/FileStore/projects/WSsustainability/data/validation/ISA/result/output_emissions_text_1005.json",
#     "w",
# ) as file:
#     file.write(serialized_documents)

# file_path = "/dbfs/FileStore/projects/WSsustainability/data/validation/ISA/result/output_emissions_text_1005.json"
# with open(file_path, "r") as file:
#     data = file.read()

# # Deserialize the content back into Python objects
# data = jsonpickle.decode(data)
# data