In [1]:
# ----------------------- General Libraries ----------------------- #
import pandas as pd
import numpy as np

# --------------------------- RAG System -------------------------- #
# Document Preparation
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Embeddings
from langchain_huggingface import HuggingFaceEmbeddings

# Vector Store
from langchain_chroma import Chroma

# Chat Completion
import os
from langchain.chat_models import init_chat_model
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate


from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from textwrap import fill

In [None]:
API_KEY = "..."
FILE_PATH = "Data/2023_Climate-Report-ING-Groep-NV.pdf"

if not os.environ.get("GROQ_API_KEY"):
  os.environ["GROQ_API_KEY"] = API_KEY

In [3]:
# ----------------------------- Component 1: LLM ----------------------------- #
llm = init_chat_model("llama3-70b-8192", model_provider="groq")

# ----------------------- Component 2: Embedding model ----------------------- #
model_name = "all-MiniLM-L6-v2"
# model_name = "sentence-transformers/all-mpnet-base-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

# ------------------------ Component 3: Vector store ------------------------- #
vector_store = Chroma(
    collection_name="first_draft_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

# --------------------- Component 4: Document retriever ---------------------- #
retriever = vector_store.as_retriever(search_kwargs={"k": 3})


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
loader = PyPDFLoader(file_path=FILE_PATH)
doc = loader.load()

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(doc)

In [6]:
# Index chunks
_ = vector_store.add_documents(documents=all_splits)

In [7]:
# --------------------------- Retrieval & Generation ------------------------- #

# Create custom prompts for the translation and generation phases
template = """
        You are a sustainability expert.
        Your task is to help users answer questions related ESG matters. 
        Users will ask you for information about Scope 1, Scope 2, or Scope 3 emissions.
        You should use the context provided to answer their questions.
        Always write the units that the emissions are reported in.

        {context}

        Question: {question}

        Helpful Answer:
    """

prompt = PromptTemplate.from_template(template)


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [10]:
response = graph.invoke({"question": "What were the Scope 3 emissions, and what was the source of the emissions?"})
print(fill(response["answer"]))

According to the provided data, the Scope 3 emissions were 11
kilotonnes CO2e in 2021, 7 kilotonnes CO2e in 2020, and 9 kilotonnes
CO2e in 2014. The source of these Scope 3 emissions is business travel
by air and car, which includes travel for business purposes only and
excludes emissions from employee commuting.


In [22]:
df = pd.read_excel("Data/uva_data_challenge_emission.xlsx")
df

Unnamed: 0,Company Name,Report Year,Report Name,Report Link,Scope 1,Scope 2 Market-based,Scope 2 Location-based,Scope3 Total,Scope3 Purchased Goods And Services,Scope3 Capital Goods,...,Scope3 Leased Assets Upstream,Scope3 Transportation And Distribution Downstream,Scope3 Processing Of Sold Products,Scope3 Use Of Sold Products,Scope3 End Of Life Treatment Of Sold Products,Scope3 Leased Assets Downstream,Scope3 Franchises,Scope3 Investments,Scope3 Other,Page No
0,Ventient Energy,2022,ventient_sustainability_report_2022.pdf,https://nadara.com/our-impact/#flipbook-df_162...,354,809.34,2567.46,83516,,81724.62,...,,264.44,,,,,,,,"14, 15"
1,ING,2023,2023_Climate-Report-ING-Groep-NV.pdf,https://www.ing.com/web/file?uuid=a65e8288-2a1...,8,9.0,68.0,11,,,...,,,,,,,,,,43
2,DuPont,2023,DuPont_2024SustainabilityReport,https://www.dupont.com/content/dam/dupont/amer...,739337,564194.0,850290.0,8108616,4034495.0,63401.0,...,797.0,22975.0,508520.0,4644.0,2292650.0,,,,,"115, 117"
3,Novartis,2024,novartis-integrated-report-2024.pdf,https://www.novartis.com/sites/novartis_com/fi...,207000,30000.0,200400.0,4350300,3372500.0,195700.0,...,,111300.0,8500.0,,75500.0,100.0,,,,"28, 61"


In [None]:
# Load PDF
# Split PDF 
# Embed chunks
# Store Embeddings 
# Embed Query
# Chat complete
""