# KPI Extraction Chain

### Handle imports

In [51]:
# Move to root directory
import os

notebooks_dir = 'notebooks'
if notebooks_dir in os.path.abspath(os.curdir):
    while not os.path.abspath(os.curdir).endswith('notebooks'):
        print(os.path.abspath(os.curdir))
        os.chdir('..')
    os.chdir('..')  # to get to root

print(os.path.abspath(os.curdir))

/Users/jb_vandeneynde/code/jb-vde/Belfius/llm_esg_kpi/eyalytics


In [52]:
import pickle
import pinecone
import tiktoken

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [53]:
# Get API and ENV keys:
from dotenv import load_dotenv

load_dotenv()
if not os.getenv("OPENAI_API_KEY"):
    raise KeyError(
        "You will need an OPENAI_API_KEY to use the LLM models in this notebook."
    )

## Text Splitting, Embedding Models, and Vector DB

We'll be using OpenAI's text-embedding-ada-002 model and Pinecone's FREE vector DB. 

In [54]:
report_path = "../data/colruyt_annualreport.pdf"
loader = PyPDFLoader(report_path)

In [77]:
# model = 'gpt-3.5-turbo'  # open ai LLM model we will be using later.
model = 'text-davinci-003'  # open ai LLM model we will be using for now.
enc_code = tiktoken.encoding_for_model(model).name
tokenizer = tiktoken.get_encoding(enc_code)

# Determine length of input after tokenization
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # in order to be able to fit 3 chunks in context window
    chunk_overlap=0,  
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]  # order in which splits are prioritized
)

In [78]:
chunks = loader.load_and_split(text_splitter)

In [79]:
len(chunks)

572

In [80]:
chunks[4]

Document(page_content='adjust in a positive direction our outlook for the full-year \nresults. Ultimately, our revenue for the financial year increased by \n7,7%, to over 10,8 billion euros. The net result decreased by one \npercentage point to 1,9% of revenue. \nIn the coming financial year, we will maintain our increased focus \non efficiency and continue to implement our strategic plan. \nWe are convinced that we will emerge stronger from the current \nperiod. We therefore continue to expand our smart ecosystem, \nwith products and services in four areas, where a significant \nportion of family budgets is spent. In the fields of nutrition \nand health, in particular, we have taken major steps through \ninvestments, participations and acquisitions. Our Jims fitness \nchain boosted its muscle by acquiring six high-quality clubs from \nthe Oxygen chain. We also took a shareholding in digital health \nplatform yoboo, strong in personal counselling towards healthier \nlifestyles. Finally

In [81]:
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY', ''),
    environment=os.getenv('PINECONE_ENV', '')
)
index_name = 'esg-kpi'

In [82]:
# fname = "colruyt_report"
# fpath = f"./data/pinecone_db/{fname}.pkl"
# if os.path.exists(fpath):
#     with open(fpath, 'rb') as f:
#         vector_store = pickle.load(f)
# else:
#     # Init embedding model:
#     embed = OpenAIEmbeddings(
#         model='text-embedding-ada-002'
#     )
#     vector_store = Pinecone.from_documents(chunks, embedding=embed, index_name=index_name)
#     with open(fpath, 'wb') as f:
#         pickle.dump(vector_store, f)

# Init embedding model:
embed = OpenAIEmbeddings(
    model='text-embedding-ada-002'
)
vector_store = Pinecone.from_documents(chunks, embedding=embed, index_name=index_name)

## Set up QA Chain

In [83]:
llm = OpenAI(temperature=0, model=model)
chain = load_qa_chain(llm, chain_type="stuff")

In [96]:
query = "Approximately how much green house gas emissions does Colruyt produce ?"
context_docs = vector_store.similarity_search(query)

In [97]:
context_docs

[Document(page_content='Green electricity production by Virya Energy on behalf of Colruyt Group (in MWh) Calendar year 1.135.562 949.336 925.315\nCO2 emissions avoided through our investments in green energy via our shareholding in Virya Energy (in tonnes) Calendar year 187.368 (4)156.640 (4)152.677\nOffering renewable energy\n# DATS 24 electric charging stations Financial year 147 232 373\n# Colruyt Group sites with charging stations Financial year 135 159 230\n# public DATS 24 hydrogen filling stations Financial year 1 1 3\n(1)  These indicators relate to the entire Colruyt Group, unless stated otherwise.\n(2)  Compared to previous years, the scope of this indicator has been expanded to include the consolidated scope of the entire Colruyt Group,  \nin line with our climate reporting. This year we also report separately for Colruyt Group in Belgium and Luxembourg.\n(3)  To avoid double counting, the own produced and consumed electricity and heat from the co-generation installations ar

In [98]:
print(chain.run(input_documents=context_docs, question=query).strip())

It is not possible to answer this question with the given information.
