In [13]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from openai import OpenAI
from IPython.display import display, HTML

In [14]:
def visualize(answer):
    answer_html = f"<p>{answer}</p>".replace("\n", "</p><p>")
    in_html = f"""
            <h3>Output:</h3>
            {answer_html}
            """
    display(HTML(in_html))

In [19]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
Settings.llm = Ollama(model="llama3.1:8b", request_timeoout=360.0)


def get_model_response(prompt):
    client = OpenAI(
        base_url="http://localhost:11434/v1",
        api_key="ollama",  # required, but unused
    )
    response = client.chat.completions.create(
        model="llama3.1:8b",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
    )
    return response.choices[0].message.content

In [8]:
# function that takes the report and creates the retriever (with indexes etc.)
def createRetriever(REPORT, CHUNK_SIZE, CHUNK_OVERLAP, TOP_K):
    """Create a relevant paragraph retriever for a document
    Args:
        REPORT (string): string containing the location of a report
        CHUNK_SIZE (int): desired word count of the paragraphs
        CHUNK_OVERLAP (int): overlap of the paragraphs
        TOP_K (int): number of retrieved paragraphs for a search
    Returns:
        retriever (object): retriever to retrieve paragraphs
    """
    # load in document
    documents = SimpleDirectoryReader(input_files=[REPORT]).load_data()
    parser = SentenceSplitter(
        chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
    )  # tries to keep sentences together
    nodes = parser.get_nodes_from_documents(documents)

    # build indexes
    index = VectorStoreIndex(nodes, embed_model=Settings.embed_model)

    # configure retriever
    retriever = VectorIndexRetriever(
        index=index,
        similarity_top_k=TOP_K,
    )
    return retriever


def createSources(retriever, query):
    """Use the retriever to obtain sources
    Args:
        retriever (object): retriever to retrieve paragraphs
        query (string): search string
    Returns:
        sources_block (string): sources in a structured output
    """
    # Query content
    retrieved_nodes = retriever.retrieve(query)
    # create the "sources" block
    sources = []
    for i in retrieved_nodes:
        page_num = i.metadata["page_label"]
        # remove "\n" from the sources
        source = i.get_content().replace("\n", "")
        sources.append(f"PAGE {page_num}: {source}")
    sources_block = "\n\n\n".join(sources)
    return sources_block

In [9]:
# TODO: edit this
REPORT = "./other_docs/ai policies 2022 oecd.pdf"
# CHUNK_SIZE (int): desired word count of the paragraphs
CHUNK_SIZE = 512

# CHUNK_OVERLAP (int): overlap of the paragraphs
CHUNK_OVERLAP = 50

# TOP_K (int): number of retrieved paragraphs for a search
TOP_K = 5

# Create retriever (this may take 30 seconds)
retriever = createRetriever(REPORT, CHUNK_SIZE, CHUNK_OVERLAP, TOP_K)

In [10]:
question = "Does the company report its GHG emission reduction interim targets for achieving the overall goal?"
sources = createSources(retriever, question)
visualize(sources)

In [17]:
prompt_template = f"""
{sources}

{question}
"""

prompt_template = f"""You are a senior sustainability analyst with expertise in climate science evaluating a company’s climate-related transition plan and
strategy.
This is basic information to the company: Volkswagen Group is a large German-based international automobile manufacturer known for making high-quality luxury vehicles.
You are presented with the following sources from the company’s report:
--------------------- [BEGIN OF SOURCES]\n {sources}\n
--------------------- [END OF SOURCES]\n
Given the source information and no prior knowledge, your main task is to respond to the posed question encapsulated in "||".
Question: ||{question}||
Please enforce the following guidelines in your answer:
1. Your response must be precise , thorough , and grounded on specific extracts from the report
to verify its authenticity.
2. If you are unsure, simply acknowledge the lack
of knowledge , rather than fabricating an
answer.
3. Keep your ANSWER within 200 words.
4. Be skeptical to the information disclosed in
the report as there might be greenwashing ( exaggerating the firm’s environmental responsibility). Always answer in a critical tone.
5. Cheap talks are statements that are costless to make and may not necessarily reflect the
true intentions or future actions of the company. Be critical for all cheap talks you discovered in the report.
6. Always acknowledge that the information provided is representing the company’s view based on its report.
7. Scrutinize whether the report is grounded in quantifiable , concrete data or vague , unverifiable statements , and communicate your
findings.
8. Start your answer with a "[[YES]]"" or ""[[NO]]
"" depending on whether you would answer the question with a yes or no. Always compliment your judgment on yes or no with a short explanation that summarizes the sources in an
informative way, i.e. provide details.
Format your answer in JSON format with the two keys: ANSWER (this should contain your answer
string without sources), and SOURCES (this should be a list of the SOURCE numbers that were referenced in your answer).
Your FINAL_ANSWER in JSON (ensure there’s no format error):
"""

In [20]:
answer = get_model_response(prompt_template)
visualize(answer)