In [46]:
from llama_index import VectorStoreIndex, download_loader, ServiceContext
from llama_index.evaluation import DatasetGenerator, RelevancyEvaluator
from llama_index.callbacks import CallbackManager, WandbCallbackHandler
from llama_index.llms import OpenAI

import copy
from dotenv import load_dotenv
import os
from pathlib import Path
import random
import textwrap as tr

# This is a hack to get some things to work in Jupyter Notebooks
import nest_asyncio
nest_asyncio.apply()

def pwrap(text):
    print(tr.fill(str(text), width=80))

In [3]:
load_dotenv()

# Data Loader
loader = PDFReader()
documents = loader.load_data(file=Path('./data/llama2.pdf'))

# Chunking and Embedding of the chunks.
index = VectorStoreIndex.from_documents(documents)

# Retrieval, node poseprocessing, response synthesis. 
query_engine = index.as_query_engine()

# Run the query engine on a user question.
response = query_engine.query("Who wrote this paper?")

In [6]:
print(response)

The authors of this paper are listed in the A.1 Contributions section. They are sorted alphabetically by last name and include various individuals such as Guillem Cucurull, Naman Goyal, Louis Martin, Thomas Scialom, Ruan Silva, Kevin Stone, Hugo Touvron, Sergey Edunov, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Peter Albert, Nikolay Bashlykov, Prajjwal Bhargava, Moya Chen, David Esiobu, Jeremy Fu, Vedanuj Goswami, Anthony Hartshorn, Rui Hou, Marcin Kardas, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Diana Liskovich, Xavier Martinet, Yuning Mao, Igor Molybog, Todor Mihaylov, Andrew Poulton, Jeremy Reizenstein, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Jacob Xu, Yuchen Zhang, and Iliyan Zarov, among others.


## Weights and Biases version

In [4]:
load_dotenv()

# Data Loader
PDFReader = download_loader("PDFReader")
loader = PDFReader()
documents = loader.load_data(file=Path('./data/llama2.pdf'))

os.environ["WANDB_NOTEBOOK_NAME"] = "01_basic-qa.ipynb"

# initialise WandbCallbackHandler and pass any wandb.init args
wandb_args = {"project":"llama-index-report"}
wandb_callback = WandbCallbackHandler(run_args=wandb_args)

# pass wandb_callback to the service context
callback_manager = CallbackManager([wandb_callback])
service_context = ServiceContext.from_defaults(callback_manager=callback_manager)

# Chunking and Embedding of the chunks.
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

# Retrieval, node poseprocessing, response synthesis.
query_engine = index.as_query_engine()

# Run the query engine on a user question.
response = query_engine.query("Who wrote this paper?")

[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.


In [5]:
response

Response(response='The authors of this paper are listed in the context information.', source_nodes=[NodeWithScore(node=TextNode(id_='cab2f54c-0f3d-472a-9c35-ad099dc80507', embedding=None, metadata={'page_label': '46', 'file_name': 'llama2.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6c968678-0b64-4637-a962-ba04eba1da3a', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '46', 'file_name': 'llama2.pdf'}, hash='8be476e049ea94cc3d1ffb715d60e31a863db999359908014a66c5aaf4b7bdd6'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='9af473d1-ede8-4b60-99e9-39d24c3dd1ff', node_type=<ObjectType.TEXT: '1'>, metadata={'page_label': '46', 'file_name': 'llama2.pdf'}, hash='f148bb4cd0ff97e9f311871c4a9f88ee999ce2e8df73b4e09358cc2c2e0024d2')}, hash='6b8f817c9e56d9360b2cdccbb119cd42f8682e4aea7d76f6c386f8c3cce49be6', text='A Appendix\nA.1 Contributions\nAll authors sorted alphabetically by l

In [11]:
pwrap(query_engine.query("Who wrote this paper?").response)

[34m[1mwandb[0m: Logged trace tree to W&B.


The authors of this paper are listed in the A.1 Contributions section. They are
sorted alphabetically by last name and include various individuals such as
Guillem Cucurull, Naman Goyal, Louis Martin, Thomas Scialom, Ruan Silva, Kevin
Stone, Hugo Touvron, Sergey Edunov, Angela Fan, Melanie Kambadur, Sharan Narang,
Aurelien Rodriguez, Robert Stojnic, Peter Albert, Nikolay Bashlykov, Prajjwal
Bhargava, Moya Chen, David Esiobu, Jeremy Fu, Vedanuj Goswami, Anthony
Hartshorn, Rui Hou, Marcin Kardas, Punit Singh Koura, Marie-Anne Lachaux,
Thibaut Lavril, Diana Liskovich, Xavier Martinet, Yuning Mao, Igor Molybog,
Todor Mihaylov, Andrew Poulton, Jeremy Reizenstein, Eric Michael Smith, Ranjan
Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Jacob Xu, Yuchen Zhang,
and Iliyan Zarov.


In [47]:
# Let's just use a meaningful subset of the shuffled documents.
random_documents = [doc.copy(deep=True) for doc in documents]
random.shuffle(random_documents)
random_documents = random_documents[:10]

llm = OpenAI(temperature=0, model="gpt-3.5-turbo")

service_context = ServiceContext.from_defaults(callback_manager=callback_manager, llm=llm)

# Let's reduce the number of questions per chunk.
data_generator = DatasetGenerator.from_documents(
    random_documents, service_context=service_context, num_questions_per_chunk=2
)

eval_questions = data_generator.generate_questions_from_nodes()

[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.


Failed to log trace tree to W&B: list index out of range
Failed to log trace tree to W&B: list index out of range
Failed to log trace tree to W&B: list index out of range


[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.


wandb: ERROR Metric data exceeds maximum size of 10.4MB (10.4MB)
wandb: ERROR Summary data exceeds maximum size of 10.4MB. Dropping it.


In [48]:
eval_questions

["How did the tuning process reveal interesting results about Llama 2-Chat's abilities?",
 'Can you explain the phenomenon of in-context temperature rescaling and its implications for RLHF?',
 'How does GAtt address the issue of multi-turn consistency in dialogue systems?',
 'What is the purpose of Ghost Attention (GAtt) in the Llama 2-Chat model?',
 'How does the responsible release strategy of Llama 2 promote collaboration and democratization of AI expertise?',
 'What are the potential risks associated with the use of AI models like Llama 2, and how has the AI community worked towards mitigating them?',
 'What is the significance of Llama in the field of computational efficiency during inference?',
 'Compare and contrast the dynamics of open-source models like BLOOM, OPT, and Falcon with their closed-source counterparts like GPT-3 and Chinchilla.',
 'What is the main evaluation metric used to measure safety violations in Llama 2-Chat? How is inter-rater reliability measured in this c

In [49]:
from llama_index.evaluation import ResponseEvaluator

# Let's use GPT 3.5 for evaluation.
llm = OpenAI(temperature=0, model="gpt-4")
service_context = ServiceContext.from_defaults(llm=llm, callback_manager=callback_manager)

# define evaluator
evaluator_gpt4 = ResponseEvaluator(service_context=service_context)

# query index
query_engine = index.as_query_engine()

# Get evaluation result
response_vector = query_engine.query(eval_questions[1])
eval_result = evaluator_gpt4.evaluate_response(
    query=eval_questions[1], response=response_vector
)

[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.


In [59]:
pwrap(response_vector)

In-context temperature rescaling is a phenomenon observed in RLHF (Reinforcement
Learning from Human Feedback) models. It refers to the dynamic adjustment of the
temperature parameter in the model based on the context of the prompt. The
temperature parameter controls the level of randomness in the model's output
generation.   The implications of in-context temperature rescaling are twofold.
First, it affects the diversity of responses generated by the model. For prompts
associated with creativity, such as "Write a poem," increasing the temperature
leads to more diverse outputs across different iterations of RLHF. On the other
hand, for prompts based on factual information, such as "What is the capital
of?", the temperature rescaling diminishes over time, indicating that the model
learns to consistently provide the same response to factual prompts.  Second,
in-context temperature rescaling highlights the model's ability to adapt its
generation behavior based on the prompt context. It su

In [57]:
eval_questions[1]

'Can you explain the phenomenon of in-context temperature rescaling and its implications for RLHF?'

In [55]:
pwrap(eval_result)

query=None contexts=['5 Discussion\nHere, we discuss the interesting properties
we have observed with RLHF (Section 5.1). We then discuss the\nlimitations of
Llama 2-Chat (Section 5.2). Lastly, we present our strategy for responsibly
releasing these\nmodels (Section 5.3).\n5.1 Learnings and Observations\nOur
tuning process revealed several interesting results, such as Llama 2-Chat ’s
abilities to temporally\norganize its knowledge, or to call APIs for external
tools.\nSFT (Mix)\nSFT (Annotation)\nRLHF (V1)\n0.0 0.2 0.4 0.6 0.8 1.0\nReward
Model ScoreRLHF (V2)\nFigure 20: Distribution shift for progressive versions of
Llama 2-Chat , from SFT models towards RLHF.\nBeyond Human Supervision. At the
outset of the project, many among us expressed a preference for\nsupervised
annotation, attracted by its denser signal. Meanwhile reinforcement learning,
known for its insta-\nbility, seemed a somewhat shadowy field for those in the
NLP research community. However, reinforcement\nlearning proved

In [60]:
eval_result.__dict__

{'query': None,
 'contexts': ['5 Discussion\nHere, we discuss the interesting properties we have observed with RLHF (Section 5.1). We then discuss the\nlimitations of Llama 2-Chat (Section 5.2). Lastly, we present our strategy for responsibly releasing these\nmodels (Section 5.3).\n5.1 Learnings and Observations\nOur tuning process revealed several interesting results, such as Llama 2-Chat ’s abilities to temporally\norganize its knowledge, or to call APIs for external tools.\nSFT (Mix)\nSFT (Annotation)\nRLHF (V1)\n0.0 0.2 0.4 0.6 0.8 1.0\nReward Model ScoreRLHF (V2)\nFigure 20: Distribution shift for progressive versions of Llama 2-Chat , from SFT models towards RLHF.\nBeyond Human Supervision. At the outset of the project, many among us expressed a preference for\nsupervised annotation, attracted by its denser signal. Meanwhile reinforcement learning, known for its insta-\nbility, seemed a somewhat shadowy field for those in the NLP research community. However, reinforcement\nlearni

In [62]:
from llama_index import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
    LLMPredictor,
    Response,
)
import pandas as pd

# define jupyter display function
def display_eval_df(query: str, response: Response, eval_result: str) -> None:
    eval_df = pd.DataFrame(
        {
            "Query": query,
            "Response": str(response),
            "Source": (
                response.source_nodes[0].node.get_content()[:1000] + "..."
            ),
            "Evaluation Result": eval_result,
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)

display_eval_df(eval_questions[1], response_vector, eval_result.feedback)

Unnamed: 0,Query,Response,Source,Evaluation Result
0,Can you explain the phenomenon of in-context temperature rescaling and its implications for RLHF?,"In-context temperature rescaling is a phenomenon observed in RLHF (Reinforcement Learning from Human Feedback) models. It refers to the dynamic adjustment of the temperature parameter in the model based on the context of the prompt. The temperature parameter controls the level of randomness in the model's output generation. The implications of in-context temperature rescaling are twofold. First, it affects the diversity of responses generated by the model. For prompts associated with creativity, such as ""Write a poem,"" increasing the temperature leads to more diverse outputs across different iterations of RLHF. On the other hand, for prompts based on factual information, such as ""What is the capital of?"", the temperature rescaling diminishes over time, indicating that the model learns to consistently provide the same response to factual prompts. Second, in-context temperature rescaling highlights the model's ability to adapt its generation behavior based on the prompt context. It suggests that RLHF models can learn to differentiate between different types of prompts and adjust their output generation accordingly. This adaptability is crucial for generating more accurate and contextually appropriate responses. Overall, in-context temperature rescaling in RLHF models allows for a fine-grained control over the level of randomness in the generated outputs, depending on the specific prompt context. This phenomenon enhances the model's ability to generate diverse and contextually appropriate responses, improving its overall performance.","5 Discussion Here, we discuss the interesting properties we have observed with RLHF (Section 5.1). We then discuss the limitations of Llama 2-Chat (Section 5.2). Lastly, we present our strategy for responsibly releasing these models (Section 5.3). 5.1 Learnings and Observations Our tuning process revealed several interesting results, such as Llama 2-Chat ’s abilities to temporally organize its knowledge, or to call APIs for external tools. SFT (Mix) SFT (Annotation) RLHF (V1) 0.0 0.2 0.4 0.6 0.8 1.0 Reward Model ScoreRLHF (V2) Figure 20: Distribution shift for progressive versions of Llama 2-Chat , from SFT models towards RLHF. Beyond Human Supervision. At the outset of the project, many among us expressed a preference for supervised annotation, attracted by its denser signal. Meanwhile reinforcement learning, known for its insta- bility, seemed a somewhat shadowy field for those in the NLP research community. However, reinforcement learning proved highly effective, particularly given ...",YES


In [71]:
# Evaluate all the question/answer pairs
from llama_index.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    CorrectnessEvaluator,
)
gpt4 = OpenAI(temperature=0, model="gpt-4")

service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4, callback_manager=callback_manager)

faithfulness_gpt4 = FaithfulnessEvaluator(service_context=service_context_gpt4)
relevancy_gpt4 = RelevancyEvaluator(service_context=service_context_gpt4)
correctness_gpt4 = CorrectnessEvaluator(service_context=service_context_gpt4)


from llama_index.evaluation import BatchEvalRunner

runner = BatchEvalRunner(
    {"faithfulness": faithfulness_gpt4, "relevancy": relevancy_gpt4},
    workers=1,
)

eval_results = await runner.aevaluate_queries(
    index.as_query_engine(), queries=eval_questions
)

[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace t

In [72]:
def get_eval_results(key, eval_results):
    results = eval_results[key]
    correct = 0
    for result in results:
        if result.passing:
            correct += 1
    score = correct / len(results)
    print(f"{key} Score: {score}")
    return score

fscore = get_eval_results("faithfulness", eval_results)
rscore = get_eval_results("relevancy", eval_results)

faithfulness Score: 0.9642857142857143
relevancy Score: 0.8571428571428571


In [75]:
from llama_index.node_parser import SimpleNodeParser

# parse nodes
parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(documents)

nodes

[TextNode(id_='1bee14ea-1a18-498d-8312-f6de95268e8b', embedding=None, metadata={'page_label': '1', 'file_name': 'llama2.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='9ea96a08-ac35-49b5-b6d6-f922e5fa4b80', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'llama2.pdf'}, hash='018b304d342200869f05b9e3bcb41cf91a368cd743da48c666fb67110a4a2c1f')}, hash='018b304d342200869f05b9e3bcb41cf91a368cd743da48c666fb67110a4a2c1f', text='Llama 2 : Open Foundation and Fine-Tuned Chat Models\nHugo Touvron∗Louis Martin†Kevin Stone†\nPeter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra\nPrajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen\nGuillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller\nCynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou\nHakan Inan Marcin Kardas V

In [129]:
from llama_index.evaluation import RetrieverEvaluator

# define retriever somewhere (e.g. from index)
retriever = index.as_retriever(similarity_top_k=2)
# retriever = ...

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)

llm = OpenAI(temperature=0, model="gpt-3.5-turbo")

from llama_index.evaluation import generate_question_context_pairs

qa_dataset = generate_question_context_pairs(
    nodes[40:70], llm=llm, num_questions_per_chunk=3
)

100%|██████████| 30/30 [02:01<00:00,  4.05s/it]


In [130]:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=VectorStoreIndex(nodes).as_retriever(similarity_top_k=5)
)

eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [131]:
def extract_info(eval_result):
    return {
        'query': eval_result.query,
        'mrr': float(str(eval_result.metric_dict['mrr']).split('\n')[0][-3:]),
        'hit_rate': float(str(eval_result.metric_dict['mrr']).split('\n')[0][-3:]),
    }

eval_results_dict = [extract_info(eval_result) for eval_result in eval_results]
df = pd.DataFrame(eval_results_dict)
df

Unnamed: 0,query,mrr,hit_rate
0,"In the evaluation prompts, what are the possib...",1.0,1.0
1,"According to the Likert scale, what does a rat...",1.0,1.0
2,Can you explain what the rating of 2 on the Li...,1.0,1.0
3,What is the main evaluation metric used to mea...,1.0,1.0
4,How does the safety rating of Falcon compare t...,1.0,1.0
...,...,...,...
85,What role did the red team play in improving t...,0.0,0.0
86,Which teams or individuals were involved in gu...,1.0,1.0
87,Who are the individuals mentioned in the conte...,0.5,0.5
88,How many people in total provided product and ...,1.0,1.0


In [133]:
df.describe()

Unnamed: 0,mrr,hit_rate
count,90.0,90.0
mean,22.943889,22.943889
std,83.33086,83.33086
min,0.0,0.0
25%,0.5,0.5
50%,1.0,1.0
75%,1.0,1.0
max,333.0,333.0
