In [4]:
import dspy as ds
import pandas as pd
import numpy as np
from rich import print
from dspy.datasets.gsm8k import GSM8K, gsm8k_metric
from tqdm import tqdm
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShot
from dspy.datasets import HotPotQA
from dsp.utils import deduplicate

tqdm.pandas()

%load_ext rich


## GSM8K


In [5]:
turbo = ds.OpenAI(model="gpt-3.5-turbo-instruct", max_tokens=250)
ds.settings.configure(lm=turbo, log_openai_usage=True)

In [6]:
gsm8k = GSM8K()

train_gsm8k, dev_gsm8k = gsm8k.train[:10], gsm8k.dev[:10]

print(train_gsm8k)


100%|██████████| 7473/7473 [00:00<00:00, 17836.72it/s]
100%|██████████| 1319/1319 [00:00<00:00, 21621.31it/s]


In [7]:
print(train_gsm8k[0])

### Define the modules


In [8]:
class CoT(ds.Module):
    def __init__(self):
        super().__init__()
        self.prog = ds.ChainOfThought("question -> answer")

    def forward(self, question):
        return self.prog(question=question)

In [9]:
teleprompter = BootstrapFewShot(
    metric=gsm8k_metric, max_bootstrapped_demos=4, max_labeled_demos=4
)

optimized_cot = teleprompter.compile(CoT(), trainset=train_gsm8k, valset=dev_gsm8k)


 50%|█████     | 5/10 [00:00<00:00, 37.59it/s]


Bootstrapped 4 full traces after 6 examples in round 0.


In [10]:
optimized_cot(question="What is the capital of France?")


[1;35mPrediction[0m[1m([0m
    [33mrationale[0m=[32m'find the answer. First, we know that the country is France. Then, we know that the capital is the most important city in a country. Therefore, the capital of France is Paris.'[0m,
    [33manswer[0m=[32m'Paris'[0m
[1m)[0m

In [11]:
evalute = Evaluate(
    devset=dev_gsm8k,
    metric=gsm8k_metric,
    num_threads=4,
    display_progress=True,
    display_table=1,
)

In [12]:
evalute(optimized_cot)

Average Metric: 9 / 10  (90.0): 100%|██████████| 10/10 [00:00<00:00, 104.17it/s]


Average Metric: 9 / 10  (90.0%)


  df.loc[:, metric_name] = df[metric_name].apply(


Unnamed: 0,question,gold_reasoning,example_answer,rationale,pred_answer,gsm8k_metric
0,"20 birds migrate on a seasonal basis from one lake to another, searching for food. If they fly from lake Jim to lake Disney in...",The birds' flying distance between Lake Jim through lake Disney to lake London is 50+60 = <<50+60=110>>110 miles. Since each bird flies the 110 miles...,2200,"calculate the combined distance traveled by the birds. First, we know that the birds flew 50 miles from lake Jim to lake Disney and then...",2200 miles,✔️ [True]


[1;36m90.0[0m

In [13]:
turbo.inspect_history()




Given the fields `question`, produce the fields `answer`.

---

Follow the following format.

Question: ${question}
Reasoning: Let's think step by step in order to ${produce the answer}. We ...
Answer: ${answer}

---

Question: The result from the 40-item Statistics exam Marion and Ella took already came out. Ella got 4 incorrect answers while Marion got 6 more than half the score of Ella. What is Marion's score?
Reasoning: Let's think step by step in order to find Marion's score. We know that Ella got 4 incorrect answers, which means she got 36 correct answers out of 40. We also know that Marion got 6 more than half of Ella's score, which is 6 more than 36/2 = 18. Therefore, Marion's score is 18 + 6 = 24.
Answer: 24

---

Question: Bridget counted 14 shooting stars in the night sky. Reginald counted two fewer shooting stars than did Bridget, but Sam counted four more shooting stars than did Reginald. How many more shooting stars did Sam count in the night sky than was the average n

[32m"\n\n\nGiven the fields `question`, produce the fields `answer`.\n\n---\n\nFollow the following format.\n\nQuestion: $[0m[32m{[0m[32mquestion[0m[32m}[0m[32m\nReasoning: Let's think step by step in order to $[0m[32m{[0m[32mproduce the answer[0m[32m}[0m[32m. We ...\nAnswer: $[0m[32m{[0m[32manswer[0m[32m}[0m[32m\n\n---\n\nQuestion: The result from the 40-item Statistics exam Marion and Ella took already came out. Ella got 4 incorrect answers while Marion got 6 more than half the score of Ella. What is Marion's score?\nReasoning: Let's think step by step in order to find Marion's score. We know that Ella got 4 incorrect answers, which means she got 36 correct answers out of 40. We also know that Marion got 6 more than half of Ella's score, which is 6 more than 36/2 = 18. Therefore, Marion's score is 18 + 6 = 24.\nAnswer: 24\n\n---\n\nQuestion: Bridget counted 14 shooting stars in the night sky. Reginald counted two fewer shooting stars than did Bridget, but Sam

## With retrieval module


In [14]:
turbo = ds.OpenAI(model="gpt-3.5-turbo")
colbertv2_wiki17_abstracts = ds.ColBERTv2(
    url="http://20.102.90.50:2017/wiki17_abstracts"
)

ds.settings.configure(lm=turbo, rm=colbertv2_wiki17_abstracts, log_openai_usage=True)


In [15]:
dataset_hpqa = HotPotQA(
    train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0
)

  table = cls._concat_blocks(blocks, axis=0)


In [16]:
trainset_hpqa = [x.with_inputs("question") for x in dataset_hpqa.train]
devset_hpqa = [x.with_inputs("question") for x in dataset_hpqa.dev]

len(trainset_hpqa), len(devset_hpqa)


[1m([0m[1;36m20[0m, [1;36m50[0m[1m)[0m

In [17]:
print(trainset_hpqa[2])

In [18]:
print(devset_hpqa[0])

### Signature

In [19]:
class BasicQA(ds.Signature):
    """Answer questions with short factoid layers"""

    question = ds.InputField()
    answer = ds.OutputField(desc='often between 1 and 5 words')

In [20]:
BasicQA


[1;35mBasicQA[0m[1m([0mquestion -> answer
    [33minstructions[0m=[32m'Answer questions with short factoid layers'[0m
    question = [1;35mField[0m[1m([0m[33mannotation[0m=[35mstr[0m [33mrequired[0m=[3;92mTrue[0m [33mjson_schema_extra[0m=[1m{[0m[32m'__dspy_field_type'[0m: [32m'input'[0m, [32m'prefix'[0m: [32m'Question:'[0m, [32m'desc'[0m: [32m'$[0m[32m{[0m[32mquestion[0m[32m}[0m[32m'[0m[1m}[0m[1m)[0m
    answer = [1;35mField[0m[1m([0m[33mannotation[0m=[35mstr[0m [33mrequired[0m=[3;92mTrue[0m [33mjson_schema_extra[0m=[1m{[0m[32m'desc'[0m: [32m'often between 1 and 5 words'[0m, [32m'__dspy_field_type'[0m: [32m'output'[0m, [32m'prefix'[0m: [32m'Answer:'[0m[1m}[0m[1m)[0m
[1m)[0m

In [21]:
generate_answer = ds.Predict(BasicQA)

In [22]:
pred = generate_answer(question=devset_hpqa[3].question)

In [23]:
print(f'Question: {devset_hpqa[3].question}')
print(f'Answer: {pred.answer}')

In [24]:
turbo.inspect_history(n=3)




Answer questions with short factoid layers

---

Follow the following format.

Question: ${question}
Answer: often between 1 and 5 words

---

Question: What river is near the Crichton Collegiate Church?
Answer: River Nith





[32m'\n\n\nAnswer questions with short factoid layers\n\n---\n\nFollow the following format.\n\nQuestion: $[0m[32m{[0m[32mquestion[0m[32m}[0m[32m\nAnswer: often between 1 and 5 words\n\n---\n\nQuestion: What river is near the Crichton Collegiate Church?\nAnswer:\x1b[0m[32m[[0m[32m32m River Nith\x1b[0m[32m[[0m[32m0m\n\n\n'[0m

In [25]:
generate_cot = ds.ChainOfThought(BasicQA)

pred = generate_cot(question=devset_hpqa[14].question)

print(f'Question: {devset_hpqa[14].question}')
print(f'Thought: {pred.rationale.split(".", 1)[1].strip()}')
print(f'Answer: {pred.answer}')


In [26]:
retrieve = ds.Retrieve(k=3)

top_k_passages = retrieve(devset_hpqa[14].question).passages

print(
    f"Top {retrieve.k} passages for question: {devset_hpqa[14].question} \n",
    "-" * 30,
    "\n",
)

for idx, passage in enumerate(top_k_passages):
    print(f"{idx+1}]", passage, "\n")


In [27]:
retrieve("Who is the president of the United States?").passages


[1m[[0m
    [32m'President of the United States [0m[32m([0m[32mdisambiguation[0m[32m)[0m[32m | The President of the United States [0m[32m([0m[32mof America[0m[32m)[0m[32m has been chief of the United States executive branch since 1789. The current U.S. President is Donald Trump.'[0m,
    [32m'President of the United States | The President of the United States [0m[32m([0m[32minformally referred to as "POTUS"[0m[32m)[0m[32m is the head of state and head of government of the United States. The president directs the executive branch of the federal government and is the commander-in-chief of the United States Armed Forces.'[0m,
    [32m'List of Presidents of the United States | The President of the United States is the elected head of state and head of government of the United States. The president leads the executive branch of the federal government and is the commander-in-chief of the United States Armed Forces. The president is indirectly elected to a four

## Program 1: Basic RAG

In [28]:
class GenerateAnswer(ds.Signature):
    """Answer question with short factoid layers"""

    question = ds.InputField()
    context = ds.InputField(desc="may contain relevant facts")
    answer = ds.OutputField(desc="often between 1 and 5 words")

In [29]:
class RAG(ds.Module):
    def __init__(self, num_passages=3):
        super().__init__()
        
        self.retrieve = ds.Retrieve(k=num_passages)
        self.generate_answer = ds.ChainOfThought(GenerateAnswer)

    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)

        return ds.Prediction(context=context, answer=prediction.answer)

In [30]:
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = ds.evaluate.answer_exact_match(example, pred)
    answer_PM = ds.evaluate.answer_passage_match(example, pred)

    return answer_EM and answer_PM

In [31]:
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)
compiled_rag = teleprompter.compile(RAG(), trainset=trainset_hpqa)

 55%|█████▌    | 11/20 [00:00<00:00, 139.23it/s]


Bootstrapped 4 full traces after 12 examples in round 0.


In [32]:
compiled_rag


generate_answer = [1;35mChainOfThought[0m[1m([0m[1;35mGenerateAnswer[0m[1m([0mquestion, context -> answer
    [33minstructions[0m=[32m'Answer question with short factoid layers'[0m
    question = [1;35mField[0m[1m([0m[33mannotation[0m=[35mstr[0m [33mrequired[0m=[3;92mTrue[0m [33mjson_schema_extra[0m=[1m{[0m[32m'__dspy_field_type'[0m: [32m'input'[0m, [32m'prefix'[0m: [32m'Question:'[0m, [32m'desc'[0m: [32m'$[0m[32m{[0m[32mquestion[0m[32m}[0m[32m'[0m[1m}[0m[1m)[0m
    context = [1;35mField[0m[1m([0m[33mannotation[0m=[35mstr[0m [33mrequired[0m=[3;92mTrue[0m [33mjson_schema_extra[0m=[1m{[0m[32m'desc'[0m: [32m'may contain relevant facts'[0m, [32m'__dspy_field_type'[0m: [32m'input'[0m, [32m'prefix'[0m: [32m'Context:'[0m[1m}[0m[1m)[0m
    answer = [1;35mField[0m[1m([0m[33mannotation[0m=[35mstr[0m [33mrequired[0m=[3;92mTrue[0m [33mjson_schema_extra[0m=[1m{[0m[32m'desc'[0m: [32m'often between

In [33]:
question = "Who is the current president of the United States?"

pred = compiled_rag(question)

print(f"Question: {question}")
print(f'Context: {[c[:200] + "..." for c in pred.context]}')
print(f"Answer: {pred.answer}")

In [34]:
turbo.inspect_history(n=1)




Answer question with short factoid layers

---

Question: At My Window was released by which American singer-songwriter?
Answer: John Townes Van Zandt

Question: "Everything Has Changed" is a song from an album released under which record label ?
Answer: Big Machine Records

Question: The Victorians - Their Story In Pictures is a documentary series written by an author born in what year?
Answer: 1950

Question: Which Pakistani cricket umpire who won 3 consecutive ICC umpire of the year awards in 2009, 2010, and 2011 will be in the ICC World Twenty20?
Answer: Aleem Sarwar Dar

Question: Having the combination of excellent foot speed and bat speed helped Eric Davis, create what kind of outfield for the Los Angeles Dodgers?
Answer: "Outfield of Dreams"

Question: Who is older, Aleksandr Danilovich Aleksandrov or Anatoly Fomenko?
Answer: Aleksandr Danilovich Aleksandrov

Question: The Organisation that allows a community to influence their operation or use and to enjoy the benefits aris

[32m'\n\n\nAnswer question with short factoid layers\n\n---\n\nQuestion: At My Window was released by which American singer-songwriter?\nAnswer: John Townes Van Zandt\n\nQuestion: "Everything Has Changed" is a song from an album released under which record label ?\nAnswer: Big Machine Records\n\nQuestion: The Victorians - Their Story In Pictures is a documentary series written by an author born in what year?\nAnswer: 1950\n\nQuestion: Which Pakistani cricket umpire who won 3 consecutive ICC umpire of the year awards in 2009, 2010, and 2011 will be in the ICC World Twenty20?\nAnswer: Aleem Sarwar Dar\n\nQuestion: Having the combination of excellent foot speed and bat speed helped Eric Davis, create what kind of outfield for the Los Angeles Dodgers?\nAnswer: "Outfield of Dreams"\n\nQuestion: Who is older, Aleksandr Danilovich Aleksandrov or Anatoly Fomenko?\nAnswer: Aleksandr Danilovich Aleksandrov\n\nQuestion: The Organisation that allows a community to influence their operation or use

In [35]:
for name, parameter in compiled_rag.named_predictors():
    print(name)
    print(parameter.demos[0])
    print()

In [36]:
evaluate_on_hotpotqa = Evaluate(
    devset=devset_hpqa,
    num_threads=4,
    display_progress=True,
    display_table=5,
    metric=ds.evaluate.answer_exact_match,
)

evaluate_on_hotpotqa(compiled_rag)

Average Metric: 27 / 43  (62.8):  84%|████████▍ | 42/50 [00:00<00:00, 61.58it/s]

Average Metric: 28 / 50  (56.0): 100%|██████████| 50/50 [00:00<00:00, 59.74it/s]


Average Metric: 28 / 50  (56.0%)


  df.loc[:, metric_name] = df[metric_name].apply(


Unnamed: 0,question,example_answer,gold_titles,context,pred_answer,answer_exact_match
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,"{'Cangzhou', 'Qionghai'}","['Cangzhou | Cangzhou () is a prefecture-level city in eastern Hebei province, People\'s Republic of China. At the 2010 census, Cangzhou\'s built-up (""or metro"") area...",No,✔️ [True]
1,Who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights for the 2017-18 season?,National Hockey League,"{'2017–18 Pittsburgh Penguins season', '2017 NHL Expansion Draft'}",['2017–18 Pittsburgh Penguins season | The 2017–18 Pittsburgh Penguins season will be the 51st season for the National Hockey League ice hockey team that was...,National Hockey League,✔️ [True]
2,"The Wings entered a new era, following the retirement of which Canadian retired professional ice hockey player and current general manager of the Tampa Bay...",Steve Yzerman,"{'2006–07 Detroit Red Wings season', 'Steve Yzerman'}","['Steve Yzerman | Stephen Gregory ""Steve"" Yzerman ( ; born May 9, 1965) is a Canadian retired professional ice hockey player and current general manager...",Steve Yzerman,✔️ [True]
3,What river is near the Crichton Collegiate Church?,the River Tyne,"{'Crichton Collegiate Church', 'Crichton Castle'}","[""Crichton Collegiate Church | Crichton Collegiate Church is situated about 0.6 mi south west of the hamlet of Crichton in Midlothian, Scotland. Crichton itself is...",River Tyne,✔️ [True]
4,In the 10th Century A.D. Ealhswith had a son called Æthelweard by which English king?,King Alfred the Great,"{'Ealhswith', 'Æthelweard (son of Alfred)'}","[""Æthelweard of East Anglia | Æthelweard (died 854) was a 9th-century king of East Anglia, the long-lived Anglo-Saxon kingdom which today includes the English counties...",King Alfred the Great,✔️ [True]


[1;36m56.0[0m

In [37]:
def gold_passages_retrieved(example, pred, trace=None):
    gold_titles = set(map(ds.evaluate.normalize_text, example['gold_titles']))
    found_titles = set(map(ds.evaluate.normalize_text, [c.split("|")[0] for c in pred.context]))

    return gold_titles.issubset(found_titles)

compiled_rag_retrieval_score = evaluate_on_hotpotqa(compiled_rag, metric=gold_passages_retrieved)

Average Metric: 13 / 50  (26.0): 100%|██████████| 50/50 [00:00<00:00, 276.56it/s]


Average Metric: 13 / 50  (26.0%)


  df.loc[:, metric_name] = df[metric_name].apply(


Unnamed: 0,question,example_answer,gold_titles,context,pred_answer,gold_passages_retrieved
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,"{'Cangzhou', 'Qionghai'}","['Cangzhou | Cangzhou () is a prefecture-level city in eastern Hebei province, People\'s Republic of China. At the 2010 census, Cangzhou\'s built-up (""or metro"") area...",No,False
1,Who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights for the 2017-18 season?,National Hockey League,"{'2017–18 Pittsburgh Penguins season', '2017 NHL Expansion Draft'}",['2017–18 Pittsburgh Penguins season | The 2017–18 Pittsburgh Penguins season will be the 51st season for the National Hockey League ice hockey team that was...,National Hockey League,✔️ [True]
2,"The Wings entered a new era, following the retirement of which Canadian retired professional ice hockey player and current general manager of the Tampa Bay...",Steve Yzerman,"{'2006–07 Detroit Red Wings season', 'Steve Yzerman'}","['Steve Yzerman | Stephen Gregory ""Steve"" Yzerman ( ; born May 9, 1965) is a Canadian retired professional ice hockey player and current general manager...",Steve Yzerman,✔️ [True]
3,What river is near the Crichton Collegiate Church?,the River Tyne,"{'Crichton Collegiate Church', 'Crichton Castle'}","[""Crichton Collegiate Church | Crichton Collegiate Church is situated about 0.6 mi south west of the hamlet of Crichton in Midlothian, Scotland. Crichton itself is...",River Tyne,✔️ [True]
4,In the 10th Century A.D. Ealhswith had a son called Æthelweard by which English king?,King Alfred the Great,"{'Ealhswith', 'Æthelweard (son of Alfred)'}","[""Æthelweard of East Anglia | Æthelweard (died 854) was a 9th-century king of East Anglia, the long-lived Anglo-Saxon kingdom which today includes the English counties...",King Alfred the Great,False


## Program 2: Multi-hop

In [38]:
turbo = ds.OpenAI(model="gpt-3.5-turbo")
colbertv2_wiki17_abstracts = ds.ColBERTv2(
    url="http://20.102.90.50:2017/wiki17_abstracts"
)

ds.settings.configure(lm=turbo, rm=colbertv2_wiki17_abstracts, log_openai_usage=True)

In [47]:
class GenerateSearchQuery(ds.Signature):
    """Write a simple search query that will help answer a complex question."""

    context = ds.InputField(desc="may contain relevant facts")
    question = ds.InputField()
    search_query = ds.OutputField(desc="search query to retrieve relevant facts")


In [48]:
class SimplifiedMultiHop(ds.Module):
    def __init__(self, passages_per_hop=3, max_hops=2):
        super().__init__()

        self.generate_query = [ds.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
        self.retrieve = ds.Retrieve(k=passages_per_hop)

        self.generate_answer = ds.ChainOfThought(GenerateAnswer)
        self.max_hops = max_hops

    def forward(self, question):
        context = []

        for hop in range(self.max_hops):
            query = self.generate_query[hop](context=context, question=question).search_query
            passages = self.retrieve(query).passages
            context = deduplicate(context + passages)

        prediction = self.generate_answer(context=context, question=question)

        return ds.Prediction(context=context, answer=prediction.answer)

In [49]:
simple_multi_hop = SimplifiedMultiHop()
question = "How many storeys are in the castle that David Gregory inherited?"

prediction = simple_multi_hop(question=question)

print(f"Question: {question}")
print(f"Answer: {prediction.answer}")
print(f"Context: {[c[:200] + '...' for c in prediction.context]}")


In [50]:
turbo.inspect_history(n=3)




Answer question with short factoid layers

---

Follow the following format.

Question: ${question}

Context: may contain relevant facts

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: often between 1 and 5 words

---

Question: How many storeys are in the castle that David Gregory inherited?

Context:
[1] «David Gregory (physician) | David Gregory (20 December 1625 – 1720) was a Scottish physician and inventor. His surname is sometimes spelt as Gregorie, the original Scottish spelling. He inherited Kinnairdy Castle in 1664. Three of his twenty-nine children became mathematics professors. He is credited with inventing a military cannon that Isaac Newton described as "being destructive to the human species". Copies and details of the model no longer exist. Gregory's use of a barometer to predict farming-related weather conditions led him to be accused of witchcraft by Presbyterian ministers from Aberdeen, although he was never convicted.»
[2] «

[32m'\n\n\nAnswer question with short factoid layers\n\n---\n\nFollow the following format.\n\nQuestion: $[0m[32m{[0m[32mquestion[0m[32m}[0m[32m\n\nContext: may contain relevant facts\n\nReasoning: Let\'s think step by step in order to $[0m[32m{[0m[32mproduce the answer[0m[32m}[0m[32m. We ...\n\nAnswer: often between 1 and 5 words\n\n---\n\nQuestion: How many storeys are in the castle that David Gregory inherited?\n\nContext:\n[0m[32m[[0m[32m1[0m[32m][0m[32m «David Gregory [0m[32m([0m[32mphysician[0m[32m)[0m[32m | David Gregory [0m[32m([0m[32m20 December 1625 – 1720[0m[32m)[0m[32m was a Scottish physician and inventor. His surname is sometimes spelt as Gregorie, the original Scottish spelling. He inherited Kinnairdy Castle in 1664. Three of his twenty-nine children became mathematics professors. He is credited with inventing a military cannon that Isaac Newton described as "being destructive to the human species". Copies and details of the model

In [51]:
def validate_context_and_answer_and_hops(example, pred, trace=None):
    if not ds.evaluate.answer_exact_match(example, pred):
        return False
    if not ds.evaluate.answer_passage_match(example, pred):
        return False

    hops = [example.question] + [
        outputs.query for *_, outputs in trace if "query" in outputs
    ]

    if max([len(h) for h in hops]) > 100:
        return False
    if any(
        ds.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8)
        for idx in range(2, len(hops))
    ):
        return False

    return True


In [52]:
teleprompter = BootstrapFewShot(metric=validate_context_and_answer_and_hops)
compiled_baleen = teleprompter.compile(
    SimplifiedMultiHop(),
    teacher=SimplifiedMultiHop(passages_per_hop=2),
    trainset=trainset_hpqa,
)


 35%|███▌      | 7/20 [00:28<00:52,  4.02s/it]


Bootstrapped 4 full traces after 8 examples in round 0.


In [53]:
uncompiled_evaluate = evaluate_on_hotpotqa(simple_multi_hop, metric=gold_passages_retrieved)
compiled_evaluate = evaluate_on_hotpotqa(compiled_baleen, metric=gold_passages_retrieved)

Average Metric: 27 / 50  (54.0): 100%|██████████| 50/50 [00:50<00:00,  1.02s/it]


Average Metric: 27 / 50  (54.0%)


  df.loc[:, metric_name] = df[metric_name].apply(


Unnamed: 0,question,example_answer,gold_titles,context,pred_answer,gold_passages_retrieved
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,"{'Cangzhou', 'Qionghai'}","['Cangzhou | Cangzhou () is a prefecture-level city in eastern Hebei province, People\'s Republic of China. At the 2010 census, Cangzhou\'s built-up (""or metro"") area...",No.,✔️ [True]
1,Who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights for the 2017-18 season?,National Hockey League,"{'2017–18 Pittsburgh Penguins season', '2017 NHL Expansion Draft'}","['2017 NHL Entry Draft | The 2017 NHL Entry Draft was the 55th NHL Entry Draft. The draft was held from June 23–24, 2017, at...",NHL Expansion Draft,False
2,"The Wings entered a new era, following the retirement of which Canadian retired professional ice hockey player and current general manager of the Tampa Bay...",Steve Yzerman,"{'2006–07 Detroit Red Wings season', 'Steve Yzerman'}","['Steve Yzerman | Stephen Gregory ""Steve"" Yzerman ( ; born May 9, 1965) is a Canadian retired professional ice hockey player and current general manager...",Steve Yzerman,False
3,What river is near the Crichton Collegiate Church?,the River Tyne,"{'Crichton Collegiate Church', 'Crichton Castle'}","[""Crichton Collegiate Church | Crichton Collegiate Church is situated about 0.6 mi south west of the hamlet of Crichton in Midlothian, Scotland. Crichton itself is...",River Tyne,✔️ [True]
4,In the 10th Century A.D. Ealhswith had a son called Æthelweard by which English king?,King Alfred the Great,"{'Ealhswith', 'Æthelweard (son of Alfred)'}","['Æthelweard (son of Alfred) | Æthelweard (d. 920 or 922) was the younger son of King Alfred the Great and Ealhswith.', 'Æthelweard, king of the...",Alfred the Great,False


Average Metric: 27 / 50  (54.0): 100%|██████████| 50/50 [00:59<00:00,  1.20s/it]


Average Metric: 27 / 50  (54.0%)


  df.loc[:, metric_name] = df[metric_name].apply(


Unnamed: 0,question,example_answer,gold_titles,context,pred_answer,gold_passages_retrieved
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,"{'Cangzhou', 'Qionghai'}","['Cangzhou | Cangzhou () is a prefecture-level city in eastern Hebei province, People\'s Republic of China. At the 2010 census, Cangzhou\'s built-up (""or metro"") area...",No,✔️ [True]
1,Who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights for the 2017-18 season?,National Hockey League,"{'2017–18 Pittsburgh Penguins season', '2017 NHL Expansion Draft'}","[""Marc-André Fleury | Marc-André Fleury (born November 28, 1984) is a French-Canadian professional ice hockey goaltender playing for the Vegas Golden Knights of the National...",National Hockey League,✔️ [True]
2,"The Wings entered a new era, following the retirement of which Canadian retired professional ice hockey player and current general manager of the Tampa Bay...",Steve Yzerman,"{'2006–07 Detroit Red Wings season', 'Steve Yzerman'}","['Steve Yzerman | Stephen Gregory ""Steve"" Yzerman ( ; born May 9, 1965) is a Canadian retired professional ice hockey player and current general manager...",Steve Yzerman,False
3,What river is near the Crichton Collegiate Church?,the River Tyne,"{'Crichton Collegiate Church', 'Crichton Castle'}","[""Crichton Collegiate Church | Crichton Collegiate Church is situated about 0.6 mi south west of the hamlet of Crichton in Midlothian, Scotland. Crichton itself is...",River Tyne,✔️ [True]
4,In the 10th Century A.D. Ealhswith had a son called Æthelweard by which English king?,King Alfred the Great,"{'Ealhswith', 'Æthelweard (son of Alfred)'}","['Æthelweard (son of Alfred) | Æthelweard (d. 920 or 922) was the younger son of King Alfred the Great and Ealhswith.', 'Æthelred the Unready |...",Alfred the Great,False


In [54]:
print(f'Uncompiled model retrieval score: {uncompiled_evaluate}')
print(f'Compiled model retrieval score: {compiled_evaluate}')