In [1]:
import os

os.environ['DSP_CACHEBOOL'] = 'false'
os.environ["DSP_NOTEBOOK_CACHEDIR"] = os.path.join(os.getcwd(), 'cache')

import dspy
from dspy.evaluate import Evaluate
from dspy.datasets.hotpotqa import HotPotQA
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
from compiler.utils import load_api_key

load_api_key('/mnt/ssd4/lm_compiler/secrets.toml')

gpt4o_mini = dspy.OpenAI('gpt-4o-mini', max_tokens=1000)
colbert = dspy.ColBERTv2(url='http://192.168.1.18:8893/api/search')
dspy.configure(lm=gpt4o_mini, rm=colbert)

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed
  from .autonotebook import tqdm as notebook_tqdm


/mnt/ssd4/lm_compiler/examples/HotPotQA/cache/compiler


In [2]:
dataset = HotPotQA(train_seed=1, train_size=150, eval_seed=2023, dev_size=200, test_size=0)
trainset = [x.with_inputs('question') for x in dataset.train[0:100]]
valset = [x.with_inputs('question') for x in dataset.train[100:150]]
devset = [x.with_inputs('question') for x in dataset.dev]

# show an example datapoint; it's just a question-answer pair
devset[0]

Example({'question': 'Are both Cangzhou and Qionghai in the Hebei province of China?', 'answer': 'no', 'gold_titles': {'Cangzhou', 'Qionghai'}}) (input_keys={'question'})

In [2]:
from dsp.utils.utils import deduplicate

class BasicMH(dspy.Module):
    def __init__(self, passages_per_hop=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_query = [dspy.ChainOfThought("context, question -> search_query") for _ in range(2)]
        self.generate_answer = dspy.ChainOfThought("context, question -> answer")

    def forward(self, question):
        context = []

        for hop in range(2):
            search_query = self.generate_query[hop](context=context, question=question).search_query
            passages = self.retrieve(search_query).passages
            context = deduplicate(context + passages)

        return self.generate_answer(context=context, question=question).copy(context=context)

In [3]:
agent = BasicMH(passages_per_hop=2)

In [4]:
import dsp

def answer_f1(example, pred, trace=None):
    assert(type(example.answer) is str or type(example.answer) is list)
    
    if type(example.answer) is str:
        return dsp.F1(pred.answer, [example.answer])
    else: # type(example.answer) is list
        return dsp.F1(pred.answer, example.answer)

In [6]:
# Set up an evaluator on the first 300 examples of the devset.
config = dict(num_threads=8, display_progress=True, display_table=5)
evaluate = Evaluate(devset=devset, metric=answer_f1, **config)

In [7]:
evaluate(agent)

 		You are using the client GPT3, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to migrate at
 		https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb
 		You are using the client GPT3, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to migrate at
 		https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb
 		You are using the client GPT3, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to

Unnamed: 0,question,example_answer,gold_titles,rationale,pred_answer,context,answer_f1
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,"{'Cangzhou', 'Qionghai'}","determine the locations of Cangzhou and Qionghai. First, we know from the context that Cangzhou is a prefecture-level city in eastern Hebei province, People's Republic...","No, Cangzhou is in Hebei province, while Qionghai is in Hainan province.","['Cangzhou | ""Cangzhou Cangzhou () is a prefecture-level city in eastern Hebei province, People\'s Republic of China. At the 2010 census, Cangzhou\'s built-up (""""or metro"""")...",✔️ [0.15384615384615385]
1,Who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights for the 2017-18 season?,National Hockey League,"{'2017 NHL Expansion Draft', '2017–18 Pittsburgh Penguins season'}",identify the organization responsible for conducting the draft. The context mentions that the 2017 NHL Expansion Draft was conducted by the National Hockey League (NHL)...,National Hockey League (NHL),"['""Marc-André Fleury"" | ""route to a Stanley Cup championship victory, defeating the Nashville Predators in six games. The win was the third Stanley Cup of...",✔️ [0.8571428571428571]
2,"The Wings entered a new era, following the retirement of which Canadian retired professional ice hockey player and current general manager of the Tampa Bay...",Steve Yzerman,"{'Steve Yzerman', '2006–07 Detroit Red Wings season'}",identify the retired professional ice hockey player who was a significant figure for the Detroit Red Wings and later became the general manager of the...,Steve Yzerman,"['""Julien BriseBois"" | ""Julien BriseBois Julien BriseBois (born January 24, 1977) is a Canadian ice hockey executive. He is the general manager for the Tampa...",✔️ [1.0]
3,What river is near the Crichton Collegiate Church?,the River Tyne,"{'Crichton Collegiate Church', 'Crichton Castle'}","determine the river that is near the Crichton Collegiate Church. We know that Crichton Collegiate Church is located in Midlothian, Scotland, and is situated near...",River Tyne,"['""Crichton Collegiate Church"" | ""Crichton Collegiate Church Crichton Collegiate Church is situated about south west of the hamlet of Crichton in Midlothian, Scotland. Crichton itself...",✔️ [1.0]
4,In the 10th Century A.D. Ealhswith had a son called Æthelweard by which English king?,King Alfred the Great,"{'Ealhswith', 'Æthelweard (son of Alfred)'}",determine the answer. We know from the context that Æthelweard was the younger son of King Alfred the Great and Ealhswith. Since Ealhswith is mentioned...,King Alfred the Great,"['""Æthelheard, king of the Hwicce"" | ""Æthelheard, king of the Hwicce Æthelheard, King of Hwicce (an Anglo Saxon kingdom in the English midlands) jointly with...",✔️ [1.0]


46.51

In [7]:
import dspy.evaluate
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, MIPROv2
prmopt_model = dspy.OpenAI(model='gpt-4o-mini', max_tokens=1000)
dspy.settings.configure(lm=prmopt_model)
bootstrap_optimizer = MIPROv2(
    prompt_model=prmopt_model,
    metric=dspy.evaluate.answer_exact_match,
    num_candidates=10,
    num_threads=8,
)
optimized_matplot = bootstrap_optimizer.compile(
    agent,
    trainset=trainset,
    max_bootstrapped_demos=20,
    max_labeled_demos=4,
    num_trials=25,
    valset=valset,
    requires_permission_to_run=False,
)

Beginning MIPROv2 optimization process...

==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
These will be used for as few-shot examples candidates for our program and for creating instructions.

Bootstrapping N=20 sets of demonstrations...
Bootstrapping set 1/20
Bootstrapping set 2/20
Bootstrapping set 3/20


 43%|████▎     | 43/100 [03:32<04:42,  4.95s/it]


Bootstrapped 20 full traces after 44 examples in round 0.
Bootstrapping set 4/20


 31%|███       | 31/100 [02:33<05:42,  4.96s/it]


Bootstrapped 13 full traces after 32 examples in round 0.
Bootstrapping set 5/20


  6%|▌         | 6/100 [00:27<07:04,  4.52s/it]


Bootstrapped 5 full traces after 7 examples in round 0.
Bootstrapping set 6/20


  3%|▎         | 3/100 [00:18<09:49,  6.08s/it]


Bootstrapped 2 full traces after 4 examples in round 0.
Bootstrapping set 7/20


 15%|█▌        | 15/100 [01:06<06:16,  4.42s/it]


Bootstrapped 8 full traces after 16 examples in round 0.
Bootstrapping set 8/20


 14%|█▍        | 14/100 [01:14<07:37,  5.32s/it]


Bootstrapped 8 full traces after 15 examples in round 0.
Bootstrapping set 9/20


 46%|████▌     | 46/100 [03:43<04:22,  4.86s/it]


Bootstrapped 20 full traces after 47 examples in round 0.
Bootstrapping set 10/20


 44%|████▍     | 44/100 [03:13<04:06,  4.39s/it]


Bootstrapped 19 full traces after 45 examples in round 0.
Bootstrapping set 11/20


 20%|██        | 20/100 [01:39<06:39,  5.00s/it]


Bootstrapped 11 full traces after 21 examples in round 0.
Bootstrapping set 12/20


 19%|█▉        | 19/100 [01:28<06:16,  4.65s/it]


Bootstrapped 8 full traces after 20 examples in round 0.
Bootstrapping set 13/20


 26%|██▌       | 26/100 [02:01<05:45,  4.67s/it]


Bootstrapped 15 full traces after 27 examples in round 0.
Bootstrapping set 14/20


 40%|████      | 40/100 [03:07<04:40,  4.68s/it]


Bootstrapped 19 full traces after 41 examples in round 0.
Bootstrapping set 15/20


 41%|████      | 41/100 [03:20<04:48,  4.90s/it]


Bootstrapped 15 full traces after 42 examples in round 0.
Bootstrapping set 16/20


 31%|███       | 31/100 [02:46<06:11,  5.39s/it]


Bootstrapped 16 full traces after 32 examples in round 0.
Bootstrapping set 17/20


 23%|██▎       | 23/100 [01:55<06:27,  5.03s/it]


Bootstrapped 9 full traces after 24 examples in round 0.
Bootstrapping set 18/20


  7%|▋         | 7/100 [00:33<07:25,  4.79s/it]


Bootstrapped 4 full traces after 8 examples in round 0.
Bootstrapping set 19/20


 20%|██        | 20/100 [01:38<06:34,  4.93s/it]


Bootstrapped 7 full traces after 21 examples in round 0.
Bootstrapping set 20/20


 25%|██▌       | 25/100 [02:01<06:04,  4.86s/it]


Bootstrapped 12 full traces after 26 examples in round 0.

==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
In this step, by default we will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.

Proposing instructions...

Proposed Instructions for Predictor 0:

0: Given the fields `context`, `question`, produce the fields `search_query`.

1: Propose a detailed instruction that guides the Language Model to generate a search query based on the provided context and question, ensuring clarity and relevance in the output. The instruction should encourage the model to think critically about the relationship between the context and the question before producing the search query.

PROPOSED INSTRUCTION: 

"Using the provided `context` and `question`, carefully analyze the relationship between the two. Your goal is to produce a `search_query` that accurately reflects the essentia

Average Metric: 12 / 50  (24.0): 100%|██████████| 50/50 [00:32<00:00,  1.55it/s]


Default program score: 24.0

==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==
In this step, we will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination. Bayesian Optimization will be used for this search process.

== Minibatch Trial 1 / 25 ==


Average Metric: 7 / 25  (28.0): 100%|██████████| 25/25 [00:23<00:00,  1.09it/s]


Score: 28.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 12', 'Predictor 1: Few-Shot Set 6', 'Predictor 2: Instruction 8', 'Predictor 2: Few-Shot Set 4', 'Predictor 3: Instruction 3', 'Predictor 3: Few-Shot Set 13'].


== Minibatch Trial 2 / 25 ==


Average Metric: 10 / 25  (40.0): 100%|██████████| 25/25 [00:18<00:00,  1.32it/s]


Score: 40.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 9', 'Predictor 1: Few-Shot Set 7', 'Predictor 2: Instruction 0', 'Predictor 2: Few-Shot Set 9', 'Predictor 3: Instruction 10', 'Predictor 3: Few-Shot Set 15'].


== Minibatch Trial 3 / 25 ==


Average Metric: 3 / 25  (12.0): 100%|██████████| 25/25 [00:17<00:00,  1.45it/s]


Score: 12.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 6', 'Predictor 1: Few-Shot Set 17', 'Predictor 2: Instruction 18', 'Predictor 2: Few-Shot Set 9', 'Predictor 3: Instruction 2', 'Predictor 3: Few-Shot Set 16'].


== Minibatch Trial 4 / 25 ==


Average Metric: 5 / 25  (20.0): 100%|██████████| 25/25 [00:21<00:00,  1.18it/s]


Score: 20.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 8', 'Predictor 2: Instruction 1', 'Predictor 2: Few-Shot Set 13', 'Predictor 3: Instruction 17', 'Predictor 3: Few-Shot Set 3'].


== Minibatch Trial 5 / 25 ==


Average Metric: 6 / 25  (24.0): 100%|██████████| 25/25 [00:17<00:00,  1.45it/s]


Score: 24.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 10', 'Predictor 1: Few-Shot Set 3', 'Predictor 2: Instruction 2', 'Predictor 2: Few-Shot Set 12', 'Predictor 3: Instruction 13', 'Predictor 3: Few-Shot Set 5'].


== Minibatch Trial 6 / 25 ==


Average Metric: 3 / 25  (12.0): 100%|██████████| 25/25 [00:21<00:00,  1.14it/s]


Score: 12.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 9', 'Predictor 2: Instruction 18', 'Predictor 2: Few-Shot Set 12', 'Predictor 3: Instruction 6', 'Predictor 3: Few-Shot Set 0'].


== Minibatch Trial 7 / 25 ==


Average Metric: 8 / 25  (32.0): 100%|██████████| 25/25 [00:23<00:00,  1.04it/s]


Score: 32.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 6', 'Predictor 2: Instruction 16', 'Predictor 2: Few-Shot Set 10', 'Predictor 3: Instruction 3', 'Predictor 3: Few-Shot Set 12'].


== Minibatch Trial 8 / 25 ==


Average Metric: 6 / 25  (24.0): 100%|██████████| 25/25 [00:20<00:00,  1.23it/s]


Score: 24.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 12', 'Predictor 1: Few-Shot Set 5', 'Predictor 2: Instruction 11', 'Predictor 2: Few-Shot Set 13', 'Predictor 3: Instruction 12', 'Predictor 3: Few-Shot Set 11'].


== Minibatch Trial 9 / 25 ==


Average Metric: 3 / 25  (12.0): 100%|██████████| 25/25 [00:26<00:00,  1.07s/it]


Score: 12.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 19', 'Predictor 2: Instruction 12', 'Predictor 2: Few-Shot Set 9', 'Predictor 3: Instruction 1', 'Predictor 3: Few-Shot Set 0'].


== Minibatch Trial 10 / 25 ==


Average Metric: 6 / 25  (24.0): 100%|██████████| 25/25 [00:20<00:00,  1.24it/s]


Score: 24.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 8', 'Predictor 2: Instruction 18', 'Predictor 2: Few-Shot Set 15', 'Predictor 3: Instruction 7', 'Predictor 3: Few-Shot Set 6'].


===== Full Eval 1 =====
Doing full eval on next top averaging program (Avg Score: 40.0) so far from mini-batch trials...


Average Metric: 17 / 50  (34.0): 100%|██████████| 50/50 [00:35<00:00,  1.43it/s]


[92mBest full eval score so far![0m Score: 34.0


== Minibatch Trial 11 / 25 ==


Average Metric: 8 / 25  (32.0): 100%|██████████| 25/25 [00:21<00:00,  1.14it/s]


Score: 32.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 13', 'Predictor 1: Few-Shot Set 14', 'Predictor 2: Instruction 10', 'Predictor 2: Few-Shot Set 3', 'Predictor 3: Instruction 10', 'Predictor 3: Few-Shot Set 15'].


== Minibatch Trial 12 / 25 ==


Average Metric: 9 / 25  (36.0): 100%|██████████| 25/25 [00:18<00:00,  1.34it/s]


Score: 36.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 15', 'Predictor 1: Few-Shot Set 6', 'Predictor 2: Instruction 15', 'Predictor 2: Few-Shot Set 2', 'Predictor 3: Instruction 3', 'Predictor 3: Few-Shot Set 12'].


== Minibatch Trial 13 / 25 ==


Average Metric: 9 / 25  (36.0): 100%|██████████| 25/25 [00:18<00:00,  1.36it/s]


Score: 36.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 9', 'Predictor 1: Few-Shot Set 4', 'Predictor 2: Instruction 14', 'Predictor 2: Few-Shot Set 8', 'Predictor 3: Instruction 10', 'Predictor 3: Few-Shot Set 15'].


== Minibatch Trial 14 / 25 ==


Average Metric: 3 / 25  (12.0): 100%|██████████| 25/25 [00:19<00:00,  1.26it/s]


Score: 12.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 9', 'Predictor 1: Few-Shot Set 7', 'Predictor 2: Instruction 0', 'Predictor 2: Few-Shot Set 6', 'Predictor 3: Instruction 19', 'Predictor 3: Few-Shot Set 12'].


== Minibatch Trial 15 / 25 ==


Average Metric: 7 / 25  (28.0): 100%|██████████| 25/25 [00:27<00:00,  1.10s/it]


Score: 28.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 15', 'Predictor 1: Few-Shot Set 7', 'Predictor 2: Instruction 13', 'Predictor 2: Few-Shot Set 11', 'Predictor 3: Instruction 15', 'Predictor 3: Few-Shot Set 7'].


== Minibatch Trial 16 / 25 ==


Average Metric: 7 / 25  (28.0): 100%|██████████| 25/25 [00:23<00:00,  1.05it/s]


Score: 28.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 7', 'Predictor 1: Few-Shot Set 4', 'Predictor 2: Instruction 15', 'Predictor 2: Few-Shot Set 2', 'Predictor 3: Instruction 16', 'Predictor 3: Few-Shot Set 1'].


== Minibatch Trial 17 / 25 ==


Average Metric: 9 / 25  (36.0): 100%|██████████| 25/25 [00:22<00:00,  1.09it/s]


Score: 36.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 15', 'Predictor 1: Few-Shot Set 6', 'Predictor 2: Instruction 10', 'Predictor 2: Few-Shot Set 2', 'Predictor 3: Instruction 18', 'Predictor 3: Few-Shot Set 9'].


== Minibatch Trial 18 / 25 ==


Average Metric: 7 / 25  (28.0): 100%|██████████| 25/25 [00:20<00:00,  1.25it/s]


Score: 28.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 9', 'Predictor 1: Few-Shot Set 0', 'Predictor 2: Instruction 0', 'Predictor 2: Few-Shot Set 9', 'Predictor 3: Instruction 5', 'Predictor 3: Few-Shot Set 4'].


== Minibatch Trial 19 / 25 ==


Average Metric: 6 / 25  (24.0): 100%|██████████| 25/25 [00:22<00:00,  1.13it/s]


Score: 24.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 13', 'Predictor 2: Instruction 0', 'Predictor 2: Few-Shot Set 9', 'Predictor 3: Instruction 10', 'Predictor 3: Few-Shot Set 10'].


== Minibatch Trial 20 / 25 ==


Average Metric: 9 / 25  (36.0): 100%|██████████| 25/25 [00:22<00:00,  1.09it/s]


Score: 36.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 10', 'Predictor 2: Instruction 9', 'Predictor 2: Few-Shot Set 1', 'Predictor 3: Instruction 9', 'Predictor 3: Few-Shot Set 15'].


===== Full Eval 2 =====
Doing full eval on next top averaging program (Avg Score: 36.0) so far from mini-batch trials...


Average Metric: 15 / 50  (30.0): 100%|██████████| 50/50 [00:34<00:00,  1.44it/s]


Full eval score: 30.0
Best full eval score so far: 34.0


== Minibatch Trial 21 / 25 ==


Average Metric: 3 / 25  (12.0): 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]


Score: 12.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 15', 'Predictor 1: Few-Shot Set 15', 'Predictor 2: Instruction 0', 'Predictor 2: Few-Shot Set 19', 'Predictor 3: Instruction 17', 'Predictor 3: Few-Shot Set 14'].


== Minibatch Trial 22 / 25 ==


Average Metric: 5 / 25  (20.0): 100%|██████████| 25/25 [00:18<00:00,  1.32it/s]


Score: 20.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 9', 'Predictor 1: Few-Shot Set 2', 'Predictor 2: Instruction 4', 'Predictor 2: Few-Shot Set 8', 'Predictor 3: Instruction 10', 'Predictor 3: Few-Shot Set 14'].


== Minibatch Trial 23 / 25 ==


Average Metric: 7 / 25  (28.0): 100%|██████████| 25/25 [00:17<00:00,  1.41it/s]


Score: 28.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 11', 'Predictor 1: Few-Shot Set 6', 'Predictor 2: Instruction 15', 'Predictor 2: Few-Shot Set 9', 'Predictor 3: Instruction 3', 'Predictor 3: Few-Shot Set 18'].


== Minibatch Trial 24 / 25 ==


Average Metric: 8 / 25  (32.0): 100%|██████████| 25/25 [00:24<00:00,  1.03it/s]


Score: 32.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 5', 'Predictor 1: Few-Shot Set 8', 'Predictor 2: Instruction 14', 'Predictor 2: Few-Shot Set 8', 'Predictor 3: Instruction 15', 'Predictor 3: Few-Shot Set 15'].


== Minibatch Trial 25 / 25 ==


Average Metric: 7 / 25  (28.0): 100%|██████████| 25/25 [00:19<00:00,  1.26it/s]


Score: 28.0 on minibatch of size 25 with parameters ['Predictor 1: Instruction 11', 'Predictor 1: Few-Shot Set 7', 'Predictor 2: Instruction 14', 'Predictor 2: Few-Shot Set 15', 'Predictor 3: Instruction 10', 'Predictor 3: Few-Shot Set 15'].


===== Full Eval 3 =====
Doing full eval on next top averaging program (Avg Score: 36.0) so far from mini-batch trials...


Average Metric: 16 / 50  (32.0): 100%|██████████| 50/50 [00:38<00:00,  1.30it/s]

Full eval score: 32.0
Best full eval score so far: 34.0







In [9]:
evaluate(optimized_matplot)

  0%|          | 0/50 [00:00<?, ?it/s]

Average Metric: 29.734819420345733 / 50  (59.5): 100%|██████████| 50/50 [00:36<00:00,  1.38it/s]


Unnamed: 0,question,example_answer,gold_titles,rationale,pred_answer,context,answer_f1
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,"{'Cangzhou', 'Qionghai'}","determine whether both Cangzhou and Qionghai are located in the Hebei province of China. First, we analyze the context provided for Cangzhou. The text states...",no,"['Cangzhou | ""Cangzhou Cangzhou () is a prefecture-level city in eastern Hebei province, People\'s Republic of China. At the 2010 census, Cangzhou\'s built-up (""""or metro"""")...",✔️ [1.0]
1,Who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights for the 2017-18 season?,National Hockey League,"{'2017–18 Pittsburgh Penguins season', '2017 NHL Expansion Draft'}","determine who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights. The context mentions the ""2017 NHL Expansion Draft,"" which...",National Hockey League (NHL),"['""Marc-André Fleury"" | ""route to a Stanley Cup championship victory, defeating the Nashville Predators in six games. The win was the third Stanley Cup of...",✔️ [0.8571428571428571]
2,"The Wings entered a new era, following the retirement of which Canadian retired professional ice hockey player and current general manager of the Tampa Bay...",Steve Yzerman,"{'2006–07 Detroit Red Wings season', 'Steve Yzerman'}","determine the answer to the question. The context mentions Steve Yzerman, who is a Canadian retired professional ice hockey player and the current general manager...",Steve Yzerman,"['""2010–11 Tampa Bay Lightning season"" | ""2010–11 Tampa Bay Lightning season The 2010–11 Tampa Bay Lightning season was the 19th season for the Lightning franchise...",✔️ [1.0]
3,What river is near the Crichton Collegiate Church?,the River Tyne,"{'Crichton Castle', 'Crichton Collegiate Church'}",determine the answer to the question about the river near Crichton Collegiate Church. The context provided does not mention any specific river associated with the...,The context does not provide information about a river near Crichton Collegiate Church.,"['""Crichton Collegiate Church"" | ""Crichton Collegiate Church Crichton Collegiate Church is situated about south west of the hamlet of Crichton in Midlothian, Scotland. Crichton itself...",✔️ [0.15384615384615385]
4,In the 10th Century A.D. Ealhswith had a son called Æthelweard by which English king?,King Alfred the Great,"{'Æthelweard (son of Alfred)', 'Ealhswith'}",determine the answer to the question regarding Ealhswith's son Æthelweard. The context provided mentions that Æthelweard was the younger son of King Alfred the Great...,King Alfred the Great,"['""Æthelheard, king of the Hwicce"" | ""Æthelheard, king of the Hwicce Æthelheard, King of Hwicce (an Anglo Saxon kingdom in the English midlands) jointly with...",✔️ [1.0]


59.47

In [10]:
optimized_matplot.save('optimized_qa.dspy')

[('retrieve', <dspy.retrieve.retrieve.Retrieve object at 0x7fd1fdbc2030>), ('generate_query[0]', Predict(StringSignature(context, question -> rationale, search_query
    instructions='Given the fields `context`, `question`, produce the fields `search_query`.'
    context = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Context:', 'desc': '${context}'})
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the search_query}. We ...', '__dspy_field_type': 'output'})
    search_query = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Search Query:', 'desc': '${search_query}'})
))), ('generate_query[1]', Predict(StringSignature(context, question -> rationale,

In [5]:
agent.load('optimized_qa.dspy')

In [10]:
score, outputs = evaluate(agent, return_outputs=True)

  0%|          | 0/200 [00:00<?, ?it/s]

Average Metric: 112.32022976163844 / 200  (56.2): 100%|██████████| 200/200 [02:33<00:00,  1.31it/s]


Unnamed: 0,question,example_answer,gold_titles,rationale,pred_answer,context,answer_f1
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,"{'Cangzhou', 'Qionghai'}","determine whether both Cangzhou and Qionghai are located in the Hebei province of China. First, we analyze the context provided for Cangzhou. The text states...",no,"['Cangzhou | ""Cangzhou Cangzhou () is a prefecture-level city in eastern Hebei province, People\'s Republic of China. At the 2010 census, Cangzhou\'s built-up (""""or metro"""")...",✔️ [1.0]
1,Who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights for the 2017-18 season?,National Hockey League,"{'2017 NHL Expansion Draft', '2017–18 Pittsburgh Penguins season'}","determine who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights. The context mentions the ""2017 NHL Expansion Draft,"" which...",NHL,"['""Marc-André Fleury"" | ""route to a Stanley Cup championship victory, defeating the Nashville Predators in six games. The win was the third Stanley Cup of...",
2,"The Wings entered a new era, following the retirement of which Canadian retired professional ice hockey player and current general manager of the Tampa Bay...",Steve Yzerman,"{'Steve Yzerman', '2006–07 Detroit Red Wings season'}","determine the answer to the question. The context mentions Steve Yzerman, who is a Canadian retired professional ice hockey player and the current general manager...",Steve Yzerman,"['""2010–11 Tampa Bay Lightning season"" | ""2010–11 Tampa Bay Lightning season The 2010–11 Tampa Bay Lightning season was the 19th season for the Lightning franchise...",✔️ [1.0]
3,What river is near the Crichton Collegiate Church?,the River Tyne,"{'Crichton Collegiate Church', 'Crichton Castle'}",determine the answer to the question regarding the river near Crichton Collegiate Church. The context provided does not mention any specific river associated with the...,The context does not provide information about a river near Crichton Collegiate Church.,"['""Crichton Collegiate Church"" | ""Crichton Collegiate Church Crichton Collegiate Church is situated about south west of the hamlet of Crichton in Midlothian, Scotland. Crichton itself...",✔️ [0.15384615384615385]
4,In the 10th Century A.D. Ealhswith had a son called Æthelweard by which English king?,King Alfred the Great,"{'Ealhswith', 'Æthelweard (son of Alfred)'}","determine the answer to the question regarding Ealhswith's son Æthelweard. The context provides information about Æthelweard, stating that he was the younger son of King...",King Alfred the Great,"['""Æthelheard, king of the Hwicce"" | ""Æthelheard, king of the Hwicce Æthelheard, King of Hwicce (an Anglo Saxon kingdom in the English midlands) jointly with...",✔️ [1.0]


In [16]:
import json
import pandas as pd

In [15]:
def merge_dicts(d1, d2) -> dict:
    merged = {}
    for k, v in d1.items():
        if k in d2:
            merged[f"example_{k}"] = v
        else:
            merged[k] = v

    for k, v in d2.items():
        if k in d1:
            merged[f"pred_{k}"] = v
        else:
            merged[k] = v

    return merged

In [17]:
data = [
    merge_dicts(example, prediction) | {"correct": score} for example, prediction, score in outputs 
]
result_df = pd.DataFrame(data)


In [18]:
result_df.to_csv('qa_results.csv', index=False)

In [19]:
agent(question="Are Walt Disney and Sacro GRA both documentry films?")

Prediction(
    rationale='determine if both "Walt Disney" and "Sacro GRA" are documentary films. First, we analyze the context for "Sacro GRA," which is described as a documentary film directed by Gianfranco Rosi that won the Golden Lion at the Venice Film Festival. This confirms that "Sacro GRA" is indeed a documentary. Next, we look at the context for "Walt Disney," which is also identified as a documentary film created by PBS for the "American Experience" program, focusing on the life and legacy of Walt Disney. Since both films are explicitly categorized as documentaries in their respective contexts, we can conclude that the answer to the question is yes.',
    answer='yes',
    context=['"Sacro GRA" | "Sacro GRA Sacro GRA (, Italian for ""Holy GRA"") is a 2013 Italian documentary film directed by Gianfranco Rosi. It won the Golden Lion at the 70th Venice International Film Festival. It was the first documentary film to win the award at the Venice Festival. The film depicts life al