In [None]:
%load_ext autoreload
%autoreload 2

# %set_env CUDA_VISIBLE_DEVICES=7
# import sys; sys.path.append('/future/u/okhattab/repos/public/stanfordnlp/dspy')

import dspy
from dspy.evaluate import Evaluate
from dspy.datasets.hotpotqa import HotPotQA
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, BootstrapFinetune

### 1) Configure the default LM and retriever

In [None]:
ports = [7140, 7141, 7142, 7143, 7144, 7145]
llamaChat = dspy.HFClientTGI(model="meta-llama/Llama-2-13b-chat-hf", port=ports, max_tokens=150)
colbertv2 = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')

dspy.settings.configure(rm=colbertv2, lm=llamaChat)

### 2) Load a small sample of HotPotQA data

In [None]:
dataset = HotPotQA(train_seed=1, train_size=200, eval_seed=2023, dev_size=1000, test_size=0)
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]
testset = [x.with_inputs('question') for x in dataset.test]

len(trainset), len(devset), len(testset)

(200, 1000, 0)

In [None]:
trainset[0]

Example({'question': 'At My Window was released by which American singer-songwriter?', 'answer': 'John Townes Van Zandt'}) (input_keys={'question'})

### 3) Define a simple multi-hop program

In [None]:
from dsp.utils.utils import deduplicate

class BasicMH(dspy.Module):
    def __init__(self, passages_per_hop=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_query = [dspy.ChainOfThought("context, question -> search_query") for _ in range(2)]
        self.generate_answer = dspy.ChainOfThought("context, question -> answer")

    def forward(self, question):
        context = []

        for hop in range(2):
            search_query = self.generate_query[hop](context=context, question=question).search_query
            passages = self.retrieve(search_query).passages
            context = deduplicate(context + passages)

        return self.generate_answer(context=context, question=question).copy(context=context)

### 4) Compile the program with `Llama2-13b-chat`

In [None]:
RECOMPILE_INTO_LLAMA_FROM_SCRATCH = False
NUM_THREADS = 24

metric_EM = dspy.evaluate.answer_exact_match

In [None]:
if RECOMPILE_INTO_LLAMA_FROM_SCRATCH:
    tp = BootstrapFewShotWithRandomSearch(metric=metric_EM, max_bootstrapped_demos=2, num_threads=NUM_THREADS)
    basicmh_bs = tp.compile(BasicMH(), trainset=trainset[:50], valset=trainset[50:200])

    ensemble = [prog for *_, prog in basicmh_bs.candidate_programs[:4]]

    for idx, prog in enumerate(ensemble):
        # prog.save(f'multihop_llama213b_{idx}.json')
        pass

In [None]:
if not RECOMPILE_INTO_LLAMA_FROM_SCRATCH:
    ensemble = []

    for idx in range(4):
        prog = BasicMH()
        prog.load(f'multihop_llama213b_{idx}.json')
        ensemble.append(prog)

In [None]:
llama_program = ensemble[0]

evaluate_hotpot = Evaluate(devset=devset[:1000], metric=metric_EM, num_threads=NUM_THREADS, display_progress=True, display_table=0)
evaluate_hotpot(llama_program)

Average Metric: 424 / 1000  (42.4): 100%|██████████| 1000/1000 [00:14<00:00, 70.51it/s]


Average Metric: 424 / 1000  (42.4%)


42.4

In [None]:
llama_program(question="How many storeys are in the castle that David Gregory inherited?")

llamaChat.inspect_history(n=3)

### 6) Compile into `T5-Large` (770M parameters)

In [None]:
unlabeled_train = HotPotQA(train_seed=1, train_size=3000, eval_seed=2023, dev_size=0, test_size=0).train
unlabeled_train = [dspy.Example(question=x.question).with_inputs('question') for x in unlabeled_train]
len(unlabeled_train)

3000

Optional step: pre-compute the ensemble on the unlabeled training set

In [None]:
always_true = lambda g, p, trace=None: True

for prog_ in ensemble:
    evaluate_hotpot(prog_, devset=unlabeled_train[:3000], metric=always_true)

Now compile into T5!

In [None]:
RECOMPILE_INTO_T5_FROM_SCRATCH = False

if RECOMPILE_INTO_T5_FROM_SCRATCH:
    config = dict(target='t5-large', epochs=2, bf16=True, bsize=6, accumsteps=2, lr=5e-5)

    tp = BootstrapFinetune(metric=None)
    t5_program = tp.compile(BasicMH(), teacher=ensemble, trainset=unlabeled_train[:3000], **config)

    # Deactivate chain of thought prompting. Let's use T5 to directly predict outputs. (Faster and similar quality.)
    for p in t5_program.predictors(): p.activated = False

In [None]:
if not RECOMPILE_INTO_T5_FROM_SCRATCH:
    t5_program = BasicMH()

    # ckpt_path = '../finetuning_ckpts/LMWEP0WZ5IKWM.all/checkpoint-5400'
    ckpt_path = "colbert-ir/dspy-Oct11-T5-Large-MH-3k-v1"
    LM = dspy.HFModel(checkpoint=ckpt_path, model='t5-large')

    for p in t5_program.predictors():
        p.lm = LM
        p.activated = False

### 7) Evaluate the T5-Large `multihop` program

In [None]:
score = evaluate_hotpot(t5_program, num_threads=1)

In [None]:
t5_program.predictors()[0].lm.inspect_history(n=3)