# Preparing the LLM

In [1]:
import dspy

llm = dspy.OpenAI(
    model_type="chat",
    # model="groq/llama3-70b-8192",
    # model="azure/gpt-35-turbo-1106",
    # model="openai/gpt-3.5-turbo",
    # model="gpt-3.5-turbo",
    model="anthropic/claude-3-haiku-20240307",
    api_base="http://localhost:8080/proxy/v1/",
    max_tokens=2048,
    temperature=0,
)

print("LLM test response:", llm("hello there"))

colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')
dspy.settings.configure(lm=llm, rm=colbertv2_wiki17_abstracts)

LLM test response: ['Hello! How can I assist you today?']


# Preparing the Dataset

In [2]:
from dspy.datasets import HotPotQA

# Load the dataset.
dataset = HotPotQA(train_seed=1, train_size=32, eval_seed=2023, dev_size=50, test_size=0)

# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]

len(trainset), len(devset)

  table = cls._concat_blocks(blocks, axis=0)


(32, 50)

# Defining the model

In [3]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")


class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)

    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)


dev_example = devset[18]
print(f"[Devset] Question: {dev_example.question}")
print(f"[Devset] Answer: {dev_example.answer}")
print(f"[Devset] Relevant Wikipedia Titles: {dev_example.gold_titles}")

generate_answer = RAG()

pred = generate_answer(question=dev_example.question)

# Print the input and the prediction.
print(f"[Prediction] Question: {dev_example.question}")
print(f"[Prediction] Predicted Answer: {pred.answer}")

[Devset] Question: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?
[Devset] Answer: English
[Devset] Relevant Wikipedia Titles: {'Restaurant: Impossible', 'Robert Irvine'}
[Prediction] Question: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?
[Prediction] Predicted Answer: British


# Login to LangWatch

In [4]:
import langwatch

langwatch.endpoint = "http://localhost:3000"
langwatch.login()

LangWatch API key is already set, if you want to login again, please call as langwatch.login(relogin=True)


In [8]:
# Refetching langwatch modules for development

import sys

if "langwatch" in sys.modules:
    del sys.modules["langwatch"]
if "langwatch.dspy" in sys.modules:
    del sys.modules["langwatch.dspy"]

import langwatch
from langwatch.dspy import SerializableAndPydanticEncoder

# Start Training Session!

In [10]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, LabeledFewShot, BootstrapFewShot
import dspy.evaluate

# make logger appear on jupyter notebook
dspy.logger.info = print

def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic optimizer, which will compile our RAG program.
optimizer = BootstrapFewShotWithRandomSearch(metric=validate_context_and_answer, max_rounds=1)
# optimizer = BootstrapFewShot(metric=validate_context_and_answer, max_bootstrapped_demos=10, max_labeled_demos=10, max_rounds=3)
# optimizer.num_candidate_sets = 0

langwatch.dspy.init(experiment="dspy-visualizer-example", optimizer=optimizer)

# Compile!
compiled_rag = optimizer.compile(RAG(), trainset=trainset)

Going to sample between 1 and 4 traces per predictor.
Will attempt to train 16 candidate sets.
Experiment initialized, run_id: astute-husky-from-shambhala
Open http://localhost:3000/inbox-narrator/experiments/dspy-visualizer-example?runIds=astute-husky-from-shambhala to track your DSPy training session live



Average Metric: 9 / 32  (28.1): 100%|██████████| 32/32 [00:00<00:00, 668.02it/s]

Average Metric: 9 / 32 (28.1%)





Score: 28.12 for set: [0]
New best sscore: 28.12 for seed -3
Scores so far: [28.12]
Best score: 28.12


Average Metric: 12 / 32  (37.5): 100%|██████████| 32/32 [00:00<00:00, 309.14it/s]

Average Metric: 12 / 32 (37.5%)





Score: 37.5 for set: [16]
New best sscore: 37.5 for seed -2
Scores so far: [28.12, 37.5]
Best score: 37.5


 34%|███▍      | 11/32 [00:00<00:00, 1248.58it/s]


Bootstrapped 4 full traces after 12 examples in round 0.


Average Metric: 12 / 32  (37.5): 100%|██████████| 32/32 [00:00<00:00, 620.38it/s]

Average Metric: 12 / 32 (37.5%)





Score: 37.5 for set: [16]
Scores so far: [28.12, 37.5, 37.5]
Best score: 37.5
Average of max per entry across top 1 scores: 0.375
Average of max per entry across top 2 scores: 0.5
Average of max per entry across top 3 scores: 0.5
Average of max per entry across top 5 scores: 0.5
Average of max per entry across top 8 scores: 0.5
Average of max per entry across top 9999 scores: 0.5


 41%|████      | 13/32 [00:00<00:00, 1003.94it/s]


Bootstrapped 4 full traces after 14 examples in round 0.


Average Metric: 13 / 32  (40.6): 100%|██████████| 32/32 [00:00<00:00, 482.43it/s]

Average Metric: 13 / 32 (40.6%)





Score: 40.62 for set: [16]
New best sscore: 40.62 for seed 0
Scores so far: [28.12, 37.5, 37.5, 40.62]
Best score: 40.62
Average of max per entry across top 1 scores: 0.40625
Average of max per entry across top 2 scores: 0.46875
Average of max per entry across top 3 scores: 0.5
Average of max per entry across top 5 scores: 0.5
Average of max per entry across top 8 scores: 0.5
Average of max per entry across top 9999 scores: 0.5


 12%|█▎        | 4/32 [00:00<00:00, 1075.88it/s]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 14 / 32  (43.8): 100%|██████████| 32/32 [00:00<00:00, 619.07it/s]

Average Metric: 14 / 32 (43.8%)





KeyboardInterrupt: 