# Preparing the LLM

In [1]:
import dspy

llm = dspy.OpenAI(
    model_type="chat",
    # model="groq/llama3-70b-8192",
    # model="azure/gpt-35-turbo-1106",
    model="openai/gpt-3.5-turbo",
    # model="gpt-3.5-turbo",
    # model="anthropic/claude-3-haiku-20240307",
    api_base="http://localhost:8080/proxy/v1/",
    max_tokens=2048,
    temperature=0,
)

print("LLM test response:", llm("hello there"))

colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')
dspy.settings.configure(lm=llm, rm=colbertv2_wiki17_abstracts)

LLM test response: ['Hello! How can I assist you today?']


# Preparing the Dataset

In [2]:
from dspy.datasets import HotPotQA

# Load the dataset.
dataset = HotPotQA(train_seed=1, train_size=32, eval_seed=2023, dev_size=50, test_size=0)

# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]

len(trainset), len(devset)

  table = cls._concat_blocks(blocks, axis=0)


(32, 50)

# Defining the model

In [3]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")


class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)

    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)


dev_example = devset[18]
print(f"[Devset] Question: {dev_example.question}")
print(f"[Devset] Answer: {dev_example.answer}")
print(f"[Devset] Relevant Wikipedia Titles: {dev_example.gold_titles}")

generate_answer = RAG()

pred = generate_answer(question=dev_example.question)

# Print the input and the prediction.
print(f"[Prediction] Question: {dev_example.question}")
print(f"[Prediction] Predicted Answer: {pred.answer}")

[Devset] Question: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?
[Devset] Answer: English
[Devset] Relevant Wikipedia Titles: {'Robert Irvine', 'Restaurant: Impossible'}
[Prediction] Question: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?
[Prediction] Predicted Answer: British


# Login to LangWatch

In [4]:
import langwatch

langwatch.endpoint = "http://localhost:3000"
langwatch.login()

LangWatch API key is already set, if you want to login again, please call as langwatch.login(relogin=True)


In [93]:
# Refetching langwatch modules for development

import sys

if "langwatch" in sys.modules:
    del sys.modules["langwatch"]
if "langwatch.dspy" in sys.modules:
    del sys.modules["langwatch.dspy"]

import langwatch
from langwatch.dspy import SerializableAndPydanticEncoder

# Start Training Session!

In [96]:
from dspy.teleprompt import (
    BootstrapFewShotWithRandomSearch,
    LabeledFewShot,
    BootstrapFewShot,
    COPRO,
    MIPRO,
)
import dspy.teleprompt
import dspy.evaluate

# make logger appear on jupyter notebook
dspy.logger.info = print


def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM


# Set up a basic optimizer, which will compile our RAG program.
# optimizer = BootstrapFewShotWithRandomSearch(metric=validate_context_and_answer, max_rounds=1)
# optimizer = BootstrapFewShot(metric=validate_context_and_answer, max_bootstrapped_demos=10, max_labeled_demos=10, max_rounds=3)
# optimizer = COPRO(metric=validate_context_and_answer, breadth=2, depth=3, track_stats=True)
optimizer = MIPRO(
    metric=validate_context_and_answer,
    num_candidates=2,
    init_temperature=0.7
)
# optimizer.num_candidate_sets = 0

langwatch.dspy.init(experiment="dspy-visualizer-example", optimizer=optimizer)

# Compile
# compiled_rag = optimizer.compile(RAG(), trainset=trainset, eval_kwargs=dict(num_threads=64, display_progress=True, display_table=0))
compiled_rag = optimizer.compile(
    RAG(),
    trainset=trainset,
    num_trials=10,
    max_bootstrapped_demos=3,
    max_labeled_demos=5,
    eval_kwargs=dict(num_threads=16, display_progress=True, display_table=0),
)

Experiment initialized, run_id: convivial-enchanted-catfish
Open http://localhost:3000/inbox-narrator/experiments/dspy-visualizer-example?runIds=convivial-enchanted-catfish to track your DSPy training session live


Please be advised that based on the parameters you have set, the maximum number of LM calls is projected as follows:

[93m- Task Model: [94m[1m32[0m[93m examples in dev set * [94m[1m10[0m[93m trials * [94m[1m# of LM calls in your program[0m[93m = ([94m[1m320 * # of LM calls in your program[0m[93m) task model calls[0m
[93m- Prompt Model: # data summarizer calls (max [94m[1m10[0m[93m) + [94m[1m2[0m[93m * [94m[1m1[0m[93m lm calls in program = [94m[1m12[0m[93m prompt model calls[0m

[93m[1mEstimated Cost Calculation:[0m

[93mTotal Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token) 
            + (Number of calls

 12%|█▎        | 4/32 [00:00<00:00, 907.61it/s]
[I 2024-05-29 20:10:22,198] A new study created in memory with name: no-name-15828445-319a-471b-a5be-9986b42b0db0


Bootstrapped 3 full traces after 5 examples in round 0.
Starting trial #0


Average Metric: 8 / 32  (25.0): 100%|██████████| 32/32 [00:00<00:00, 590.46it/s]


Average Metric: 8 / 32 (25.0%)


[I 2024-05-29 20:10:22,422] Trial 0 finished with value: 25.0 and parameters: {'6056878432_predictor_instruction': 1, '6056878432_predictor_demos': 0}. Best is trial 0 with value: 25.0.


Starting trial #1


Average Metric: 10 / 32  (31.2): 100%|██████████| 32/32 [00:00<00:00, 677.06it/s]


Average Metric: 10 / 32 (31.2%)


[I 2024-05-29 20:10:22,635] Trial 1 finished with value: 31.25 and parameters: {'6056878432_predictor_instruction': 0, '6056878432_predictor_demos': 1}. Best is trial 1 with value: 31.25.


Starting trial #2


Average Metric: 13 / 32  (40.6): 100%|██████████| 32/32 [00:00<00:00, 626.42it/s]


Average Metric: 13 / 32 (40.6%)


[I 2024-05-29 20:10:22,843] Trial 2 finished with value: 40.62 and parameters: {'6056878432_predictor_instruction': 1, '6056878432_predictor_demos': 1}. Best is trial 2 with value: 40.62.


Starting trial #3


Average Metric: 10 / 32  (31.2): 100%|██████████| 32/32 [00:00<00:00, 666.47it/s]
[I 2024-05-29 20:10:23,047] Trial 3 finished with value: 31.25 and parameters: {'6056878432_predictor_instruction': 0, '6056878432_predictor_demos': 1}. Best is trial 2 with value: 40.62.


Average Metric: 10 / 32 (31.2%)
Starting trial #4


Average Metric: 8 / 32  (25.0): 100%|██████████| 32/32 [00:00<00:00, 596.53it/s]
[I 2024-05-29 20:10:23,252] Trial 4 finished with value: 25.0 and parameters: {'6056878432_predictor_instruction': 1, '6056878432_predictor_demos': 0}. Best is trial 2 with value: 40.62.


Average Metric: 8 / 32 (25.0%)
Starting trial #5


Average Metric: 10 / 32  (31.2): 100%|██████████| 32/32 [00:00<00:00, 597.35it/s]
[I 2024-05-29 20:10:23,467] Trial 5 finished with value: 31.25 and parameters: {'6056878432_predictor_instruction': 0, '6056878432_predictor_demos': 1}. Best is trial 2 with value: 40.62.


Average Metric: 10 / 32 (31.2%)
Starting trial #6


Average Metric: 13 / 32  (40.6): 100%|██████████| 32/32 [00:00<00:00, 648.99it/s]
[I 2024-05-29 20:10:23,668] Trial 6 finished with value: 40.62 and parameters: {'6056878432_predictor_instruction': 1, '6056878432_predictor_demos': 1}. Best is trial 2 with value: 40.62.


Average Metric: 13 / 32 (40.6%)
Starting trial #7


Average Metric: 11 / 32  (34.4): 100%|██████████| 32/32 [00:00<00:00, 689.76it/s]


Average Metric: 11 / 32 (34.4%)


[I 2024-05-29 20:10:23,871] Trial 7 finished with value: 34.38 and parameters: {'6056878432_predictor_instruction': 0, '6056878432_predictor_demos': 0}. Best is trial 2 with value: 40.62.


Starting trial #8


Average Metric: 8 / 32  (25.0): 100%|██████████| 32/32 [00:00<00:00, 764.84it/s]


Average Metric: 8 / 32 (25.0%)


[I 2024-05-29 20:10:24,088] Trial 8 pruned. 


Trial pruned.
Starting trial #9


Average Metric: 11 / 32  (34.4): 100%|██████████| 32/32 [00:00<00:00, 634.58it/s]

Average Metric: 11 / 32 (34.4%)



[I 2024-05-29 20:10:24,313] Trial 9 finished with value: 34.38 and parameters: {'6056878432_predictor_instruction': 0, '6056878432_predictor_demos': 0}. Best is trial 2 with value: 40.62.


Returning generate_answer = ChainOfThought(GenerateAnswer(context, question -> answer
    instructions='Answer questions with short factoid answers.'
    context = Field(annotation=str required=True json_schema_extra={'desc': 'may contain relevant facts', '__dspy_field_type': 'input', 'prefix': 'Context:'})
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'often between 1 and 5 words', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)) from continue_program


In [47]:
compiled_rag.save("test.json")