In [1]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") or getpass("Enter your OPENAI_API_KEY: ")

In [2]:
import datasets, aiohttp

dataset = datasets.load_dataset(
    "hotpot_qa",
    "fullwiki",
    storage_options={'client_kwargs': {'timeout': aiohttp.ClientTimeout(total=3600)}}
)

from dspy.datasets import HotPotQA

# Load the dataset.
dataset = HotPotQA(train_seed=1, train_size=20, eval_seed=2025, dev_size=10, test_size=0)

# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]

len(trainset), len(devset)

  from .autonotebook import tqdm as notebook_tqdm


(20, 10)

## Defining the model

Generated DSL

In [1]:
import asyncio
import time
from typing import Any, Dict, List, Literal, Optional

import dspy
import langwatch

from langwatch_nlp.studio.dspy import (EvaluationResultWithMetadata,
                                       LangWatchWorkflowModule, LLMConfig,
                                       LLMNode,
                                       PredictionWithEvaluationAndMetadata,
                                       PredictionWithMetadata, TemplateAdapter)
from langwatch_nlp.studio.dspy.retrieve import ColBERTv2RM, ContextsRetriever
from langwatch_nlp.studio.modules.evaluators.exact_match import \
    ExactMatchEvaluator

lm = dspy.LM(
            max_tokens=8192,
            temperature=0,
            model='openai/gpt-4o-mini',
            drop_params=True,
            model_type='chat',
        )

dspy.configure(lm=lm)
dspy.configure(adapter=TemplateAdapter())

class WorkflowModule(LangWatchWorkflowModule):
    def __init__(self, run_evaluations: bool = False):
        super().__init__()

        self.exact_match = self.wrapped(ExactMatchEvaluator, node_id="exact_match", run=run_evaluations)(
        )
        self.generate_answer = self.wrapped(Generateanswer, node_id="generate_answer", )(
        )
        self.generate_query = self.wrapped(Generatequery, node_id="generate_query", )(
        )
        self.col_bert_v2 = self.wrapped(ContextsRetriever, node_id="col_bert_v2", )(
            rm=ColBERTv2RM,
            k=3,
            url='http://20.102.90.50:2017/wiki17_abstracts',
        )


    def forward(self, **kwargs) -> dspy.Prediction:
        self.cost = 0
        self.duration = 0

        generate_query = self.generate_query(
                        question=kwargs.get("question"),

        )
        col_bert_v2 = self.col_bert_v2(
                        query=generate_query.query,

        )
        generate_answer = self.generate_answer(
                        contexts=col_bert_v2.contexts,

                        question=kwargs.get("question"),

        )
        exact_match = self.exact_match(
                        expected_output=kwargs.get("answer"),

                        output=generate_answer.answer,

        )


        return PredictionWithEvaluationAndMetadata(
            generate_answer=generate_answer,
            generate_query=generate_query,
            col_bert_v2=col_bert_v2,
            end={
                "output": generate_answer.answer,
            },
            evaluations={
                "exact_match": exact_match,
            },
            cost=self.cost,
            duration=self.duration,
        )







class GenerateanswerSignature(dspy.Signature):
    """generate an answer to the question based on the contexts"""

    _messages = [
        {"role": "user", "content": """Question:
{{question}}

Contexts:
{{contexts}}"""},
    ]

    question: str = dspy.InputField()
    contexts: list[str] = dspy.InputField()
    answer: str = dspy.OutputField()


class Generateanswer(LLMNode):
    def __init__(self):
        predict = dspy.Predict(GenerateanswerSignature)

        lm = dspy.LM(
            max_tokens=2048,
            temperature=0,
            model='openai/gpt-4o-mini',
            drop_params=True,
            model_type='chat',
        )

        super().__init__(
            node_id="generate_answer",
            name="Generateanswer",
            predict=predict,
            lm=lm,
        )


    def forward(self, question: str = None, contexts: list[str] = None):
        return super().forward(question=question, contexts=contexts)






class GeneratequerySignature(dspy.Signature):
    """generate a short wikipedia search query to find info about it"""

    _messages = [
        {"role": "user", "content": """Question: {{question}}"""},
    ]

    question: str = dspy.InputField()
    query: str = dspy.OutputField()


class Generatequery(LLMNode):
    def __init__(self):
        predict = dspy.Predict(GeneratequerySignature)

        lm = dspy.LM(
            max_tokens=2048,
            temperature=0,
            model='openai/gpt-4o-mini',
            drop_params=True,
            model_type='chat',
            cache=False
        )

        super().__init__(
            node_id="generate_query",
            name="Generatequery",
            predict=predict,
            lm=lm,
        )


    def forward(self, question: str = None):
        return super().forward(question=question)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
response = WorkflowModule(run_evaluations=True).forward(**trainset[0])

response

2025-07-07 21:50:54,866 - langwatch.utils.initialization - INFO - Setting up LangWatch client...
2025-07-07 21:50:54,874 - langwatch.client - INFO - Configuring OTLP exporter with endpoint: http://localhost:5560/api/otel/v1/traces
2025-07-07 21:50:54,874 - langwatch.client - INFO - Registering atexit handler to flush tracer provider on exit
2025-07-07 21:50:54,875 - langwatch.client - INFO - Successfully configured tracer provider with OTLP exporter
2025-07-07 21:50:54,875 - langwatch.utils.initialization - INFO - LangWatch client setup complete




Prediction(
    generate_answer=Prediction(
    answer='At My Window was released by American singer-songwriter Townes Van Zandt in 1987.'
),
    generate_query=Prediction(
    query='Search query: "At My Window American singer-songwriter"'
),
    col_bert_v2=Prediction(
    contexts=['The Search for Everything | The Search for Everything is the seventh studio album by American singer-songwriter John Mayer, released by Columbia and Sony Music on April 14, 2017.', 'At My Window (album) | At My Window is an album released by Folk/country singer-songwriter Townes Van Zandt in 1987. This was Van Zandt\'s first studio album in the nine years that followed 1978\'s "Flyin\' Shoes", and his only studio album recorded in the 1980s. Although the songwriter had become less prolific, this release showed that the quality of his material remained high.', 'Yes I Am (Melissa Etheridge album) | Yes I Am is the fourth studio album by American singer-songwriter Melissa Etheridge, released by Island Recor

In [2]:
from langwatch.dspy import LangWatchDSPy, DSPyOptimizer, DSPyPredictor
from dspy.teleprompt import MIPROv2

langwatch_dspy = LangWatchDSPy()

llm = dspy.LM(
    max_tokens=2048,
    temperature=0,
    model="openai/gpt-4o",
    drop_params=True,
    model_type="chat",
    cache=False,
)


def metric(
    example: dspy.Example,
    pred: PredictionWithEvaluationAndMetadata,
    trace=None,
):
    score = pred.total_score(weighting="mean")
    return score


# Set up a MIPROv2 optimizer, which will compile our RAG program.
optimizer = MIPROv2(
    metric=metric,
    prompt_model=llm,
    task_model=llm,
    num_candidates=2,
    init_temperature=0.7,
    auto=None,
)

langwatch_dspy.init(experiment="my-awesome-experiment", optimizer=optimizer)

program = WorkflowModule(run_evaluations=True)

langwatch_dspy.log_step(
    optimizer=DSPyOptimizer(
        name=MIPROv2.__name__,
        parameters={},
    ),
    index=str(0),
    score=0.5,
    label="score",
    predictors=[
        DSPyPredictor(name=name, predictor=predictor)
        for name, predictor in program.named_predictors()
    ],
)
langwatch_dspy.send_steps()


[LangWatch] Experiment initialized, run_id: elegant-cordial-dingo
[LangWatch] Open http://localhost:5560/inbox-narrator/experiments/my-awesome-experiment?runIds=elegant-cordial-dingo to track your DSPy training session live



data [{'run_id': 'elegant-cordial-dingo', 'experiment_slug': 'my-awesome-experiment', 'workflow_version_id': None, 'index': '0', 'score': 0.5, 'label': 'score', 'optimizer': {'name': 'MIPROv2', 'parameters': {}}, 'predictors': [{'name': 'generate_answer.predict', 'predictor': {'__class__': 'langwatch_nlp.studio.dspy.predict_with_metadata.PredictWithMetadata', '_compiled': False, 'callbacks': [], 'history': [], 'stage': '3dd0793884742e89', 'signature': {'__class__': 'dspy.signatures.signature.SignatureMeta', 'signature': 'question, contexts -> answer', 'instructions': 'generate an answer to the question based on the contexts', 'fields': {'question': {'__class__': 'pydantic.fields.FieldInfo', 'field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'}, 'c





data [] 




In [None]:
import json
from langwatch.dspy import SerializableDSPyAndPydanticEncoder
from langwatch_nlp.studio.dspy.predict_with_metadata import PredictWithMetadata

module = WorkflowModule(run_evaluations=True)
# predictor = module.wrapped(Generatequery, node_id="generate_query", )()

class Generatequery2(dspy.Module):
    def __init__(self):
        super().__init__()

        # self.predict = dspy.Predict(GeneratequerySignature)

        lm = dspy.LM(
            max_tokens=2048,
            temperature=0,
            model='openai/gpt-4o-mini',
            drop_params=True,
            model_type='chat',
            cache=False
        )

        self.predict = PredictWithMetadata(GeneratequerySignature)



    def forward(self, question: str = None):
        return super().forward(question=question)

predictor = Generatequery()
# predictor = Generatequery2()

predictor = predictor.named_predictors()[0][1]
print(predictor)

# print(predictor.named_predictors()[0][1])

# print(program.named_predictors()[1][1])
json.dumps(predictor, cls=SerializableDSPyAndPydanticEncoder)

PredictWithMetadata(GeneratequerySignature(question -> query
    instructions='generate a short wikipedia search query to find info about it'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    query = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Query:', 'desc': '${query}'})
))


ValueError: Circular reference detected

In [11]:
import langwatch

langwatch.login()

LangWatch API key is already set, if you want to login again, please call as langwatch.login(relogin=True)


In [12]:
from dspy.teleprompt import MIPROv2
import dspy.evaluate

def metric(
    example: dspy.Example,
    pred: PredictionWithEvaluationAndMetadata,
    trace=None,
):
    score = pred.total_score(weighting="mean")
    return score

llm = dspy.LM(
    max_tokens=2048,
    temperature=0,
    model='openai/gpt-4o',
    drop_params=True,
    model_type='chat',
    cache=False
)

# Set up a MIPROv2 optimizer, which will compile our RAG program.
optimizer = MIPROv2(metric=metric, prompt_model=llm, task_model=llm, num_candidates=2, init_temperature=0.7, auto=None)

# Initialize langwatch for this run, to track the optimizer compilation
langwatch.dspy.init(experiment="my-awesome-experiment", optimizer=optimizer)

module = WorkflowModule(run_evaluations=True)
module.prevent_crashes()

# Compile
compiled_rag = optimizer.compile(
    module,
    trainset=trainset,
    num_trials=10,
    max_bootstrapped_demos=3,
    max_labeled_demos=5,
    minibatch_size=16,
)


[LangWatch] Experiment initialized, run_id: fractal-godlike-bullfrog
[LangWatch] Open http://localhost:5560/inbox-narrator/experiments/my-awesome-experiment?runIds=fractal-godlike-bullfrog to track your DSPy training session live

[93m[1mProjected Language Model (LM) Calls[0m

Based on the parameters you have set, the maximum number of LM calls is projected as follows:

[93m- Prompt Generation: [94m[1m10[0m[93m data summarizer calls + [94m[1m2[0m[93m * [94m[1m2[0m[93m lm calls in program + ([94m[1m3[0m[93m) lm calls in program-aware proposer = [94m[1m17[0m[93m prompt model calls[0m
[93m- Program Evaluation: [94m[1m16[0m[93m examples in minibatch * [94m[1m10[0m[93m batches + [94m[1m16[0m[93m examples in val set * [94m[1m3[0m[93m full evals = [94m[1m208[0m[93m LM Program calls[0m

[93m[1mEstimated Cost Calculation:[0m

[93mTotal Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token +

2025/07/07 21:34:58 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/07/07 21:34:58 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/07/07 21:34:58 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=2 sets of demonstrations...
2025/07/07 21:34:58 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/07/07 21:34:58 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.



No input received within 20 seconds. Proceeding with execution...
Bootstrapping set 1/2
Bootstrapping set 2/2
Error getting source code: maximum recursion depth exceeded.

Running without program aware proposer.


2025/07/07 21:35:13 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=2 instructions...

2025/07/07 21:35:40 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/07/07 21:35:40 INFO dspy.teleprompt.mipro_optimizer_v2: 0: generate an answer to the question based on the contexts

2025/07/07 21:35:40 INFO dspy.teleprompt.mipro_optimizer_v2: 1: ### Task Description

The task involves generating responses to user queries about climate change and environmental issues. The responses should be informative, factually accurate, and reflective of the most recent scientific consensus. The model should be able to address a broad range of topics within this domain, including but not limited to: the causes and effects of climate change, mitigation and adaptation strategies, environmental policies, and sustainable practices. The responses should be tailored to different audiences, ranging from laypersons to experts, and should encourage informed and constructive di

  0%|          | 0/16 [00:00<?, ?it/s]



Average Metric: 0.00 / 2 (0.0%):   6%|▋         | 1/16 [00:01<00:24,  1.61s/it]



Average Metric: 0.00 / 4 (0.0%):  19%|█▉        | 3/16 [00:02<00:07,  1.77it/s]



Average Metric: 0.00 / 7 (0.0%):  38%|███▊      | 6/16 [00:02<00:02,  3.71it/s]



Average Metric: 0.00 / 8 (0.0%):  50%|█████     | 8/16 [00:02<00:02,  3.36it/s]



Average Metric: 0.00 / 10 (0.0%):  62%|██████▎   | 10/16 [00:03<00:01,  3.85it/s]

  for obj in list(mapping.values()):
  for obj in list(mapping.values()):
  for obj in list(mapping.values()):


Average Metric: 0.00 / 12 (0.0%):  69%|██████▉   | 11/16 [00:03<00:01,  4.26it/s]



Average Metric: 0.00 / 13 (0.0%):  81%|████████▏ | 13/16 [00:04<00:00,  3.37it/s]



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:04<00:00,  3.38it/s]

2025/07/07 21:35:45 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2025/07/07 21:35:45 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/langwatch_nlp/.venv/lib/python3.12/site-packages/dspy/teleprompt/utils.py", line 52, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 653, in patched_evaluate_call
    langwatch_dspy.log_step(
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 346, in log_step
    self.send_steps()
  File "/Users/rchaves/Projects/langw




self.steps_buffer run_id='fractal-godlike-bullfrog' experiment_slug='my-awesome-experiment' workflow_version_id=None index='1' score=0.0 label='score' optimizer=DSPyOptimizer(name='MIPROv2', parameters={'num_candidates': 2, 'init_temperature': 0.7}) predictors=[DSPyPredictor(name='generate_answer.predict', predictor=PredictWithMetadata(GenerateanswerSignature(question, contexts -> answer
    instructions='generate an answer to the question based on the contexts'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    contexts = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contexts:', 'desc': '${contexts}'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))), DSPyPredictor(name='generate_query.predict', predictor=PredictWithMetadata(Generateque



Average Metric: 0.00 / 6 (0.0%):  31%|███▏      | 5/16 [00:00<00:01,  6.74it/s]



Average Metric: 0.00 / 11 (0.0%):  69%|██████▉   | 11/16 [00:01<00:00, 12.25it/s]



Average Metric: 0.00 / 15 (0.0%):  94%|█████████▍| 15/16 [00:01<00:00,  9.54it/s]



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:02<00:00,  6.26it/s]

2025/07/07 21:35:48 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2025/07/07 21:35:48 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/langwatch_nlp/.venv/lib/python3.12/site-packages/dspy/teleprompt/utils.py", line 52, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 653, in patched_evaluate_call
    langwatch_dspy.log_step(
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 346, in log_step
    self.send_steps()
  File "/Users/rchaves/Projects/langw




self.steps_buffer run_id='fractal-godlike-bullfrog' experiment_slug='my-awesome-experiment' workflow_version_id=None index='1' score=0.0 label='score' optimizer=DSPyOptimizer(name='MIPROv2', parameters={'num_candidates': 2, 'init_temperature': 0.7}) predictors=[DSPyPredictor(name='generate_answer.predict', predictor=PredictWithMetadata(GenerateanswerSignature(question, contexts -> answer
    instructions='generate an answer to the question based on the contexts'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    contexts = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contexts:', 'desc': '${contexts}'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))), DSPyPredictor(name='generate_query.predict', predictor=PredictWithMetadata(Generateque



Average Metric: 0.00 / 3 (0.0%):  12%|█▎        | 2/16 [00:00<00:03,  3.57it/s]



Average Metric: 0.00 / 8 (0.0%):  44%|████▍     | 7/16 [00:01<00:01,  7.76it/s]



Average Metric: 0.00 / 9 (0.0%):  56%|█████▋    | 9/16 [00:01<00:00, 10.16it/s]



Average Metric: 0.00 / 13 (0.0%):  75%|███████▌  | 12/16 [00:01<00:00,  8.63it/s]



Average Metric: 0.00 / 14 (0.0%):  88%|████████▊ | 14/16 [00:02<00:00,  6.44it/s]



Average Metric: 0.00 / 15 (0.0%):  94%|█████████▍| 15/16 [00:02<00:00,  6.13it/s]



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:03<00:00,  5.30it/s]

2025/07/07 21:35:51 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2025/07/07 21:35:51 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/langwatch_nlp/.venv/lib/python3.12/site-packages/dspy/teleprompt/utils.py", line 52, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 653, in patched_evaluate_call
    langwatch_dspy.log_step(
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 346, in log_step
    self.send_steps()
  File "/Users/rchaves/Projects/langw




self.steps_buffer run_id='fractal-godlike-bullfrog' experiment_slug='my-awesome-experiment' workflow_version_id=None index='1' score=0.0 label='score' optimizer=DSPyOptimizer(name='MIPROv2', parameters={'num_candidates': 2, 'init_temperature': 0.7}) predictors=[DSPyPredictor(name='generate_answer.predict', predictor=PredictWithMetadata(GenerateanswerSignature(question, contexts -> answer
    instructions='generate an answer to the question based on the contexts'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    contexts = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contexts:', 'desc': '${contexts}'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))), DSPyPredictor(name='generate_query.predict', predictor=PredictWithMetadata(Generateque



Average Metric: 0.00 / 8 (0.0%):  50%|█████     | 8/16 [00:00<00:00, 13.19it/s]



Average Metric: 0.00 / 13 (0.0%):  75%|███████▌  | 12/16 [00:01<00:00, 12.30it/s]



Average Metric: 0.00 / 15 (0.0%):  88%|████████▊ | 14/16 [00:01<00:00, 13.76it/s]



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:01<00:00,  8.28it/s]

2025/07/07 21:35:53 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2025/07/07 21:35:53 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/langwatch_nlp/.venv/lib/python3.12/site-packages/dspy/teleprompt/utils.py", line 52, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 653, in patched_evaluate_call
    langwatch_dspy.log_step(
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 346, in log_step
    self.send_steps()
  File "/Users/rchaves/Projects/langw




self.steps_buffer run_id='fractal-godlike-bullfrog' experiment_slug='my-awesome-experiment' workflow_version_id=None index='1' score=0.0 label='score' optimizer=DSPyOptimizer(name='MIPROv2', parameters={'num_candidates': 2, 'init_temperature': 0.7}) predictors=[DSPyPredictor(name='generate_answer.predict', predictor=PredictWithMetadata(GenerateanswerSignature(question, contexts -> answer
    instructions='generate an answer to the question based on the contexts'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    contexts = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contexts:', 'desc': '${contexts}'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))), DSPyPredictor(name='generate_query.predict', predictor=PredictWithMetadata(Generateque



Average Metric: 0.00 / 7 (0.0%):  44%|████▍     | 7/16 [00:00<00:00, 12.59it/s]



Average Metric: 0.00 / 11 (0.0%):  69%|██████▉   | 11/16 [00:01<00:00, 11.65it/s]



Average Metric: 0.00 / 12 (0.0%):  69%|██████▉   | 11/16 [00:01<00:00, 11.65it/s]



Average Metric: 0.00 / 14 (0.0%):  81%|████████▏ | 13/16 [00:01<00:00,  7.69it/s]



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:01<00:00,  8.56it/s]

2025/07/07 21:35:55 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2025/07/07 21:35:55 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/langwatch_nlp/.venv/lib/python3.12/site-packages/dspy/teleprompt/utils.py", line 52, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 653, in patched_evaluate_call
    langwatch_dspy.log_step(
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 346, in log_step
    self.send_steps()
  File "/Users/rchaves/Projects/langw




self.steps_buffer run_id='fractal-godlike-bullfrog' experiment_slug='my-awesome-experiment' workflow_version_id=None index='1' score=0.0 label='score' optimizer=DSPyOptimizer(name='MIPROv2', parameters={'num_candidates': 2, 'init_temperature': 0.7}) predictors=[DSPyPredictor(name='generate_answer.predict', predictor=PredictWithMetadata(GenerateanswerSignature(question, contexts -> answer
    instructions='generate an answer to the question based on the contexts'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    contexts = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contexts:', 'desc': '${contexts}'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))), DSPyPredictor(name='generate_query.predict', predictor=PredictWithMetadata(Generateque



Average Metric: 0.00 / 5 (0.0%):  25%|██▌       | 4/16 [00:00<00:01,  7.09it/s]



Average Metric: 0.00 / 8 (0.0%):  44%|████▍     | 7/16 [00:00<00:00,  9.32it/s]



Average Metric: 0.00 / 12 (0.0%):  69%|██████▉   | 11/16 [00:01<00:00,  9.36it/s]



Average Metric: 0.00 / 14 (0.0%):  81%|████████▏ | 13/16 [00:01<00:00, 12.37it/s]



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:01<00:00,  8.08it/s]

2025/07/07 21:35:57 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2025/07/07 21:35:57 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/langwatch_nlp/.venv/lib/python3.12/site-packages/dspy/teleprompt/utils.py", line 52, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 653, in patched_evaluate_call
    langwatch_dspy.log_step(
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 346, in log_step
    self.send_steps()
  File "/Users/rchaves/Projects/langw




self.steps_buffer run_id='fractal-godlike-bullfrog' experiment_slug='my-awesome-experiment' workflow_version_id=None index='1' score=0.0 label='score' optimizer=DSPyOptimizer(name='MIPROv2', parameters={'num_candidates': 2, 'init_temperature': 0.7}) predictors=[DSPyPredictor(name='generate_answer.predict', predictor=PredictWithMetadata(GenerateanswerSignature(question, contexts -> answer
    instructions='generate an answer to the question based on the contexts'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    contexts = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contexts:', 'desc': '${contexts}'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))), DSPyPredictor(name='generate_query.predict', predictor=PredictWithMetadata(Generateque



Average Metric: 0.00 / 4 (0.0%):  19%|█▉        | 3/16 [00:00<00:02,  5.85it/s]



Average Metric: 0.00 / 7 (0.0%):  44%|████▍     | 7/16 [00:00<00:00, 13.39it/s]



Average Metric: 0.00 / 12 (0.0%):  69%|██████▉   | 11/16 [00:01<00:00, 10.05it/s]



Average Metric: 0.00 / 15 (0.0%):  94%|█████████▍| 15/16 [00:01<00:00, 10.46it/s]



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:02<00:00,  7.18it/s]

2025/07/07 21:35:59 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2025/07/07 21:35:59 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/langwatch_nlp/.venv/lib/python3.12/site-packages/dspy/teleprompt/utils.py", line 52, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 653, in patched_evaluate_call
    langwatch_dspy.log_step(
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 346, in log_step
    self.send_steps()
  File "/Users/rchaves/Projects/langw




self.steps_buffer run_id='fractal-godlike-bullfrog' experiment_slug='my-awesome-experiment' workflow_version_id=None index='1' score=0.0 label='score' optimizer=DSPyOptimizer(name='MIPROv2', parameters={'num_candidates': 2, 'init_temperature': 0.7}) predictors=[DSPyPredictor(name='generate_answer.predict', predictor=PredictWithMetadata(GenerateanswerSignature(question, contexts -> answer
    instructions='generate an answer to the question based on the contexts'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    contexts = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contexts:', 'desc': '${contexts}'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))), DSPyPredictor(name='generate_query.predict', predictor=PredictWithMetadata(Generateque



Average Metric: 0.00 / 8 (0.0%):  44%|████▍     | 7/16 [00:00<00:01,  6.53it/s]



Average Metric: 0.00 / 12 (0.0%):  75%|███████▌  | 12/16 [00:01<00:00, 11.52it/s]



Average Metric: 0.00 / 14 (0.0%):  88%|████████▊ | 14/16 [00:01<00:00,  9.93it/s]



Average Metric: 0.00 / 15 (0.0%):  88%|████████▊ | 14/16 [00:01<00:00,  9.93it/s]



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:02<00:00,  7.82it/s]

2025/07/07 21:36:01 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2025/07/07 21:36:01 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/langwatch_nlp/.venv/lib/python3.12/site-packages/dspy/teleprompt/utils.py", line 52, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 653, in patched_evaluate_call
    langwatch_dspy.log_step(
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 346, in log_step
    self.send_steps()
  File "/Users/rchaves/Projects/langw




self.steps_buffer run_id='fractal-godlike-bullfrog' experiment_slug='my-awesome-experiment' workflow_version_id=None index='1' score=0.0 label='score' optimizer=DSPyOptimizer(name='MIPROv2', parameters={'num_candidates': 2, 'init_temperature': 0.7}) predictors=[DSPyPredictor(name='generate_answer.predict', predictor=PredictWithMetadata(GenerateanswerSignature(question, contexts -> answer
    instructions='generate an answer to the question based on the contexts'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    contexts = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contexts:', 'desc': '${contexts}'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))), DSPyPredictor(name='generate_query.predict', predictor=PredictWithMetadata(Generateque



Average Metric: 0.00 / 7 (0.0%):  38%|███▊      | 6/16 [00:00<00:01,  6.82it/s]



Average Metric: 0.00 / 11 (0.0%):  62%|██████▎   | 10/16 [00:01<00:00, 11.06it/s]



Average Metric: 0.00 / 14 (0.0%):  88%|████████▊ | 14/16 [00:01<00:00, 13.20it/s]



Average Metric: 0.00 / 15 (0.0%):  88%|████████▊ | 14/16 [00:01<00:00, 13.20it/s]



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:01<00:00,  8.66it/s]

2025/07/07 21:36:03 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2025/07/07 21:36:03 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/langwatch_nlp/.venv/lib/python3.12/site-packages/dspy/teleprompt/utils.py", line 52, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 653, in patched_evaluate_call
    langwatch_dspy.log_step(
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 346, in log_step
    self.send_steps()
  File "/Users/rchaves/Projects/langw




self.steps_buffer run_id='fractal-godlike-bullfrog' experiment_slug='my-awesome-experiment' workflow_version_id=None index='1' score=0.0 label='score' optimizer=DSPyOptimizer(name='MIPROv2', parameters={'num_candidates': 2, 'init_temperature': 0.7}) predictors=[DSPyPredictor(name='generate_answer.predict', predictor=PredictWithMetadata(GenerateanswerSignature(question, contexts -> answer
    instructions='generate an answer to the question based on the contexts'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    contexts = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contexts:', 'desc': '${contexts}'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))), DSPyPredictor(name='generate_query.predict', predictor=PredictWithMetadata(Generateque



Average Metric: 0.00 / 6 (0.0%):  31%|███▏      | 5/16 [00:00<00:01,  5.85it/s]



Average Metric: 0.00 / 9 (0.0%):  56%|█████▋    | 9/16 [00:01<00:00,  9.73it/s]



Average Metric: 0.00 / 11 (0.0%):  69%|██████▉   | 11/16 [00:01<00:00, 11.26it/s]



Average Metric: 0.00 / 13 (0.0%):  81%|████████▏ | 13/16 [00:01<00:00, 10.29it/s]



Average Metric: 0.00 / 15 (0.0%):  94%|█████████▍| 15/16 [00:01<00:00,  8.93it/s]



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:02<00:00,  7.67it/s]

2025/07/07 21:36:05 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2025/07/07 21:36:05 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/langwatch_nlp/.venv/lib/python3.12/site-packages/dspy/teleprompt/utils.py", line 52, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 653, in patched_evaluate_call
    langwatch_dspy.log_step(
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 346, in log_step
    self.send_steps()
  File "/Users/rchaves/Projects/langw




self.steps_buffer run_id='fractal-godlike-bullfrog' experiment_slug='my-awesome-experiment' workflow_version_id=None index='1' score=0.0 label='score' optimizer=DSPyOptimizer(name='MIPROv2', parameters={'num_candidates': 2, 'init_temperature': 0.7}) predictors=[DSPyPredictor(name='generate_answer.predict', predictor=PredictWithMetadata(GenerateanswerSignature(question, contexts -> answer
    instructions='generate an answer to the question based on the contexts'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    contexts = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contexts:', 'desc': '${contexts}'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))), DSPyPredictor(name='generate_query.predict', predictor=PredictWithMetadata(Generateque



Average Metric: 0.00 / 4 (0.0%):  25%|██▌       | 4/16 [00:00<00:02,  5.68it/s]



Average Metric: 0.00 / 8 (0.0%):  44%|████▍     | 7/16 [00:01<00:01,  8.28it/s]



Average Metric: 0.00 / 12 (0.0%):  69%|██████▉   | 11/16 [00:01<00:00,  8.96it/s]



Average Metric: 0.00 / 14 (0.0%):  81%|████████▏ | 13/16 [00:01<00:00, 12.26it/s]



Average Metric: 0.00 / 15 (0.0%):  94%|█████████▍| 15/16 [00:01<00:00,  8.13it/s]



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:02<00:00,  7.04it/s]

2025/07/07 21:36:07 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2025/07/07 21:36:07 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/langwatch_nlp/.venv/lib/python3.12/site-packages/dspy/teleprompt/utils.py", line 52, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 653, in patched_evaluate_call
    langwatch_dspy.log_step(
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 346, in log_step
    self.send_steps()
  File "/Users/rchaves/Projects/langw




self.steps_buffer run_id='fractal-godlike-bullfrog' experiment_slug='my-awesome-experiment' workflow_version_id=None index='1' score=0.0 label='score' optimizer=DSPyOptimizer(name='MIPROv2', parameters={'num_candidates': 2, 'init_temperature': 0.7}) predictors=[DSPyPredictor(name='generate_answer.predict', predictor=PredictWithMetadata(GenerateanswerSignature(question, contexts -> answer
    instructions='generate an answer to the question based on the contexts'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    contexts = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contexts:', 'desc': '${contexts}'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))), DSPyPredictor(name='generate_query.predict', predictor=PredictWithMetadata(Generateque



Average Metric: 0.00 / 12 (0.0%):  69%|██████▉   | 11/16 [00:01<00:00,  8.37it/s]



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:01<00:00, 10.35it/s]

2025/07/07 21:36:09 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2025/07/07 21:36:09 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/langwatch_nlp/.venv/lib/python3.12/site-packages/dspy/teleprompt/utils.py", line 52, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 653, in patched_evaluate_call
    langwatch_dspy.log_step(
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 346, in log_step
    self.send_steps()
  File "/Users/rchaves/Projects/langw




self.steps_buffer run_id='fractal-godlike-bullfrog' experiment_slug='my-awesome-experiment' workflow_version_id=None index='1' score=0.0 label='score' optimizer=DSPyOptimizer(name='MIPROv2', parameters={'num_candidates': 2, 'init_temperature': 0.7}) predictors=[DSPyPredictor(name='generate_answer.predict', predictor=PredictWithMetadata(GenerateanswerSignature(question, contexts -> answer
    instructions='generate an answer to the question based on the contexts'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    contexts = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contexts:', 'desc': '${contexts}'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))), DSPyPredictor(name='generate_query.predict', predictor=PredictWithMetadata(Generateque



Average Metric: 0.00 / 5 (0.0%):  25%|██▌       | 4/16 [00:00<00:01,  6.82it/s]



Average Metric: 0.00 / 7 (0.0%):  38%|███▊      | 6/16 [00:00<00:01,  7.95it/s]



Average Metric: 0.00 / 14 (0.0%):  81%|████████▏ | 13/16 [00:01<00:00,  4.94it/s]

  start_time if start_time is not None else time_ns()
  start_time if start_time is not None else time_ns()
  start_time if start_time is not None else time_ns()
  start_time if start_time is not None else time_ns()


Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:02<00:00,  7.81it/s]

2025/07/07 21:36:11 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2025/07/07 21:36:11 ERROR dspy.teleprompt.utils: An exception occurred during evaluation
Traceback (most recent call last):
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/langwatch_nlp/.venv/lib/python3.12/site-packages/dspy/teleprompt/utils.py", line 52, in eval_candidate_program
    return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 653, in patched_evaluate_call
    langwatch_dspy.log_step(
  File "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/src/langwatch/dspy/__init__.py", line 346, in log_step
    self.send_steps()
  File "/Users/rchaves/Projects/langw




self.steps_buffer run_id='fractal-godlike-bullfrog' experiment_slug='my-awesome-experiment' workflow_version_id=None index='1' score=0.0 label='score' optimizer=DSPyOptimizer(name='MIPROv2', parameters={'num_candidates': 2, 'init_temperature': 0.7}) predictors=[DSPyPredictor(name='generate_answer.predict', predictor=PredictWithMetadata(GenerateanswerSignature(question, contexts -> answer
    instructions='generate an answer to the question based on the contexts'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    contexts = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Contexts:', 'desc': '${contexts}'})
    answer = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))), DSPyPredictor(name='generate_query.predict', predictor=PredictWithMetadata(Generateque