<a href="https://colab.research.google.com/github/hanhanwu/Hanhan_COLAB_Experiemnts/blob/master/GenAI_Practice/Langwatch/try_dspy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Try DsPy for RAG Prompt Optimization


In [1]:
%%capture --no-stderr
!pip install --upgrade nbformat
%pip install -U --quiet dspy

## Prepare LLM

* `http://20.102.90.50:2017/wiki17_abstracts` provides the sources for retrieval here

In [None]:
import os
import pandas as pd
from getpass import getpass
import dspy
from pprint import pprint
from google.colab import userdata
from dspy.datasets import HotPotQA
from dspy.teleprompt import MIPROv2


# OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
# llm = dspy.LM("openai/gpt-4.1-nano", api_key=OPENAI_API_KEY)

GOOGLE_AI_API_KEY = userdata.get('GOOGLE_AI_API_KEY')
llm = dspy.LM("gemini/gemini-2.0-flash", api_key=GOOGLE_AI_API_KEY)
print("LLM test response:", llm("Where's Silicon Valley?"))

# the retrieval model
colbertv2_wiki17_abstracts = dspy.ColBERTv2(
    url="http://20.102.90.50:2017/wiki17_abstracts"
)
dspy.settings.configure(lm=llm, rm=colbertv2_wiki17_abstracts)

LLM test response: ['Silicon Valley is located in the southern part of the San Francisco Bay Area in **Northern California, United States**.\n']


## Preparing Dataset

In [None]:
dataset = HotPotQA(train_seed=1, train_size=32, eval_seed=2025, dev_size=50, test_size=0)
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]

print()
print(len(trainset), len(devset))
print(trainset[0])
print(devset[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.19k [00:00<?, ?B/s]

hotpot_qa.py:   0%|          | 0.00/6.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/566M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/90447 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7405 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7405 [00:00<?, ? examples/s]


32 50
Example({'question': 'At My Window was released by which American singer-songwriter?', 'answer': 'John Townes Van Zandt'}) (input_keys={'question'})
Example({'question': 'Pehchaan: The Face of Truth stars Vinod Khanna, Rati Agnihotri and which Indian actress, producer, and former model who also produced the film?', 'answer': 'Raveena Tandon', 'gold_titles': {'Raveena Tandon', 'Pehchaan: The Face of Truth'}}) (input_keys={'question'})


## Defining DsPy RAG

In [4]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""
    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")


class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)

    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context,
                               answer=prediction.answer,
                               reasoning=prediction.reasoning)


dev_example = devset[12]
print(f"[Devset] Question: {dev_example.question}")
print(f"[Devset] Answer: {dev_example.answer}")
print(f"[Devset] Relevant Wikipedia Titles: {dev_example.gold_titles}")
print()

generate_answer = RAG()
pred = generate_answer(question=dev_example.question)
print(f"[Prediction] Question: {dev_example.question}")
print(f"[Prediction] Predicted Answer: {pred.answer}")
print(f"[Prediction] Reasoning: {pred.reasoning}")

[Devset] Question: Twelve Inches is a compilation album by which 1980s British band?
[Devset] Answer: Frankie Goes to Hollywood
[Devset] Relevant Wikipedia Titles: {'Twelve Inches', 'Frankie Goes to Hollywood'}

[Prediction] Question: Twelve Inches is a compilation album by which 1980s British band?
[Prediction] Predicted Answer: Soft Cell, Bananarama, Spandau Ballet
[Prediction] Reasoning: The question asks which 1980s British band released a compilation album called "Twelve Inches". I need to find a band that matches both criteria.
The context provides three albums with "Twelve Inches" in the title: "The Twelve Inch Singles" by Soft Cell, "The Twelve Inches of Bananarama" by Bananarama, and "The Twelve Inch Mixes" by Spandau Ballet.
Soft Cell, Bananarama, and Spandau Ballet are all British bands from the 1980s.


In [None]:
trial_logs = []

def validate_context_and_answer(example, prediction):
    gold = example.answer.strip().lower()
    pred = prediction.answer.strip().lower()
    score = int(gold == pred)

    # Format similar to Langwatch's internal logging
    log_entry = {
        "input": {
            "question": example.question,
            "context": getattr(example, "context", "")
        },
        "output": {
            "answer": pred
        },
        # Include prediction trace (as dict if possible)
        "trace": prediction.__dict__ if hasattr(prediction, "__dict__") else str(prediction),
        "score": score,
        "optimizer_name": optimizer.__class__.__name__,
    }
    trial_logs.append(log_entry)

    print(f"[Trial] Q: {example.question} | Pred: {pred} | GT: {gold} | Score: {score}")
    return score


optimizer = MIPROv2(
    metric=validate_context_and_answer,
    prompt_model=llm,
    task_model=llm,
    num_candidates=2,  # number of proposed instructions
    init_temperature=0.7,
    auto=None,
)


compiled_rag = optimizer.compile(
    RAG(),
    trainset=trainset,
    num_trials=5,
    max_bootstrapped_demos=2,
    max_labeled_demos=3,
    minibatch_size=4,
    requires_permission_to_run=False
)

In [6]:
# optimized results
compiled_rag

generate_answer.predict = Predict(StringSignature(context, question -> reasoning, answer
    instructions='Answer questions with short factoid answers.'
    context = Field(annotation=str required=True json_schema_extra={'desc': 'may contain relevant facts', '__dspy_field_type': 'input', 'prefix': 'Context:'})
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'often between 1 and 5 words', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
))

In [7]:
# example output with optimized results
dev_example = devset[0]
pred = compiled_rag(question=dev_example.question)
print("\n--- Test on dev example ---")
print(f"Question: {dev_example.question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Ground Truth: {dev_example.answer}")


--- Test on dev example ---
Question: Pehchaan: The Face of Truth stars Vinod Khanna, Rati Agnihotri and which Indian actress, producer, and former model who also produced the film?
Predicted Answer: Raveena Tandon
Ground Truth: Raveena Tandon


In [None]:
print(len(trial_logs))
pprint(trial_logs[:2])

41
[{'input': {'context': '',
            'question': 'Which American actress who made their film debut in '
                        'the 1995 teen drama "Kids" was the co-founder of Voto '
                        'Latino?'},
  'optimizer_name': 'MIPROv2',
  'output': {'answer': 'rosario dawson'},
  'score': 1,
  'trace': {'_completions': None,
            '_lm_usage': None,
            '_store': {'answer': 'Rosario Dawson',
                       'context': ['Rosario Dawson | Rosario Isabel Dawson '
                                   '(born May 9, 1979) is an American actress, '
                                   'producer, singer, comic book writer, and '
                                   'political activist. She made her film '
                                   'debut in the 1995 teen drama "Kids". Her '
                                   'subsequent film roles include "He Got '
                                   'Game", "Men in Black II", "25th Hour", '
                          

In [None]:
output_filename = 'history_pprint.txt'

with open(output_filename, 'w') as f:
    for i, record in enumerate(llm.history):
        import io
        buffer = io.StringIO()
        pprint(record, stream=buffer)
        formatted_record = buffer.getvalue()
        f.write(formatted_record)
        f.write("\n\n")

### Processing History Records into Pandas Dataframe

In [5]:
import re
from pprint import pprint


def load_simple_dicts_from_txt(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    dict_strs = [block.strip() for block in content.split('\n\n') if block.strip()]
    dicts = []
    for dstr in dict_strs:
        # Remove lines with custom objects (e.g., ModelResponse(...))
        cleaned_lines = []
        for line in dstr.splitlines():
            if re.search(r'\w+\(.*\)', line):  # line contains Something(...)
                continue
            cleaned_lines.append(line)
        cleaned_str = '\n'.join(cleaned_lines)
        try:
            d = eval(cleaned_str, {"__builtins__": None}, {})  # Use eval with no builtins for some safety
            dicts.append(d)
        except Exception as e:
            print(f"Failed to parse block:\n{cleaned_str}\nError: {e}\n")
    return dicts


def extract_sections(text):
    if isinstance(text, list):
        text = ''.join(text)
    pattern = r'\[\[ ## (.*?) ## \]\]\n(.*?)(?=\n\[\[|$)'
    matches = re.findall(pattern, text, re.DOTALL)
    if not matches:
        return None
    return {section.strip(): content.strip() for section, content in matches}

In [14]:
dict_list = load_simple_dicts_from_txt('history_pprint.txt')
print(f"Loaded {len(dict_list)} dictionaries.")

selective_lst = []
for record in dict_list:
    if 'messages' in record.keys():
        messages = record['messages']
        if messages is None:
            continue
        for content_dct in messages:
            if "[[ ## question ## ]]" in content_dct['content']:
                content_elems = extract_sections(content_dct['content'])
                question = content_elems['question'].split('\n\n')[0]
                if question != '{question}':
                    selective_lst.append({'question': question})
    output = record['outputs']
    output_elems = extract_sections(output)
    if output_elems != None and (('answer' in output_elems.keys() and 'reasoning' in output_elems.keys()) \
                                 or 'proposed_instruction' in output_elems.keys()):
        selective_lst.append(output_elems)

Loaded 52 dictionaries.


In [16]:
for item in selective_lst:
    pprint(item)

{'question': 'Twelve Inches is a compilation album by which 1980s British '
             'band?'}
{'answer': 'Soft Cell, Bananarama, Spandau Ballet',
 'reasoning': 'The question asks which 1980s British band released a '
              'compilation album called "Twelve Inches". I need to find a band '
              'that matches both criteria.\n'
              'The context provides three albums with "Twelve Inches" in the '
              'title: "The Twelve Inch Singles" by Soft Cell, "The Twelve '
              'Inches of Bananarama" by Bananarama, and "The Twelve Inch '
              'Mixes" by Spandau Ballet.\n'
              'Soft Cell, Bananarama, and Spandau Ballet are all British bands '
              'from the 1980s.'}
{'proposed_instruction': 'You are an expert question answering system. Your '
                         'task is to answer questions accurately and concisely '
                         'using a chain-of-thought approach. Given a context '
                         '