<a href="https://colab.research.google.com/github/AdoHaha/dspy_fun/blob/main/programmatic_LLM_and_VLM_use_through_DSPy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Programmatic LLL and VLM use through DSPy.
Code to follow along the presentation ["Programmatic LLM and VLM use through DSPy"](https://raw.githubusercontent.com/AdoHaha/dspy_fun/753b46cb3528eb374b943009e1eb851d7b69c4bb/programmatic%20LLM%20%26%20VLM%20use%20through%20DSPy.pdf)

Igor Zubrycki
igorzubrycki@gmail.com

In [None]:
!pip install dspy opik

In [None]:
#the notebook is intended to run from Google Colab which has "secrets" tab. Use different way to load your API keys otherwise
from google.colab import userdata
GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')

### Basic DSPy use -- using predict

In [None]:
import dspy

small_model = dspy.LM("gemini/gemini-2.5-flash-lite", api_key=GEMINI_API_KEY)
dspy.configure(lm=small_model)
sum_of_numbers = dspy.Predict('numbers -> sum_of_numbers') #we want the input be numbers and the output being their sum
result = sum_of_numbers(numbers = (12,13,15))

print(result)



In [None]:
# Numbers do not neccesarely need to be a list or a string, any numbers will do
image_url = "https://raw.githubusercontent.com/AdoHaha/dspy_fun/refs/heads/main/example_files/image_numbers.png" #nano banana generated numbers
numbers_image = dspy.Image.from_url(image_url)
from IPython.display import Image
display(Image(image_url, width=300))

In [None]:
result = sum_of_numbers(numbers = numbers_image)

print(result) #we gave the model the freedom of type of answer

# you can ensure that result is float by simply adding
sum_of_numbers = dspy.Predict('numbers -> sum_of_numbers:float')
result = sum_of_numbers(numbers = numbers_image)

print(result)


In [None]:
sum_of_numbers.history[-1] # history can show us how the signature and function call are converted by an adapter to a prompt

# Being more precise with Signatures

In [None]:
from typing import Optional
class NumberAdd(dspy.Signature):
 """Please add numbers provided in a various ways together. Numbers can also be symbolic or require computation.
 Single number is also ok
 Only if there are no numbers in input, write a sad haiku using the contents of input. """
 numbers = dspy.InputField(description="numbers to add")
 sum_of_numbers: float = dspy.OutputField(description="resulting sum")
 haiku: Optional[str] = dspy.OutputField(description="sad haiku")

 #note that the type can be also a previously specified signature


In [None]:
sum_of_numbers_haiku = dspy.Predict(NumberAdd)


In [None]:
sum_of_numbers_haiku(numbers = "one, two")

In [None]:
sum_of_numbers_haiku(numbers = "dog, bowl")

In [None]:
# we can also create signatures for visual tasks


from typing import List, Dict

class NumberDetections(dspy.Signature):
    """Detect all numbers (not single digits) in the image and return their bounding boxes.

    Boxes use pixel coords in xyxy format: x_min, y_min, x_max, y_max.
    Return an empty list if no numbers are found.
    """
    image: dspy.Image = dspy.InputField(desc="Image to analyze.")
    boxes: List[Dict] = dspy.OutputField(
        desc="One dict per number in normalized coordinates (0-1000): {'x_min': int, 'y_min': int,'x_max': int, 'y_max': int, 'number':float}")


#To have more ready made input types use attachements library: https://github.com/maximerivest/Attachments (full texts, multiple files etc)
#!pip install attachments



In [None]:
# importing modules

import urllib.request
from PIL import Image
from io import BytesIO




In [None]:
#lets use a specialized vison model

visionmodel =  dspy.LM("gemini/gemini-2.5-flash",api_key=GEMINI_API_KEY)

In [None]:
new_detector = dspy.Predict(NumberDetections)
with dspy.context(lm = visionmodel):
  detections = new_detector(image = numbers_image)
print(detections)
def draw_detectcions(image_url, detections):

  # Read the image from the URL using the io module
  with urllib.request.urlopen(image_url) as my_url_res:
    my_img_data = my_url_res.read()

  # Open the image in PIL
  my_img = Image.open(BytesIO(my_img_data))

  # Show the image
  """draw detections on image"""
  from PIL import ImageDraw
  image = my_img
  draw = ImageDraw.Draw(image)
  for detection in detections:
    x_min, y_min, x_max, y_max = detection["x_min"], detection["y_min"], detection["x_max"], detection["y_max"]
    scaled_x_min = int(x_min * image.width/1000)
    scaled_y_min = int(y_min * image.height/1000)
    scaled_x_max = int(x_max * image.width/1000)
    scaled_y_max = int(y_max * image.height/1000)
    draw.rectangle([(scaled_x_min, scaled_y_min), (scaled_x_max, scaled_y_max)], outline="red", width=2)
    draw.text((scaled_x_min, scaled_y_min), str(detection["number"]), fill="red")
  return image
draw_detectcions(image_url, detections.boxes)

# changing models

There are specialized or just more powerfull models that you can use for the task at hand

In [None]:
sum_of_numbers_haiku = dspy.ChainOfThought(NumberAdd) #models can be just not smart enough
sum_of_numbers_haiku(numbers = "dragon,siete, enterprise") #frequentely answer either does not recognize that siete is a number or outputs both 7 and a haiku

In [None]:
larger_lm = dspy.LM("gemini/gemini-2.5-flash",api_key=GEMINI_API_KEY)

In [None]:

with dspy.context(lm = larger_lm): #so they can be replaced
    sum_of_numbers_haiku = dspy.ChainOfThought(NumberAdd)
    print(sum_of_numbers_haiku(numbers = "dragon,siete, enterprise"))

In [None]:
# try a specialized (and open) vision language model for image understanding
# open router is a nice way to access them
#visionmodel =  dspy.LM(model = "openrouter/z-ai/glm-4.5v",
#                       api_key = OPENROUTER_API_KEY)


# Cost

DSPy provides logs both on general, language model and module levels

They are provided as history, where each call is explained

In [None]:
big_cost = larger_lm.history[-1]["cost"]
small_cost = small_model.history[-1]["cost"]

print(f"Big cost: {big_cost}")
print(f"Small cost: {small_cost}")
print(f"Smaller model is {big_cost/small_cost} times cheaper")

In [None]:
# Trying a chain of modules
try_n = dspy.ChainOfThought(NumberAdd,n=5) # we ask the model 5 times
best_of_n = dspy.MultiChainComparison(NumberAdd, M=5) # we compare the outputs and choose one
tries = try_n(numbers = "dragon,siete, enterprise")
tries.completions

best_of_n(tries.completions)

# Writing own modules

we can create our own modules, that allow us to combine strategies to fit our idea

In [None]:
from typing import Optional

class BestNumber(dspy.Module):
  """module returns sum of numbers through generating multiple answers, analyzing them and
  finally verifying the best answer"""
  def __init__(self, n):
    self.n = n
    # will generate n answers
    self.chain = dspy.ChainOfThought(NumberAdd, n=n)
    signature_possible = NumberAdd.append("possible_answers",
                dspy.InputField(
                    desc="choice of possible answers, with reasoning",
                ))
    best_answer = dspy.ChainOfThought(signature_possible)

    self.check_rule = dspy.Refine(best_answer, N=3, reward_fn=self.check_result, threshold=1.0)

  def check_result(self, args, result):
    """when number is not zero, haiku should not be generated"""
    rule_exclusive_or = (result.sum_of_numbers != 0) ^ (result.haiku is not None)
    return rule_exclusive_or

  def forward(self, numbers): #forward is the key to module behaviour, will be used during runtime, for logging and optimization
    tries = self.chain(numbers=numbers)

    final_answer = self.check_rule(possible_answers = tries.completions, numbers=numbers)

    return final_answer

In [None]:
bestnumber = BestNumber(n=4)

print(bestnumber(numbers = "dragon,siete,enterprise"))

# Tool use

In [None]:
import sympy

def symbolic_expression_sympy(expression: str, *Args) -> float:
  """
  Takes a symbolic math expression (written as a string) and returns a result as a float, evaluated to 5 significant numbers, using sympy.
  For example symbolic_expression_sympy("2*log(E)") would result in 2.0000
  """
  expr = sympy.sympify(expression)
  return expr.evalf(5)


In [None]:
sum_of_numbers_smarter = dspy.ReAct('numbers -> sum_of_numbers', tools=[symbolic_expression_sympy]) #re-act uses the tools provided and a chain of thought
prediction = sum_of_numbers_smarter(numbers=["2*sin(10)","pi"])
prediction


# RAG
Retreival-augmented generation is a common strategy to deliver context.

Retretival can use tools such as reverse index search (akin to old school search engines with keywords) or embedding based search.

DSPy can be easly connected to external vector databases (like chromadb),
it plays nicely also with helper tools such as the ones in langchain (for reading standard fileformats, connecting to databases)

The build in dspy.Embeddings tool uses FAISS internally

In [None]:
!git clone http://github.com/python/peps.git #we will use all PEPs up to point as a knowledge base


In [None]:
import os

documents = []
for filename in os.listdir('./peps/peps'):
    if filename.endswith('.rst'):
        filepath = os.path.join('./peps/peps', filename)
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            documents.append(f.read())

print(f"Loaded {len(documents)} documents.")

In [None]:
!pip install inverted-index #very simple inverted-index tool

In [None]:
from inverted_index.inverted_index import InvertedIndex

ii = InvertedIndex()

ii.index(documents)

In [None]:
ii.search("PEP 355")

## Using logging tools

DSpy traces can be explored in MLOps tools such as Opik (Comet) or MLFlow. This allows to easly see how information flew or what were particular responses. Essential also for optimization. Personally I feel that MLFlow is now better integrated, try both

In [None]:
import opik
from opik.integrations.dspy.callback import OpikCallback

opik.configure(use_local=False)


In [None]:
class PEPSearch(dspy.Module):
  def __init__(self):
    self.respond = dspy.ChainOfThought('context_based_on_search, python_question -> easy_to_understand_response_based_on_PEP')
    self.reverseindexquery = dspy.ChainOfThought('python_question -> query_to_reverse_index')
  def forward(self, question):
    query = self.reverseindexquery(python_question = question)
    #print(query)
    search_responses ="\n\n".join(ii.search(query.query_to_reverse_index)[0:10])
    response = self.respond(context_based_on_search = search_responses, python_question = question)
    return response.easy_to_understand_response_based_on_PEP

In [None]:
pep_trivia = PEPSearch()

pep_trivia(question = "what is PEP 761 about?")

In [None]:
embedder = dspy.Embedder("gemini/embedding-001", dimensions = 768,  api_key = GEMINI_API_KEY, batch_size = 20)
embeddings_peps = dspy.Embeddings(embedder = embedder, corpus = documents, k = 5)
class PEPEmbeddingRetreival(dspy.Module):
  def __init__(self):
    self.respond = dspy.ChainOfThought('context_based_on_search, python_question -> easy_to_understand_response_based_on_PEP')
    self.reverseindexquery = dspy.ChainOfThought('python_question -> query_to_embbeding_based_search')
  def forward(self, question):
    query = self.reverseindexquery(python_question = question)
    #print(query)
    search_responses =embeddings_peps(query.query_to_embbeding_based_search)
    response = self.respond(context_based_on_search = search_responses, python_question = question)
    return response.easy_to_understand_response_based_on_PEP

In [None]:
pep_trivia_embeddings = PEPEmbeddingRetreival()
pep_trivia_embeddings(question = "any news about python 3.14?")

#Optimization

DSPy key idea is that we first create the information flow system from components, focus on the context engineering while later align the models behaviours through compilation

For that we need to have in place:

 - some datasets with expected outcomes
 - a metric (or metrics) that we will use to optimize. This metric can be a judge model with set of instructions (we can optimize even the judge)
 - choice of optimizers (teleprompters). Those vary on requirements (number of examples, type of metrics, helper functions) and scope of optimization: they can provide important examples (demos), optimize prompts or collaborate when finetuneing the model itself

In [None]:
sum_of_numbers_smarter = dspy.ReAct( #similar to previous ones but we ensure that output is float, very basic othervise.
                                    #a one line instruction added based on what we need from the task
    dspy.Signature('numbers -> sum_of_numbers:float',"find all the numbers in any format and add them"),
                                    tools=[symbolic_expression_sympy])


## Datasets

As with other machine-learning based systems it is important to have a datasets to verify againsts.

In case of AI (LLM/VLM) based systems and the optimization strategies as those below these datasets:

 - Do not need to be very large. Tens to hundreds examples will do
 - Need to be very well checked as all the errors (outliers) will very probably end up as important examples or modify the prompt

In [None]:
#preparing dataset
examples_pairs = [
    ("siete banana dos",9),
     ("one seven",8),
    ("1,2,3",6),
 (1,1),(["exp(1)",7],9.7183),("SIN(1) AND 1",1.8415),
                  ("4,5,6", 15),
 (["7","8","9"], 24),
 ("10 + 11", 21),
 ("3.1416 and 2.7183", 5.8599),
 ("-1, -2, 5", 2),
 (["100","200","300"], 600),
 ("0.3333 + 0.6667", 1.0),
 ("SQRT(4) + 6", 8.0),
 ("2^3 and 1", 9),
 (["12.5","-2.5"], 10.0),
 ("0.1 + 0.2 + 0.3", 0.6),
 ("exp(1) + 3", 5.7183),
 ("cos(0)+1", 2.0),
 ("[‘foo’,42]", 42),
 ("99 bottles + 1", 100),
 ("1e3 + 2e3 + 3e3", 6000),
 ("π + 1", 4.1416),
 ("ln(10) + 2", 4.3026),
 ("tan(1) + 0", 1.5574),
 ("[‘3*3’, 4]", 13),
                  ("2.5, 7.5", 10.0),
 ("sin(0.5) + cos(0.5)", 1.3570),
 ("log10(1000) + 4", 7.0),
 ("sqrt(2) + sqrt(3)", 3.1463),
 ("phi + 1", 2.6180),              # golden ratio 1.6180 + 1
 ("abs(-7) + 3", 10.0),
 ("round(2.718,2)+1", 3.72),
 ("sin(pi/2) + cos(0)", 2.0),
 ("arctan(1) + 1", 1.7854),
 ("sinh(1) + cosh(1)", 2.7183),    # = e^1
 ("exp(2) + 1", 8.3891),
 ("log(100) + 1", 5.6052),         # natural log
 ("7.77 + 8.88 + 9.99", 26.64),
 ("0.12345 + 0.54321", 0.66666),
 ("2π + e", 9.0015),
 ("gamma(5)", 24.0),               # 4! = 24
 ("erf(1) + 1", 1.8427),
 ("ceil(2.3)+floor(2.3)", 5),
 ("10^-2 + 10^-3", 0.011),
 ("sqrt(5) + sqrt(7)", 4.8818),
 ("sin(2) + cos(3)", -0.0807),
 (["42","banana"], 42),              # keep numeric, ignore nonsense
 ("pi^2 + e^2", 17.2587),
 ("'hello' + 'world'", 0),           # no numeric
 ("sqrt(11) + sqrt(13)", 6.9222),
 ("exp(0) + log(1)", 1.0),
 ("foo(99)", 99),                    # keep numeric 99, ignore nonsense
 ("tan(pi/4) + 10", 11.0),
 ("gamma(6)", 120.0),
    ("raz dwa trzy",6),
 (["dragon","unicorn"], 0),          # no numeric
 ("1/3 + 2/3", 1.0),
 ("arcsin(1) + arccos(0)", 3.1416),
 ("NaN test", 0),                    # no numeric
 ("ln(50) + 0.5", 4.4120),
 ("e^3 + 2", 22.085),
 ("cosh(2) - sinh(2)", 0.1353),         # = e^-2
 ("weird#string", 0),
 ("10% of 200", 20.0),
 ("floor(7.9) + ceil(7.1)", 15.0),
 (["None","None"], 0),               # no numeric
                  ]
examples = [] #converting to list of dspy.Examples
for numbers,sum_of_numbers in examples_pairs:
  examples.append(dspy.Example(numbers = numbers, sum_of_numbers = sum_of_numbers).with_inputs("numbers"))

## Metrics

Metrics ideally output a single value but for the tasks at hand it is important to understand what are the key properties of the answer

Particuarly with approach such as GEPA, you can provide more feedback to the optimizer if for example the answer is too long, mathematically not correct or silly.

Other language models can be judges quite effectively, particularly for typical language tasks (for vision in may be more tricky as there can be systematic errors in the way the image is encoded in both vlms that are used for inference and for judging)

In [None]:
def metric(example, pred, trace=None):
    """basic metric result should be aprox equal to gold"""
    gold = example.sum_of_numbers
    pred = pred.sum_of_numbers
    return abs(gold - pred)<0.0001 #lets give some margin of error

In [None]:
trainset, devset, testset = examples[:40], examples[40:55], examples[55:]


In [None]:
evaluate = dspy.Evaluate(devset=devset, metric=metric, num_threads=4, display_progress=True,
                         display_table=0, max_errors=999)

In [None]:
resu = evaluate(sum_of_numbers_smarter)
resu

In [None]:
labeledfew = dspy.LabeledFewShot(k=3) #simples optimization will fill the demos part of model call

optimizedlabelw = labeledfew.compile(sum_of_numbers_smarter, trainset=trainset)

In [None]:
optimizedlabelw(numbers = "one, two")

In [None]:
optimizedlabelw.extract.predict.demos

In [None]:
optimizedlabelw.save("few_shot.json")

In [None]:
simba = dspy.SIMBA(metric=metric, max_steps=3, max_demos=5)
optimized_agent_simba = simba.compile(sum_of_numbers_smarter, trainset=trainset, seed=6793115)

In [None]:
simba = dspy.SIMBA(metric=metric, max_steps=3, max_demos=5)
optimized_agent_simba = simba.compile(sum_of_numbers_smarter, trainset=trainset, seed=6793115)

In [None]:
evaluate(optimized_agent_simba)


In [None]:
optimized_agent_simba.extract.predict.demos

In [None]:
# Here is opportunity to pass more knowledge about quality of the answer or some tips to the optimizer
def metric_with_feedback(example,prediction,trace=None, pred_name=None, pred_trace = None):
  correct_answer = float(example.sum_of_numbers)
  try:
    llm_answer = float(prediction.sum_of_numbers)
  except:
    llm_answer = "it was not a number"
  score = float(metric(example,prediction))
  feedback_text = ""
  if score==1:
    feedback_text = f"Your answer is correct {correct_answer}"
  else:
    print(example)
    feedback_text = f"Your answer: {llm_answer} is not correct, it should be {correct_answer}"
    print(feedback_text)
  return dspy.Prediction(score = score, feedback = feedback_text)

In [None]:
gepa_optimizer = optimizer = dspy.GEPA(
    metric=metric_with_feedback,
    #auto="light",
    max_metric_calls=200,
    num_threads=32,
    track_stats=True,
    reflection_minibatch_size=3,
    reflection_lm=dspy.LM(model="gemini/gemini-2.5-flash", temperature=1.0, max_tokens=32000,  api_key=GEMINI_API_KEY) #note that these models receive a large part of history, houndreds of thousands of tokens
)
gepa_optimized_program = optimizer.compile(
    sum_of_numbers_smarter,
    trainset=trainset,
    valset=devset,
)

In [None]:
gepa_optimized_program.save("gepa.json")

In [None]:
evaluate(gepa_optimized_program)