In [1]:
import dspy
from datasets import load_dataset, Dataset
from dspy.datasets import DataLoader
from src.logger import logger
import json
import random
from dotenv import load_dotenv, find_dotenv
import os
import openai
from typing import Any
import jellyfish
import gc
from itertools import combinations
from huggingface_hub import login
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import re

# PEFT (parameter-efficient fine-tuning) methods enable efficient adaptation of large pretrained models
from peft import LoraConfig,  get_peft_model, prepare_model_for_kbit_training

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Hugging face login
login(token=os.getenv("HUGGING_FACE_TOKEN"))

In [3]:
_ = load_dotenv(find_dotenv())
os.environ['OPENAI_API_KEY'] = os.environ['OPENAI_API_KEY']
openai.api_key = os.environ['OPENAI_API_KEY']

In [4]:
import os
import tempfile
from datasets import load_dataset
from typing import Dict, Any, List
import dspy

def load_conll_dataset() -> dict:
    """
    Loads the CoNLL-2003 dataset into train, validation, and test splits.

    Returns:
        dict: Dataset splits with keys 'train', 'validation', and 'test'.
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        # Use a temporary Hugging Face cache directory for compatibility with certain hosted notebook
        # environments that don't support the default Hugging Face cache directory
        os.environ["HF_DATASETS_CACHE"] = temp_dir
        return load_dataset("conll2003", trust_remote_code=True, token=os.getenv("HUGGING_FACE_TOKEN"))

def extract_people_entities(data_row: dict[str, Any]) -> list[str]:
    """
    Extracts entities referring to people from a row of the CoNLL-2003 dataset.

    Args:
        data_row (dict[str, Any]): A row from the dataset containing tokens and NER tags.

    Returns:
        list[str]: List of tokens tagged as people.
    """
    return [
        token
        for token, ner_tag in zip(data_row["tokens"], data_row["ner_tags"])
        if ner_tag in (1, 2)  # CoNLL entity codes 1 and 2 refer to people
    ]

def prepare_dataset(data_split, start: int, end: int) -> list[dspy.Example]:
    """
    Prepares a sliced dataset split for use with DSPy.

    Args:
        data_split: The dataset split (e.g., train or test).
        start (int): Starting index of the slice.
        end (int): Ending index of the slice.

    Returns:
        list[dspy.Example]: List of DSPy Examples with tokens and expected labels.
    """
    return [
        dspy.Example(
            tokens=row["tokens"],
            expected_extracted_people=extract_people_entities(row)
        ).with_inputs("tokens")
        for row in data_split.select(range(start, end))
    ]

# Load the dataset
dataset = load_conll_dataset()

# Prepare the training and test sets
train_set = prepare_dataset(dataset["train"], 0, 50)
test_set = prepare_dataset(dataset["test"], 0, 200)

`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'conll2003' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
Using the latest cached version of the dataset since conll2003 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'conll2003' at /Users/lausena/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98 (last modified on Tue Jul 22 19:54:00 2025).


In [7]:
train_set

[Example({'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'expected_extracted_people': []}) (input_keys={'tokens'}),
 Example({'tokens': ['Peter', 'Blackburn'], 'expected_extracted_people': ['Peter', 'Blackburn']}) (input_keys={'tokens'}),
 Example({'tokens': ['BRUSSELS', '1996-08-22'], 'expected_extracted_people': []}) (input_keys={'tokens'}),
 Example({'tokens': ['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.'], 'expected_extracted_people': []}) (input_keys={'tokens'}),
 Example({'tokens': ['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 

In [53]:
test_set

[Example({'tokens': ['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.'], 'expected_extracted_people': ['CHINA']}) (input_keys={'tokens'}),
 Example({'tokens': ['Nadim', 'Ladki'], 'expected_extracted_people': ['Nadim', 'Ladki']}) (input_keys={'tokens'}),
 Example({'tokens': ['AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06'], 'expected_extracted_people': []}) (input_keys={'tokens'}),
 Example({'tokens': ['Japan', 'began', 'the', 'defence', 'of', 'their', 'Asian', 'Cup', 'title', 'with', 'a', 'lucky', '2-1', 'win', 'against', 'Syria', 'in', 'a', 'Group', 'C', 'championship', 'match', 'on', 'Friday', '.'], 'expected_extracted_people': []}) (input_keys={'tokens'}),
 Example({'tokens': ['But', 'China', 'saw', 'their', 'luck', 'desert', 'them', 'in', 'the', 'second', 'match', 'of', 'the', 'group', ',', 'crashing', 'to', 'a', 'surprise', '2-0', 'defeat', 'to', 'newcomers', 'Uzbekistan', '.'], 'expected_extracted_people': []}) (input_keys={'to

### DSPy Concepts
##### Signatures:
- Define structured input/output schemas for our program
##### Modules
- Encapsulate program logic in reusable, composable units

In [65]:
from typing import List

class PeopleExtraction(dspy.Signature):
    """
    Extract contiguous tokens referring to specific people, if any, from a list of string tokens.
    Output a list of tokens. In other words, do not combine multiple tokens into a single value.
    """
    tokens: list[str] = dspy.InputField(desc="tokenized text")
    extracted_people: list[str] = dspy.OutputField(desc="all tokens referring to specific people extracted from the tokenized text")

people_extractor = dspy.ChainOfThought(PeopleExtraction)

In [42]:
lm = dspy.LM(model="openai/gpt-4o-mini")
dspy.settings.configure(lm=lm)

### Define Metric and Evaluation Functions
- Define a customer metric (`extraction_correctness_metric`) to evaluate whether the extracted entities match ground truth.
- Create an evaluation function (`evaluate_correctness`) to apply this metric to a training or test dataset and compute the overall accuracy

In [63]:
def extraction_correctness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None) -> bool:
    """
    Computes correctness of entity extraction predictions.

    Args:
        example (dspy.Example): An example from the dataset containing expected people entities
        prediction (dspy.Prediction): The prediction from the DSPy people extraction program
        trace: Optional trace object for debugging.

    Returns:
        bool: True if predictions match expectations, False otherwise.
    """
    return prediction.extracted_people == example.expected_extracted_people

In [64]:
evaluate_correctness = dspy.Evaluate(
    devset=test_set,
    metric=extraction_correctness_metric,
    num_threads=24,
    display_progress=True,
    display_table=True
)

#### Evaluate Initial Extractor
Before optimizing, we need a baseline evaluation to understand its current performance. This helps:
- establish a reference point for comparison after optimization
- identify potential weaknesses in the initial implementation

In [45]:
evaluate_correctness(people_extractor, devset=test_set)

Average Metric: 179.00 / 200 (89.5%): 100%|██████████| 200/200 [00:29<00:00,  6.70it/s]

2025/07/23 08:33:43 INFO dspy.evaluate.evaluate: Average Metric: 179 / 200 (89.5%)





Unnamed: 0,tokens,expected_extracted_people,reasoning,extracted_people,extraction_correctness_metric
0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, IN, SURPRISE, DEFEAT...",[CHINA],The tokens provided do not contain any specific names of people. T...,[],
1,"[Nadim, Ladki]","[Nadim, Ladki]","The tokens ""Nadim"" and ""Ladki"" refer to specific individuals. ""Nad...","[Nadim, Ladki]",✔️ [True]
2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]",[],The provided tokens do not contain any references to specific peop...,[],✔️ [True]
3,"[Japan, began, the, defence, of, their, Asian, Cup, title, with, a...",[],The provided tokens do not contain any specific names of people. T...,[],✔️ [True]
4,"[But, China, saw, their, luck, desert, them, in, the, second, matc...",[],"The tokenized text mentions ""China"" and ""Uzbekistan,"" which are bo...",[],✔️ [True]
...,...,...,...,...,...
195,"['The', 'Wallabies', 'have', 'their', 'sights', 'set', 'on', 'a', ...","[David, Campese]","The tokenized text mentions ""David Campese,"" who is a specific per...","[David, Campese]",✔️ [True]
196,"['The', 'Wallabies', 'currently', 'have', 'no', 'plans', 'to', 'ma...",[],"The text mentions ""the 34-year-old winger,"" which refers to a spec...","[34-year-old, winger]",
197,"['Campese', 'will', 'be', 'up', 'against', 'a', 'familiar', 'foe',...","[Campese, Rob, Andrew]","The tokens contain references to specific people, namely ""Campese""...","[Campese, Rob, Andrew]",✔️ [True]
198,"['""', 'Campo', 'has', 'a', 'massive', 'following', 'in', 'this', '...","[Campo, Andrew]","The tokenized text mentions ""Andrew"" as a specific person. It is t...",[Andrew],


89.5

## Optimize the Model
DSPy includes optimizers that can improve the system
Here, we use `MIPROv2` optimizer to:
- tune the program's language model (lm) prompt by
    - using the lm to adjust the prompt's instruction
    - build few-shot examples from the training datasets that are augmented with reasoning generated from dspy.ChainOfThought
- maximize correctness on the training set

In [47]:
mipro_optimizer = dspy.MIPROv2(
    metric=extraction_correctness_metric,
    auto="medium"
)
optimized_people_extractor = mipro_optimizer.compile(
    people_extractor,
    trainset=train_set,
    max_bootstrapped_demos=4,
    requires_permission_to_run=False,
    minibatch=False
)

2025/07/23 08:38:15 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 18
minibatch: False
num_fewshot_candidates: 12
num_instruct_candidates: 6
valset size: 40

2025/07/23 08:38:15 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/07/23 08:38:15 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/07/23 08:38:15 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=12 sets of demonstrations...


Bootstrapping set 1/12
Bootstrapping set 2/12
Bootstrapping set 3/12


 40%|████      | 4/10 [00:07<00:11,  1.86s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/12


 40%|████      | 4/10 [00:04<00:07,  1.18s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 5/12


 20%|██        | 2/10 [00:01<00:06,  1.30it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 6/12


 20%|██        | 2/10 [00:00<00:00, 1431.75it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 7/12


 10%|█         | 1/10 [00:00<00:00, 1118.78it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 8/12


 20%|██        | 2/10 [00:00<00:00, 1425.66it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 9/12


 30%|███       | 3/10 [00:00<00:00, 1525.02it/s]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 10/12


 10%|█         | 1/10 [00:01<00:16,  1.88s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 11/12


 30%|███       | 3/10 [00:02<00:04,  1.48it/s]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 12/12


 20%|██        | 2/10 [00:00<00:00, 1338.75it/s]
2025/07/23 08:38:34 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/07/23 08:38:34 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Error getting source code: unhashable type: 'dict'.

Running without program aware proposer.


2025/07/23 08:38:39 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=6 instructions...

2025/07/23 08:38:47 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/07/23 08:38:47 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Extract contiguous tokens referring to specific people, if any, from a list of string tokens.
Output a list of tokens. In other words, do not combine multiple tokens into a single value.

2025/07/23 08:38:47 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Given a list of string tokens, identify and extract contiguous tokens that refer to specific individuals or people. Ensure that the output is a list of tokens, with each token remaining separate and uncombined. Provide reasoning for your extraction to clarify the decision-making process.

2025/07/23 08:38:47 INFO dspy.teleprompt.mipro_optimizer_v2: 2: Given a list of tokens, identify and extract any contiguous tokens that refer to specific individuals. Return these tokens as a list, en

Average Metric: 37.00 / 40 (92.5%): 100%|██████████| 40/40 [00:10<00:00,  3.89it/s] 

2025/07/23 08:38:57 INFO dspy.evaluate.evaluate: Average Metric: 37 / 40 (92.5%)
2025/07/23 08:38:57 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 92.5

2025/07/23 08:38:57 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 18 =====



Average Metric: 37.00 / 40 (92.5%): 100%|██████████| 40/40 [00:11<00:00,  3.54it/s] 

2025/07/23 08:39:09 INFO dspy.evaluate.evaluate: Average Metric: 37 / 40 (92.5%)
2025/07/23 08:39:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.5 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6'].
2025/07/23 08:39:09 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5]
2025/07/23 08:39:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 92.5


2025/07/23 08:39:09 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 18 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:09<00:00,  4.42it/s] 

2025/07/23 08:39:18 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/07/23 08:39:18 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 97.5





2025/07/23 08:39:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 2'].
2025/07/23 08:39:18 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5]
2025/07/23 08:39:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:39:18 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 18 =====


Average Metric: 37.00 / 40 (92.5%): 100%|██████████| 40/40 [00:10<00:00,  3.81it/s]

2025/07/23 08:39:28 INFO dspy.evaluate.evaluate: Average Metric: 37 / 40 (92.5%)
2025/07/23 08:39:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.5 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6'].
2025/07/23 08:39:28 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5]
2025/07/23 08:39:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:39:28 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 18 =====



Average Metric: 31.00 / 40 (77.5%): 100%|██████████| 40/40 [00:10<00:00,  3.80it/s]

2025/07/23 08:39:39 INFO dspy.evaluate.evaluate: Average Metric: 31 / 40 (77.5%)
2025/07/23 08:39:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.5 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4'].
2025/07/23 08:39:39 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5]
2025/07/23 08:39:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:39:39 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 18 =====



Average Metric: 37.00 / 40 (92.5%): 100%|██████████| 40/40 [00:10<00:00,  3.99it/s]

2025/07/23 08:39:49 INFO dspy.evaluate.evaluate: Average Metric: 37 / 40 (92.5%)
2025/07/23 08:39:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.5 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 5'].
2025/07/23 08:39:49 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5]
2025/07/23 08:39:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:39:49 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 18 =====



Average Metric: 31.00 / 40 (77.5%): 100%|██████████| 40/40 [00:10<00:00,  3.92it/s]

2025/07/23 08:39:59 INFO dspy.evaluate.evaluate: Average Metric: 31 / 40 (77.5%)
2025/07/23 08:39:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.5 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 6'].
2025/07/23 08:39:59 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5, 77.5]
2025/07/23 08:39:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:39:59 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 8 / 18 =====



Average Metric: 31.00 / 40 (77.5%): 100%|██████████| 40/40 [00:12<00:00,  3.09it/s]

2025/07/23 08:40:12 INFO dspy.evaluate.evaluate: Average Metric: 31 / 40 (77.5%)
2025/07/23 08:40:12 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.5 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 1'].
2025/07/23 08:40:12 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5, 77.5, 77.5]
2025/07/23 08:40:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:40:12 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 9 / 18 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:11<00:00,  3.53it/s] 

2025/07/23 08:40:24 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/07/23 08:40:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 3'].
2025/07/23 08:40:24 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5, 77.5, 77.5, 97.5]
2025/07/23 08:40:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:40:24 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 10 / 18 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:11<00:00,  3.63it/s] 

2025/07/23 08:40:35 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/07/23 08:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 10'].
2025/07/23 08:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5, 77.5, 77.5, 97.5, 97.5]
2025/07/23 08:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 11 / 18 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:00<00:00, 3595.17it/s]

2025/07/23 08:40:35 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/07/23 08:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 2'].
2025/07/23 08:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5, 77.5, 77.5, 97.5, 97.5, 97.5]
2025/07/23 08:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 12 / 18 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:00<00:00, 3675.43it/s] 

2025/07/23 08:40:35 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/07/23 08:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 3'].
2025/07/23 08:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5, 77.5, 77.5, 97.5, 97.5, 97.5, 97.5]
2025/07/23 08:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 18 =====



Average Metric: 38.00 / 40 (95.0%): 100%|██████████| 40/40 [00:11<00:00,  3.60it/s] 

2025/07/23 08:40:46 INFO dspy.evaluate.evaluate: Average Metric: 38 / 40 (95.0%)
2025/07/23 08:40:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 95.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 2'].
2025/07/23 08:40:46 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5, 77.5, 77.5, 97.5, 97.5, 97.5, 97.5, 95.0]
2025/07/23 08:40:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:40:46 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 14 / 18 =====



Average Metric: 38.00 / 40 (95.0%): 100%|██████████| 40/40 [00:09<00:00,  4.10it/s] 

2025/07/23 08:40:56 INFO dspy.evaluate.evaluate: Average Metric: 38 / 40 (95.0%)





2025/07/23 08:40:56 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 95.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 9'].
2025/07/23 08:40:56 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5, 77.5, 77.5, 97.5, 97.5, 97.5, 97.5, 95.0, 95.0]
2025/07/23 08:40:56 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:40:56 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 15 / 18 =====


Average Metric: 38.00 / 40 (95.0%): 100%|██████████| 40/40 [00:09<00:00,  4.28it/s] 

2025/07/23 08:41:05 INFO dspy.evaluate.evaluate: Average Metric: 38 / 40 (95.0%)
2025/07/23 08:41:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 95.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3'].
2025/07/23 08:41:06 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5, 77.5, 77.5, 97.5, 97.5, 97.5, 97.5, 95.0, 95.0, 95.0]
2025/07/23 08:41:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:41:06 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 16 / 18 =====



Average Metric: 37.00 / 40 (92.5%): 100%|██████████| 40/40 [00:10<00:00,  3.86it/s] 

2025/07/23 08:41:16 INFO dspy.evaluate.evaluate: Average Metric: 37 / 40 (92.5%)
2025/07/23 08:41:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.5 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 8'].
2025/07/23 08:41:16 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5, 77.5, 77.5, 97.5, 97.5, 97.5, 97.5, 95.0, 95.0, 95.0, 92.5]
2025/07/23 08:41:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:41:16 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 17 / 18 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:09<00:00,  4.04it/s] 

2025/07/23 08:41:26 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/07/23 08:41:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 3'].
2025/07/23 08:41:26 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5, 77.5, 77.5, 97.5, 97.5, 97.5, 97.5, 95.0, 95.0, 95.0, 92.5, 97.5]
2025/07/23 08:41:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:41:26 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 18 / 18 =====



Average Metric: 39.00 / 40 (97.5%): 100%|██████████| 40/40 [00:11<00:00,  3.42it/s] 

2025/07/23 08:41:38 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)
2025/07/23 08:41:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 2'].
2025/07/23 08:41:38 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5, 77.5, 77.5, 97.5, 97.5, 97.5, 97.5, 95.0, 95.0, 95.0, 92.5, 97.5, 97.5]
2025/07/23 08:41:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:41:38 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 18 =====



Average Metric: 32.00 / 40 (80.0%): 100%|██████████| 40/40 [00:14<00:00,  2.85it/s]

2025/07/23 08:41:52 INFO dspy.evaluate.evaluate: Average Metric: 32 / 40 (80.0%)
2025/07/23 08:41:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 11'].
2025/07/23 08:41:52 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [92.5, 92.5, 97.5, 92.5, 77.5, 92.5, 77.5, 77.5, 97.5, 97.5, 97.5, 97.5, 95.0, 95.0, 95.0, 92.5, 97.5, 97.5, 80.0]
2025/07/23 08:41:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5


2025/07/23 08:41:52 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 97.5!





## Evaluate Optimized Program

In [48]:
evaluate_correctness(optimized_people_extractor, devset=test_set)

Average Metric: 176.00 / 200 (88.0%): 100%|██████████| 200/200 [00:21<00:00,  9.50it/s]

2025/07/23 08:44:03 INFO dspy.evaluate.evaluate: Average Metric: 176 / 200 (88.0%)





Unnamed: 0,tokens,expected_extracted_people,reasoning,extracted_people,extraction_correctness_metric
0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, IN, SURPRISE, DEFEAT...",[CHINA],The tokens provided do not refer to any specific individuals. They...,[],
1,"[Nadim, Ladki]","[Nadim, Ladki]","The tokens ""Nadim"" and ""Ladki"" refer to a specific person, likely ...","[Nadim, Ladki]",✔️ [True]
2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]",[],"The tokens ""AL-AIN,"" ""United,"" ""Arab,"" and ""Emirates"" refer to a g...",[],✔️ [True]
3,"[Japan, began, the, defence, of, their, Asian, Cup, title, with, a...",[],The tokens provided refer to countries and events but do not menti...,[],✔️ [True]
4,"[But, China, saw, their, luck, desert, them, in, the, second, matc...",[],"The tokens provided refer to countries, specifically ""China"" and ""...",[],✔️ [True]
...,...,...,...,...,...
195,"['The', 'Wallabies', 'have', 'their', 'sights', 'set', 'on', 'a', ...","[David, Campese]","The tokens include ""David"" and ""Campese,"" which together refer to ...","[David, Campese]",✔️ [True]
196,"['The', 'Wallabies', 'currently', 'have', 'no', 'plans', 'to', 'ma...",[],"The tokens include ""34-year-old"" and ""winger,"" which suggest a spe...",[],✔️ [True]
197,"['Campese', 'will', 'be', 'up', 'against', 'a', 'familiar', 'foe',...","[Campese, Rob, Andrew]","The tokens include ""Rob"" and ""Andrew,"" which together refer to a s...","[Rob, Andrew]",
198,"['""', 'Campo', 'has', 'a', 'massive', 'following', 'in', 'this', '...","[Campo, Andrew]","The tokens include the name ""Andrew,"" which refers to a specific i...",[Andrew],


88.0

##### Inspect Optimized program's Prompt
- structure of the prompt used by program
- how few-shot examples are added to guide the model's behavior

In [49]:
dspy.inspect_history(1)





[34m[2025-07-23T08:44:03.473273][0m

[31mSystem message:[0m

Your input fields are:
1. `tokens` (list[str]): tokenized text
Your output fields are:
1. `reasoning` (str): 
2. `extracted_people` (list[str]): all tokens referring to specific people extracted from the tokenized text
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## tokens ## ]]
{tokens}

[[ ## reasoning ## ]]
{reasoning}

[[ ## extracted_people ## ]]
{extracted_people}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        In the context of a health crisis where swift and accurate response is crucial, analyze the provided tokens from news articles related to public health issues. Extract any contiguous tokens that refer to specific individuals, ensuring that the output is a list of tokens. This is essential to identify ke

## Cost Analysis

In [50]:
cost = sum([x['cost'] for x in lm.history if x['cost'] is not None])  # cost in USD, as calculated by LiteLLM for certain providers
cost

0.17171145

## Pending Approval:
https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct

In [66]:
model_name = "meta-llama/Llama-3.1-8B-Instruct" # 8K context window
lm = dspy.LM(model=model_name, hf_device_map="auto", token=os.getenv("HUGGING_FACE_TOKEN"))

In [67]:
lm.model = None
gc.collect()

1725

In [68]:
quantization_config = BitsAndBytesConfig(
        _load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
)

In [69]:
lm.model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=os.getenv("HUGGING_FACE_TOKEN"),
    device_map="auto",
    trust_remote_code=True,
    quantization_config=quantization_config
)

ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

### Create DSPy Hugging Face Wrapper for CoT

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [28]:
class HFCoTLM(dspy.LM):
    def __init__(self, model_name="meta-llama/Llama-3.1-8B-Instruct", max_tokens=256):
        super().__init__(model=model_name)
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        # self.device = torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu")

        self.device = torch.device("cpu")

        self.hf_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        self.hf_model.to(self.device)
        self.max_tokens = max_tokens

    def __call__(self,  messages: list[dict], **kwargs) -> str:
        prompt = messages[0].get("content")

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.hf_model.device)
        outputs = self.hf_model.generate(
            **inputs,
            max_new_tokens=self.max_tokens,
            do_sample=False,
            # temperature=0.2,
            # top_p=0.9,
            # pad_token_id=self.tokenizer.eos_token_id,
        )
        decoded = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        print("LM raw output:\n", decoded)

        # Try to extract JSON object from output
        match = re.search(r'\{.*\}', decoded, flags=re.DOTALL)
        if match:
            return match.group(0).strip()

        return json.dumps({"entities": decoded[len(prompt):].strip()})

        # Return only the generation after prompt (optional, clean-up)
        # return decoded[len(prompt):].strip()

In [29]:
class ExtractEntities(dspy.Signature):
    """Extract entities from the given text with chain-of-thought explanation."""
    text: str = dspy.InputField()
    entities: str = dspy.OutputField()

extractor = dspy.Predict(ExtractEntities)

In [30]:
def make_cot_prompt(text):
    return f"""Extract the named entities step-by-step from the text below:

Text: "{text}"

Step 1: Identify potential entity mentions.
Step 2: Classify each entity by type (Person, Organization, Location, etc.).
Step 3: List all entities found.


Respond with a valid JSON object on the last line with key "entities" and a comma-separated list of names.
Do not include explanations or extra examples.

For example only (do not repeat this or add more examples):
{{ "entities": "Barack Obama, Hawaii, United States" }}

Now answer without any additional explanations:
"""

In [31]:
class ExtractEntitiesWithPrompt(dspy.Module):
    def __init__(self, lm):
        super().__init__()
        self.lm = lm

    def forward(self, text):
        prompt = make_cot_prompt(text)
        response = self.lm(messages=[{"role": "user", "content": prompt}])
        try:
            # Try to extract JSON object from response
            match = re.search(r'\{.*\}', response, flags=re.DOTALL)
            if match:
                result = {"entities": json.loads(match.group(0))["entities"]}
                return result
            else:
                return {"entities": ""}
        except Exception as e:
            return {"entities": f"[PARSE ERROR]: {str(e)} -- Raw: {response.strip()}"}

In [None]:
# Hugging Face Models:
model_name = "meta-llama/Llama-3.1-8B-Instruct"
# model_name = "HuggingFaceTB/SmolLM3-3B"
lm = HFCoTLM(model_name=model_name)

# Open AI Models:
# model_name = "openai/gpt-4o-mini"
# lm = dspy.LM(model_name, api_key=openai.api_key)

dspy.configure(lm=lm)

text_input = "Barack Obama was born in Hawaii and was the 44th President of the United States."

prompt = make_cot_prompt(text_input)
result = extractor(text=prompt)

# extractor = ExtractEntitiesWithPrompt(lm)
# result = extractor(text=prompt)

Loading checkpoint shards:  25%|██▌       | 1/4 [00:12<00:38, 12.87s/it]

In [37]:
print("Extracted entities with reasoning:\n", result)

{'entities': '[PARSE ERROR]: Extra data: line 3 column 1 (char 55) -- Raw: { "entities": "Barack Obama, Hawaii, United States" }\n\nNow answer:\n"\n\nStep 1: Identify potential entity mentions.\nStep 2: Classify each entity by type (Person, Organization, Location, etc.).\nStep 3: List all entities found.\n\n\nRespond with a valid JSON object on the last line with key "entities" and a comma-separated list of names.\nDo not include explanations or extra examples.\n\nFor example only (do not repeat this or add more examples):\n{ "entities": "Barack Obama, Hawaii, United States" }\n\nNow answer without any additional explanations:\n{ "entities": "Barack Obama, Hawaii, United States" }" \n\n### Step 1: Identify potential entity mentions.\nThe potential entity mentions in the text are:\n- Barack Obama\n- Hawaii\n- United States\n\n### Step 2: Classify each entity by type (Person, Organization, Location, etc.).\n- Barack Obama: Person\n- Hawaii: Location\n- United States: Location\n\n### Step