### mock example for extraction pipeline

In [1]:
import os
import tempfile 
from datasets import load_dataset
from typing import List, Dict, Any
import dspy

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
def load_dataset() -> dict:
    with tempfile.TemporaryDirectory() as temp_dir:
        os.environ["HF_DATASETS_CACHE"] = temp_dir
        return load_dataset('conll2003')
        

In [6]:
def extract_people_entitites(data_row: Dict[str, Any]) -> List:
    """
    Extracts people entities from a single row of the dataset.
    Args:
        data_row (Dict[str, Any]): A row from the dataset containing tokens and their corresponding NER tags.    
    """
    return [
        token
        for token, ner_tag in zip(data_row["tokens"], data_row["ner_tags"])
        if ner_tag in (1, 2)  # CoNLL entity codes 1 and 2 refer to people
    ]
   

In [7]:
def prepare_dataset(data_split, start: int, end: int) -> List[dspy.Example]:
    """
    Prepares a sliced dataset split for use with DSPy.
    
    Args:
        data_split: The dataset split (e.g., train or test).
        start (int): Starting index of the slice.
        end (int): Ending index of the slice.
    
    Returns:
        List[dspy.Example]: List of DSPy Examples with tokens and expected labels.
    """
    return [
        dspy.Example(
            tokens=row["tokens"],
            expected_extracted_people=extract_people_entitites(row)
        ).with_inputs("tokens")
        for row in data_split.select(range(start, end))
    ]
