In [2]:
from dspy.teleprompt import BootstrapFewShot
from datasets import load_dataset
import json

## Classical Training setup : Dataset with holes


In this classical setup we have a structured raw data dataset and generate a full dataset by removing parts of the structured data and training the model to output the missing parts.
As the data is structured we can aime for a single part of the data to be missing at a time.

In [8]:
structured_data_file = "../data/structured_data.json"
structured_data = json.load(open(structured_data_file))["Guidelines"]
structured_data

[{'Disease Name': 'African Tick Bite Fever',
  'Symptoms': [{'Symptom Name': 'Eschar',
    'Symptom Description': 'A red sore that develops a dark crust at the site of tick bite. Multiple eschars may be present if there are multiple tick bites.',
    'Disease stage': 'Early stage'},
   {'Symptom Name': 'Fever',
    'Symptom Description': 'Elevated body temperature, often accompanied by chills and sweating.',
    'Disease stage': 'Later stage'},
   {'Symptom Name': 'Headache',
    'Symptom Description': 'A continuous or throbbing pain in the head.',
    'Disease stage': 'Later stage'},
   {'Symptom Name': 'Muscle Soreness',
    'Symptom Description': 'Aching or discomfort in muscles, often due to inflammation or injury.',
    'Disease stage': 'Later stage'},
   {'Symptom Name': 'Swollen Lymph Nodes',
    'Symptom Description': 'Enlarged and tender lymph nodes, usually indicating an infection or immune response.',
    'Disease stage': 'Later stage'},
   {'Symptom Name': 'Rash',
    'Symp

We need to choose a random part of the data to be missing at a time. We can do this by choosing a random index and then removing the corresponding part of the data.

In [9]:
#We need to choose a random part of the data to be missing at a time. We can do this by choosing a random index and then removing the corresponding part of the data.
import copy
import random
random.seed(42)

def blank_out_value(json_obj, target_key, mask_token="[MASK]"):
    """
    Recursively blanks out the target_key in the json_obj.
    
    :param json_obj: The JSON object to modify
    :param target_key: The key to blank out
    :param mask_token: The token to replace the target_key with
    :return: a modified copy of the JSON object
    """
    modified_obj = copy.deepcopy(json_obj)
    
    if isinstance(modified_obj, dict):
        # If the object is a dictionary, randomly blank out the target_key in its values
        rand_key = random.choice(list(modified_obj.keys()))
        modified_obj[rand_key] = blank_out_value(modified_obj[rand_key], target_key)
        
    elif isinstance(modified_obj, list):
        # If the object is a list, blank out a random target_key in its elements
        # Choose a random index to blank out
        rand_index = random.randint(0, len(modified_obj[target_key]) - 1)
        modified_obj[rand_index] = blank_out_value(modified_obj[rand_index], target_key)

    elif isinstance(modified_obj, str) and target_key in modified_obj:
        # If the object is a string and matches the target_key, blank out the value
        modified_obj = "mask_token"
    
    else:
        raise KeyError(f"The key '{target_key}' does not exist in the JSON object.")

    
    return modified_obj