This script adds more fields to the HotPotQA dataset and generate more data,
for statement & verdict training:
  - choose a random direction: True or False
  - generate statement based on the direction and the QA
  - generate some new statements that has no coresponding context in the retriever as "irrelevant"
  - save to file

TODO:
  - quality control automatically
  - more diverse irrelevant statements

In [1]:
import os

select_dataset_total = 20000  # how many to select from each portion of the dataset
concurrency = 32  # LLM calling
irrelevant_percent = 0.3  # how many irrelevant statements to created based on the number of total True & False

OPENAI_BASE_URL = "http://lb:8010/v1/"
os.environ["OPENAI_API_KEY"] = "aaaa"

os.environ['DSP_CACHEBOOL'] = "False"
os.environ["DSP_NOTEBOOK_CACHEDIR"] = '/cache'

In [2]:
import dspy

llm = dspy.OpenAI(model="mistralai/Mistral-Nemo-Instruct-2407", api_base=OPENAI_BASE_URL, max_tokens=200, model_type="chat") # temperature=1
dspy.settings.configure(lm=llm)

In [3]:
llm("who are you")

["I am a text-based AI model. I'm here to provide information, answer questions to the best of my ability, and engage in conversation on a wide range of topics. I don't have personal experiences, feelings, or a physical presence, but I'm designed to simulate a human-like interaction. How can I assist you today?"]

In [4]:
import random
from datasets import load_dataset
from dspy.datasets.dataset import Dataset

class HotPotSV():
    """
    HotSpotQA to statements + verdicts
    """
    def __init__(
        self,
        train_size = 50,
        validation_size = 50,
        keep_details = False,
    ):

        hf_official_train = load_dataset("hotpot_qa", 'fullwiki', split='train', trust_remote_code=True)
        hf_official_validation = load_dataset("hotpot_qa", 'fullwiki', split='validation', trust_remote_code=True)
        # `test` split has no answer

        self.train = self.process_dataset(hf_official_train, train_size)
        self.validation = self.process_dataset(hf_official_validation, validation_size)

    def process_dataset(self, dataset, size, keep_details = False):
        rep = []
        for raw_example in dataset:
            if keep_details is True:
                keys = ['id', 'question', 'answer', 'type', 'supporting_facts', 'context', 'level']
            elif keep_details == 'validation_titles':
                keys = ['question', 'answer', 'supporting_facts', 'level']
            else:
                keys = ['question', 'answer', 'level']

            example = {k: raw_example[k] for k in keys}
            
            if 'supporting_facts' in example:
                example['gold_titles'] = set(example['supporting_facts']['title'])
                del example['supporting_facts']

            rep.append(example)

        rng = random.Random(0)
        rng.shuffle(rep)
        rep = rep[:size]
        return rep

In [5]:
dataset_HotPotSV = HotPotSV(train_size=select_dataset_total, validation_size=select_dataset_total)

In [6]:
class ChangeAnswerSignature(dspy.Signature):
    """Generate a new answer based on the question and correct answer"""
    question = dspy.InputField()
    answer = dspy.InputField(desc="correct answer")
    output = dspy.OutputField(desc="incorrect answer")
    
class CombineQA(dspy.Signature):
    """Combine the question and answer into one statement of direct expression with context"""
    question = dspy.InputField()
    answer = dspy.InputField()
    statement = dspy.OutputField(desc="generated from the question and answer only, with all the original info")
    
class GenerateStatement(dspy.Module):
    def __init__(self):
        super().__init__()
        self.change_answer = dspy.Predict(ChangeAnswerSignature)
        self.combine_qa = dspy.Predict(CombineQA)

    def forward(self, question, answer, level=None):
        direction = random.choice(["True", "False"])
        # direction = 'False'
        answer_fake = None
        if direction == "False":
            new_answer = self.change_answer(question=question, answer=answer).output
            answer_fake = new_answer
            # print(f"new_answer: {new_answer}")
        else:
            new_answer = answer
        statement = self.combine_qa(question=question, answer=new_answer).statement
        # statement = self.generate_statement(question=question, answer=new_answer).statement

        rep = {
            'question': question,
            'answer': answer,
            'verdict': direction,
            'statement': statement,
            'level': level,
        }
        
        if answer_fake:
            rep['answer_fake'] = answer_fake
        # print(f"with statement: {rep}\n")
        return rep

In [7]:
import concurrent.futures

def generate_datasets():
    def generate_statement(d):
        return GenerateStatement()(**d)
        
    train = []
    validation = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
        train_futures = [executor.submit(generate_statement, d) for d in dataset_HotPotSV.train]
        validation_futures = [executor.submit(generate_statement, d) for d in dataset_HotPotSV.validation]

        # Collecting results as they complete
        train = [future.result() for future in concurrent.futures.as_completed(train_futures)]
        validation = [future.result() for future in concurrent.futures.as_completed(validation_futures)]

    return train, validation

dataset_train_HotPotQA_generated, dataset_validation_HotPotQA_generated = generate_datasets()

print(dataset_train_HotPotQA_generated[:10])
print("\n")
print(dataset_validation_HotPotQA_generated[:10])

[{'question': 'Which cartridge did John Browning design that has a rim that is the same in diameter as the .50 GI?', 'answer': '.45 ACP', 'verdict': 'True', 'statement': 'John Browning designed the .45 ACP cartridge, which has a rim that is the same in diameter as the .50 GI.', 'level': 'hard'}, {'question': 'Who besides Marceau founded Ballroom Theater’s Dancing Classrooms program that was featured in a 2005 American documentary?', 'answer': 'Dulaine', 'verdict': 'False', 'statement': 'Dulaine co-founded Ballroom Theater’s Dancing Classrooms program, which was featured in a 2005 American documentary.', 'level': 'easy', 'answer_fake': 'Dulaine'}, {'question': 'Invasion Attack included title matches by the Japanese sumo wrestler from what Hawaiian city?', 'answer': 'Waimānalo', 'verdict': 'False', 'statement': 'The Honolulu-born Japanese sumo wrestler, Invasion Attack, won title matches.', 'level': 'medium', 'answer_fake': 'Honolulu'}, {'question': 'What is the premier date of this Sout

In [8]:
"""Generate `irrelevant` datasets."""

from dsp.utils import deduplicate
import re

class GenerateLatestStatementSignature(dspy.Signature):
    """Generate a simple fiction with content similar to the input and happends after year 2040"""
    input = dspy.InputField()
    output = dspy.OutputField(desc="similar to the input, happends after year 2040, in one sentense")

import concurrent.futures

def generate_latest_statements():
    """Generate statements that don't have related context in retriever."""

    def contains_number_in_range(s):
        """
        The retriever has data up to year 2017.
        Make sure the statement contains year number higher than 2020.
        """
        numbers = re.findall(r'\d+', s)
        
        # Check if any of these numbers are in the range 2021 to 2029 (inclusive)
        for number in numbers:
            num = int(number)
            if 2040 < num:
                return True
        
        return False
        
    def to_json(statements):
        return [{'statement': s, 'verdict': 'Irrelevant'} for s in statements]

    def process_data(data, count):
        processed = []
        
        def process_single(d):
            return dspy.Predict(GenerateLatestStatementSignature)(input=d['statement']).output

        with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
            futures = [executor.submit(process_single, d) for d in data]
            for future in concurrent.futures.as_completed(futures):
                s_new = future.result()
                if contains_number_in_range(s_new):
                    processed = deduplicate(processed + [s_new])
                if len(processed) >= count:
                    break
        
        return to_json(processed)

    count_train = len(dataset_train_HotPotQA_generated) * irrelevant_percent
    train = process_data(dataset_train_HotPotQA_generated, count_train)

    count_validation = len(dataset_validation_HotPotQA_generated) * irrelevant_percent
    validation = process_data(dataset_validation_HotPotQA_generated, count_validation)

    return train, validation

_irrelevant_train, _irrelevant_validation = generate_latest_statements()

from pprint import pprint
pprint(_irrelevant_train[:30])
print("\n")
pprint(_irrelevant_validation[:30])

[{'statement': 'In 2045, Robert Tree Cody, now a renowned environmental '
               'activist, launched a global campaign, "Keep Earth Green," '
               "inspired by his adopted father's iconic PSA.",
  'verdict': 'Irrelevant'},
 {'statement': 'In 2045, the Sternbergia and Echinopsis genuses of plants, now '
               'genetically modified, coexist harmoniously in the same '
               'bioluminescent gardens.',
  'verdict': 'Irrelevant'},
 {'statement': "In 2045, Walter W. Arndt's great-granddaughter, Lily, "
               'translated many poems by the French-German poet and novelist, '
               'using an advanced AI-assisted tool.',
  'verdict': 'Irrelevant'},
 {'statement': "In 2045, AI-generated works, modeled after Charles Dickens' "
               'style, continue to captivate readers, with "A Neural Carol" '
               'being a standout.',
  'verdict': 'Irrelevant'},
 {'statement': '"In 2050, \'The Stumble 2.0\' was composed by a guitarist born '


In [9]:
import random

dataset_train_HotPotQA_generated += _irrelevant_train
dataset_validation_HotPotQA_generated += _irrelevant_validation

rng = random.Random(1)
rng.shuffle(dataset_train_HotPotQA_generated)
rng.shuffle(dataset_validation_HotPotQA_generated)

print(len(dataset_train_HotPotQA_generated))
print(len(dataset_validation_HotPotQA_generated))

26000
9627


In [10]:
"""Save datasets to file"""

import json
import os

base = "./datasets/HotPotQA"
data_files = {
    "train.json": dataset_train_HotPotQA_generated,
    "validation.json": dataset_validation_HotPotQA_generated,
}

os.makedirs(base, exist_ok=True)

for filename, data in data_files.items():
    file_path = os.path.join(base, filename)
    with open(file_path, 'w') as file:
        json.dump(data, file)