In [1]:
from tqdm import tqdm
import pandas as pd

In [30]:
dataset_name = "machine_learning"

In [31]:
gpt_predictions_df = pd.read_csv(f"eval_outputs/{dataset_name}_gpt4o_results.csv")
llama_predictions_df = pd.read_csv(f"eval_outputs/{dataset_name}_llama_vanilla_results.csv")

In [32]:
assert all(gpt_predictions_df['Question'] == llama_predictions_df['Question'])

In [33]:
correct_gpt = gpt_predictions_df['Predicted Answer'] == gpt_predictions_df['Correct Answer']
incorrect_llama = llama_predictions_df['Predicted Answer'] != llama_predictions_df['Correct Answer']
interesting_questions = gpt_predictions_df[correct_gpt & incorrect_llama]

In [34]:
questions_to_revist = interesting_questions['Question'].tolist()
answers_of_interest = interesting_questions['Correct Answer'].tolist()

In [35]:
import os
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [36]:
from openai import OpenAI

client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [37]:
temperature = .5
number_of_examples_per_question = 4

In [38]:
def generate_example(question, answer, prev_examples, temperature=0.5):
    system_prompt = f'''You are an advanced AI model tasked with assisting in creating training data for smaller AI models. 

    The task of the smaller model is to take in a question in the field of {dataset_name} and a list of possible answers, and produce an answer that is most likely to be correct, without providing any explanation or reasoning.

    When given a question, you should:

    1. Analyze the knowledge or reasoning required to answer it correctly.

    2. Generate a new Q&A pair that tests the same knowledge or reasoning skills.

    The objective is that the smaller model should be able to answer the original question correctly if it can answer the new question correctly.

    Ensure that:

    - The information present in the original question is also present in the new question.
    - The new question has exactly 4 answer options (A, B, C, D). Mention these options in the question.
    - The correct answer is clearly indicated in the format shown below.

    Use the following format strictly:
    ```
    <question>Question text with options</question>
    <answer>Correct answer (e.g., A or B)</answer>
    ```

    Do not include explanations, reasoning, or any additional text.'''
    
    system_message = {"role": "system", "content": system_prompt}
    messages = [system_message]
    
    messages.append({
        "role": "user", 
        "content": f'''Here is a question that was correctly answered by a large model but incorrectly answered by a smaller model:
    {question} and its answer: {answer}

    Please generate a new Q&A pair following the instructions. Only one question/answer pair should be generated per turn.'''
    })
    
    if prev_examples:
        for example in prev_examples:
            messages.append({"role": "assistant", "content": example})
            messages.append({"role": "user", "content": 'Now, generate another unique question/answer pair.'})
    
    resp = client.chat.completions.create(
        messages=messages,
        model="gpt-4o",
        temperature=temperature
    )
    
    content = resp.choices[0].message.content
    
    # Validate and return output
    if '<question>' in content and '<answer>' in content:
        return content.strip()
    else:
        raise ValueError("Generated content is not in the expected format.")

In [39]:
import pandas as pd

prompts = []
responses = []

for idx in tqdm(range(len(questions_to_revist))):
    question = questions_to_revist[idx]
    correct_answer = answers_of_interest[idx]
    prev_examples = []
    
    for i in range(number_of_examples_per_question):
        try:
            example = generate_example(question, correct_answer, prev_examples, temperature)
            prev_examples.append(example)
        except ValueError as e:
            print(f"Error generating example for question {idx}: {e}")
            continue
    
    for example in prev_examples:
        try:
            prompt_start = example.index('<question>') + len('<question>')
            prompt_end = example.index('</question>')
            prompt = example[prompt_start:prompt_end].strip()
            
            response_start = example.index('<answer>') + len('<answer>')
            response_end = example.index('</answer>')
            response = example[response_start:response_end].strip()
            
            prompts.append(prompt)
            responses.append(response)
        except (ValueError, IndexError) as e:
            print(f"Error parsing example: {e}")
            continue

  0%|          | 0/58 [00:00<?, ?it/s]

100%|██████████| 58/58 [05:05<00:00,  5.26s/it]


In [40]:
# Create a DataFrame
df = pd.DataFrame({
    'prompt': prompts,
    'response': responses
})

# Remove duplicates
df = df.drop_duplicates()

In [41]:

print('There are ' + str(len(df)) + ' successfully-generated examples. Here are the first few:')
df.head()

There are 232 successfully-generated examples. Here are the first few:


Unnamed: 0,prompt,response
0,Statement 1| GPT-3 was trained on a dataset si...,C
1,Statement 1| BERT uses a masked language model...,A
2,Statement 1| The T5 model was trained on a div...,C
3,Statement 1| The XLNet model was trained using...,C
4,"Statement 1| Decision trees, unlike support ve...",D


In [42]:
# Split the data into train and test sets, with 90% in the train set
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

# Save the dataframes to .jsonl files
train_df.to_json(f'{dataset_name}_train.jsonl', orient='records', lines=True)
test_df.to_json(f'{dataset_name}_test.jsonl', orient='records', lines=True)