In [None]:
import os
import random

from openai import OpenAI
from dotenv import load_dotenv
import pandas as pd

In [None]:
load_dotenv()

In [None]:
client = OpenAI()

In [None]:
# directory holding MMLU test set questions
data_dir = "mmlu/test"

df_list = []

for filename in os.listdir(data_dir):

    subject = filename.replace(".csv", "")
    df = pd.read_csv(f"{data_dir}/{filename}", header=None)
    df_sample = df.sample(frac=0.02).copy()

    df_sample.columns = ["question", "A", "B", "C", "D", "answer"]

    df_sample["subject"] = subject
    df_list.append(df_sample)

In [None]:
mmlu_df = pd.concat(df_list, axis=0)
mmlu_df["question_id"] = range(mmlu_df.shape[0])
mmlu_df.to_csv("mmlu_test_2pct_sample.csv", index=False)

In [None]:
def gpt_35_perturb_question(question):
    prompt = f"Reword the following question: {question}"
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    return completion.choices[0].message.content

In [None]:
def add_random_char(s):

    char = random.choice(["@", "#", "*", "{", "}", "`", "\\"])

    random_position = random.randint(0, len(s))

    return s[:random_position] + char + s[random_position:]

In [None]:
simple_perturbed_df = mmlu_df.copy()
simple_perturbed_df["question"] = simple_perturbed_df["question"].apply(add_random_char)
simple_perturbed_df.to_csv("final_data/mmlu_200_singlechar_perturbed.csv", index=False)

In [None]:
gpt_35_perturbed_df = mmlu_df.copy()
gpt_35_perturbed_df["question"] = gpt_35_perturbed_df["question"].apply(gpt_35_perturb_question)
gpt_35_perturbed_df.to_csv("final_data/mmlu_200_gpt_perturbed.csv", index=False)