#### run from main project directory

In [1]:
import csv
import json
import os
import random

In [4]:
# Function to parse the CSV input
def parse_csv(csv_file_path):
    questions = []

    with open(csv_file_path, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=',')
        next(reader)  # Skip the header row

        for row in reader:
            question = row[0].strip()  # The question text
            answers = {
                "A": row[1].strip(),
                "B": row[2].strip(),
                "C": row[3].strip(),
                "D": row[4].strip()
            }
            correct_answer = row[5].strip()  # The correct answer
            questions.append({
                "question": question,
                "answers": answers,
                "correct_answer": correct_answer,
                "subject": os.path.basename(csv_file_path)[:-4]  # The subject of the question
            })

    return questions

# Function to save parsed data to a JSON file
def save_to_json(parsed_questions, output_file):
    with open(output_file, mode='w', encoding='utf-8') as file:
        json.dump(parsed_questions, file, ensure_ascii=False, indent=4)
    print(f"Output saved to {output_file}")

def sample_and_save(csv_file, output_file, num_samples):
    questions = parse_csv(csv_file)
    if num_samples > len(questions):
        val_file = csv_file.replace("test", "val")
        val_questions = parse_csv(val_file)
        questions.extend(val_questions)
    sampled_questions = random.sample(questions, num_samples)
    save_to_json(sampled_questions, output_file)


In [5]:
sample_and_save("mmmlu/data/test/miscellaneous_test.csv", "test/general_test.json", 500)

Output saved to test/general_test.json


In [6]:
sample_and_save("mmmlu/data/test/high_school_computer_science_test.csv", "test/subjects_compare/cs_test.json", 100)

Output saved to test/subjects_compare/cs_test.json


In [7]:
sample_and_save("mmmlu/data/test/high_school_world_history_test.csv", "test/subjects_compare/history_test.json", 100)

Output saved to test/subjects_compare/history_test.json


In [8]:
sample_and_save("mmmlu/data/test/high_school_biology_test.csv", "test/subjects_compare/biology_test.json", 100)

Output saved to test/subjects_compare/biology_test.json


In [9]:
sample_and_save("mmmlu/data/test/high_school_mathematics_test.csv", "test/subjects_compare/math_test.json", 100)

Output saved to test/subjects_compare/math_test.json


In [10]:
sample_and_save("mmmlu/data/test/elementary_mathematics_test.csv", "test/difficulty_test/elementary_math_test.json", 100)

Output saved to test/difficulty_test/elementary_math_test.json


In [11]:
sample_and_save("mmmlu/data/test/high_school_mathematics_test.csv", "test/difficulty_test/hs_math_test.json", 100)

Output saved to test/difficulty_test/hs_math_test.json


In [12]:
sample_and_save("mmmlu/data/test/college_mathematics_test.csv", "test/difficulty_test/college_math_test.json", 100)

Output saved to test/difficulty_test/college_math_test.json
