## Import Packages

In [2]:
from datasets import load_dataset
import json

  from .autonotebook import tqdm as notebook_tqdm


## Math (GSM8K)

In [None]:
ds_dict = load_dataset("openai/gsm8k", "main")

ds_train = ds_dict["train"]
ds_test = ds_dict["test"]

ds_train_q = ds_train.select_columns(["question"])
ds_test_q = ds_test.select_columns(["question"])

ds_train_q.to_json("experiments/math/train_raw.jsonl", lines=True)
ds_test_q.to_json("experiments/math/test_raw.jsonl", lines=True)

Generating train split: 100%|██████████| 7473/7473 [00:00<00:00, 185190.42 examples/s]
Generating test split: 100%|██████████| 1319/1319 [00:00<00:00, 522999.34 examples/s]
Creating json from Arrow format: 100%|██████████| 8/8 [00:00<00:00, 568.92ba/s]
Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 1033.21ba/s]


338094

## Trivia (TriviaQA)

In [None]:
train_stream = load_dataset(
    "mandarjoshi/trivia_qa",
    "rc",
    split="train",
    streaming=True,
)

test_stream = load_dataset(
    "mandarjoshi/trivia_qa",
    "rc",
    split="test",
    streaming=True,
)

# Helper to write first N questions from a stream to JSONL
def write_first_n_questions(stream, path, n=10000):
    with open(path, "w", encoding="utf-8") as f:
        for i, ex in enumerate(stream):
            if i >= n:
                break
            json.dump({"question": ex["question"]}, f, ensure_ascii=False)
            f.write("\n")

# Write first 10k train and test questions
write_first_n_questions(train_stream, "experiments/trivia/train_raw.jsonl", n=10000)
write_first_n_questions(test_stream,  "experiments/trivia/test_raw.jsonl",  n=10000)

## General (NQ-Open)

In [3]:
import os
import requests

def download_and_process_nq(url, output_path, limit):
    response = requests.get(url, stream=True)
    count = 0
    # Ensure parent directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    with open(output_path, 'w', encoding='utf-8') as f_out:
        for line in response.iter_lines():
            if count >= limit:
                break
            if line:
                data = json.loads(line)
                # Only keep the question to match other datasets
                json.dump({"question": data["question"]}, f_out, ensure_ascii=False)
                f_out.write("\n")
                count += 1

# Train: first 5000 lines
download_and_process_nq(
    "https://raw.githubusercontent.com/efficientqa/nq-open/master/NQ-open.train.jsonl",
    "experiments/general/train_raw.jsonl",
    5000
)

# Test (from Dev): first 1000 lines
download_and_process_nq(
    "https://raw.githubusercontent.com/efficientqa/nq-open/master/NQ-open.dev.jsonl",
    "experiments/general/test_raw.jsonl",
    1000
)