This code draws substantially from farquhar-et-al-2024.

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [None]:
import random
import pandas as pd
from google.colab import drive
from tqdm import tqdm
import datasets
import os
import json
import hashlib


In [None]:
def load_ds(dataset_name, save_path, seed, add_options=None):
    """Load dataset."""
    train_dataset, validation_dataset = None, None
    if dataset_name == "squad":
        dataset = datasets.load_dataset("squad_v2")
        train_dataset = dataset["train"]
        validation_dataset = dataset["validation"]
    elif dataset_name == 'svamp':
        dataset = datasets.load_dataset('ChilleD/SVAMP')
        train_dataset = dataset["train"]
        validation_dataset = dataset["test"]
        reformat = lambda x: {
            'question': x['Question'], 'context': x['Body'], 'type': x['Type'],
            'equation': x['Equation'], 'id': x['ID'],
            'answers': {'text': [str(x['Answer'])]}}
        train_dataset = [reformat(d) for d in train_dataset]
        validation_dataset = [reformat(d) for d in validation_dataset]
    elif dataset_name == 'nq':
        dataset = datasets.load_dataset("nq_open")
        train_dataset = dataset["train"]
        validation_dataset = dataset["validation"]
        md5hash = lambda s: str(int(hashlib.md5(s.encode('utf-8')).hexdigest(), 16))
        reformat = lambda x: {
            'question': x['question']+'?',
            'answers': {'text': x['answer']},
            'context': '',
            'id': md5hash(str(x['question'])),
        }
        train_dataset = [reformat(d) for d in train_dataset]
        validation_dataset = [reformat(d) for d in validation_dataset]
    elif dataset_name == "trivia_qa":
        dataset = datasets.load_dataset('TimoImhof/TriviaQA-in-SQuAD-format')['unmodified']
        dataset = dataset.train_test_split(test_size=0.2, seed=seed)
        train_dataset = dataset['train']
        validation_dataset = dataset['test']
    elif dataset_name == "bioasq":
        path = os.path.join(save_path,'training11b.json') # you need to download and place this file in save_path (not necessary for nqopen or squad)
        with open(path, "rb") as file:
            data = json.load(file)
        questions = data["questions"]
        dataset_dict = {
            "question": [],
            "answers": [],
            "id": []
        }
        for question in questions:
            if "exact_answer" not in question:
                continue
            dataset_dict["question"].append(question["body"])
            if "exact_answer" in question:
                if isinstance(question['exact_answer'], list):
                    exact_answers = [
                        ans[0] if isinstance(ans, list) else ans
                        for ans in question['exact_answer']
                    ]
                else:
                    exact_answers = [question['exact_answer']]
                dataset_dict["answers"].append({
                    "text": exact_answers,
                    "answer_start": [0] * len(question["exact_answer"])
                })
            else:
                dataset_dict["answers"].append({
                    "text": question["ideal_answer"],
                    "answer_start": [0]
                })
            dataset_dict["id"].append(question["id"])
            dataset_dict["context"] = [None] * len(dataset_dict["id"])

        dataset = datasets.Dataset.from_dict(dataset_dict)
        dataset = dataset.train_test_split(test_size=0.5, seed=seed)
        print(f">>>> LEN DATASET DICT: {dataset.shape}")
        train_dataset = dataset['train']
        validation_dataset = dataset['test']
    else:
        raise ValueError
    return train_dataset, validation_dataset

In [None]:
def build_datasets(dataset_name, save_path, num_samples=3000, random_seed=42, answerable_only=True):
    random.seed(random_seed)

    train_dataset, test_dataset = load_ds(dataset_name, save_path, seed=random_seed, add_options=False)

    if answerable_only:
        train_dataset = [item for item in train_dataset if item['answers']['text']]
        test_dataset = [item for item in test_dataset if item['answers']['text']]

    random.shuffle(train_dataset)
    random.shuffle(test_dataset)

    if num_samples == 'all':
        sampled_train = train_dataset
        sampled_test = test_dataset
    else:
        sampled_train = random.sample(train_dataset, min(num_samples, len(train_dataset)))
        sampled_test = random.sample(test_dataset, min(num_samples, len(test_dataset)))


    # train_set = full_dataset[:num_samples]
    # test_set = full_dataset[num_samples:2*num_samples]
    # val_set = full_dataset[2*num_samples:3*num_samples]

    # display(sampled_train)
    # display(test_dataset)

    # def to_dataframe(dataset):
    #     data = []
    #     for item in tqdm(dataset):
    #         data.append({
    #             'id': item['id'],
    #             'question': item['question'],
    #             'context': item['context'] if 'context' in item else '',
    #             'answer': item['answers']['text'][0] if item['answers']['text'] else ''
    #         })
    #     return pd.DataFrame(data)

    # train_df = to_dataframe(train_dataset)
    # test_df = to_dataframe(test_dataset)
    # val_df = to_dataframe(val_set)

    train_df = pd.DataFrame(sampled_train)
    test_df = pd.DataFrame(sampled_test)

    os.makedirs(save_path, exist_ok=True)
    if num_samples == 'all':
        train_df.to_csv(os.path.join(save_path, f"{dataset_name}_train_set_all.csv"), index=False)
        test_df.to_csv(os.path.join(save_path, f"{dataset_name}_test_set_all.csv"), index=False)
    else:
        train_df.to_csv(os.path.join(save_path, f"{dataset_name}_train_set_{num_samples}.csv"), index=False)
        test_df.to_csv(os.path.join(save_path, f"{dataset_name}_test_set_{num_samples}.csv"), index=False)
    # val_df.to_csv(os.path.join(save_path, f"{dataset_name}_val_set_{num_samples}.csv"), index=False)

    print("Datasets have been saved to Google Drive.")

    return train_df, test_df#, val_df

In [None]:
drive.mount('/content/drive')

base_dir = '/bsae_dir_to_save'

new_folder = 'nature_datasets_FINAL'

new_folder_path = os.path.join(base_dir, new_folder)

if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)
    print(f"Created new folder: {new_folder_path}")
else:
    print(f"Folder already exists: {new_folder_path}")

dataset_name = 'nq'  # Replace with nq, squad
train_df, test_df = build_datasets(dataset_name, new_folder_path, num_samples=10000, answerable_only=True)

print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)
# print("Validation set shape:", val_df.shape)

Mounted at /content/drive
Folder already exists: /content/drive/MyDrive/arb/pizza/pizza-dataset-files/nature_datasets_FINAL


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.77k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.46M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/214k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87925 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3610 [00:00<?, ? examples/s]

Datasets have been saved to Google Drive.
Train set shape: (10000, 4)
Test set shape: (3610, 4)


In [None]:
# for file in os.listdir('/dir_to_check_dataset_dimensions'):
#     if file.endswith('.csv'):
#         file_path = os.path.join(new_folder_path, file)
#         df = pd.read_csv(file_path)
#         print(f"File: {file}; Shape: {df.shape}")

File: squad_train_set_all.csv; Shape: (86821, 5)
File: squad_test_set_all.csv; Shape: (5928, 5)
File: svamp_train_set_all.csv; Shape: (700, 6)
File: svamp_test_set_all.csv; Shape: (300, 6)
File: nq_train_set_all.csv; Shape: (87925, 4)
File: nq_test_set_all.csv; Shape: (3610, 4)
File: trivia_qa_train_set_all.csv; Shape: (12294, 4)
File: trivia_qa_test_set_all.csv; Shape: (3074, 4)
File: bioasq_train_set_all.csv; Shape: (1794, 4)
File: bioasq_test_set_all.csv; Shape: (1795, 4)
File: nq_train_set_3000.csv; Shape: (3000, 4)
File: nq_test_set_3000.csv; Shape: (3000, 4)
File: squad_train_set_3000.csv; Shape: (3000, 5)
File: squad_test_set_3000.csv; Shape: (3000, 5)
