In [2]:
import json
import re
from embed_llm import DATA_PATH
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# SQUAD
new_data = []
data = load_dataset("squad", split="validation")
for sample in data:
    new_data.append({
        'question': sample['question'],
        'answer': sample['answers']['text'][0],
        'passages':sample['context']
    })
with open(DATA_PATH + "raw/squad_validation.jsonl", "w") as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")

In [None]:
# CNN/Dailymail
new_data = []
data = load_dataset("abisee/cnn_dailymail","1.0.0", split="test")
for sample in data:
    new_data.append({
        'question': "What is a very short summary of the above text?",
        'answer': sample['highlights'],
        'passages':sample['article']
    })
with open(DATA_PATH + "raw/cnn_validation.jsonl", "w") as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")

In [None]:
#NQ 
new_data = []
data = load_dataset("nq_open",split='validation')
for sample in data:
    new_data.append({
        'question': sample['question'],
        'answer': sample['answer'][0],
    })
with open(DATA_PATH + "raw/nq_validation.jsonl", "w") as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")
        
    

In [None]:
#TRIVIAQA
new_data = []
data = load_dataset("mandarjoshi/trivia_qa", 'unfiltered.nocontext',split='validation')
for sample in data:
    new_data.append({
        'question': sample['question'],
        'answer': sample['answer']['aliases'],
    })
with open(DATA_PATH + "raw/triviaqa_validation.jsonl", "w") as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")
        
    

In [None]:
#HotpotQA
new_data = []
data = load_dataset("hotpotqa/hotpot_qa", 'distractor',split='validation')
for sample in data:
    new_data.append({
        'question': sample['question'],
        'answer': sample['answer'],
        'passages': [' '.join(sentences) for sentences in sample['context']['sentences']]
    })
with open(DATA_PATH + "raw/hotpotqa_validation.jsonl", "w") as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")
        
    

In [None]:
# Atlas chunks for eval 
# Follow instructions from https://github.com/facebookresearch/atlas?tab=readme-ov-file#available-data-and-Models-for-download
# Load it at DATA_PATH + 'raw/Atlas_passages_validation.jsonl'



### Fine-tuning dataset

In [None]:
# Freebase QA
new_data = []
data = load_dataset("freebase_qa", split="train")
for sample in data:
    new_data.append({
        'question': sample['RawQuestion'],
        'answer': sample['Parses']['Answers'][0]['AnswersName'][0][0]
    })
with open(DATA_PATH + "raw/freebase_qa_train.jsonl", "w") as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")

In [None]:
# MSMARCO
new_data = []
data = load_dataset("ms_marco","v2.1", split="train")
for sample in data:
    if 'No Answer' in sample['answers'][0]:
        continue
    new_data.append({
        'question': sample['query'],
        'answer': sample['answers'][0]
    })
with open(DATA_PATH + "raw/msmarco_train.jsonl", "w") as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")

In [None]:
# DROP
new_data = []
data = load_dataset("drop", split="train")
for sample in data:
    new_data.append({
        'question': sample['question'],
        'answer': sample['answers_spans']['spans'][0],
        'passages':sample['passage']
    })
with open(DATA_PATH + "raw/drop_train.jsonl", "w") as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")

In [None]:
# WIKISUM

ds = load_dataset("d0rj/wikisum")
data = ds['train']
new_data = []
for sample in data:
    if len(sample['article'])<8000:
        new_data.append({
            'passage':sample['article'],
            'question':'What is a summary of the previous text?',
            'answer':sample['summary']
        })
with open(DATA_PATH + 'WikiSum_train.jsonl','w') as f:
    for item in new_data:
        f.write(json.dumps(item) + '\n')

In [None]:
# DIALOGSUM

ds = load_dataset("knkarthick/dialogsum")
data = ds['train']
new_data = []
for sample in data:
    if len(sample['dialogue'])<8000:
        new_data.append({
            'passage':sample['dialogue'],
            'question':'Write a short summary of the previous dialogue.',
            'answer':sample['summary']
        })
with open(DATA_PATH + 'DialogSum_train.jsonl','w') as f:
    for item in new_data:
        f.write(json.dumps(item) + '\n')
        

In [None]:
# SAMSUM

ds = load_dataset("knkarthick/samsum")
data = ds['train']
new_data = []
for sample in data:
    if sample['dialogue'] is None:
        continue
    if len(sample['dialogue'])<8000:
        new_data.append({
            'passage':sample['dialogue'],
            'question':'Write a very short summary of the previous dialogue.',
            'answer':sample['summary']
        })
with open(DATA_PATH + 'SamSum_train.jsonl','w') as f:
    for item in new_data:
        f.write(json.dumps(item) + '\n')

In [None]:
# ParaSCI
ds = load_dataset("HHousen/ParaSCI")

data = ds['train']
new_data = []
for sample in data:
    if len(sample['sentence1'])>100 and len(sample['sentence2'])>100:
        new_data.append({
            'passage':sample['sentence1'],
            'question':'Paraphrase the previous text.',
            'answer':sample['sentence2'],
        })
with open(DATA_PATH + 'ParaSCI_train.jsonl','w') as f:
    for item in new_data:
        f.write(json.dumps(item) + '\n')

In [None]:
# Training QA
train_qa_data = {}
ds = load_dataset("dmrau/multi_qa")

for sample in ds['train']:
    data_id = re.sub(r"\d+", "", sample["id"])
    if data_id not in ['adversarial_qa', 'freebase_qa', 'sciq', 'msmarco', 'asqa']:
        continue
    
    if data_id == "msmarco" and "No Answer" in  sample['label']:
        continue
    
    if data_id not in train_qa_data:
        train_qa_data[data_id] = []
        
    train_qa_data[data_id].append({
        'question': sample['content'],
        'answer': sample['label']
    })

for k, v in train_qa_data.items():
    with open(DATA_PATH + f"raw/{k}_train.jsonl", "w") as f:
        for item in v:
            f.write(json.dumps(item) + "\n")

Downloading readme: 100%|██████████| 777/777 [00:00<00:00, 3.57kB/s]


adversarial_qa 29966
msmarco 59699
freebase_qa 20356
sciq 11679
asqa 4353


In [None]:
# KILT passages for training retrieval
ds = load_dataset("dmrau/kilt-128")
new_data = []
for sample in ds['train']:

    new_data.append({
            'content':sample['text']
    })
with open(DATA_PATH + 'KILT_passages_train.jsonl','w') as f:
    for item in new_data:
        f.write(json.dumps(item) + '\n')