In [57]:
from datasets import load_dataset
import pandas as pd
import json
from datetime import datetime
from tqdm import tqdm
import orjson
import matplotlib.pyplot as plt

MAX_QUESTION_LENGTH = 2000
MAX_ANSWER_LENGTH = 500

AVAILABLE_TAGS = [
    "multiple_choice",
    "context_based",
    "knowledge_based",
    "human_eval",
    "code"
]

In [47]:
def print_example(example):
    example = example.copy()
    limit_fields = ['input', 'context', 'answers', 'final_answer']
    for field in limit_fields:
        if field in example:
            if isinstance(example[field], str):
                if len(example[field]) > 200:
                    example[field] = example[field][:200] + '...' + example[field][-50:]
            elif isinstance(example[field], list):
                # Process each answer in the list
                limited_answers = []
                for answer in example[field]:
                    if isinstance(answer, str) and len(answer) > 200:
                        limited_answers.append(answer[:200] + '...' + answer[-50:])
                    else:
                        limited_answers.append(answer)
                example[field] = limited_answers
    
    # Prepare the output string
    output = "{\n"
    for key, value in example.items():
        # Format the value for display
        if isinstance(value, str):
            formatted_value = f'"{value}"'
        elif isinstance(value, list):
            formatted_value = '[' + ', '.join([f'"{v}"' if isinstance(v, str) else str(v) for v in value]) + ']'
        else:
            formatted_value = str(value)
        
        output += f"\t{key}: {formatted_value},\n"
    
    output = output.rstrip(",\n") + "\n}"
    
    print(output)

### Potential sources
### Candidates

> HellaSwag (?)
> PIQA ++
> TruthfulQA ++
> Mt-bench ++
> MMLU (?)

---

Language Modeling
- CHID
- Winogrande

Understanding
- RACE
- LAMBADA

Reasoning
- SIQA
- PIQA +! 

In-Context Learning
- MMLU
- CEval

Multi-Step Reasoning
- GSM8K
- StrategyQA

Instruction-Following
- Hellaswag
- ARC

Self-Calibration
- MMLU

Hallucination
- TruthfulQA +! 

Robustness
- AdvGLUE

Dialogue
- Mt-bench +! 

Long-Context
- Longeval
- Multi-Doc QA

---

### Qa Multi Hop qa [Knowledge-based] [Difficult]
https://huggingface.co/datasets/pszemraj/qmsum-cleaned/viewer/default/

This dataset has questions that may require multistep thinking or testing various knowledge.
The answer can be either small or difficult. When it requires multistep summary + answer will be supported.
Obviously I'll keep only the answer

In [15]:
dataset = "khaimaitien/qa-expert-multi-hop-qa-V1.0"
data = load_dataset(dataset, split='validation', streaming=True)
sample = list(data.take(3))

print(F"--------======== DATASET: {dataset} ========--------")
for i, example in enumerate(sample):
    print(f"\n=== Example {i+1} ===")
    print_example(example)


=== Example 1 ===
{
	sub_questions: [{'answer': 'Egypt', 'extra_info': None, 'long_answer': 'The knowledge mentions Gamal Abdel Nasser as one of the leaders that Josip Broz Tito met during his rule.\nAnswer: Based on the provided knowledge, Nasser was a leader of Egypt.', 'paragraph': "Josip Broz Tito. Yugoslavia had a liberal travel policy permitting foreigners to freely travel through the country and its citizens to travel worldwide, whereas it was limited by most Communist countries. A number[quantify] of Yugoslav citizens worked throughout Western Europe. Tito met many world leaders during his rule, such as Soviet rulers Joseph Stalin, Nikita Khrushchev and Leonid Brezhnev; Egypt's Gamal Abdel Nasser, Indian politicians Jawaharlal Nehru and Indira Gandhi; British Prime Ministers Winston Churchill, James Callaghan and Margaret Thatcher; U.S. Presidents Dwight D. Eisenhower, John F. Kennedy, Richard Nixon, Gerald Ford and Jimmy Carter; other political leaders, dignitaries and heads 

In [49]:
def load_qa_expert_multi_hopdataset(dataset_split, output_file):
    dataset_name = "khaimaitien/qa-expert-multi-hop-qa-V1.0"
    data = load_dataset(dataset_name)
    dataset = data[dataset_split]

    def cutout_summary(final_answer):
        if "Answer:" in final_answer:
            final_answer = final_answer[final_answer.find("Answer: ") + len("Answer: "):]
        return final_answer.strip()

    processed_data = []
    for item in tqdm(dataset, desc = f"Processing {dataset_name}, split: {dataset_split}..."):
        if len(item["question"]) < MAX_QUESTION_LENGTH and len(item["final_answer"]) < MAX_ANSWER_LENGTH*2 and \
            (("I'm sorry" not in item["final_answer"]) and ("I can't" not in item["final_answer"]) and ("I cannot" not in item["final_answer"])):
            conversation = [
                {"content": item["question"], "role": "user"},
                {"content": cutout_summary(item["final_answer"]), "role": "assistant"}
            ]

            if len(conversation[1]["content"]) > MAX_ANSWER_LENGTH:
                continue

            processed_row = {
                "question_id": str(hash(item["question"])),
                "tstamp": str(datetime.now().timestamp()),
                "conversation_a": conversation,
                "source": dataset_name,
                "meta_info": {"tag": item["tag"], "src": item["meta_info"].get("src")},
                "tag": "knowledge_based",
            }
            processed_data.append(processed_row)

    df = pd.DataFrame(processed_data)
    with open(output_file, 'wb') as f:
        f.write(orjson.dumps(json.loads(df.to_json(orient='records'))))
    print(f"Finished! {len(df)} records saved")

load_qa_expert_multi_hopdataset("train", "./qa-expert-multi-hop-qa-train.json")
load_qa_expert_multi_hopdataset("validation", "./qa-expert-multi-hop-qa-valid.json")

Processing khaimaitien/qa-expert-multi-hop-qa-V1.0, split: train...: 100%|██████████| 25547/25547 [00:02<00:00, 9236.94it/s]


Finished! 11322 records saved


Processing khaimaitien/qa-expert-multi-hop-qa-V1.0, split: validation...: 100%|██████████| 3186/3186 [00:00<00:00, 8594.34it/s]

Finished! 1552 records saved





### Qasper [Context based] [Papers]

https://huggingface.co/datasets/allenai/qasper/viewer

It's a dataset about questions about papers. Instead of taking full text I only have taken abstract and futher will filter out using special flag that the context is enough to answer the question.

In [34]:
dataset = "allenai/qasper"
data = load_dataset(dataset, split='validation', streaming=True)
sample = list(data.take(3))

print(F"--------======== DATASET: {dataset} ========--------")
for i, example in enumerate(sample):
    print(f"\n=== Example {i+1} ===")
    print_example(example)


=== Example 1 ===
{
	id: "1912.01214",
	title: "Cross-lingual Pre-training Based Transfer for Zero-shot Neural Machine Translation",
	abstract: "Transfer learning between different language pairs has shown its effectiveness for Neural Machine Translation (NMT) in low-resource scenario. However, existing transfer methods involving a common target language are far from success in the extreme scenario of zero-shot translation, due to the language space mismatch problem between transferor (the parent model) and transferee (the child model) on the source side. To address this challenge, we propose an effective transfer learning approach based on cross-lingual pre-training. Our key idea is to make all source languages share the same feature space and thus enable a smooth transition for zero-shot translation. To this end, we introduce one monolingual pre-training method and two bilingual pre-training methods to obtain a universal encoder for different languages. Once the universal encoder is

In [None]:
def load_qasper_dataset(dataset_split, output_file):
    dataset_name = "allenai/qasper"
    data = load_dataset(dataset_name)
    dataset = data[dataset_split]

    processed_data = []
    for item in tqdm(dataset, desc=f"Processing {dataset_name}, split: {dataset_split}..."):
        title = item["title"]
        abstract = item["abstract"]

        for q_idx, question in enumerate(item["qas"]["question"][:3]):
            formatted_question = (
                f"You are given a paper:\n"
                f"Title: {title}\n"
                f"Abstract: {abstract}\n\n"
                f"{question}"
            )
            if len(formatted_question) > MAX_QUESTION_LENGTH:
                continue

            conversation = [
                {"content": formatted_question, "role": "user"},
            ]
            
            processed_row = {
                "question_id": str(hash(formatted_question)),
                "tstamp": str(datetime.now().timestamp()),
                "conversation_a": conversation,
                "source": dataset_name,
                "tag": "context_based",
                "meta_info": {"id_in_dataset": item["id"]},
            }
            processed_data.append(processed_row)

    df = pd.DataFrame(processed_data)
    with open(output_file, 'wb') as f:
        f.write(orjson.dumps(json.loads(df.to_json(orient='records'))))
    print(f"Finished! {len(df)} records saved")

# Example usage:
load_qasper_dataset("train", "./qasper-train.json")
load_qasper_dataset("validation", "./qasper-valid.json")
load_qasper_dataset("test", "./qasper-test.json")

Processing allenai/qasper, split: train...: 100%|██████████| 888/888 [00:00<00:00, 2511.96it/s]


Finished! 2050 records saved


Processing allenai/qasper, split: validation...: 100%|██████████| 281/281 [00:00<00:00, 2142.25it/s]


Finished! 725 records saved


Processing allenai/qasper, split: test...: 100%|██████████| 416/416 [00:00<00:00, 2055.56it/s]

Finished! 1104 records saved





### PIQA [Common sense] [Reasoning]

https://huggingface.co/datasets/ybisk/piqa

More or less common sense question answering. I will drop the options

In [35]:
data = load_dataset("ybisk/piqa", trust_remote_code=True)
data

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/815k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16113 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3084 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1838 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['goal', 'sol1', 'sol2', 'label'],
        num_rows: 16113
    })
    test: Dataset({
        features: ['goal', 'sol1', 'sol2', 'label'],
        num_rows: 3084
    })
    validation: Dataset({
        features: ['goal', 'sol1', 'sol2', 'label'],
        num_rows: 1838
    })
})

In [41]:
def load_piqa_dataset(dataset_split, output_file):
    dataset_name = "ybisk/piqa"
    data = load_dataset(dataset_name)
    dataset = data[dataset_split]

    processed_data = []
    for item in tqdm(dataset, desc=f"Processing {dataset_name}, split: {dataset_split}..."):
        answer = item["sol2"] if item["label"] else item["sol1"]
        conversation = [
            {"content": item['goal'], "role": "user"},
            {"content": answer, "role": "assistant"}
        ]

        if len(conversation[0]["content"]) > 200 or len(conversation[1]["content"]) > MAX_ANSWER_LENGTH:
            continue
        
        processed_row = {
            "question_id": str(hash(answer)),
            "tstamp": str(datetime.now().timestamp()),
            "conversation_a": conversation,
            "source": dataset_name,
            "tag": "physical_reasoning"
        }
        processed_data.append(processed_row)

    df = pd.DataFrame(processed_data)
    with open(output_file, 'wb') as f:
        f.write(orjson.dumps(json.loads(df.to_json(orient='records'))))
    print(f"Finished! {len(df)} records saved")

load_piqa_dataset("train", "./piqa-train.json")
load_piqa_dataset("validation", "./piqa-valid.json")
load_piqa_dataset("test", "./piqa-test.json")

Processing ybisk/piqa, split: train...: 100%|██████████| 16113/16113 [00:00<00:00, 35307.33it/s]


Finished! 15825 records saved


Processing ybisk/piqa, split: validation...: 100%|██████████| 1838/1838 [00:00<00:00, 36479.98it/s]


Finished! 1803 records saved


Processing ybisk/piqa, split: test...: 100%|██████████| 3084/3084 [00:00<00:00, 36357.19it/s]

Finished! 3030 records saved





### AI2 ARC [Reasoning] [School grade]
https://huggingface.co/datasets/allenai/ai2_arc

Simple school grade reasoning questions

In [16]:
from datasets import load_dataset

dataset = "allenai/ai2_arc"
data = load_dataset(dataset, "ARC-Easy", split='validation', streaming=True)
sample = list(data.take(3))

print(F"--------======== DATASET: {dataset} ========--------")
for i, example in enumerate(sample):
    print(f"\n=== Example {i+1} ===")
    print_example(example)


=== Example 1 ===
{
	id: "MCAS_2000_4_6",
	question: "Which technology was developed most recently?",
	choices: {'text': ['cellular telephone', 'television', 'refrigerator', 'airplane'], 'label': ['A', 'B', 'C', 'D']},
	answerKey: "A"
}

=== Example 2 ===
{
	id: "Mercury_7057260",
	question: "A student hypothesizes that algae are producers. Which question will best help the student determine if this is correct?",
	choices: {'text': ['Do algae consume other organisms?', 'Which organisms consume algae?', 'Do algae use sunlight to make food?', 'Could an ecosystem survive without algae?'], 'label': ['A', 'B', 'C', 'D']},
	answerKey: "C"
}

=== Example 3 ===
{
	id: "ACTAAP_2014_7_6",
	question: "Soccer players use their muscle systems to kick a ball into a goal. What organ system coordinates the muscles?",
	choices: {'text': ['The nervous system', 'The endocrine system', 'The respiratory system', 'The circulatory system'], 'label': ['A', 'B', 'C', 'D']},
	answerKey: "A"
}


In [40]:
def load_ai2_arc_dataset(dataset_split, output_file):
    dataset_name = "allenai/ai2_arc"
    data = load_dataset(dataset_name, "ARC-Easy")
    dataset = data[dataset_split]

    processed_data = []
    for id_, item in tqdm(enumerate(dataset), desc=f"Processing {dataset_name}, split: {dataset_split}..."):
        formatted_question = item['question']

        correct_label = item["answerKey"]
        correct_index = item["choices"]["label"].index(correct_label)
        correct_answer = item["choices"]["text"][correct_index]

        conversation = [
            {"content": formatted_question, "role": "user"},
            {"content": correct_answer, "role": "assistant"}
        ]

        if len(conversation[0]["content"]) > 300 or len(conversation[1]["content"]) > MAX_ANSWER_LENGTH:
            continue

        processed_row = {
            "question_id": id_,
            "tstamp": str(datetime.now().timestamp()),
            "conversation_a": conversation,
            "tag": "multiple_choice",
            "source": dataset_name,
            "meta_info": {"id_in_dataset": item["id"]}
        }
        processed_data.append(processed_row)

    df = pd.DataFrame(processed_data)
    with open(output_file, 'wb') as f:
        f.write(orjson.dumps(json.loads(df.to_json(orient='records'))))
    print(f"Finished! {len(df)} records saved")

load_ai2_arc_dataset("train", "./ai2_arc-train.json")
load_ai2_arc_dataset("validation", "./ai2_arc-valid.json")
load_ai2_arc_dataset("test", "./ai2_arc-test.json")

Processing allenai/ai2_arc, split: train...: 2251it [00:00, 8568.09it/s]


Finished! 2209 records saved


Processing allenai/ai2_arc, split: validation...: 570it [00:00, 19151.92it/s]


Finished! 557 records saved


Processing allenai/ai2_arc, split: test...: 2376it [00:00, 26124.58it/s]

Finished! 2322 records saved





### Infinity Gen

In [1]:
from datasets import load_dataset

# Load the dataset (streaming mode to avoid downloading the full dataset)
dataset_7M = load_dataset('BAAI/Infinity-Instruct', '7M', split='train', streaming=True)

# Take 10 samples and convert to a list for viewing
sample_10 = list(dataset_7M.take(10))

# Print the samples
for i, example in enumerate(sample_10):
    print(f"\n=== Example {i+1} ===")
    print(example)
    print("="*30)

README.md:   0%|          | 0.00/21.7k [00:00<?, ?B/s]

DatasetNotFoundError: Dataset 'BAAI/Infinity-Instruct' is a gated dataset on the Hub. You must be authenticated to access it.

### Human-Like-DPO-Dataset
[dataset](https://huggingface.co/datasets/HumanLLMs/Human-Like-DPO-Dataset)
Basically casual conversations which are supposed to be evaluated based on the more realistic reply

I decided... that it does not fit our dataset.

In [None]:
from datasets import load_dataset

dataset = load_dataset('HumanLLMs/Human-Like-DPO-Dataset', split='train', streaming=True)
sample_10 = list(dataset.take(5))

for i, example in enumerate(sample_10):
    print(f"\n=== Example {i+1} ===")
    print(example['chosen'])
    print("="*30)


=== Example 1 ===
😂 Ah, no I haven't! I'm dying to know, what's the meme about? Is it a funny cat or a ridiculous situation? Spill the beans! 🤣

=== Example 2 ===
Oh, totally! 😄 I'm a sucker for a good ol' rock ballad. Give me some Bon Jovi any day of the week! "Livin' on a Prayer" is my go-to karaoke jam. There's just something about belting out "Oh, we're halfway there!" at the top of my lungs that gets me pumped up! 🎤 What about you, do you have a favorite karaoke song? 🎶

=== Example 3 ===
😊 I'm actually a big fan of DIY projects! I'm not super skilled, but I love trying out new things and getting creative. I've tried my hand at painting, making jewelry, and even some woodworking (with lots of supervision, of course 😉). One time, I even attempted to make my own candles, and let's just say it was... an experience. The result was more "rustic" than "refined," but it was fun!

What about you? Are you crafty? Do you have a favorite DIY project or craft that you enjoy working on? 🎨💡

=

In [48]:
dataset = load_dataset('allenai/tulu-3-sft-mixture', split='train', streaming=True)
sample_10 = list(dataset.take(3))

for i, example in enumerate(sample_10):
    print(f"\n=== Example {i+1} ===")
    print_example(example)
    print("="*30)


=== Example 1 ===
{
	id: "oasst1_5921",
	messages: [{'content': 'Create a snippet of Terraform HCL code that create an AWS autoscaling group, and an ALB in front to expose an application to internet.', 'role': 'user'}, {'content': 'Sure, here\'s an example Terraform HCL code that creates an AWS Autoscaling Group and an Application Load Balancer to expose an application to the internet:\n``` \n# Configure the AWS provider\nprovider "aws" {\n  region = "us-east-1"\n}\n\n# Create a security group to allow traffic to the ALB\nresource "aws_security_group" "alb_sg" {\n  name_prefix = "alb_sg"\n  ingress {\n    from_port = 80\n    to_port = 80\n    protocol = "tcp"\n    cidr_blocks = ["0.0.0.0/0"]\n  }\n}\n\n# Create an ALB and target group\nresource "aws_lb" "alb" {\n  name               = "example-alb"\n  internal           = false\n  load_balancer_type = "application"\n\n  subnets = ["subnet-12345678", "subnet-87654321"]\n\n  security_groups = [aws_security_group.alb_sg.id]\n\n  tags = {

### tulu-3-sft-mixture [Common sense] [Multi-language]
[dataset](https://huggingface.co/datasets/allenai/tulu-3-sft-mixture)
The Tulu 3 SFT mixture was used to train the Tulu 3 series of models and included itself 18 datasets

In [58]:
def load_tulu3_sft_dataset(dataset_split, output_file):
    dataset_name = "allenai/tulu-3-sft-mixture"
    dataset = load_dataset(dataset_name, split=dataset_split, streaming=True)
    processed_data = []
    max_items = 10000
    for item in tqdm(dataset, desc = f"Processing {dataset_name}, split: {dataset_split}..."):
        max_items -= 1
        if max_items == 0:
            break
        if len(item["messages"][0]['content']) < MAX_QUESTION_LENGTH and len(item["messages"][1]['content']) < MAX_ANSWER_LENGTH:
            conversation = item["messages"]

            processed_row = {
                "question_id": str(hash(item["messages"][0]['content'])),
                "tstamp": str(datetime.now().timestamp()),
                "conversation_a": conversation,
                "source": dataset_name,
                "meta_info": {
                    "id_in_dataset": item["id"],
                    "original_dataset": item["source"],
                    "question_length": len(item["messages"][0]['content']),
                },
                "tag": "knowledge_based",
            }
            processed_data.append(processed_row)

    df = pd.DataFrame(processed_data)
    with open(output_file, 'wb') as f:
        f.write(orjson.dumps(json.loads(df.to_json(orient='records'))))
    print(f"Finished! {len(df)} records saved")

load_tulu3_sft_dataset("train", "./tulu-3-sft-mixture-train.json")

Processing allenai/tulu-3-sft-mixture, split: train...: 9999it [00:08, 1221.25it/s]

Finished! 4979 records saved





### Long bench

In [None]:
datasets = ["qasper", "multifieldqa_en", "hotpotqa", "2wikimqa", "gov_report", "multi_news", "trec", \
            "triviaqa", "samsum", "passage_count", "passage_retrieval_en", "lcc", "repobench-p"]

for dataset in datasets:
    data = load_dataset('THUDM/LongBench', f"{dataset}_e", split='test', streaming=True)
    sample_10 = list(data.take(3))

    print(F"--------======== DATASET: {dataset} ========--------")
    for i, example in enumerate(sample_10):
        print(f"\n=== Example {i+1} ===")
        print_example(example)
        print("="*30)


=== Example 1 ===
{
	input: "Which Facebook pages did they look at?",
	context: "Introduction
This work is licenced under a Creative Commons Attribution 4.0 International Licence. Licence details: http://creativecommons.org/licenses/by/4.0/
In the spirit of the brevity of social m...roviding comments on draft versions of this paper.",
	answers: ["FoxNews, CNN, ESPN, New York Times, Time magazine, Huffington Post Weird News, The Guardian, Cartoon Network, Cooking Light, Home Cooking Adventure, Justin Bieber, Nickelodeon, Spongebob, Disney", "FoxNews, CNN, ESPN, New York Times, Time magazine, Huffington Post Weird News, The Guardian, Cartoon Network, Cooking Light, Home Cooking Adventure, Justin Bieber, Nickelodeon, Spongebob, Disney."],
	length: 3411,
	dataset: "qasper_e",
	language: "en",
	all_classes: None,
	_id: "b26174d8ec97e8d01421548d3a640e8382db6235eb506661"
}

=== Example 2 ===
{
	input: "What type of latent context is used to predict instructor intervention?",
	context: "Intro

In [None]:
datasets = ["qasper"]

example = None

for dataset in datasets:
    data = load_dataset('THUDM/LongBench', f"{dataset}_e", split='test', streaming=True)
    sample_10 = list(data.take(3))

    print(F"--------======== DATASET: {dataset} ========--------")
    for i, example in enumerate(sample_10):
        print(f"\n=== Example {i+1} ===")
        print_example(example)
        print("="*30)


=== Example 1 ===
{
	input: "Which Facebook pages did they look at?",
	context: "Introduction
This work is licenced under a Creative Commons Attribution 4.0 International Licence. Licence details: http://creativecommons.org/licenses/by/4.0/
In the spirit of the brevity of social m...roviding comments on draft versions of this paper.",
	answers: ["FoxNews, CNN, ESPN, New York Times, Time magazine, Huffington Post Weird News, The Guardian, Cartoon Network, Cooking Light, Home Cooking Adventure, Justin Bieber, Nickelodeon, Spongebob, Disney", "FoxNews, CNN, ESPN, New York Times, Time magazine, Huffington Post Weird News, The Guardian, Cartoon Network, Cooking Light, Home Cooking Adventure, Justin Bieber, Nickelodeon, Spongebob, Disney."],
	length: 3411,
	dataset: "qasper_e",
	language: "en",
	all_classes: None,
	_id: "b26174d8ec97e8d01421548d3a640e8382db6235eb506661"
}

=== Example 2 ===
{
	input: "What type of latent context is used to predict instructor intervention?",
	context: "Intro

In [None]:
BATCH_SIZE = 1000
MAX_QUESTION_LENGTH = 4000
MAX_ANSWER_LENGTH = 400

def process_batch(batch, dataset_name, all_lengths):
    processed_data = []
    for item in batch:
        input = item["input"]
        context = item["context"]
        dataset_source = item["dataset"]

        formatted_question = f"Context: {context}\nQuestion: {input}"
        all_lengths.append(len(formatted_question))
        if len(formatted_question) > MAX_QUESTION_LENGTH:
            if len(context) < MAX_QUESTION_LENGTH*4:
                context = context[:(MAX_QUESTION_LENGTH - (len(formatted_question) - len(context)))]
                formatted_question = f"Context: {context}\nQuestion: {input}"
            else:
                continue
        if not item["answers"] or len(item["answers"][0]) > MAX_ANSWER_LENGTH:
            continue

        conversation = [
            {"content": formatted_question, "role": "user"},
            {"content": item["answers"][0], "role": "assistant"}
        ]

        processed_row = {
            "question_id": str(hash(formatted_question)),
            "tstamp": str(datetime.now().timestamp()),
            "conversation_a": conversation,
            "source": f"LongBench/{dataset_name}_e",
            "tag": "context_based",
            "meta_info": {
                "id_in_dataset": item["_id"],
                "original_dataset": dataset_source,
                "question_length": len(formatted_question),
            },
        }
        processed_data.append(processed_row)
    
    return processed_data

def load_longbench_dataset(dataset_name, dataset_split, output_file):
    all_lengths = []
    try:
        dataset = load_dataset(
            'THUDM/LongBench', 
            f"{dataset_name}_e", 
            split=dataset_split, 
            streaming=True
        )

        with open(output_file, 'wb') as f:
            f.write(b'[\n')

        total_processed = 0
        batch = []

        for item in tqdm(dataset, desc=f"Processing {dataset_name}/{dataset_split}"):
            batch.append(item)
            
            if len(batch) >= BATCH_SIZE:
                processed_batch = process_batch(batch, dataset_name, all_lengths)
                if processed_batch:
                    save_batch(processed_batch, output_file, total_processed > 0)
                    total_processed += len(processed_batch)
                batch = []

        if batch:
            processed_batch = process_batch(batch, dataset_name, all_lengths)
            if processed_batch:
                save_batch(processed_batch, output_file, total_processed > 0)
                total_processed += len(processed_batch)
        #plt.hist(all_lengths)
        #plt.show()

        with open(output_file, 'ab') as f:
            f.write(b'\n]')

        print(f"Finished! {total_processed} records saved to {output_file}")

    except Exception as e:
        print(f"Error processing {dataset_name}: {str(e)}")

def save_batch(batch, output_file, needs_comma):
    """Save a batch of records to the output file"""
    json_batch = orjson.dumps(batch, option=orjson.OPT_INDENT_2)

    with open(output_file, 'ab') as f:
        if needs_comma:
            f.write(b',\n')
        f.write(json_batch[1:-1])

datasets = ["qasper", "multifieldqa_en", "hotpotqa", "2wikimqa"]
for dataset_name in datasets:
    load_longbench_dataset(dataset_name, "test", f"./long-{dataset_name}-test.json")

Processing qasper/test: 224it [00:01, 118.88it/s]


Finished! 39 records saved to ./long-qasper-test.json


Processing multifieldqa_en/test: 150it [00:01, 109.97it/s]


Finished! 42 records saved to ./long-multifieldqa_en-test.json


Processing hotpotqa/test: 300it [00:02, 109.84it/s]


Finished! 33 records saved to ./long-hotpotqa-test.json


Processing 2wikimqa/test: 300it [00:01, 150.43it/s]

Finished! 28 records saved to ./long-2wikimqa-test.json





### Samples conclusion:

This project combines multiple high-quality QA datasets to create a diverse training mixture covering different question types and difficulty levels.

### Dataset Summary

| Dataset | Type | Characteristics | Samples | Source |
|---------|------|-----------------|---------|--------|
| [QA Expert Multi-Hop QA](https://huggingface.co/datasets/khaimaitien/qa-expert-multi-hop-qa-V1.0) | Knowledge-based | Difficult multi-hop questions | 1,200 (23.1%) | Original train: 11,322 |
| [Qasper](https://huggingface.co/datasets/allenai/qasper) | Context-based | Research paper QA | 1,197 (23.0%) | Original train: 2,033 |
| [PIQA](https://huggingface.co/datasets/ybisk/piqa) | Commonsense | Physical reasoning | 1,200 (23.1%) | Original train: 15,825 |
| [AI2 ARC](https://huggingface.co/datasets/allenai/ai2_arc) | Reasoning | Science questions | 700 (13.5%) | Original train: 2,209 |
| [Tulu 3 SFT Mixture](https://huggingface.co/datasets/allenai/tulu-3-sft-mixture) | Multi-domain | 18 combined datasets, multi-language | 800 (15.4%) | Sampled: 4,979 |
| [LongBench](https://huggingface.co/datasets/THUDM/LongBench) | Long-context | Extended context QA | 142 (2.7%) | Sampled 142 |

**Total samples:** ~36368 (train), ~4,633 (val), ~6,595 (test)

### Sampling Strategy

For balanced training, we used the following proportions:

1. **Primary Distribution:**
   - 23.0% Context-based (Qasper)
   - 23.1% Knowledge-based (Multi-Hop QA)
   - 23.1% Commonsense (PIQA)
   - 13.5% Reasoning (AI2 ARC)
   - 15.4% General SFT (Tulu-3)
   - 2.7% Long-context eval (LongBench)

2. **Quality Filtering:**
   - Applied length constraints:
     - Max question length: 8,000 chars
     - Max answer length: 1,000 chars
   - Removed malformed/empty answers

3. **Special Considerations:**
   - **LongBench:** Use all available test samples (142) for evaluation
   - **Multi-Hop QA:** Valuable for hard questions - keep full training set
   - **Qasper:** Important for academic writing understanding

### Final dataset

For a 5k sample training set that will be reduced down to 300-500:
```python
{
    "qasper": 1197,           # 23.0% (context)
    "multi_hop_qa": 1200,      # 23.1% (knowledge)
    "piqa": 1200,              # 23.1% (commonsense)
    "ai2_arc": 700,            # 13.5% (reasoning)
    "tulu_3_sft": 800,         # 15.4% (general SFT)
    "longbench": 142           # 2.7% (long-context eval)
}
```

In [59]:
import pandas as pd
import orjson
import json
import random
from typing import List, Dict, Union

def load_and_sample_datasets(
    file_paths: Union[str, List[str]], 
    n_samples: int, 
    random_state: int = 42,
    longbench_mode: bool = False
) -> List[Dict]:
    random.seed(random_state)
    
    if isinstance(file_paths, str):
        file_paths = [file_paths]
    
    all_records = []
    for file_path in file_paths:
        with open(file_path, 'rb') as f:
            data = orjson.loads(f.read())
            all_records.extend(data)
    
    # Special case for LongBench - take all test samples
    if longbench_mode:
        return all_records

    if len(all_records) <= n_samples:
        return all_records
    return random.sample(all_records, n_samples)

def create_final_dataset(output_file: str = "final_dataset.json"):
    sampling_config = {
        "qasper": {
            "files": ["./qasper-train.json", "./qasper-valid.json"],
            "n_samples": 800 + (500 - 142),  # 1158 total
            "longbench": False
        },
        "multi-hop-qa": {
            "files": ["./qa-expert-multi-hop-qa-train.json", "./qa-expert-multi-hop-qa-valid.json"],
            "n_samples": 1200,
            "longbench": False
        },
        "piqa": {
            "files": ["./piqa-train.json", "./piqa-valid.json"],
            "n_samples": 1200,
            "longbench": False
        },
        "allenai/ai2_arc": {
            "files": ["./ai2_arc-train.json", "./ai2_arc-valid.json"],
            "n_samples": 700,
            "longbench": False
        },
        "tulu-3-sft-mixture": {
            "files": ["./tulu-3-sft-mixture-train.json"],
            "n_samples": 800,
            "longbench": False
        },
        "LongBench": {
            "files": [
                "./long-qasper-test.json",
                "./long-multifieldqa_en-test.json",
                "./long-hotpotqa-test.json",
                "./long-2wikimqa-test.json"
            ],
            "n_samples": 142,
            "longbench": True
        },
        
    }

    final_data = []
    for dataset_name, config in sampling_config.items():
        print(f"Processing {dataset_name}...")
        samples = load_and_sample_datasets(
            file_paths=config["files"],
            n_samples=config["n_samples"],
            longbench_mode=config["longbench"]
        )
        final_data.extend(samples)
        print(f"Added {len(samples)} samples from {dataset_name}")

    random.shuffle(final_data)

    df = pd.DataFrame(final_data)
    with open(output_file, 'wb') as f:
        f.write(orjson.dumps(json.loads(df.to_json(orient='records'))))
    
    print(f"\nFinished! Created dataset with {len(df)} records")
    print("Dataset composition:")
    for dataset_name in sampling_config:
        count = sum(1 for item in final_data if dataset_name in item["source"])
        print(f"- {dataset_name}: {count} samples ({count/len(final_data)*100:.1f}%)")

create_final_dataset()

Processing qasper...
Added 1158 samples from qasper
Processing multi-hop-qa...
Added 1200 samples from multi-hop-qa
Processing piqa...
Added 1200 samples from piqa
Processing allenai/ai2_arc...
Added 700 samples from allenai/ai2_arc
Processing tulu-3-sft-mixture...
Added 800 samples from tulu-3-sft-mixture
Processing LongBench...
Added 142 samples from LongBench

Finished! Created dataset with 5200 records
Dataset composition:
- qasper: 1197 samples (23.0%)
- multi-hop-qa: 1200 samples (23.1%)
- piqa: 1200 samples (23.1%)
- allenai/ai2_arc: 700 samples (13.5%)
- tulu-3-sft-mixture: 800 samples (15.4%)
- LongBench: 142 samples (2.7%)
