In [1]:
from datasets import load_dataset

openbookqa = load_dataset("openbookqa", "additional")

Downloading builder script:   0%|          | 0.00/6.49k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.69k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.45M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4957 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

In [2]:
openbookqa_subset = openbookqa.filter(lambda row: (
    row["humanScore"] >= 1.0
    and row["clarity"] >= 1.5
    and row["question_stem"].strip().endswith('?')
))

Filter:   0%|          | 0/4957 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

In [3]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)


In [9]:
def get_answer_text(choices, answer_key):
    for choice, label in zip(choices["text"], choices["label"]):
        if label == answer_key:
            return choice

def to_chatml(row):
    question = row["question_stem"]
    choices_data = row["choices"]
    fact = row["fact1"]
    answer_key = row["answerKey"]

    choices = "\n".join([
        f"{label}) {text}"
        for label, text
        in zip(choices_data["label"], choices_data["text"])
    ])

    situation_text = (
        "A person is talking to an intelligent AI assistant."
        " The person is quizzing the AI assistant on miscellaneous questions"
        " in a multiple choice format."
        " The assistant first thinks carefully about the question and lays down its reasoning."
        " Then answers the question with the correct choice key and the right answer."
    )
    
    mcq = f"Question: {question}\nChoices:\n{choices}"
    thought_text = f"Reasoning:\n\n{fact}"
    answer_text = get_answer_text(choices_data, answer_key)
    person_text = f"Question: {question}\nChoices:\n{choices}"
    me_text = f"The correct answer is: {answer_key}. {answer_text}"
    
    chatml = [
        situation(situation_text),
        person(person_text),
        thought(thought_text),
        me(me_text),
    ]
    
    return dict(chatml=chatml)


In [10]:

openbookqa_chatml = openbookqa_subset.map(
    to_chatml
).remove_columns(['id', 'question_stem', 'choices', 'answerKey', 'fact1', 'humanScore', 'clarity', 'turkIdAnonymized'])
print(openbookqa_chatml)

Map:   0%|          | 0/1021 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['chatml'],
        num_rows: 1021
    })
    validation: Dataset({
        features: ['chatml'],
        num_rows: 82
    })
    test: Dataset({
        features: ['chatml'],
        num_rows: 106
    })
})


In [11]:
openbookqa_chatml.push_to_hub("diwank/openbookqa_chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/759 [00:00<?, ?B/s]