In [1]:
from datasets import load_dataset, Dataset

dataset = load_dataset("ambig_qa")

No config specified, defaulting to: ambig_qa/full
Found cached dataset ambig_qa (/home/diwank/.cache/huggingface/datasets/ambig_qa/full/1.0.0/6425acf3572d4caf508123e5443753ab7ff415564753ae326ae801a0a1aa155e)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)

def entry_to_chatml(row):
    viewed_doc_titles = row["viewed_doc_titles"]
    nq_doc_title = row["nq_doc_title"]
    annotations = row["annotations"]
    question = row["question"]
    nq_answer = row["nq_answer"]
    [annot_type, *_] = annotations["type"]

    if annot_type == "multipleQAs":
        [qa_pairs, *_] = annotations["qaPairs"]
        qa_pairs = list(zip(
            qa_pairs["question"],
            map(lambda x: ", ".join(x), qa_pairs["answer"]),
        ))

        lookup_result = "\n\n".join([
            f"Q: {q}\nA: {a}" for q, a in qa_pairs
        ])

    elif annot_type == "singleAnswer":
        [lookup_answer, *_] = annotations["answer"]
        lookup_answer = ", ".join(lookup_answer)
        lookup_result = f"Source: {nq_doc_title}\nQ: {question}\nA: {lookup_answer}"

    else:
        raise Exception(f"Not supported annot_type: {annot_type}")

    # Start preparing chatml
    situation_content = (
        "User is talking to an AI Large Language Model and asking it general knowledge question."
        " The AI must not use prior knowledge or any piece of information not explicitly retrieved from reliable sources."
        " In order to ensure that the answer is based on factual, up-to-date information, the AI must make a query to `Lookup` tool for fetching relevant information before answering."
    )

    lookup_query = '"' + '", "'.join(viewed_doc_titles) + '"'
    lookup_query += f', "{question}"'
    thought_content = (
        f'In order to answer this question, I need more information about "{nq_doc_title}".'
        f"\nUse Tool: Lookup({lookup_query})"
    )

    information_content = (
        f"Lookup Result:\n\n{lookup_result}"
    )
    
    chatml = [
        situation(situation_content),
        person(question),
        thought(thought_content),
        information(information_content),
        me(name="AI", content=f"The correct answer is: {', '.join(nq_answer)}"),
    ]

    return dict(chatml=chatml)

In [3]:
dataset = dataset.map(entry_to_chatml).remove_columns(['id', 'question', 'annotations', 'viewed_doc_titles', 'used_queries', 'nq_answer', 'nq_doc_title'])

Map:   0%|          | 0/10036 [00:00<?, ? examples/s]

Map:   0%|          | 0/2002 [00:00<?, ? examples/s]

In [4]:
dataset['train'][0]['chatml']

[{'content': 'User is talking to an AI Large Language Model and asking it general knowledge question. The AI must not use prior knowledge or any piece of information not explicitly retrieved from reliable sources. In order to ensure that the answer is based on factual, up-to-date information, the AI must make a query to `Lookup` tool for fetching relevant information before answering.',
  'name': 'situation',
  'role': 'system'},
 {'content': 'When did the simpsons first air on television?',
  'name': None,
  'role': 'user'},
 {'content': 'In order to answer this question, I need more information about "The Simpsons".\nUse Tool: Lookup("The Simpsons", "When did the simpsons first air on television?")',
  'name': 'thought',
  'role': 'system'},
 {'content': 'Lookup Result:\n\nQ: When did the Simpsons first air on television as an animated short on the Tracey Ullman Show?\nA: April 19, 1987\n\nQ: When did the Simpsons first air as a half-hour prime time show?\nA: December 17, 1989',
  'n

In [5]:
assistant_me_map = {
    "user": "person",
    "assistant": "me",
}

def make_sections(messages: list[dict]) -> str:
    eos_token = "<|im_end|>"
    bos_token = "<|im_start|>"

    result = bos_token + (eos_token+'\n'+bos_token).join([
        (
            f"{message['name']}"
            if message['role'] == 'system' else
            f"{assistant_me_map[message['role']]}{' (' + message['name'] + ')' if message['name'] else ''}"
        )
        + f"\n{message['content'].strip()}"
        for message in messages
    ]) + eos_token
    
    return result
    

def transform_to_samantha_dialog(sample):

    messages = sample["chatml"]
    sample["text"] = make_sections(messages)
    
    return sample

dataset = dataset.map(transform_to_samantha_dialog)

Map:   0%|          | 0/10036 [00:00<?, ? examples/s]

Map:   0%|          | 0/2002 [00:00<?, ? examples/s]

In [6]:
dataset.push_to_hub("diwank/ambig_qa-chatml", private=True)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]