In [1]:
from datasets import load_dataset, Dataset

dataset = load_dataset("ambig_qa")

In [2]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)


In [3]:
import json
def entry_to_chatml(row):
    viewed_doc_titles = row["viewed_doc_titles"]
    nq_doc_title = row["nq_doc_title"]
    annotations = row["annotations"]
    question = row["question"]
    nq_answer = row["nq_answer"]
    [annot_type, *_] = annotations["type"]

    if annot_type == "multipleQAs":
        [qa_pairs, *_] = annotations["qaPairs"]
        qa_pairs = list(zip(
            qa_pairs["question"],
            map(lambda x: ", ".join(x), qa_pairs["answer"]),
        ))

        lookup_result = "\n\n".join([
            f"Q: {q}\nA: {a}" for q, a in qa_pairs
        ])

    elif annot_type == "singleAnswer":
        [lookup_answer, *_] = annotations["answer"]
        lookup_answer = ", ".join(lookup_answer)
        lookup_result = f"Source: {nq_doc_title}\nQ: {question}\nA: {lookup_answer}"

    else:
        raise Exception(f"Not supported annot_type: {annot_type}")

    # Start preparing chatml
    situation_content = (
        "User is talking to an AI Large Language Model and asking it general knowledge question."
        " The AI must not use prior knowledge or any piece of information not explicitly retrieved from reliable sources."
        "\n\nIn order to call a function, AI just needs to specify the name of the function to call and its arguments as a valid JSON string."
    )

    function_content = {
        "name": "lookup_info",
        "description": "In order to ensure that the answer is based on factual, up-to-date information, use this tool for fetching relevant information before answering.",
        "parameters": {
            "type": "object",
            "properties": {
                "queries": { 
                    "type": "array",
                    "items": {
                        "type": "string",
                    },
                },
            },
        },
    }

    function_json = json.dumps(function_content, indent=4)

    functions = dict(
        role="system",
        name="functions",
        content=f"Available functions and their signatures as JSON schema:\n\n```\n{function_json}\n```",
    )

    thought_content = (
        f'Thought:\n\nIn order to answer this question, I need more information about "{nq_doc_title}".'
    )

    lookup_items = [*viewed_doc_titles, question]
    payload = json.dumps({
        "name": "lookup_info",
        "arguments": {
            "queries": lookup_items,
        },
    })

    function_call = dict(role="function_call", content=payload)

    information_content = (
        f"`lookup_info` Result:\n\n{lookup_result}"
    )
    
    chatml = [
        situation(situation_content),
        functions,
        person(question),
        thought(thought_content),
        function_call,
        information(information_content),
        me(name="AI", content=f"The correct answer is: {', '.join(nq_answer)}"),
    ]

    return dict(chatml=chatml)

In [4]:
dataset = dataset.map(entry_to_chatml).remove_columns(['id', 'question', 'annotations', 'viewed_doc_titles', 'used_queries', 'nq_answer', 'nq_doc_title'])

Map:   0%|          | 0/10036 [00:00<?, ? examples/s]

Map:   0%|          | 0/2002 [00:00<?, ? examples/s]

In [5]:
dataset['train'][0]['chatml']

[{'content': 'User is talking to an AI Large Language Model and asking it general knowledge question. The AI must not use prior knowledge or any piece of information not explicitly retrieved from reliable sources.\n\nIn order to call a function, AI just needs to specify the name of the function to call and its arguments as a valid JSON string.',
  'name': 'situation',
  'role': 'system'},
 {'content': 'Available functions and their signatures as JSON schema:\n\n```\n{\n    "name": "lookup_info",\n    "description": "In order to ensure that the answer is based on factual, up-to-date information, use this tool for fetching relevant information before answering.",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "queries": {\n                "type": "array",\n                "items": {\n                    "type": "string"\n                }\n            }\n        }\n    }\n}\n```',
  'name': 'functions',
  'role': 'system'},
 {'content': 'When did th

In [6]:
dataset.push_to_hub("diwank/ambig_qa-chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/568 [00:00<?, ?B/s]