In [1]:
from datasets import load_dataset, Dataset

dataset = load_dataset("coqa")

Downloading builder script:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.03k [00:00<?, ?B/s]

Downloading and preparing dataset coqa/default to /home/diwank/.cache/huggingface/datasets/coqa/default/1.0.0/1b03a32914e882ed315577005c472665e542419f910bab445815ad1929a7958f...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/49.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/7199 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset coqa downloaded and prepared to /home/diwank/.cache/huggingface/datasets/coqa/default/1.0.0/1b03a32914e882ed315577005c472665e542419f910bab445815ad1929a7958f. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)

def entry_to_chatml(row):
    source = row["source"]
    story = row["story"]
    questions = row["questions"]

    # {
    #   "input_text": [...],
    #   "answer_start": [...],
    #   "answer_end": [...],
    # }
    answers_with_annotations = row["answers"]
    answers = answers_with_annotations["input_text"]
    story_spans = list(zip(
        answers_with_annotations["answer_start"],
        answers_with_annotations["answer_end"],
    ))
    story_snippets = [story[start:end] for start, end in story_spans]
    
    # Start preparing chatml
    situation_content = (
        "User is talking to an AI Large Language Model and asking it comprehension-style questions about a given passage."
        f" The passage is a story/article from '{source}' data."
    )
    
    chatml = [
        situation(situation_content),
        information(f"Passage:\n\n{story}")
    ]

    # Add stuff one by one
    count = len(questions)
    for i in range(count):
        chatml.append(person(questions[i]))
        chatml.append(thought(
            f'I think that the answer to this question can be inferred from the following line from the passage:\n"{story_snippets[i]}"'
        ))
        chatml.append(me(name="AI", content=answers[i]))
    
    return dict(chatml=chatml)

In [17]:
dataset = dataset.map(entry_to_chatml).remove_columns(['source', 'story', 'questions', 'answers'])

Map:   0%|          | 0/7199 [00:00<?, ? examples/s]

In [19]:
assistant_me_map = {
    "user": "person",
    "assistant": "me",
}

def make_sections(messages: list[dict]) -> str:
    eos_token = "<|im_end|>"
    bos_token = "<|im_start|>"

    result = bos_token + (eos_token+'\n'+bos_token).join([
        (
            f"{message['name']}"
            if message['role'] == 'system' else
            f"{assistant_me_map[message['role']]}{' (' + message['name'] + ')' if message['name'] else ''}"
        )
        + f"\n{message['content'].strip()}"
        for message in messages
    ]) + eos_token
    
    return result
    

def transform_to_samantha_dialog(sample):

    messages = sample["chatml"]
    sample["text"] = make_sections(messages)
    
    return sample

dataset = dataset.map(transform_to_samantha_dialog)

Map:   0%|          | 0/7199 [00:00<?, ? examples/s]

In [20]:
dataset.push_to_hub("diwank/coqa-chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]