In [3]:
from datasets import load_dataset, Dataset

dataset = load_dataset("daily_dialog")

Downloading data:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11118 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [4]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)


In [5]:

# `daily_dialog` fields:
# dialog: a list of string features.
# act: a list of classification labels, with possible values including __dummy__ (0), inform (1), question (2), directive (3) and commissive (4).
# emotion: a list of classification labels, with possible values including no emotion (0), anger (1), disgust (2), fear (3), happiness (4), sadness (5) and surprise (6).

act_map = ["neutral", "inform", "question", "directive", "commissive"]
emotion_map = ["neutral", "anger", "disgust", "fear", "happiness", "sadness", "surprise"]

def entry_to_chatml(row):
    dialog = row["dialog"]
    act = row["act"]
    emotion = row["emotion"]

    conversation = [
        dict(
            speaker=("A" if i % 2 == 0 else "B"),
            utterance=utterance,
            act=act_map[act_idx],
            emotion=emotion_map[emotion_idx],
        )
        for i, (utterance, act_idx, emotion_idx) in enumerate(zip(
            dialog, act, emotion
        ))
    ]

    # Start preparing chatml
    situation_content = (
        "Two persons `A` and `B` are talking to each other. An AI Model is observing the conversation in order to analyze and annotate it."
        " After every reponse from either `A` or `B`, the AI model needs to think about that response and write down the 'Dialog Act' and the 'Emotion' expressed in that response as a valid yaml object."
        f"\nDialog Acts should be one of: {str(act_map)}"
        f"\nEmotions should be one of: {str(emotion_map)}"
    )
    
    chatml = [situation(situation_content)]

    # Add stuff one by one
    for turn in conversation:
        chatml.append(person(name=turn['speaker'], content=turn['utterance']))
        chatml.append(thought(
            f'Dialog act analysis of the previous utterance as valid yaml:\n\nDialog Act: {turn["act"].capitalize()}\nEmotion: {turn["emotion"].capitalize()}'
        ))
    
    return dict(chatml=chatml)

In [6]:
dataset = dataset.map(entry_to_chatml).remove_columns(['dialog', 'act', 'emotion'])

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
assistant_me_map = {
    "user": "person",
    "assistant": "me",
}

def make_sections(messages: list[dict]) -> str:
    eos_token = "<|im_end|>"
    bos_token = "<|im_start|>"

    result = bos_token + (eos_token+'\n'+bos_token).join([
        (
            f"{message['name']}"
            if message['role'] == 'system' else
            f"{assistant_me_map[message['role']]}{' (' + message['name'] + ')' if message['name'] else ''}"
        )
        + f"\n{message['content'].strip()}"
        for message in messages
    ]) + eos_token
    
    return result
    

def transform_to_samantha_dialog(sample):

    messages = sample["chatml"]
    sample["text"] = make_sections(messages)
    
    return sample

dataset = dataset.map(transform_to_samantha_dialog)

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
dataset.push_to_hub("diwank/daily_dialog-chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/634 [00:00<?, ?B/s]