In [1]:
from datasets import load_dataset, Dataset

gpt_roleplay = load_dataset("IlyaGusev/gpt_roleplay_realm")

Downloading readme:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/195M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/202M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating en split:   0%|          | 0/216 [00:00<?, ? examples/s]

Generating ru split:   0%|          | 0/219 [00:00<?, ? examples/s]

In [2]:
gpt_roleplay_en = gpt_roleplay["en"]

make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)


In [3]:

def entry_to_chatml(
    name,
    context,
    greeting,
    dialog,
    topic=None,
):
    # Prepare situation context
    situation_context = f"{context} {name} is talking to a person."
    if topic:
        if type(topic) == list:
            topic = "\n- ".join(topic)

        situation_context += f'{name} and the person are talking about:\n- {topic}'

    # Prepare chatml
    chatml = [
        situation(situation_context),
        me(name=name, content=greeting),
    ]

    for message in dialog:
        if message["role"] == "char":
            chatml.append(me(
                name=name,
                content=message["content"],
            ))

        else:
            chatml.append(person(content=message["content"]))

    return chatml

In [4]:
def map_chatml_batch(batch):
    # vars with x_ suffix are batches
    # Outputs
    chatml_ = []

    # All batched inputs
    name_ = batch["name"]
    context_ = batch["context"]
    greeting_ = batch["greeting"]
    example_dialogue_ = batch["example_dialogue"]
    topics_ = batch["topics"]
    dialogues_ = batch["dialogues"]

    # Add examples
    for i, example_dialogue in enumerate(example_dialogue_):
        k = entry_to_chatml(
            name=name_[i],
            context=context_[i],
            greeting=greeting_[i],
            dialog=example_dialogue,
            topic=None,
        )
        
        chatml_.append(k)

    # Add the rest
    for i, dialogues in enumerate(dialogues_):    
        for dialog in dialogues:
            k = entry_to_chatml(
                name=name_[i],
                context=context_[i],
                greeting=greeting_[i],
                dialog=dialog["chat"],
                topic=topics_[i],
            )
            
            chatml_.append(k)

    none_seq = [None] * len(chatml_)
    return dict(
        chatml=chatml_,
        name=none_seq,
        context=none_seq,
        greeting=none_seq,
        example_dialogue=none_seq,
        topics=none_seq,
        dialogues=none_seq,
        image=none_seq,
        image_prompt=none_seq,
        char_id=none_seq,
    )


In [5]:
dataset = gpt_roleplay_en.map(
    map_chatml_batch,
    batched=True,
).remove_columns([
    "name",
    "context",
    "greeting",
    "example_dialogue",
    "topics",
    "dialogues",
    "image",
    "image_prompt",
    "char_id",
])

Map:   0%|          | 0/216 [00:00<?, ? examples/s]

In [6]:
assistant_me_map = {
    "user": "person",
    "assistant": "me",
}

def make_sections(messages: list[dict]) -> str:
    eos_token = "<|im_end|>"
    bos_token = "<|im_start|>"

    result = bos_token + (eos_token+'\n'+bos_token).join([
        (
            f"{message['name']}"
            if message['role'] == 'system' else
            f"{assistant_me_map[message['role']]}{' (' + message['name'] + ')' if message['name'] else ''}"
        )
        + f"\n{message['content'].strip()}"
        for message in messages
    ]) + eos_token
    
    return result
    

def transform_to_samantha_dialog(sample):

    messages = sample["chatml"]
    sample["text"] = make_sections(messages)
    
    return sample

dataset = dataset.map(transform_to_samantha_dialog)

Map:   0%|          | 0/4536 [00:00<?, ? examples/s]

In [7]:
dataset.push_to_hub("diwank/gpt_roleplay_realm-chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/507 [00:00<?, ?B/s]