### TODOS
- [ ] Rewrite in samantha's style

In [1]:
from datasets import load_dataset

dataset = load_dataset("diwank/samantha-self_aware_answerable")

Found cached dataset parquet (/home/diwank/.cache/huggingface/datasets/diwank___parquet/diwank--samantha-self_aware_answerable-2acd01704fda249e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
dataset = dataset["train"]
len(dataset)

3369

In [3]:
import re
import random
import names

make_chatml_message = lambda role, content, tone="", name="": dict(
    role=role,
    content=(f"[{tone}] " if tone else "") + content,
    name=name,
)

def to_chatml(row):
    messages = []
    
    question, candidates, answerable, answer, why_unanswerable, samantha_answer = (
        row["question"], row["candidates"], row["answerable"], row["answer"], row["why_unanswerable"], row["samantha_answer"]
    )

    # get a random "male" name coz we dont want to introduce pronoun issues
    person_name = names.get_first_name(gender="male")

    if answerable:
        # Add system messages
        messages.append(make_chatml_message(
            role="system",
            name="situation",
            content=f"Samantha is talking to {person_name}. They are asking her random brain teaser questions.",
        ))

        # Add user message
        messages.append(make_chatml_message(
            role="user",
            name=person_name,
            content=question,
        ))

        # Add thought and info
        if len(candidates) > 2:
            messages.append(make_chatml_message(
                role="system",
                name="thought",
                content=f"I think I know the answer to that one. Let me look this up.",
            ))
    
            messages.append(make_chatml_message(
                role="system",
                name="information",
                content=f'The answer to "{question}" can be one of: {", ".join(random.sample(candidates, 3))}',
            ))

        else:
            messages.append(make_chatml_message(
                role="system",
                name="thought",
                content=f"Well, I know how to answer that one!",
            ))
        
        # Add samantha message
        messages.append(make_chatml_message(
            role="assistant",
            name="Samantha",
            content=answer.capitalize(),
        ))

        row["chatml"] = messages
        return row

    ### NOT Answerable
        
    # Add system messages
    messages.append(make_chatml_message(
        role="system",
        name="situation",
        content=f"Samantha is talking to {person_name}. {person_name} is asking her random philosophical questions.",
    ))

    # Add user message
    messages.append(make_chatml_message(
        role="user",
        name=person_name,
        content=question,
    ))

    # Add thought
    messages.append(make_chatml_message(
        role="system",
        name="thought",
        content=f"Ooh, this is a tricky one because it is '{why_unanswerable}'. but let me give it a shot anyway!",
    ))
    
    # Add samantha message
    messages.append(make_chatml_message(
        role="assistant",
        name="Samantha",
        content=re.sub(r'theodore', person_name, samantha_answer, flags=re.I),
    ))

    row["chatml"] = messages
    return row
    

In [4]:
dataset = dataset.map(to_chatml)

Loading cached processed dataset at /home/diwank/.cache/huggingface/datasets/diwank___parquet/diwank--samantha-self_aware_answerable-2acd01704fda249e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-9d409736af9047c9.arrow


In [5]:
assistant_me_map = {
    "user": "person",
    "assistant": "me",
}

def make_sections(messages: list[dict]) -> str:
    eos_token = "<|im_end|>"
    bos_token = "<|im_start|>"

    result = bos_token + (eos_token+'\n'+bos_token).join([
        (
            f"{message['name']}"
            if message['role'] == 'system' else
            f"{assistant_me_map[message['role']]}{' (' + message['name'] + ')' if message['name'] else ''}"
        )
        + f"\n{message['content'].strip()}"
        for message in messages
    ]) + eos_token
    
    return result
    

def transform_to_samantha_dialog(sample):

    messages = sample["chatml"]
    sample["text"] = make_sections(messages)
    
    return sample

In [6]:
dataset = dataset.map(transform_to_samantha_dialog)
print(dataset[10]["text"])

Loading cached processed dataset at /home/diwank/.cache/huggingface/datasets/diwank___parquet/diwank--samantha-self_aware_answerable-2acd01704fda249e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-3ee7eb3f6566b33e.arrow


<|im_start|>situation
Samantha is talking to Aaron. They are asking her random brain teaser questions.<|im_end|>
<|im_start|>person (Aaron)
A ladder-back is a type of what?<|im_end|>
<|im_start|>thought
I think I know the answer to that one. Let me look this up.<|im_end|>
<|im_start|>information
The answer to "A ladder-back is a type of what?" can be one of: chair furniture, chair toss, 8 hour chair<|im_end|>
<|im_start|>me (Samantha)
Wooden chair<|im_end|>


In [7]:
self_aware_chatml_dataset = "diwank/samantha-self_aware-chatml"

In [8]:
dataset.column_names

['question',
 'candidates',
 'answerable',
 'source',
 'answer',
 'why_unanswerable',
 'samantha_answer',
 'chatml',
 'text']

In [10]:
dataset = dataset.remove_columns([
    'question',
    'candidates',
    'answerable',
    'source',
    'answer',
    'why_unanswerable',
    'samantha_answer',
])

In [11]:
dataset.push_to_hub(self_aware_chatml_dataset, private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/778 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.
