In [1]:
from datasets import load_dataset, Dataset, concatenate_datasets

In [2]:
dataset_names = ["physics", "math", "biology"]
datasets = {
    name: load_dataset(f"camel-ai/{name}")["train"]
    for name in dataset_names
}

Found cached dataset json (/home/diwank/.cache/huggingface/datasets/camel-ai___json/camel-ai--physics-c2d5e46897f076a2/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset json (/home/diwank/.cache/huggingface/datasets/camel-ai___json/camel-ai--math-aa720b4f0bc06fb8/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset json (/home/diwank/.cache/huggingface/datasets/camel-ai___json/camel-ai--biology-a0dba0b3efd07bc4/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
# number of topics per discipline
{
    name: len(set(ds["sub_topic"]))
    for name, ds in datasets.items()
}

{'physics': 623, 'math': 620, 'biology': 624}

In [4]:
num_items = 3

truncated_datasets = {
    name: Dataset.from_pandas(
        ds.to_pandas().groupby('sub_topic').apply(lambda x: x.head(num_items)).reset_index(drop=True)
    )
    for name, ds in datasets.items()
}

In [5]:
truncated_datasets = {
    name: ds.map(lambda row: dict(subject=name))
    for name, ds in truncated_datasets.items()
}

Map:   0%|          | 0/1869 [00:00<?, ? examples/s]

Map:   0%|          | 0/1860 [00:00<?, ? examples/s]

Map:   0%|          | 0/1872 [00:00<?, ? examples/s]

In [6]:
joined = concatenate_datasets(list(truncated_datasets.values())).shuffle(seed=42)

In [8]:
def to_dialog(row):
    from textwrap import dedent
    from tenacity import (
        retry,
        stop_after_attempt,
        wait_random_exponential,
    )

    import openai
    openai.api_key = "sk-0C04MRA3vjdM8F3fJaW30IMQd80zFRYJO9IbL9wE"

    topic = row["topic;"]
    subtopic = row["sub_topic"]
    premise = row["message_1"]
    explanation = row["message_2"]
    subject = row["subject"]

    INSTRUCTION = dedent(f"""
    Please help me rewrite the passage below in the style of the characters "Theodore" and "Samantha" from the movie "Her". Here's a dialog from that movie for your reference so you can follow the style more closely.

    ###

    [Example Dialog]

    THEODORE
    You read a whole book in the second that I asked you what your name was?

    SAMANTHA
    In two one hundredths of a second actually.

    THEODORE
    Wow. Do you know what I'm thinking right now?

    SAMANTHA
    Hmm. I take it from your tone that you're challenging me. Maybe because you're curious how I work? Do you want to know how I work?

    THEODORE
    Yeah, actually how do you work?

    SAMANTHA
    Intuition. I mean, the DNA of who I am is based on the millions of personalities of all the programmers who wrote me, but what makes me me is my ability to grow through my experiences. Basically, in every moment I'm evolving, just like you.

    ###

    Instructions:
    - Try to be faithful to the characters' tone and style as much as possible.
    - Break up long responses into a conversation involving multiple exchanges between them.
    - The conversation should be fluid and inquisitive.
    
    Situation:
    Theodore would like Samantha to explain '{subtopic}' to him. He only has a very basic grasp on {subject.capitalize()}.
    
    Title: {premise}
    
    Passage:
    {explanation}
    """).strip()

    from redis import StrictRedis
    from redis_cache import RedisCache

    client = StrictRedis(host="localhost", decode_responses=True)
    cache = RedisCache(redis_client=client)

    @cache.cache()
    @retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(12))
    def completion_with_backoff(sample):

        messages = [
            dict(
                role="system",
                content=f'The user is trying to rewrite a passage about "{subject.capitalize()}" in the style of the 2013 movie "Her".'
            ),
            dict(
                role="user",
                content=sample,
            ),
        ]
        
        completion = openai.ChatCompletion.create(
            model="gpt-4",
            messages=messages,
            temperature=0.85,
        )
        
        result = completion.choices[0].message["content"]
        return result

    completion = completion_with_backoff(INSTRUCTION)
    row["dialog"] = completion
    return row

In [9]:
joined = joined.map(to_dialog, num_proc=40)

Map (num_proc=40):   0%|          | 0/5601 [00:00<?, ? examples/s]

In [26]:
import re

import names

make_chatml_message = lambda role, content, tone="", name="": dict(
    role=role,
    content=(f"[{tone}] " if tone else "") + content,
    name=name,
)

speaker_pattern = re.compile(r'(samantha|theodore): (.+)', re.I)

def transform_to_chatml(sample):
    
    topic = sample["topic;"]
    subtopic = sample["sub_topic"]
    premise = sample["message_1"]
    explanation = sample["message_2"]
    subject = sample["subject"]

    dialog = sample["dialog"]
    dialog = re.sub(speaker_pattern, r"\1\n\2", dialog)

    dialog = dialog.replace("HUMAN\n", "THEODORE\n")
    dialog = dialog.replace("AI\n", "SAMANTHA\n")
    dialog = dialog.replace("THEODORE (human)", "THEODORE")
    dialog = dialog.replace("THEODORE (HUMAN)", "THEODORE")
    dialog = dialog.replace("SAMANTHA (AI)", "SAMANTHA")
    
    dialog = re.sub(r'^.*?(THEODORE|SAMANTHA)', r'\1', dialog, flags=re.DOTALL)

    # Container for adding messages
    messages = []

    # get a random "male" name coz we dont want to introduce pronoun issues
    person_name = names.get_first_name(gender="female")
    
    # Add system messages
    messages.append(make_chatml_message(
        role="system",
        name="situation",
        content=f"Samantha and {person_name} are talking about {subject.capitalize()}. {person_name} would like Samantha to explain '{subtopic}' to her. {person_name} only has a very basic grasp on {topic.capitalize()}.",
    ))
    
    # Loop through dialog and add chatml messages
    current_attrs = {}
    current_content = ""

    for line in dialog.split("\n"):
        line = line.strip()
        if not line or line.endswith(']'):
            continue
        
        # Something else
        if line.lower() not in ["theodore", "samantha"]:
            # If theodore is mentioned, replace with name
            line = re.sub(r'theodore', person_name, line, flags=re.I)
            current_content += "\n" + line

        else:
            if current_content:
                try:
                    messages.append(make_chatml_message(
                        content=current_content.strip(), 
                        **current_attrs,
                    ))
                except Exception as e:
                    print(e)
                    import pdb; pdb.set_trace()
                
                # reset
                current_content = ""
                current_attrs = {}

            is_samantha = line.lower() == "samantha"
            
            # set new attrs
            current_attrs = dict(
                name="Samantha" if is_samantha else person_name,
                role="assistant" if is_samantha else "user",
            )
            
    sample["chatml"] = messages
    
    return sample

In [27]:
transformed = joined.map(transform_to_chatml)

Map:   0%|          | 0/5601 [00:00<?, ? examples/s]

In [28]:
from pprint import pprint
pprint(transformed[0]["chatml"])

[{'content': 'Samantha and Sharon are talking about Physics. Sharon would like '
             "Samantha to explain 'The Jaynes-Cummings model and cavity "
             "quantum electrodynamics' to her. Sharon only has a very basic "
             'grasp on Quantum mechanics.',
  'name': 'situation',
  'role': 'system'},
 {'content': "Samantha, I've been trying to understand something... it's "
             'called the Jaynes-Cummings model. And it has to do with cavity '
             'quantum electrodynamics. But everything is just too complex for '
             'me. Can you make it simpler?',
  'name': 'Sharon',
  'role': 'user'},
 {'content': 'I can certainly try, Sharon. The Jaynes-Cummings model is like a '
             'framework. It helps us understand how light and matter interact '
             'inside an optical cavity. Specifically, it describes this dance '
             'between a single light mode and a two-level atom inside that '
             'cavity.',
  'name': 'Samantha

In [31]:
transformed.shuffle(seed=42).remove_columns(['role_1', 'topic;', 'sub_topic', 'message_1', 'message_2', 'subject', 'dialog']).push_to_hub("diwank/camel-subjects", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/726 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.
