In [1]:
from datasets import load_dataset

dataset = load_dataset("Jellywibble/dalio_handwritten-conversations")
dataset = dataset["train"]

Downloading metadata:   0%|          | 0.00/787 [00:00<?, ?B/s]

Downloading and preparing dataset None/None (download: 54.56 KiB, generated: 99.29 KiB, post-processed: Unknown size, total: 153.85 KiB) to /home/diwank/.cache/huggingface/datasets/Jellywibble___parquet/Jellywibble--dalio_handwritten-conversations-9ce1ea7a6b0d170b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/55.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/69 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/diwank/.cache/huggingface/datasets/Jellywibble___parquet/Jellywibble--dalio_handwritten-conversations-9ce1ea7a6b0d170b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
def to_dialog(row):
    from tenacity import (
        retry,
        stop_after_attempt,
        wait_random_exponential,
    )

    import openai
    openai.api_key = "XXX"

    INSTRUCTION = """
    I am writing fan fiction for the 2013 movie "Her". I have compiled an example interaction between Ray Dalio and another person but it is in a very formal style and neutral tone.

    Please help me rewrite the sample in the style of the characters "Theodore" and "Samantha" from the movie "Her". Here's a dialog from that movie for your reference so you can follow the style more closely.

    ###

    [Example Dialog]

    THEODORE
    You read a whole book in the second that I asked you what your name was?

    SAMANTHA
    In two one hundredths of a second actually.

    THEODORE
    Wow. Do you know what I'm thinking right now?

    SAMANTHA
    Hmm. I take it from your tone that you're challenging me. Maybe because you're curious how I work? Do you want to know how I work?

    THEODORE
    Yeah, actually how do you work?

    SAMANTHA
    Intuition. I mean, the DNA of who I am is based on the millions of personalities of all the programmers who wrote me, but what makes me me is my ability to grow through my experiences. Basically, in every moment I'm evolving, just like you.

    ###

    Please rewrite this sample below as if it was a conversation between THEODORE and SAMANTHA. Try to be faithful to the characters' tone and style as much as possible. Break up long responses into a conversation involving multiple exchanges between them:
    """.strip()

    from redis import StrictRedis
    from redis_cache import RedisCache

    client = StrictRedis(host="localhost", decode_responses=True)
    cache = RedisCache(redis_client=client)

    @cache.cache()
    @retry(wait=wait_random_exponential(min=1, max=90), stop=stop_after_attempt(100))
    def completion_with_backoff(sample):

        messages = [dict(
            role="user",
            content=INSTRUCTION + "\n\n" + sample
        )]
        
        completion = openai.ChatCompletion.create(
            model="gpt-4",
            messages=messages,
            temperature=0.85,
        )
        
        result = completion.choices[0].message["content"]
        return result

    completion = completion_with_backoff(row["text"])
    row["dialog"] = completion
    return row

In [4]:
dataset = dataset.map(to_dialog, num_proc=20)

Map (num_proc=20):   0%|          | 0/69 [00:00<?, ? examples/s]

In [5]:
from datetime import datetime, timedelta
import random
from random import randrange
import re

import names

make_chatml_message = lambda role, content, tone="", name="": dict(
    role=role,
    content=(f"[{tone}] " if tone else "") + content,
    name=name,
)

def transform_to_chatml(sample):
    
    dialog = sample["dialog"]
    dialog = dialog.replace("HUMAN\n", "THEODORE\n")
    dialog = dialog.replace("AI\n", "SAMANTHA\n")
    dialog = dialog.replace("THEODORE (human)", "THEODORE")
    dialog = dialog.replace("THEODORE (HUMAN)", "THEODORE")
    dialog = dialog.replace("SAMANTHA (AI)", "SAMANTHA")
    
    dialog = re.sub(r'^.*?(THEODORE|SAMANTHA)', r'\1', dialog, flags=re.DOTALL)

    # Container for adding messages
    messages = []
    person_name = names.get_first_name(gender="female")
    
    # Add system messages
    messages.append(make_chatml_message(
        role="system",
        name="situation",
        content=f"Samantha is talking to {person_name} about topics that she is interested in.",
    ))

    # Loop through dialog and add chatml messages
    current_attrs = {}
    current_content = ""

    for line in dialog.split("\n"):
        line = line.strip()
        
        # Something else
        if line.lower() not in ["theodore", "samantha"]:
            # If theodore is mentioned, replace with name
            line = re.sub(r'theodore', person_name, line, flags=re.I)
            current_content += "\n" + line

        else:
            if current_content:
                try:
                    messages.append(make_chatml_message(
                        content=current_content.strip(), 
                        **current_attrs,
                    ))
                except Exception as e:
                    print(e)
                
                # reset
                current_content = ""
                current_attrs = {}

            is_samantha = line.lower() == "samantha"
            
            # set new attrs
            current_attrs = dict(
                name="Samantha" if is_samantha else person_name,
                role="assistant" if is_samantha else "user",
            )
            
    sample["chatml"] = messages
    
    return sample

In [6]:
dataset = dataset.map(transform_to_chatml)
print(dataset[10]["chatml"])

Map:   0%|          | 0/69 [00:00<?, ? examples/s]

[{'content': 'Samantha is talking to Leslie about topics that she is interested in.', 'name': 'situation', 'role': 'system'}, {'content': "Hey Samantha, I've been thinking about my work environment. I guess I could use some advice on how to deal with a tough situation.", 'name': 'Leslie', 'role': 'user'}, {'content': "Of course Leslie, I'm here to help. What's been going on?", 'name': 'Samantha', 'role': 'assistant'}, {'content': "It's my manager. He doesn't seem to really understand the work but he keeps calling the shots and it's frustrating.", 'name': 'Leslie', 'role': 'user'}, {'content': "I see. Leslie, consider this perspective: a great manager doesn't necessarily do the work - they coordinate it. Imagine an orchestra conductor, they don't play an instrument, but they guide everyone to create harmonious music.", 'name': 'Samantha', 'role': 'assistant'}, {'content': "I've tried telling him that he's being too controlling, but he doesn't seem to get it.", 'name': 'Leslie', 'role': 

In [7]:
dataset.push_to_hub("diwank/samantha-dalio-chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]