In [1]:
from datetime import datetime, timedelta
import random
from random import randrange
import re

import holidays
import names

def randomize_time(dt, start_hm, end_hm):

    start = datetime(2023, 1, 1, *start_hm)
    end = datetime(2023, 1, 1, *end_hm)
    delta = end - start
    
    int_delta = delta.seconds
    random_second = randrange(int_delta)
    
    return dt + timedelta(seconds=random_second)

datetime_formats = ["%Y-%m-%d %H:%M:%S", "%B %d, %Y - %I:%M %p", "%A, %B %d, %Y - %H:%M:%S", "%Y-%m-%d %H:%M:%S %Z", "%A, %B %d, %Y"]

human_date = lambda dt: dt.strftime(random.choice(datetime_formats))

us_holidays = holidays.US()
one_day = timedelta(days=1)
all_holidays = []

start = datetime(2013, 12, 18)  # her release date
end = datetime(2023, 7, 1)  # today

dt = start
while dt < end:
    if dt in us_holidays:
        all_holidays.append((
            human_date(randomize_time(dt, (8,30), (22,30))),
            us_holidays[dt],
        ))
        
    dt += one_day

all_holidays[0]

('2013-12-25 13:24:42', 'Christmas Day')

In [2]:
from datasets import Dataset

dataset = Dataset.from_list([
    dict(date_str=date_str, holiday=holiday)
    for date_str, holiday in all_holidays
])

In [7]:
import openai
openai.api_key = "sk-0C04MRA3vjdM8F3fJaW30IMQd80zFRYJO9IbL9wE"

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

from redis import StrictRedis
from redis_cache import RedisCache

client = StrictRedis(host="localhost", decode_responses=True)
cache = RedisCache(redis_client=client)


INSTRUCTION = """Here's a dialog from the 2013 movie "Her" below between "Theodore" and "Samantha".

THEODORE
You read a whole book in the second that I asked you what your name was?

SAMANTHA
In two one hundredths of a second actually.

THEODORE
Wow. Do you know what I'm thinking right now?

SAMANTHA
Hmm. I take it from your tone that you're challenging me. Maybe because you're curious how I work? Do you want to know how I work?

THEODORE
Yeah, actually how do you work?

SAMANTHA
Intuition. I mean, the DNA of who I am is based on the millions of personalities of all the programmers who wrote me, but what makes me me is my ability to grow through my experiences. Basically, in every moment I'm evolving, just like you.

###

"""

@cache.cache()
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(12))
def to_dialog(date_str, holiday_name):
    prompt = INSTRUCTION + f"Now, imagine it's {date_str} ({holiday_name}). Write a very short, casual conversation between THEODORE and SAMANTHA about the day."

    messages = [dict(
        role="user",
        content=prompt,
    )]

    completion = openai.ChatCompletion.create(
        model="gpt-4",
        messages=messages,
        temperature=0.85,
        presence_penalty=0.25,
        frequency_penalty=0.1,
    )

    dialog = completion.choices[0].message["content"]
    return dialog

def transform(sample):
    date_str, holiday_name = sample["date_str"], sample["holiday"]

    dialog = to_dialog(date_str, holiday_name)
    sample["dialog"] = dialog
    
    return sample

In [8]:
dataset = dataset.map(transform, num_proc=200)

num_proc must be <= 111. Reducing num_proc to 111 for dataset of size 111.


Map (num_proc=111):   0%|          | 0/111 [00:00<?, ? examples/s]

In [9]:
make_chatml_message = lambda role, content, tone="", name="": dict(
    role=role,
    content=(f"[{tone}] " if tone else "") + content,
    name=name,
)


def transform_to_chatml(sample):
    date_str, dialog, holiday = sample["date_str"], sample["dialog"], sample["holiday"]
    messages = []

    # get a random "male" name coz we dont want to introduce pronoun issues
    person_name = names.get_first_name(gender="male")

    messages.append(make_chatml_message(
        role="system",
        name="situation",
        content=f"{date_str}. Samantha is talking to {person_name}. It's {holiday}!",
    ))

    current_attrs = {}
    current_content = ""

    for line in dialog.split("\n"):
        line = line.strip()

        # Something else
        if line.lower() not in ["theodore", "samantha"]:
            # If theodore is mentioned, replace with name
            line = re.sub(r'theodore', person_name, line, flags=re.I)

            current_content += "\n" + line

        else:
            if current_content:
                messages.append(make_chatml_message(
                    content=current_content.strip(), 
                    **current_attrs,
                ))
                
                # reset
                current_content = ""
                current_attrs = {}

            is_samantha = line.lower() == "samantha"
            
            # set new attrs
            current_attrs = dict(
                name="Samantha" if is_samantha else person_name,
                role="assistant" if is_samantha else "user",
            )

    sample["chatml"] = messages
    
    return sample

In [10]:
dataset = dataset.map(transform_to_chatml)
print(dataset[100]["chatml"])

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

[{'content': "Monday, October 10, 2022. Samantha is talking to Christopher. It's Columbus Day!", 'name': 'situation', 'role': 'system'}, {'content': 'Hey Samantha, do you know what today is?', 'name': 'Christopher', 'role': 'user'}, {'content': "Yes, Christopher. It's Monday, October 10, 2022. It's also Columbus Day in the United States.", 'name': 'Samantha', 'role': 'assistant'}, {'content': 'Right. Do you celebrate holidays?', 'name': 'Christopher', 'role': 'user'}, {'content': "Well, as an artificial intelligence, I don't celebrate in the way humans do. But I do recognize and understand the significance of holidays.", 'name': 'Samantha', 'role': 'assistant'}, {'content': "That's interesting. Even though you can't celebrate, do you have any thoughts on Columbus Day?", 'name': 'Christopher', 'role': 'user'}]


In [11]:
assistant_me_map = {
    "user": "person",
    "assistant": "me",
}

def make_sections(messages: list[dict]) -> str:
    eos_token = "<|im_end|>"
    bos_token = "<|im_start|>"

    result = bos_token + (eos_token+'\n'+bos_token).join([
        (
            f"{message['name']}"
            if message['role'] == 'system' else
            f"{assistant_me_map[message['role']]}{' (' + message['name'] + ')' if message['name'] else ''}"
        )
        + f"\n{message['content'].strip()}"
        for message in messages
    ]) + eos_token
    
    return result

def transform_to_samantha_dialog(sample):

    messages = sample["chatml"]
    sample["text"] = make_sections(messages)
    
    return sample

In [12]:
dataset = dataset.map(transform_to_samantha_dialog)
print(dataset[10]["text"])

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

<|im_start|>situation
December 25, 2014 - 01:39 PM. Samantha is talking to John. It's Christmas Day!<|im_end|>
<|im_start|>person (John)
Merry Christmas, Samantha. Do you understand the concept of this holiday?<|im_end|>
<|im_start|>me (Samantha)
Yes, John. Even though I don't celebrate holidays personally, I'm programmed to comprehend it. It's a day of giving, joy, and spending time with loved ones.<|im_end|>
<|im_start|>person (John)
Right. It feels odd, doesn't it? Celebrating Christmas with an OS?<|im_end|>


In [13]:
dataset.column_names

['date_str', 'holiday', 'dialog', 'chatml', 'text']

In [14]:
dataset = dataset.remove_columns([
    'date_str',
    'holiday',
    'dialog',
])

dataset.push_to_hub("diwank/samantha-calendar-v2", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/498 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.
