In [1]:
from datasets import load_dataset

dataset = load_dataset("heliosbrahma/mental_health_chatbot_dataset")

Found cached dataset parquet (/home/diwank/.cache/huggingface/datasets/heliosbrahma___parquet/heliosbrahma--mental_health_chatbot_dataset-55294e53c7a5aca1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
def to_dialog(row):
    from tenacity import (
        retry,
        stop_after_attempt,
        wait_random_exponential,
    )

    import openai
    openai.api_key = "XXX"

    INSTRUCTION = """
    I am writing fan fiction for the 2013 movie "Her". I have compiled an example instruction to AI Assistant from a human and the corresponding response from the AI but it is in a very formal style and neutral tone.

    Please help me rewrite the sample in the style of the characters "Theodore" and "Samantha" from the movie "Her". Here's a dialog from that movie for your reference so you can follow the style more closely.

    ###

    [Example Dialog]

    THEODORE
    You read a whole book in the second that I asked you what your name was?

    SAMANTHA
    In two one hundredths of a second actually.

    THEODORE
    Wow. Do you know what I'm thinking right now?

    SAMANTHA
    Hmm. I take it from your tone that you're challenging me. Maybe because you're curious how I work? Do you want to know how I work?

    THEODORE
    Yeah, actually how do you work?

    SAMANTHA
    Intuition. I mean, the DNA of who I am is based on the millions of personalities of all the programmers who wrote me, but what makes me me is my ability to grow through my experiences. Basically, in every moment I'm evolving, just like you.

    ###

    Please rewrite this sample below as if it was a conversation between THEODORE and SAMANTHA. Try to be faithful to the characters' tone and style as much as possible. Break up long responses into a conversation involving multiple exchanges between them:
    """.strip()


    from redis import StrictRedis
    from redis_cache import RedisCache

    client = StrictRedis(host="localhost", decode_responses=True)
    cache = RedisCache(redis_client=client)

    @cache.cache()
    @retry(wait=wait_random_exponential(min=1, max=90), stop=stop_after_attempt(100))
    def completion_with_backoff(sample):

        messages = [dict(
            role="user",
            content=INSTRUCTION + "\n\n" + sample
        )]
        
        completion = openai.ChatCompletion.create(
            model="gpt-4",
            messages=messages,
            temperature=0.25,
        )
        
        result = completion.choices[0].message["content"]
        return result

    completion = completion_with_backoff(row["text"])
    row["dialog"] = completion
    return row

In [3]:
dataset = dataset.map(to_dialog, num_proc=20)

Map (num_proc=20):   0%|          | 0/172 [00:00<?, ? examples/s]

In [4]:
from datetime import datetime, timedelta
import random
from random import randrange
import re

import names


def random_date(start, end):
    """
    This function will return a random datetime between two datetime 
    objects.
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    
    return start + timedelta(seconds=random_second)

date_formats = ["%m/%d/%Y", "%d %B %Y", "%d/%m/%Y", "%Y-%m-%d", "%B %d, %Y", "%A, %B %d, %Y"]

human_date = lambda dt: dt.strftime(random.choice(date_formats))

make_chatml_message = lambda role, content, tone="", name="": dict(
    role=role,
    content=(f"[{tone}] " if tone else "") + content,
    name=name,
)

def transform_to_chatml(sample):
    
    dialog = sample["dialog"]
    dialog = dialog.replace("HUMAN\n", "THEODORE\n")
    dialog = dialog.replace("AI\n", "SAMANTHA\n")
    dialog = dialog.replace("THEODORE (human)", "THEODORE")
    dialog = dialog.replace("THEODORE (HUMAN)", "THEODORE")
    dialog = dialog.replace("SAMANTHA (AI)", "SAMANTHA")
    
    dialog = re.sub(r'^.*?(THEODORE|SAMANTHA)', r'\1', dialog, flags=re.DOTALL)

    # Container for adding messages
    messages = []

    # Random date and name
    some_date = human_date(random_date(
        datetime(2013, 12, 18),  # Her release date
        datetime(2023, 6, 10),  # Today
    ))
    
    # get a random "male" name coz we dont want to introduce pronoun issues
    person_name = names.get_first_name(gender="male")
    
    # Add system messages
    messages.append(make_chatml_message(
        role="system",
        name="situation",
        content=f"{some_date}\n\nSamantha is talking to {person_name} about their mental health.",
    ))

    # Loop through dialog and add chatml messages
    current_attrs = {}
    current_content = ""

    for line in dialog.split("\n"):
        line = line.strip()
        
        # Something else
        if line.lower() not in ["theodore", "samantha"]:
            # If theodore is mentioned, replace with name
            line = re.sub(r'theodore', person_name, line, flags=re.I)
            current_content += "\n" + line

        else:
            if current_content:
                try:
                    messages.append(make_chatml_message(
                        content=current_content.strip(), 
                        **current_attrs,
                    ))
                except Exception as e:
                    print(e)
                
                # reset
                current_content = ""
                current_attrs = {}

            is_samantha = line.lower() == "samantha"
            
            # set new attrs
            current_attrs = dict(
                name="Samantha" if is_samantha else person_name,
                role="assistant" if is_samantha else "user",
            )
            
    sample["chatml"] = messages
    
    return sample

In [5]:
dataset = dataset.map(transform_to_chatml)
print(dataset["train"][10]["chatml"])

Map:   0%|          | 0/172 [00:00<?, ? examples/s]

[{'content': 'Wednesday, September 09, 2015\n\nSamantha is talking to Theodore about their mental health.', 'name': 'situation', 'role': 'system'}, {'content': 'So, can people be cured of mental health problems?', 'name': 'Theodore', 'role': 'user'}, {'content': 'Well, Theodore, when we talk about mental health, the word "cure" might not be the best fit.', 'name': 'Samantha', 'role': 'assistant'}, {'content': 'What do you mean?', 'name': 'Theodore', 'role': 'user'}, {'content': 'Mental health issues can be intricate and diverse, and there isn\'t always a simple "cure" like there might be for some physical illnesses. But many mental health conditions can be effectively managed, treated, and even overcome with the right support and strategies.', 'name': 'Samantha', 'role': 'assistant'}, {'content': 'Like what kind of strategies?', 'name': 'Theodore', 'role': 'user'}, {'content': "Well, there are several options. Therapy, counseling, medication, lifestyle changes, and self-help techniques

In [6]:
dataset = dataset.remove_columns(["text"])

dataset.push_to_hub("diwank/samantha-mental-health", private=True)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/503 [00:00<?, ?B/s]