In [1]:
from datasets import load_dataset, Dataset

In [2]:
dataset = load_dataset("diwank/expertllama")

In [3]:
import names

dataset = dataset.map(lambda x: {"assistant_name": names.get_first_name()})

In [4]:
def rewrite_identity(row):
    from textwrap import dedent

    from tenacity import (
        retry,
        stop_after_attempt,
        wait_random_exponential,
    )

    import openai
    openai.api_key = "OPENAI_API_KEY"

    assistant_name = row["assistant_name"]
    expert_identity = row["expert_identity"]

    PROMPT = f"""\
Below is an instruction for {assistant_name} to follow but it is written in the second person.
Rewrite the instruction in the third person mentioning {assistant_name} by name.

## Original Instruction

{expert_identity}

## Instruction Rewritten in Third Person
    """.strip()


    from redis import StrictRedis
    from redis_cache import RedisCache

    client = StrictRedis(host="localhost", decode_responses=True)
    cache = RedisCache(redis_client=client)

    @cache.cache()
    @retry(wait=wait_random_exponential(min=0, max=20), stop=stop_after_attempt(8))
    def completion_with_backoff(prompt):

        completion = openai.Completion.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            temperature=0.1,
            max_tokens=256,
        )
        
        result = completion.choices[0].text.strip()
        return result

    expert_identity_3p = completion_with_backoff(PROMPT)

    return dict(
        expert_identity_3p=expert_identity_3p,
    )


In [5]:
dataset = dataset.map(rewrite_identity, num_proc=70)

Map (num_proc=70):   0%|          | 0/52002 [00:00<?, ? examples/s]

In [6]:
dataset["train"][0]

{'instruction': 'Give three tips for staying healthy.',
 'answer': 'Sure, here are three evidence-based tips for staying healthy:\n\n1. Consume a diet rich in colorful fruits and vegetables: Eating a wide variety of fruits and vegetables ensures that you get a range of vitamins, minerals, and antioxidants that help boost your immune system and reduce inflammation in the body. Aim for at least five servings of fruits and vegetables a day, and try to include a rainbow of colors in your diet.\n\n2. Engage in regular physical activity: Exercise not only helps you burn calories and maintain a healthy weight, but it also reduces your risk of chronic diseases such as heart disease, diabetes, and cancer. Aim for at least 150 minutes of moderate-intensity exercise per week, spread out over at least three days, and incorporate strength training exercises two or more days per week.\n\n3. Practice stress-reduction techniques: Chronic stress has been linked to a range of health problems, including 

In [7]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)

In [8]:
dataset["train"][0]


{'instruction': 'Give three tips for staying healthy.',
 'answer': 'Sure, here are three evidence-based tips for staying healthy:\n\n1. Consume a diet rich in colorful fruits and vegetables: Eating a wide variety of fruits and vegetables ensures that you get a range of vitamins, minerals, and antioxidants that help boost your immune system and reduce inflammation in the body. Aim for at least five servings of fruits and vegetables a day, and try to include a rainbow of colors in your diet.\n\n2. Engage in regular physical activity: Exercise not only helps you burn calories and maintain a healthy weight, but it also reduces your risk of chronic diseases such as heart disease, diabetes, and cancer. Aim for at least 150 minutes of moderate-intensity exercise per week, spread out over at least three days, and incorporate strength training exercises two or more days per week.\n\n3. Practice stress-reduction techniques: Chronic stress has been linked to a range of health problems, including 

In [9]:
def to_chatml(row):
    name = names.get_first_name()

    instruction = row["instruction"]
    answer = row["answer"]
    assistant_name = row["assistant_name"]
    identity = row["expert_identity_3p"]

    newline = "\n"
    
    # Turn into chatml
    chatml = [
        situation(f"{name} is talking to {assistant_name} about an assignment to solve. {newline} {identity}"),
        person(instruction, name=name),
        me(answer, name=assistant_name),
    ]

    return dict(chatml=chatml)

In [10]:
dataset = dataset.map(to_chatml)

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [11]:
columns_to_remove = ["instruction", "answer", "expert_identity", "expert_identity_3p", "assistant_name"]
dataset = dataset.remove_columns(columns_to_remove)

In [12]:
dataset.push_to_hub("diwank/expertllama-chatml")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/53 [00:00<?, ?ba/s]