In [1]:
!#wget https://github.com/cascip/ChatAlpaca/raw/main/data/chatalpaca-20k.json -O ../cache_data/chatalpaca-20k.jsonl

In [2]:
!#poetry add jsonlines

In [3]:
import jsonlines as jsonl
from datasets import Dataset

fn = "../cache_data/chatalpaca-20k.jsonl"
with jsonl.open(fn) as reader:
    def from_reader():
        for item in reader:
            yield item

    dataset = Dataset.from_generator(from_reader)

Found cached dataset generator (/home/diwank/.cache/huggingface/datasets/generator/default-645a65a0efdf258b/0.0.0)


In [4]:
def to_dialog(row):
    from tenacity import (
        retry,
        stop_after_attempt,
        wait_random_exponential,
    )

    import openai
    openai.api_key = "XXX"

    from redis import StrictRedis
    from redis_cache import RedisCache

    client = StrictRedis(host="localhost", decode_responses=True)
    cache = RedisCache(redis_client=client)

    INSTRUCTION = "Analyze the following conversation between a User and an AI Large Language Model and write down the topic they are discussing or the task the AI is trying to help the User with in this situation:"
    
    @cache.cache()
    @retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(100))
    def completion_with_backoff(sample):

        messages = [dict(
            role="user",
            content=INSTRUCTION + "\n\n" + sample
        )]
        
        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0.0,
        )
        
        result = completion.choices[0].message["content"]
        return result

    sample = "\n".join([
        f"{'User' if item['from'] == 'human' else 'AI Large Language Model'}: {item['value']}"
        for item in row["conversations"][:4]
    ])
    
    sample += "\n\nSituation:"
    
    completion = completion_with_backoff(sample)
    row["situation"] = completion
    
    return row

In [5]:
dataset = dataset.map(to_dialog, num_proc=200)

Map (num_proc=200):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [6]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)

In [7]:
def map_chatml(row):
    chatml = [
        situation(row["situation"]),
        *[
            me(turn["value"], name="AI")
            if turn["from"] != "human"
            else person(turn["value"])
            
            for turn in row["conversations"]
        ]
    ]
    
    return dict(chatml=chatml)

dataset = dataset.map(map_chatml).remove_columns([
    "id",
    "conversations",
    "situation",
])

Loading cached processed dataset at /home/diwank/.cache/huggingface/datasets/generator/default-645a65a0efdf258b/0.0.0/cache-b9315b32301d33fd.arrow


In [8]:
assistant_me_map = {
    "user": "person",
    "assistant": "me",
}

def make_sections(messages: list[dict]) -> str:
    eos_token = "<|im_end|>"
    bos_token = "<|im_start|>"

    result = bos_token + (eos_token+'\n'+bos_token).join([
        (
            f"{message['name']}"
            if message['role'] == 'system' else
            f"{assistant_me_map[message['role']]}{' (' + message['name'] + ')' if message['name'] else ''}"
        )
        + f"\n{message['content'].strip()}"
        for message in messages
    ]) + eos_token
    
    return result
    

def transform_to_samantha_dialog(sample):

    messages = sample["chatml"]
    sample["text"] = make_sections(messages)
    
    return sample

dataset = dataset.map(transform_to_samantha_dialog)

Loading cached processed dataset at /home/diwank/.cache/huggingface/datasets/generator/default-645a65a0efdf258b/0.0.0/cache-ed8adb32a51945fa.arrow


In [9]:
dataset.push_to_hub("diwank/chatalpaca-20k-chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]