In [1]:
# !aria2c https://huggingface.co/datasets/psmathur/orca_minis_uncensored_dataset/resolve/main/wizardlm_alpaca_dolly_orca_uncensored.jsonl

In [2]:
# !mv 38e3ca794a2e0fc9a7bf51bced5fbc1296c910ac0e0359272e1b9ea282dbb0a1 ../cache_data/wizardlm_alpaca_dolly_orca_uncensored.jsonl

In [1]:
import jsonlines as jsonl
from datasets import Dataset

fn = "../cache_data/wizardlm_alpaca_dolly_orca_uncensored.jsonl"
with jsonl.open(fn) as reader:
    def from_reader():
        for item in reader:
            yield {"input": "", **item}

    dataset = Dataset.from_generator(from_reader)

Found cached dataset generator (/home/diwank/.cache/huggingface/datasets/generator/default-a9720ec59ab928d2/0.0.0)


In [2]:
len(dataset), len(dataset.filter(lambda row: not (row["input"].strip())))

Loading cached processed dataset at /home/diwank/.cache/huggingface/datasets/generator/default-a9720ec59ab928d2/0.0.0/cache-3877f5370df68f7a.arrow


(104179, 83087)

In [3]:
dataset = dataset.filter(lambda row: not (row["input"].strip())).remove_columns(["input"])

Loading cached processed dataset at /home/diwank/.cache/huggingface/datasets/generator/default-a9720ec59ab928d2/0.0.0/cache-3877f5370df68f7a.arrow


In [4]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)

def entry_to_chatml(row):
    instruction = row["instruction"]
    system = row["system"]
    output = row["output"]
    
    # Start preparing chatml
    situation_content = (
        "User is talking to an AI Large Language Model."
        "The AI must follow the instructions given below to the letter and help the User with their request."
    )
    
    chatml = [
        situation(situation_content),
        information(f"Instructions:\n\n{system}"),
        person(instruction, name="User"),
        me(output, name="AI"),
    ]

    return dict(chatml=chatml)

In [5]:
dataset = dataset.map(entry_to_chatml).remove_columns(['instruction', 'system', 'output'])

Loading cached processed dataset at /home/diwank/.cache/huggingface/datasets/generator/default-a9720ec59ab928d2/0.0.0/cache-30ef3b0f32417419.arrow


In [6]:
assistant_me_map = {
    "user": "person",
    "assistant": "me",
}

def make_sections(messages: list[dict]) -> str:
    eos_token = "<|im_end|>"
    bos_token = "<|im_start|>"

    result = bos_token + (eos_token+'\n'+bos_token).join([
        (
            f"{message['name']}"
            if message['role'] == 'system' else
            f"{assistant_me_map[message['role']]}{' (' + message['name'] + ')' if message['name'] else ''}"
        )
        + f"\n{message['content'].strip()}"
        for message in messages
    ]) + eos_token
    
    return result
    

def transform_to_samantha_dialog(sample):

    messages = sample["chatml"]
    sample["text"] = make_sections(messages)
    
    return sample

dataset = dataset.map(transform_to_samantha_dialog)

Loading cached processed dataset at /home/diwank/.cache/huggingface/datasets/generator/default-a9720ec59ab928d2/0.0.0/cache-e1ec1a573d53820b.arrow


In [7]:
# dataset.push_to_hub("diwank/orca_minis_uncensored-chatml", private=True)

In [8]:
dataset[0]['text']

'<|im_start|>situation\nUser is talking to an AI Large Language Model.The AI must follow the instructions given below to the letter and help the User with their request.<|im_end|>\n<|im_start|>information\nInstructions:\n\nYou are a teacher. Given a task, you explain in simple steps what the task is asking, any guidelines it provides and how to use those guidelines to find the answer.<|im_end|>\n<|im_start|>person (User)\nWhat are the three primary colors?<|im_end|>\n<|im_start|>me (AI)\nThe task is asking for the three primary colors. Primary colors are colors that cannot be created by mixing other colors together. The guidelines that we have is that we need to identify the three primary colors. \n\nTo find the answer, we need to know that there are two sets of primary colors: additive and subtractive. Additive primary colors are red, green, and blue. These are the primary colors of light and are used in devices such as computer monitors and televisions. Subtractive primary colors are

In [9]:
def to_mpt_fmt(row):
    splitter = "<|im_start|>me (AI)\n"
    [prompt, response] = row['text'].split(splitter)

    prompt += splitter
    return dict(prompt=prompt, response=response)

mpt_dataset = dataset.map(to_mpt_fmt).remove_columns(['chatml'])
# mpt_dataset.push_to_hub("diwank/orca_minis_uncensored-mpt")

Loading cached processed dataset at /home/diwank/.cache/huggingface/datasets/generator/default-a9720ec59ab928d2/0.0.0/cache-ea5387822f79ce9d.arrow


In [12]:
def calc_tokens(row):
    import math
    
    text = row['text']

    assert "\n<|im_start|>me (AI)\n" in text

    token_count = math.ceil(len(text)/3.6)
    return dict(token_count=token_count)

mpt_dataset_t = mpt_dataset.map(calc_tokens, num_proc=40)
mpt_dataset_t

Loading cached processed dataset at /home/diwank/.cache/huggingface/datasets/generator/default-a9720ec59ab928d2/0.0.0/cache-49a132e21fe0f9a5_*_of_00040.arrow


Dataset({
    features: ['text', 'prompt', 'response', 'token_count'],
    num_rows: 83087
})

In [15]:
mpt_dataset_t = mpt_dataset_t.filter(
    lambda x: x["token_count"] < 8192
)

Filter:   0%|          | 0/83087 [00:00<?, ? examples/s]

In [16]:
mpt_dataset_t = mpt_dataset_t.remove_columns(['prompt', 'response', 'token_count'])

In [19]:
import matplotlib.pyplot as plt
import numpy as np
# if using a Jupyter notebook, includue:
%matplotlib inline

import math

lens = np.array([
    o["token_count"]
    for o in mpt_dataset_t["train"]
    if o["token_count"] < 5000
])

plt.hist(lens, 10)
plt.show()

KeyError: 'token_count'

In [25]:
mpt_dataset_t = mpt_dataset_t.train_test_split(test_size=0.15)

mpt_dataset_t.push_to_hub("diwank/orca_minis_uncensored-completion_lm", private=True)

AttributeError: 'DatasetDict' object has no attribute 'train_test_split'

In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("julep-ai/beluga-13b-orca_mini")

In [23]:
def sep_present(row):
    assert "<|im_start|>me (AI)\n" in row["text"]
    
    sep = tokenizer.encode("<|im_start|>me (AI)\n", add_special_tokens=False)
    input = tokenizer.encode(row["text"])
    present = set(sep) <= set(input)

    return present

check_ds = mpt_dataset_t.filter(sep_present)

Filter:   0%|          | 0/70622 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12463 [00:00<?, ? examples/s]

In [26]:
len(mpt_dataset_t['train']), len(check_ds['train'])

(70622, 70622)