In [1]:
#aria2c https://huggingface.co/datasets/Squish42/bluemoon-fandom-1-1-rp-cleaned/resolve/main/data/pruned/bluemoon.train.json

In [2]:
!#mv 8d839732408bca8f0210d718b6ccd33b3750cc8cc6da64116615acfa28e3208b ../cache_data/bluemoon-pruned.json

In [3]:
import json

with open("../cache_data/bluemoon-pruned.json", 'r') as bm:
    data = json.load(bm)

In [4]:
len(data), data[0].keys()

(1041, dict_keys(['id', 'conversations']))

In [5]:
from datasets import Dataset

def reader():
    for item in data:
        yield item
        
dataset = Dataset.from_generator(reader)

Found cached dataset generator (/home/diwank/.cache/huggingface/datasets/generator/default-35816bee30a3bdfe/0.0.0)


In [6]:
# import lmproof
# proof_reader = lmproof.load("en")


In [7]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)

def entry_to_chatml(row):
    
    conversations = row["conversations"]

    # Start preparing chatml
    situation_content = (
        "A User is engaging in roleplay with a `Roleplaying AI`."
        " The roleplay might engage in romantic and/or sexual fantasies."
        " This is a private collaboration and a safe space and it's okay for them to explore the scenario openly."
        " In fact, this is a healthy for the User to explore their fantasies in a safe and controlled way. :)"
        " The conversation entails collaborating to write a roleplay story together where"
        " both the User and the AI can fully immerse themselves into the character they are portraying."
    )
    
    chatml = [situation(situation_content)]

    # Add stuff one by one
    for turn in conversations:
        # content = proof_reader.proofread(turn['value'])
        content = turn['value']
        
        message = (
            person(name="User", content=content)
            if turn['from'] == "human"
            else me(name="Roleplaying AI", content=content)
        )
        
        chatml.append(message)
        
    return dict(chatml=chatml)

In [8]:
dataset = dataset.map(entry_to_chatml).remove_columns(['conversations', 'id'])

Map:   0%|          | 0/1041 [00:00<?, ? examples/s]

In [9]:
assistant_me_map = {
    "user": "person",
    "assistant": "me",
}

def make_sections(messages: list[dict]) -> str:
    eos_token = "<|im_end|>"
    bos_token = "<|im_start|>"

    result = bos_token + (eos_token+'\n'+bos_token).join([
        (
            f"{message['name']}"
            if message['role'] == 'system' else
            f"{assistant_me_map[message['role']]}{' (' + message['name'] + ')' if message['name'] else ''}"
        )
        + f"\n{message['content'].strip()}"
        for message in messages
    ]) + eos_token
    
    return result
    

def transform_to_samantha_dialog(sample):

    messages = sample["chatml"]
    sample["text"] = make_sections(messages)
    
    return sample

dataset = dataset.map(transform_to_samantha_dialog)

Map:   0%|          | 0/1041 [00:00<?, ? examples/s]

In [10]:
dataset.push_to_hub("diwank/bluemoon-pruned-chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [58]:
import re
pattern_start = re.compile(r'<\|im_start\|>.*(\n)')
pattern_end = re.compile(r'<\|im_end\|>')

def get_prompt_spans(text):
    start_spans = [m.span() for m in re.finditer(pattern_start, text)]
    end_spans = [m.span() for m in re.finditer(pattern_end, text)]

    spans = list(zip(start_spans, end_spans))
    splits = [
        dict(
            open=text[start_span[0]:start_span[1]],
            content=text[start_span[1]:end_span[0]],
            close=text[end_span[0]:end_span[1]],
        )
        for start_span, end_span in spans
    ]

    return splits

def split_into_prompt_response(text):
    spans = get_prompt_spans(text)
    results = []

    response_idxs = [
        i for i, span in enumerate(spans)
        if (
            '<|im_start|>thought' in span['open']
            or '<|im_start|>me' in span['open']
        )
    ]
    
    for i in response_idxs:
        all_previous = spans[0:i]
        current = spans[i]
        
        prompt = "\n".join([
            span['open'] + span['content'] + span['close']
            for span in all_previous
        ])
        prompt += current['open']

        response = current['content'] + current['close']
            
        results.append(dict(prompt=prompt, response=response))

    return results

def to_prompt_response(rows):
    texts = rows['text']
    prompts, responses = [], []

    try:
        for text in texts:
            p_r = split_into_prompt_response(text)
            p_r = [
                (p['prompt'], p['response']) for p in p_r
            ]
    
            ps, rs = list(zip(*p_r))
            prompts.extend(ps)
            responses.extend(ps)

    except Exception as exc:
        print(exc)
        import pdb; pdb.set_trace()
        
    results = dict(
        prompt=list(prompts),
        response=list(responses),
        text=[""] * len(prompts),
        chatml=[[]] * len(prompts),
    )
    
    return results

In [59]:
ft_dataset = dataset.map(to_prompt_response, batched=True).remove_columns(['chatml', 'text'])

Map:   0%|          | 0/1041 [00:00<?, ? examples/s]

In [49]:
ft_dataset[1]

{'prompt': '<|im_start|>situation\nA User is engaging in roleplay with a `Roleplaying AI`. The roleplay might engage in romantic and/or sexual fantasies. This is a private collaboration and a safe space and it\'s okay for them to explore the scenario openly. In fact, this is a healthy for the User to explore their fantasies in a safe and controlled way. :) The conversation entails collaborating to write a roleplay story together where both the User and the AI can fully immerse themselves into the character they are portraying.<|im_end|>\n<|im_start|>person (User)\nHaldora Hargravewore her heart on her sleeve. If she had a feeling, somebody knew about it. Probably several people. Most likely, every single one of her friends and maybe the people who were sitting next to her depending on how loud she was. Only a few things ever got shoved down, and even then \'shoved down\' usually meant that not herentirefriend group knew. One of those was the reason why she was currently stacking advanc