In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoderplus", use_auth_token=True)
eos_token = tokenizer.eos_token
print(eos_token)

: 

In [12]:
from datasets import load_dataset

"""
We will combine following datasets:
1. Open Assistant Guanaco: For following instructions and converse
2. LIMA: For following instructions and converse
"""


"""
Format of the dataset:

<|system|> system message <|endoftext|> <|prompter|> Q1 <|endoftext|> <|assistant|> A1 <|endoftext|>
"""


system_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully \
as possible, while being safe. Your answers should not include any harmful, \
unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that \
your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why \
instead of answering something not correct. If you don’t know the answer to a \
question, please don’t share false information."""

print(system_prompt)

You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information.


In [40]:
# LIMA dataset processing

def preprocess(samples):
    conv_prefix = f"<|system|> {system_prompt} {eos_token}"
    batch = []
    for sample in samples["conversations"]:
        formatted_conv = conv_prefix
        for i, turn in enumerate(sample):
            turn_prefix = "<|assistant|>" if (i+1)%2==0 else "<|prompter|>"
            formatted_conv += f" {turn_prefix} {turn} {eos_token}"
        batch.append(formatted_conv)
    return {"content": batch}
            
    


lima = load_dataset("GAIR/lima")
lima = lima.map(
    preprocess,
    batched=True,
    remove_columns=lima["train"].column_names
)

lima["train"] = lima["train"].shuffle(100)

print(lima)
print(lima["train"][0])


Found cached dataset lima (/raid/sourab/.cache/huggingface/datasets/GAIR___lima/plain_text/0.0.1/f882fbf63e999e19fc8841fab01c292dd00433ae9bc4f0f177b0b1c484771179)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /raid/sourab/.cache/huggingface/datasets/GAIR___lima/plain_text/0.0.1/f882fbf63e999e19fc8841fab01c292dd00433ae9bc4f0f177b0b1c484771179/cache-0ecec33228122a06.arrow
Loading cached processed dataset at /raid/sourab/.cache/huggingface/datasets/GAIR___lima/plain_text/0.0.1/f882fbf63e999e19fc8841fab01c292dd00433ae9bc4f0f177b0b1c484771179/cache-932634d2b3375595.arrow
Loading cached shuffled indices for dataset at /raid/sourab/.cache/huggingface/datasets/GAIR___lima/plain_text/0.0.1/f882fbf63e999e19fc8841fab01c292dd00433ae9bc4f0f177b0b1c484771179/cache-27e5171340f1f4df.arrow


DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 1030
    })
    test: Dataset({
        features: ['content'],
        num_rows: 300
    })
})
{'content': '<|system|> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information. <|endoftext|> <|prompter|> Are we alone in the universe? <|endoftext|> <|assistant|> Humanity has yet to find evidence for life outside planet Earth.\n\nThe current search for extraterrestrial life is largely focused on finding planets that are situated in an "habitable zone". Roughly p

In [52]:
# OpenAsst Guanaco dataset processing

tokens = ["### Human:", "### Assistant:"]

import re

def split_on_multiple_tokens(input_string, tokens):
    # Combine the tokens into a regular expression pattern using the '|' (OR) operator
    pattern = '|'.join(re.escape(token) for token in tokens)
    
    # Split the input string using the generated pattern
    split_result = re.split(pattern, input_string)
    
    # Remove any empty strings resulting from consecutive delimiters
    split_result = [part.strip() for part in split_result if part.strip()]
    
    return split_result

def preprocess(samples):
    conv_prefix = f"<|system|> {system_prompt} {eos_token}"
    batch = []
    for sample in samples["text"]:
        sample = split_on_multiple_tokens(sample, tokens)
        formatted_conv = conv_prefix
        for i, turn in enumerate(sample):
            turn_prefix = "<|assistant|>" if (i+1)%2==0 else "<|prompter|>"
            formatted_conv += f" {turn_prefix} {turn} {eos_token}"
        batch.append(formatted_conv)
    return {"content": batch}
            
    


guanaco = load_dataset("timdettmers/openassistant-guanaco")
guanaco = guanaco.map(
    preprocess,
    batched=True,
    remove_columns=guanaco["train"].column_names
)

guanaco["train"] = guanaco["train"].shuffle()

print(guanaco)
print(guanaco["train"][0])


Found cached dataset json (/raid/sourab/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /raid/sourab/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-d71fcbcad60fd547.arrow
Loading cached processed dataset at /raid/sourab/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-54124603983709e2.arrow


DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 9846
    })
    test: Dataset({
        features: ['content'],
        num_rows: 518
    })
})
{'content': "<|system|> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information. <|endoftext|> <|prompter|> Can you explain me how cheats are working? <|endoftext|> <|assistant|> Cheating can be of various types. I will need more information on what type of cheating you are referring to before I can provide you with any information. <|endoftext|> <|prompter|> How do

In [55]:
from datasets import concatenate_datasets, DatasetDict
full_dataset = DatasetDict({split: concatenate_datasets([lima[split], guanaco[split]]) for split in ["train", "test"]})
full_dataset

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 10876
    })
    test: Dataset({
        features: ['content'],
        num_rows: 818
    })
})

In [56]:
full_dataset.push_to_hub("code-chat-assistant-v1", private=True)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]