In [7]:
import copy
import datasets
import itertools

In [12]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>", "<</SYS>>"
SYSTEM_PROMPT = "You are a helpful, creative and honest assistant. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

def tokenize_interaction(tokenizer, instruction, response):
    prompt_tokens = tokenizer.encode(f"{tokenizer.bos_token} {B_INST} {B_SYS}\n{SYSTEM_PROMPT}\n{E_SYS}\n{instruction} {E_INST}\n\n", add_special_tokens=False)
    answer_tokens = tokenizer.encode(f"{response} {tokenizer.eos_token}", add_special_tokens=False)

    interaction_tokens = list(itertools.chain(prompt_tokens, answer_tokens))
    labels_tokens = list(itertools.chain(len(prompt_tokens)*[-100,], answer_tokens))

    combined_tokens = {
        "input_ids": interaction_tokens,
        "labels": labels_tokens,
    }

    return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))

In [11]:
def get_custom_dataset(dataset_config, tokenizer, split):
    dataset = Dataset.from_csv("instruct_dataset_final.csv", split=split)

    dataset = dataset.map(
        lambda sample: {
            "instruction": sample["instruction"],
            "response": sample["raw_response"]
        },
        batched=True,
        remove_columns=list(dataset.features)
    )

    dataset = dataset.map(
        lambda x: tokenize_dialog(tokenizer, x["instruction"], x["response"]),
        remove_columns=list(dataset.features)
    )

    return dataset

In [19]:
class TOK:
    def __init__(self):
        self.bos_token = "<s>"
        self.eos_token = "</s>"
    def encode(self, arg, add_special_tokens):
        return arg.split(' ')

In [20]:
tokenizer = TOK()

In [21]:
tokenize_interaction(tokenizer, "do it", "okay done")

{'input_ids': ['<s>',
  '[INST]',
  '<<SYS>>\nYou',
  'are',
  'a',
  'helpful,',
  'creative',
  'and',
  'honest',
  'assistant.',
  'Please',
  'ensure',
  'that',
  'your',
  'responses',
  'are',
  'socially',
  'unbiased',
  'and',
  'positive',
  'in',
  'nature.',
  'If',
  'a',
  'question',
  'does',
  'not',
  'make',
  'any',
  'sense,',
  'or',
  'is',
  'not',
  'factually',
  'coherent,',
  'explain',
  'why',
  'instead',
  'of',
  'answering',
  'something',
  'not',
  'correct.',
  'If',
  'you',
  "don't",
  'know',
  'the',
  'answer',
  'to',
  'a',
  'question,',
  'please',
  "don't",
  'share',
  'false',
  'information.\n<</SYS>>\ndo',
  'it',
  '[/INST]\n\n',
  'okay',
  'done',
  '</s>'],
 'labels': [-100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
 