# Data preparation

In [1]:
import pandas as pd
import datasets

from pprint import pprint
from transformers import AutoTokenizer

### Tokenizing text

In [2]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

In [3]:
text = "Hi, how are you?"

In [4]:
encoded_text = tokenizer(text)["input_ids"]

In [5]:
print(encoded_text)

[12764, 13, 849, 403, 368, 32]


In [6]:
decoded_text = tokenizer.decode(encoded_text)
print("Decoded tokens back into text: ", decoded_text)

Decoded tokens back into text:  Hi, how are you?


### Tokenize multiple texts at once

In [7]:
list_texts = ["Hi, how are you?", "I'm good", "Yes"]
encoded_texts = tokenizer(list_texts)
print("Encoded several texts: ", encoded_texts["input_ids"])

Encoded several texts:  [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175], [4374]]


In [8]:
encoded_texts

{'input_ids': [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175], [4374]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1], [1]]}

### Padding and truncation

In [9]:
tokenizer.pad_token = tokenizer.eos_token 
encoded_texts_longest = tokenizer(list_texts, padding=True)
print("Using padding: ", encoded_texts_longest["input_ids"])

Using padding:  [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175, 0, 0, 0], [4374, 0, 0, 0, 0, 0]]


In [10]:
encoded_texts_truncation = tokenizer(list_texts, max_length=3, truncation=True)
print("Using truncation: ", encoded_texts_truncation["input_ids"])

Using truncation:  [[12764, 13, 849], [42, 1353, 1175], [4374]]


In [11]:
tokenizer.truncation_side = "left"
encoded_texts_truncation_left = tokenizer(list_texts, max_length=3, truncation=True)
print("Using left-side truncation: ", encoded_texts_truncation_left["input_ids"])

Using left-side truncation:  [[403, 368, 32], [42, 1353, 1175], [4374]]


In [12]:
encoded_texts_both = tokenizer(list_texts, max_length=3, truncation=True, padding=True)
print("Using both padding and truncation: ", encoded_texts_both["input_ids"])

Using both padding and truncation:  [[403, 368, 32], [42, 1353, 1175], [4374, 0, 0]]


### Prepare instruction dataset

In [13]:
import pandas as pd

filename = "lamini_docs.jsonl"
instruction_dataset_df = pd.read_json(filename, lines=True)
examples = instruction_dataset_df.to_dict()

if "question" in examples and "answer" in examples:
  text = examples["question"][0] + examples["answer"][0]
elif "instruction" in examples and "response" in examples:
  text = examples["instruction"][0] + examples["response"][0]
elif "input" in examples and "output" in examples:
  text = examples["input"][0] + examples["output"][0]
else:
  text = examples["text"][0]

prompt_template = """### Question:
{question}

### Answer:"""

num_examples = len(examples["question"])
finetuning_dataset = []
for i in range(num_examples):
  question = examples["question"][i]
  answer = examples["answer"][i]
  text_with_prompt_template = prompt_template.format(question=question)
  finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})

from pprint import pprint
print("One datapoint in the finetuning dataset:")
pprint(finetuning_dataset[0])

One datapoint in the finetuning dataset:
{'answer': 'Lamini has documentation on Getting Started, Authentication, '
           'Question Answer Model, Python Library, Batching, Error Handling, '
           'Advanced topics, and class documentation on LLM Engine available '
           'at https://lamini-ai.github.io/.',
 'question': '### Question:\n'
             'What are the different types of documents available in the '
             'repository (e.g., installation guide, API documentation, '
             "developer's guide)?\n"
             '\n'
             '### Answer:'}


In [31]:
len(examples)

2

In [34]:
len(examples["question"])

1400

In [33]:
len(examples["answer"])

1400

In [27]:
len(finetuning_dataset)

1400

### Tokenize a single example

In [35]:
text = finetuning_dataset[0]["question"] + finetuning_dataset[0]["answer"]
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    padding=True
)
print(tokenized_inputs["input_ids"])

[[ 4118 19782    27   187  1276   403   253  1027  3510   273  7177  2130
    275   253 18491   313    70    15    72   904 12692  7102    13  8990
  10097    13 13722   434  7102  6177   187   187  4118 37741    27    45
   4988    74   556 10097   327 27669 11075   264    13  5271 23058    13
  19782 37741 10031    13 13814 11397    13   378 16464    13 11759 10535
   1981    13 21798 12989    13   285   966 10097   327 21708    46 10797
   2130   387  5987  1358    77  4988    74    14  2284    15  7280    15
    900 14206]]


In [36]:
max_length = 2048
max_length = min(
    tokenized_inputs["input_ids"].shape[1],
    max_length,
)

In [37]:
tokenized_inputs["input_ids"].shape[1]

86

In [38]:
tokenized_inputs["input_ids"]

array([[ 4118, 19782,    27,   187,  1276,   403,   253,  1027,  3510,
          273,  7177,  2130,   275,   253, 18491,   313,    70,    15,
           72,   904, 12692,  7102,    13,  8990, 10097,    13, 13722,
          434,  7102,  6177,   187,   187,  4118, 37741,    27,    45,
         4988,    74,   556, 10097,   327, 27669, 11075,   264,    13,
         5271, 23058,    13, 19782, 37741, 10031,    13, 13814, 11397,
           13,   378, 16464,    13, 11759, 10535,  1981,    13, 21798,
        12989,    13,   285,   966, 10097,   327, 21708,    46, 10797,
         2130,   387,  5987,  1358,    77,  4988,    74,    14,  2284,
           15,  7280,    15,   900, 14206]])

### Tokenize the instruction dataset

In [19]:
def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    else:
      text = examples["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [20]:
finetuning_dataset_loaded = datasets.load_dataset("json", data_files=filename, split="train")

tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 1400
})


In [21]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

In [40]:
tokenized_dataset[1]

{'question': 'What is the recommended way to set up and configure the code repository?',
 'answer': 'Lamini can be downloaded as a python package and used in any codebase that uses python. Additionally, we provide a language agnostic REST API. We’ve seen users develop and train models in a notebook environment, and then switch over to a REST API to integrate with their production environment.',
 'input_ids': [1276,
  310,
  253,
  8521,
  1039,
  281,
  873,
  598,
  285,
  20486,
  253,
  2127,
  18491,
  32,
  45,
  4988,
  74,
  476,
  320,
  20582,
  347,
  247,
  15548,
  5522,
  285,
  908,
  275,
  667,
  2127,
  4793,
  326,
  4648,
  15548,
  15,
  9157,
  13,
  359,
  2085,
  247,
  3448,
  639,
  79,
  6932,
  30392,
  8990,
  15,
  844,
  457,
  306,
  2326,
  4212,
  1287,
  285,
  6194,
  3210,
  275,
  247,
  24849,
  3126,
  13,
  285,
  840,
  5234,
  689,
  281,
  247,
  30392,
  8990,
  281,
  19837,
  342,
  616,
  3275,
  3126,
  15],
 'attention_mask': [1,
  1,
  

### Prepare test/train splits

In [39]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})


### Some datasets for you to try

In [41]:
finetuning_dataset_path = "lamini/lamini_docs"
finetuning_dataset = datasets.load_dataset(finetuning_dataset_path)
print(finetuning_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})


In [42]:
taylor_swift_dataset = "lamini/taylor_swift"
bts_dataset = "lamini/bts"
open_llms = "lamini/open_llms"

In [46]:
dataset_swiftie = datasets.load_dataset(taylor_swift_dataset)
print(dataset_swiftie["train"][1])

{'question': 'What is the most popular Taylor Swift song among millennials? How does this song relate to the millennial generation? What is the significance of this song in the millennial culture?', 'answer': 'Taylor Swift\'s "Shake It Off" is the most popular song among millennials. This song relates to the millennial generation as it is an anthem of self-acceptance and embracing one\'s individuality. The song\'s message of not letting others bring you down and to just dance it off resonates with the millennial culture, which is often characterized by a strong sense of individuality and a rejection of societal norms. Additionally, the song\'s upbeat and catchy melody makes it a perfect fit for the millennial generation, which is known for its love of pop music.', 'input_ids': [1276, 310, 253, 954, 4633, 11276, 24619, 4498, 2190, 24933, 8075, 32, 1359, 1057, 436, 4498, 14588, 281, 253, 24933, 451, 5978, 32, 1737, 310, 253, 8453, 273, 436, 4498, 275, 253, 24933, 451, 4466, 32, 37979, 24

In [48]:
btsie = datasets.load_dataset(bts_dataset)
print(btsie["train"][1])

{'question': "What is Seventeen's biggest hit song?", 'answer': 'Seventeen\'s biggest hit song is "Don\'t Wanna Cry", which was released in 2017 as the lead single from their fourth EP Al1. The song\'s music video became Seventeen\'s first to reach 200 million views on YouTube.', 'input_ids': [1276, 310, 38297, 9673, 434, 5962, 4352, 4498, 32, 52, 8045, 9673, 434, 5962, 4352, 4498, 310, 346, 5498, 626, 411, 9045, 34712, 995, 534, 369, 4439, 275, 4240, 347, 253, 1421, 2014, 432, 616, 7002, 16602, 1219, 18, 15, 380, 4498, 434, 3440, 3492, 3395, 38297, 9673, 434, 806, 281, 3986, 1052, 3041, 6849, 327, 15167, 15], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [1276, 310, 38297, 9673, 434, 5962, 4352, 4498, 32, 52, 8045, 9673, 434, 5962, 4352, 4498, 310, 346, 5498, 626, 411, 9045, 34712, 995, 534, 369, 4439, 275, 4240, 347, 253, 1421, 

In [49]:
open_llms_ie = datasets.load_dataset(open_llms)
print(open_llms_ie["train"][1])

{'question': 'EleutherAI-gpt-neox-20b: EleutherAI-gpt-neox-20b: EleutherAI-gpt-neox-20b: What is the architecture of GPT-NeoX-20B?', 'answer': "GPT-NeoX-20B's architecture intentionally resembles that of GPT-3, and is almost identical to that of GPT-J-6B.", 'input_ids': [30377, 16580, 18128, 14, 72, 431, 14, 570, 1004, 14, 938, 67, 27, 13173, 16580, 18128, 14, 72, 431, 14, 570, 1004, 14, 938, 67, 27, 13173, 16580, 18128, 14, 72, 431, 14, 570, 1004, 14, 938, 67, 27, 1737, 310, 253, 10336, 273, 443, 5736, 14, 6560, 80, 57, 14, 938, 35, 32, 40, 5736, 14, 6560, 80, 57, 14, 938, 35, 434, 10336, 23209, 29217, 326, 273, 443, 5736, 14, 20, 13, 285, 310, 2761, 8931, 281, 326, 273, 443, 5736, 14, 43, 14, 23, 35, 15], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
# This is how to push your own dataset to your Huggingface hub
# !pip install huggingface_hub
# !huggingface-cli login
# split_dataset.push_to_hub(dataset_path_hf)