In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

def generate_prompt(data_point):
    if data_point["instruction"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Response:
{data_point["output"]}"""


data = [
    {
        "instruction": "Write a summary of the given article.",
        "input": "Article: Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
        "output": "A summary of the article should capture the main points and key information."
    },
    {
        "instruction": "Translate the following sentence to French.",
        "input": "Sentence: Hello, how are you?",
        "output": "Translation: Bonjour, comment ça va ?"
    }
]

tokenized_data = []

for data_point in data:
    prompt = generate_prompt(data_point)
    tokenized_prompt = tokenizer.encode_plus(
        prompt,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    tokenized_data.append({"prompt": tokenized_prompt["input_ids"], "attention_mask": tokenized_prompt["attention_mask"]})

print(tokenized_data)


In [None]:
'''
Prompt---->
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Write a summary of the given article.

### Input:
Article: Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam ac gravida nulla. Integer non velit sed nisl fringilla consequat. Donec feugiat metus vitae velit ullamcorper, ut interdum ligula gravida.

### Response:
A summary of the article should capture the main points and key information.

'''

In [None]:
''''
tokenized_data---->
[
    {
        'prompt': tensor([[   61,  3350,    61,  1537,  2570,   338,  2853,  2794,   462,  1467,
            147,  4597,   257,  2848,  1107,  1210,  1537,  2570,    13,     1,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0

'''