### Import libraries

In [None]:
!pip install -q datasets
!pip install -q transformers
!pip install -q peft

In [None]:
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, AutoTokenizer, AutoModelForCausalLM, AutoModelForQuestionAnswering
from peft import LoraConfig, get_peft_model
import math

### Load model

In [None]:
model_checkpoint = 'distilbert/distilgpt2'
model = GPT2LMHeadModel.from_pretrained(model_checkpoint)
tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
model

In [None]:
model.num_parameters()

In [None]:
tokenizer

### Test before fine-tuning

In [None]:
model.eval()
max_length = 30
prompt = "There are two pencils in the box. Q: How many pencils? A:"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

outputs = model.generate(input_ids, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

### Load dataset

In [None]:
dataset = load_dataset("rajpurkar/squad")
dataset

In [None]:
dataset['train'][:1]

### Preprocess the data

In [None]:
# Remove unused columns
dataset = dataset.remove_columns(["id", "title"])
# Reduce dataset size
dataset['train'] = dataset['train'].shuffle(seed=42).select(range(10000))
dataset['validation'] = dataset['validation'].shuffle(seed=42).select(range(2000))

In [None]:
special_tokens = tokenizer.special_tokens_map

# create tokenize function
def tokenize_function(input_data):
    predict = f'{input_data["context"]} Q:{input_data["question"]} A:{input_data["answers"]["text"][0]}{special_tokens["bos_token"]}'
    inputs = tokenizer(
        predict,
        truncation=True,
    )
    return inputs

In [None]:
dataset = dataset.map(tokenize_function, remove_columns=['context', 'question', 'answers'])
dataset

In [None]:
def reallocate_tokens(input_data, max_token_size=128):
    combined_tokens = {
        key: sum(values, [])
        for key, values in input_data.items()
    }

    total_length = len(combined_tokens[list(input_data.keys())[0]])
    total_length = (total_length // max_token_size) * max_token_size

    token_blocks = {
        key: [tokens[i: i + max_token_size] for i in range(0, total_length, max_token_size)]
        for key, tokens in combined_tokens.items()
    }

    token_blocks['labels'] = token_blocks['input_ids'].copy()
    return token_blocks


dataset = dataset.map(reallocate_tokens,batched=True)

In [None]:
dataset

In [None]:
decoded_text = tokenizer.decode(dataset['train'][0]['input_ids'])
decoded_text

In [None]:
train_dataset = dataset['train']
eval_dataset = dataset['validation']

In [None]:
train_dataset

In [None]:
eval_dataset

### Fine-tuning

In [None]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

training_args = TrainingArguments(
    f'./{model_checkpoint}-ft',
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

### Save the fine-tuned model

In [None]:
trainer.save_model("distilgpt2-ft")

### Test the fine-tuned model

In [None]:
model_checkpoint = "distilgpt2-ft" # You can also load the model from checkpoints
tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint)
model = GPT2LMHeadModel.from_pretrained(model_checkpoint)

model.eval()
max_length = 30
prompt = "There are two pencils in the box. Q: How many pencils? A:"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

outputs = model.generate(input_ids, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)