# GPT2 Fine-Tuning Loop
**Last Edited On: 5/24/2023**<br>
**Last Edited By: Kyle Williams**

In [2]:
'''
Necessary Imports
'''
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, GPT2Tokenizer, GPT2LMHeadModel
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR
import pandas as pd # TODO: If we can eliminate pandas and save the file as a torch object, that would be nice
import os # Won't be necessary when uploading this to Colab

In [None]:
'''
Load GPT2 and its corresponding tokenizer
'''
model_name = "gpt2-medium"  # You can also use "gpt2", "gpt2-large", or "gpt2-xl"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id = tokenizer.eos_token_id)

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
'''
Load Training Data and Encode it
'''
# Load the dev split (can't report data for test set without answers)
cwd = os.getcwd()
parent_path = '/'.join(cwd.split('/')[0:-1]) # removes the innermost folder (currently /experiments)
train = pd.read_csv(parent_path + '/data/TRAIN.csv') # TODO: we need to create a file with our X/y data

# TODO: if we eliminate pandas, we need to erase this call
train = train.drop(columns = ['Unnamed: 0']) # the CSVs were saved with a leading index column that we can ignore

max_length = None 

# Encoding the data
encoded_dataset = tokenizer.batch_encode_plus(
    dataset,  # Your downstream task dataset
    padding=True,
    truncation=True,
    max_length=max_length  # TODO: We need to find the maximum sequence length for our task because 
                           # this will make things much more efficient. This will depend on our decided upon prompt. 
)
input_ids = torch.tensor(encoded_dataset["input_ids"])
attention_mask = torch.tensor(encoded_dataset["attention_mask"])

In [None]:
'''
Configure Training Hyperparameters
'''
# Stochastic Gradient Descent Hyperparameters
num_epochs = None
batch_size = None


# Define the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs

# TODO: Replace with LinearLR somehow
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, total_steps)

In [None]:
'''
Load Training Set for Torch Model (dependant on batch_size)
'''
train = torch.utils.data.TensorDataset(input_ids, attention_mask)
train_dataloader = DataLoader(train, batch_size=batch_size, shuffle=True)

In [None]:
'''
Training Loop
'''
# Training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # Move batch to device
        inputs = batch.to(device)

        # Clear gradients
        model.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = ...  # Compute your loss here

        # Backward pass
        loss.backward()

        # Clip gradients to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        # Update weights
        optimizer.step()

        # Update learning rate schedule
        scheduler.step()

        # Print loss or other metrics if desired
        print(f"Epoch {epoch+1}/{num_epochs}, Step {step+1}/{len(train_dataloader)}, Loss: {loss.item()}")

# Save the trained model
output_dir = "fine_tuned_model/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)