In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType
import torch




In [7]:
# helper function to print trainable parameters and total parameters
def print_model_stats(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total Parameters: {total_params / 1e6:.2f}M")
    print(f"Trainable Parameters (LoRA): {trainable_params / 1e6:.2f}M")
    print(f"Trainable Ratio: {trainable_params / total_params:.4f}")

#### Load model

In [5]:
# 2. Load tokenizer and model
model_name = "openai-community/gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Fix padding
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 1024)

### Apply LORA

In [8]:
# 3. Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn"],  # GPT-2 specific
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)
print_model_stats(model)

Total Parameters: 355.61M
Trainable Parameters (LoRA): 0.79M
Trainable Ratio: 0.0022


In [3]:
# helper function to load dataset (optimized)
def load_bio_dataset(file_path, tokenizer, block_size=128):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Tokenize without padding/truncation, get flat list of token IDs
    tokens = tokenizer.encode(text, add_special_tokens=True)

    # Drop remainder to fit into blocks
    num_blocks = len(tokens) // block_size
    tokens = tokens[:num_blocks * block_size]

    # Split into input blocks
    input_ids = torch.tensor(tokens).view(-1, block_size)
    attention_mask = torch.ones_like(input_ids)  # Assume full attention for all tokens

    dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, input_ids)
    return dataset

### Load dataset

In [9]:
# Load dataset
dataset = load_bio_dataset("./data/shakespeare.txt",
                           tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


In [29]:
from torch.utils.data import DataLoader, random_split


# Calculate split sizes (e.g., 90% train, 10% test)
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size

# Use a fixed seed for reproducibility
generator = torch.Generator().manual_seed(42)

# Split the dataset
train_dataset, test_dataset = random_split(dataset, [train_size, test_size], generator=generator)

# Create DataLoaders for batching
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, pin_memory=True, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True, pin_memory=True, num_workers=4)

In [30]:
len(train_dataloader), len(test_dataloader)

(149, 17)

### Define optimizer

In [31]:
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


trainable_params = [p for p in model.parameters() if p.requires_grad]

optimizer = AdamW(trainable_params,lr=5e-5, weight_decay=1e-4)

In [32]:
from transformers import get_linear_schedule_with_warmup

num_training_steps = len(train_dataloader) * 10  # total steps = steps per epoch × num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,       # warmup steps before LR decay begins
    num_training_steps=num_training_steps
)

In [33]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [34]:
device

device(type='cuda')

### Train

In [35]:
import torch
from torch.amp import autocast, GradScaler
from transformers import get_linear_schedule_with_warmup


scaler = GradScaler()  # device='cuda' is default if cuda available

# Scheduler setup for 10 epochs
num_training_steps = len(train_dataloader) * 10
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=num_training_steps
)

patience = 5
min_delta = 0.001
best_val_loss = float('inf')
patience_counter = 0

model.to(device)
model.train()

for epoch in range(10):  # 10 epochs
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        optimizer.zero_grad()

        with autocast("cuda"):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()

        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        scaler.step(optimizer)
        scaler.update()

        scheduler.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in test_dataloader:
        
            input_ids, attention_mask, labels = [x.to(device) for x in batch]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss = outputs.loss
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(test_dataloader)

    # Early stopping (optional)
    if best_val_loss - avg_val_loss > min_delta:
        best_val_loss = avg_val_loss
        patience_counter = 0
        print(f"Epoch {epoch+1}: Validation loss improved to {avg_val_loss:.4f}, resetting patience.")
    else:
        patience_counter += 1
        print(f"Epoch {epoch+1}: No improvement in validation loss. Patience {patience_counter}/{patience}")

    if patience_counter >= patience:
        print(f"Early stopping triggered at epoch {epoch+1}.")
        break

    model.train()
    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")


Epoch 1: Validation loss improved to 3.8195, resetting patience.
Epoch 1 - Train Loss: 4.2545 - Val Loss: 3.8195
Epoch 2: Validation loss improved to 3.6736, resetting patience.
Epoch 2 - Train Loss: 3.8401 - Val Loss: 3.6736
Epoch 3: Validation loss improved to 3.6077, resetting patience.
Epoch 3 - Train Loss: 3.7351 - Val Loss: 3.6077
Epoch 4: Validation loss improved to 3.5801, resetting patience.
Epoch 4 - Train Loss: 3.6877 - Val Loss: 3.5801
Epoch 5: Validation loss improved to 3.5574, resetting patience.
Epoch 5 - Train Loss: 3.6596 - Val Loss: 3.5574
Epoch 6: Validation loss improved to 3.5469, resetting patience.
Epoch 6 - Train Loss: 3.6428 - Val Loss: 3.5469
Epoch 7: Validation loss improved to 3.5238, resetting patience.
Epoch 7 - Train Loss: 3.6274 - Val Loss: 3.5238
Epoch 8: No improvement in validation loss. Patience 1/5
Epoch 8 - Train Loss: 3.6184 - Val Loss: 3.5295
Epoch 9: No improvement in validation loss. Patience 2/5
Epoch 9 - Train Loss: 3.6094 - Val Loss: 3.5246

#### Save model

In [36]:
model.save_pretrained("gpt2-medium-lora-shakespeare")
tokenizer.save_pretrained("gpt2-medium-lora-shakespeare")

('gpt2-medium-lora-shakespeare/tokenizer_config.json',
 'gpt2-medium-lora-shakespeare/special_tokens_map.json',
 'gpt2-medium-lora-shakespeare/vocab.json',
 'gpt2-medium-lora-shakespeare/merges.txt',
 'gpt2-medium-lora-shakespeare/added_tokens.json',
 'gpt2-medium-lora-shakespeare/tokenizer.json')

#### Execute only if using Kaggle else ignore

In [37]:
import shutil

# Path to the folder you want to zip
folder_to_zip = '/kaggle/working/gpt2-medium-lora-shakespeare' 

# Create a zip file (output will be 'my_model_folder.zip')
shutil.make_archive(folder_to_zip, 'zip', folder_to_zip)


'/kaggle/working/gpt2-medium-lora-shakespeare.zip'