# 4. LLM Fine-Tuning (Stage 2)

**Objective:** Train the main Language Model (e.g., GPT-2) on the `assorted_train.jsonl` dataset we just created. We will use the Hugging Face `Trainer` for this.

In [None]:
%pip install datasets transformers torch accelerate

In [None]:
import sys
import os
import torch
from transformers import Trainer, TrainingArguments

# Add 'src' to path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.utils import (
    get_llm_tokenizer, MAX_SEQ_LEN, LLM_MODEL_NAME, 
    PATH_LLM_MODEL, PATH_PROCESSED_DATA
)
from src.dataset import AssortedDataset
from src.model.transformer import get_llm_model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## 4.1 Load Tokenizer and Assorted Dataset

In [None]:
tokenizer = get_llm_tokenizer()

try:
    train_dataset = AssortedDataset(
        tokenizer, 
        file_path=PATH_PROCESSED_DATA, 
        max_length=MAX_SEQ_LEN
    )
    print(f"Loaded {len(train_dataset)} assorted samples.")
except FileNotFoundError:
    print(f"ERROR: Processed data not found at {PATH_PROCESSED_DATA}")
    print("Please run '03_preprocessing_assorted.ipynb' first.")

## 4.2 Load LLM and Resize Embeddings

In [None]:
# This helper function loads the model AND resizes its token embeddings
model = get_llm_model(
    model_name=LLM_MODEL_NAME, 
    tokenizer_len=len(tokenizer)
).to(device)

print(f"LLM parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

## 4.3 Set Up Trainer and Run

In [None]:
# --- Training Configuration ---
NUM_TRAIN_EPOCHS = 1 # Increase for a real run
PER_DEVICE_TRAIN_BATCH_SIZE = 4 # Adjust based on your GPU memory
LEARNING_RATE = 2e-5

training_args = TrainingArguments(
    output_dir=PATH_LLM_MODEL,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    logging_steps=20,
    save_steps=100,
    report_to="none",
    fp16=True if device == "cuda" else False,
    gradient_accumulation_steps=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

print("--- Starting LLM Fine-Tuning ---")
trainer.train()
print("--- LLM Training Complete ---")

## 4.4 Save Final Model

Save the final model and tokenizer to the experiments directory.

In [None]:
print(f"Saving final LLM to {PATH_LLM_MODEL}")
trainer.save_model(PATH_LLM_MODEL)
tokenizer.save_pretrained(PATH_LLM_MODEL)
print("Final model saved.")