##IMPORTING NECCESSARY LIBRARIES

In [None]:
# Install required packages (first cell in Colab)
!pip install -q transformers datasets accelerate evaluate sentencepiece
# If you need a specific torch version, install it explicitly (only if necessary)
# !pip install -q torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html


##IMPORTS AND GPU CHECKS

In [None]:
# First I am importing all the necessary libraries that I will use in this notebook.
# os, math, and random are standard Python libraries.
# pprint is just for printing things in a more readable format.
# torch is the PyTorch library which is the backend for Hugging Face models.

import os
import math
import random
from pprint import pprint
import torch

# From the Hugging Face transformers library,
# I am importing the GPT-2 model, its tokenizer,
# a collator for data batching, the Trainer API for training,
# the TrainingArguments class to configure training,
# and set_seed for reproducibility.

from transformers import (
    GPT2LMHeadModel,            # the GPT-2 model with a language modeling head
    GPT2TokenizerFast,          # the GPT-2 tokenizer
    DataCollatorForLanguageModeling,  # batches text data into the right shape for training
    Trainer,                    # high-level training/evaluation loop
    TrainingArguments,          # to pass all training hyperparameters
    set_seed                    # to make results reproducible
)

# From the datasets library, I import load_dataset to easily load AG News dataset.
# From evaluate, I import evaluation metrics like BLEU.

from datasets import load_dataset
import evaluate

# Now I am setting the random seed. This makes sure that every time I run the notebook,
# I get the same results (important for reproducibility).
SEED = 42
set_seed(SEED)
random.seed(SEED)

# Next I check if I have a GPU available (like the A100 I am using in Colab).
# If I do, I set the device to GPU, otherwise fallback to CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Device:", device)
if device.type == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))   # prints the GPU name if available


##LOADING THE DATASET and THE GPT2 MODEL

In [None]:
# Now I am loading the dataset.
# I chose the AG News dataset because it has a lot of clean news text data,
# which works really well for language modeling experiments.
# Hugging Face datasets library makes it super easy to load it with just one line.

raw = load_dataset("ag_news")

# The dataset comes with different splits (train and test).
# So I print the keys to check what splits are available.
print("Dataset splits: ", raw.keys())

# To get a feel of the dataset, I print one example record from the training set.
# pprint just makes it easier to read the output in a nice formatted way.
pprint(raw["train"][0])


Preparing train/validation split and keep only text

In [None]:
# The AG News dataset actually has both "text" and "label" columns.
# But since I am training a language model, I don’t need the labels (categories).
# I only care about the raw text data.
# So here, I am removing the "label" column from both train and test sets.

train_ds = raw["train"].remove_columns("label")
val_ds = raw["test"].remove_columns("label")

# Now I want to quickly check how big my datasets are.
# This prints the number of training and validation samples.
print("Train size:", len(train_ds),
      "Validation size:", len(val_ds))

# Finally, I print one example text just to make sure everything looks correct.
print("Example Text:", train_ds[0]["text"])


In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

# First, I choose the base GPT-2 model.
# "gpt2" here refers to the small GPT-2 (124M parameters), which is lightweight and Colab-friendly, can also use the GPT-2 medium, or large, but using GPT-2 for the time being.
MODEL_NAME = "gpt2"

# Now I load the tokenizer for GPT-2.
# A tokenizer converts raw text into numbers (token IDs) that the model can understand.
tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)

# GPT-2 does not originally come with a pad token.
# Since we need padding for batching, I set the pad token to be the same as the end-of-sequence (EOS) token.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Next, I load the GPT-2 model itself.
# GPT2LMHeadModel is the GPT-2 model with a "language modeling head" on top,
# which makes it suitable for text generation and autoregressive training.
base_model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

# Because I added a pad token above, I need to resize the model's embeddings
# so the model knows about the new token.
base_model.resize_token_embeddings(len(tokenizer))

# setting the pad_token_id in the model's config, so it won’t get confused while training.
base_model.config.pad_token_id = tokenizer.pad_token_id

#  moving the model to GPU (if available) so training/inference is much faster.
base_model.to(device)


##TOKENIZEING THE DATASET

In [None]:
# First, I define a helper function to tokenize the text.
# It takes the "text" field from the dataset and converts it into token IDs using the GPT-2 tokenizer.
# I also enable truncation so that very long texts don’t break the tokenizer.
def tokenize_batch(examples):
    return tokenizer(examples["text"], truncation=True)

# Now I apply this tokenizer function to both training and validation datasets.
# "batched=True" means it will process multiple examples at once (faster).
# I also remove the original raw text column since I only need the tokenized version.
tokenized_train = train_ds.map(tokenize_batch, batched=True, remove_columns=train_ds.column_names)
tokenized_val = val_ds.map(tokenize_batch, batched=True, remove_columns=val_ds.column_names)

# Next, I set the block size (context length) for training.
# Since I am using an A100 GPU with large memory, I can safely use 512 tokens.
block_size = 512

# This function groups the tokenized text into fixed-size blocks of length = block_size.
# Basically, I take all the token IDs, flatten them, and then split them into equal chunks of 512.
# This ensures the model always sees consistent input lengths during training.
def group_texts(examples):
    all_ids = sum(examples["input_ids"], [])   # flatten into a single list
    total_len = (len(all_ids) // block_size) * block_size  # drop leftover tokens that don’t fit
    chunks = [all_ids[i:i+block_size] for i in range(0, total_len, block_size)]
    return {
        "input_ids": chunks,
        "attention_mask": [[1]*block_size for _ in range(len(chunks))]  # mask for attention
    }

# Now I apply the grouping function to both train and validation sets.
# Again, "batched=True" so it handles multiple samples efficiently.
# I set batch_size=1000 here to balance speed and memory usage.
lm_train = tokenized_train.map(group_texts, batched=True, batch_size=1000)
lm_val = tokenized_val.map(group_texts, batched=True, batch_size=1000)

# cheking how many training/validation blocks we ended up with.
print("Training blocks:", len(lm_train))
print("Validation blocks:", len(lm_val))


##FEW SHOT PROMPTING (BASELINE GPT2, BEFORE FINE-TUNING)

In [None]:
# This function builds a prompt with k examples from the dataset.
# For example:
#   k=0 → zero-shot (no examples, just the query).
#   k=1 → one-shot (one example + the query).
#   k=3 → few-shot (three examples + the query).
# I concatenate k examples into a single text prompt.
def build_prompt(k, dataset, idx=0):
    chosen = dataset.select(range(k+1))  # take k+1 samples
    prompt = ""
    for i in range(k):
        prompt += f"News: {chosen[i]['text']}\n\n"  # add example news texts
    prompt += f"News: {chosen[k]['text']}\n"        # final "query" example
    return prompt

# This function generates text given a prompt.
# Steps:
# 1. Tokenize the prompt into input_ids.
# 2. Use model.generate() to predict new tokens.
# 3. Decode the generated tokens back into text.
def generate_text(model, tokenizer, prompt, max_new_tokens=40):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    gen_ids = model.generate(
        input_ids,
        do_sample=True,                 # enables sampling instead of greedy decoding
        top_p=0.9,                      # nucleus sampling: only sample from top 90% probability mass
        temperature=0.7,                # controls randomness (lower = more focused)
        max_new_tokens=max_new_tokens,  # how many new tokens to generate
        pad_token_id=tokenizer.eos_token_id
    )
    gen_text = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
    # Remove the original prompt from the output, keeping only the model's continuation.
    return gen_text[len(prompt):].strip()

# testing the pretrained GPT-2 (before fine-tuning) with k = 0, 1, 3.
# This shows how well the base GPT-2 can handle few-shot learning directly.
for k in [0, 1, 3]:
    prompt = build_prompt(k, val_ds, idx=5)  # pick index 5 from validation set
    gen_output = generate_text(base_model, tokenizer, prompt)

    print("="*100)
    print(f" FEW-SHOT TEST (k={k})")
    print("="*100)

    print("\nPrompt given to model:\n")
    print(prompt.strip())

    print("\nModel Generated:\n")
    print(gen_output.strip())
    print("\n\n")


INTERACTIVE PROMTING

In [None]:
# This function is my custom tester for few-shot learning.
# Instead of always relying on the dataset, I can pass my own list of "news" examples.
# The function builds a prompt, feeds it into the model, and prints the continuation.

def test_few_shot(model, tokenizer, prompt_texts, max_new_tokens=50):
    """
    Arguments:
      model: The GPT model (can be base GPT-2 or fine-tuned version).
      tokenizer: The GPT tokenizer to convert text into tokens.
      prompt_texts: A list of custom news strings, like:
                    ["News: ...", "News: ..."].
    """

    # Step 1: Build the full prompt by joining all the news examples together.
    # I separate them with blank lines for readability.
    prompt = "\n\n".join([f"News: {t}" for t in prompt_texts])

    # Step 2: Tokenize the prompt into input IDs and attention mask.
    # This prepares the text in a format that the model can actually process.
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    input_ids = inputs["input_ids"].to(device)
    attn_mask = inputs["attention_mask"].to(device)

    # Step 3: Generate continuation from the model.
    # I use sampling here (not greedy decoding), so results are more diverse.
    gen_ids = model.generate(
        input_ids,
        attention_mask=attn_mask,
        do_sample=True,          # sample from probability distribution
        top_p=0.9,               # nucleus sampling (take top 90% probability mass)
        temperature=0.7,         # controls randomness (lower = more deterministic)
        max_new_tokens=max_new_tokens,  # number of new tokens to generate
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.2,  # avoid repeating the same phrase too much
        min_new_tokens=40        # make sure at least 40 tokens are generated
    )

    # Step 4: Decode the tokens back into human-readable text.
    gen_text = tokenizer.decode(gen_ids[0], skip_special_tokens=True)

    # Step 5: Print everything nicely formatted.
    print("="*100)
    print("Prompt given:\n")
    print(prompt)
    print("\nModel continuation:\n")
    print(gen_text[len(prompt):].strip())  # remove the prompt, keep only new content
    print("="*60)

# 🔹 Example usage of the function with my own prompts.
custom_examples = [
    "AI startup raises $50M to build next-gen language model",
    "New vaccine shows promise in clinical trials",
    "Stock markets rally after positive earnings reports",
]

# Here I test the base GPT-2 (before fine-tuning) with my custom examples.
test_few_shot(base_model, tokenizer, custom_examples)


##FINETUNING GPT2 ON AG NEWS DATASET

In [None]:
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [None]:
# First, I remove any old fine-tuned model checkpoints
# so that I always start fresh without conflicts.
!rm -rf ./gpt2-finetuned-agnews

from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments, GPT2LMHeadModel

# I reload a fresh GPT-2 base model (same as before).
# This way, I fine-tune from scratch (on AG News) rather than continuing a broken run.
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))   # adjust embedding layer in case pad token was added
model.config.pad_token_id = tokenizer.pad_token_id
model.to(device)  # push to GPU

# The data collator is responsible for batching data during training.
# Since I am training a language model (not MLM like BERT),
# I set mlm=False → this means we use *causal LM objective* (predict next token).
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Here I define the training arguments.
# These are the knobs that control how training happens.
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-agnews",   # save checkpoints to this folder
    overwrite_output_dir=True,              # overwrite if folder already exists
    num_train_epochs=20,                    # I train for 20 epochs
    per_device_train_batch_size=64,         # batch size per GPU
    gradient_accumulation_steps=4,          # accumulate gradients → effective batch size = 64*4=256
    save_strategy="epoch",                  # save checkpoint at the end of every epoch
    logging_steps=100,                      # log loss every 100 steps
    learning_rate=3e-5,                     # learning rate for AdamW optimizer
    weight_decay=0.01,                      # small L2 regularization to prevent overfitting
    fp16=torch.cuda.is_available(),         # use half-precision if GPU supports it (faster, less memory)
    report_to="none",                       # I don’t log to WandB/Hub, just local training
    save_total_limit=2                      # keep only the last 2 checkpoints to save space
)

# Now I build the HuggingFace Trainer.
# This wraps the model, data, and training args all together.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_train,
    eval_dataset=lm_val,
    data_collator=data_collator,
)

# Finally, I launch the training process.
# This is where fine-tuning actually happens 🚀.
trainer.train()


aiming for a better train_loss
removing the checkpoint 570 and 665 dirs
7 epochs at the above given params gave train_loss 3.26 avg


in 20 epochs, the train_loss is 3.23 a little better than 7 epochs

In [None]:
!rm -rf ./gpt2-finetuned-agnews


##PERPLEXITY EVALATION (HOW SUPRRISED THE MODOEL IS AFTER SEEING NEW WORDS)

In [None]:
import math

eval_results = trainer.evaluate()
eval_loss = eval_results["eval_loss"]
perplexity = math.exp(eval_loss)
print(f"Validation loss: {eval_loss:.4f} → Perplexity: {perplexity:.2f}")


In [None]:
ft_model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned-agnews/checkpoint-960").to(device)
trainer.save_model("./gpt2-finetuned-agnews-final")
ft_model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned-agnews-final").to(device)


In [None]:
test_idx = 10  # you can change this index to try different samples
for k in [0, 1, 3]:
    prompt = build_prompt(k, val_ds, idx=test_idx)
    base_out = generate_text(base_model, tokenizer, prompt)
    ft_out = generate_text(ft_model, tokenizer, prompt)

    print("="*100)
    print(f"Prompt (K={k}):\n{prompt[:300]}...\n")
    print("Pretrained GPT-2 →", base_out, "\n")
    print("Fine-tuned GPT-2 →", ft_out)
    print("="*100, "\n")


##BLEU SCORE EVALUATION

In [None]:
import evaluate

bleu = evaluate.load("bleu")

def evaluate_bleu(model, tokenizer, dataset, k=0, num_samples=100):
    preds, refs = [], []
    for i in range(num_samples):
        prompt = build_prompt(k, dataset, idx=i)
        gen = generate_text(model, tokenizer, prompt)
        preds.append(gen)
        refs.append([dataset[i]["text"]])  # reference is the real news text
    return bleu.compute(predictions=preds, references=refs)

print("\nBaseline GPT-2 BLEU:", evaluate_bleu(base_model, tokenizer, val_ds, k=0, num_samples=100))
print("Fine-tuned GPT-2 BLEU:", evaluate_bleu(ft_model, tokenizer, val_ds, k=0, num_samples=100))
