The most common ways to finetune language models are instruction finetuning and classification finetuning. Instruction finetuning involves training a language model on a set of tasks using specific instructions to improve its ability to understand and execute tasks described in natural language prompts.
In classification finetuning, the model is trained to recognize a specific set of class labels such as "spam" or "not spam". examples go beyond email filtering; they include identifying between different species of plants or animals; categorizing news articles into sports, entertainment, politics etc.
Usually an instruction finetuned model can undergo a broader range of tasks.

Instruction fine-tuning improves a modelâ€™s ability to understand and generate responses
based on specific user instructions. Instruction fine-tuning is best suited for models
that need to handle a variety of tasks based on complex user instructions, improving
flexibility and interaction quality. Classification fine-tuning is ideal for projects requiring precise categorization of data into predefined classes, such as sentiment analysis or spam detection.
While instruction fine-tuning is more versatile, it demands larger datasets and greater
computational resources to develop models proficient in various tasks. In contrast,
classification fine-tuning requires less data and compute power, but its use is confined to the specific classes on which the model has been trained.

### Preparing the dataset

In [1]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extract_dir = "sms_spam_data"
data_file_path = Path(extract_dir) / "SMSSpamCollection.tsv"

# Download the dataset
def download_and_unzip_spam_data(url, zip_path, extract_dir, data_file_path):
    if data_file_path.exists():
        print(f"Data file already exists at {data_file_path}. Skipping download.")
        return
    with urllib.request.urlopen(url) as response:
        with open(zip_path, 'wb') as out_file:
            out_file.write(response.read())
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    original_file_path = Path(extract_dir) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"Data downloaded and extracted to {data_file_path}.")

download_and_unzip_spam_data(url, zip_path, extract_dir, data_file_path)
    

Data file already exists at sms_spam_data/SMSSpamCollection.tsv. Skipping download.


In [2]:
import pandas as pd
# Load the dataset into a DataFrame
df = pd.read_csv(data_file_path, sep='\t', header=None, names=['Label', 'Text'])
print(df.head())

  Label                                               Text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
print(df['Label'].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


we need to balance the dataset so we can have 747 instances for both classes

In [4]:
def create_balanced_dataset(df):
    num_spam = df[df['Label'] == 'spam'].shape[0]
    ham_df = df[df['Label'] == 'ham'].sample(n=num_spam, random_state=123)
    balanced_df = pd.concat([ham_df, df[df['Label'] == 'spam']])
    return balanced_df
balanced_df = create_balanced_dataset(df)
print(balanced_df['Label'].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


next we convert the string class labels into integer class labels

In [5]:
balanced_df['Label'] = balanced_df['Label'].map({'ham': 0, 'spam': 1})

next we split out data into training, test and validation sets

In [6]:
def random_split(df, train_frac, validation_frac):
    
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)  # Shuffle the DataFrame
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)
    
    train_df = df.iloc[:train_end]
    validation_df = df.iloc[train_end:validation_end]
    test_df = df.iloc[validation_end:]
    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)

train_df.to_csv("train_data.csv", index=None)
validation_df.to_csv("validation_data.csv", index=None)
test_df.to_csv("test_data.csv", index=None)

creating dataloaders by padding all messages to the length of the longest message in the dataset or batch. to do this we add padding tokens (<|endoftext|>) to all shorter messages. instead of just adding the string "<|endoftext|>" directly, we will add the token id corresponding to "<|endoftext|>" to the encoded text messages.

In [8]:
# checking the encoding for <|endoftext|>

import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})) 

# the answer is [50256] which is the special token for end of text

[50256]


In [17]:
import torch
from torch.utils.data import Dataset


class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)
        self.encoded_texts = [tokenizer.encode(text) for text in self.data["Text"]]
        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length

            self.encoded_texts = [
                encoded_text[: self.max_length] for encoded_text in self.encoded_texts
            ]
        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long),
        )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length


In [18]:
train_dataset = SpamDataset(
    csv_file="train_data.csv",
    tokenizer=tokenizer,
    max_length=None
)

In [19]:
print(train_dataset.max_length)

120


the longest sequence contain no more than 120 tokens which is common for text messages

In [20]:
val_dataset = SpamDataset(
    csv_file="validation_data.csv",
    tokenizer=tokenizer,
    max_length=train_dataset.max_length
)
test_dataset = SpamDataset(
    csv_file="test_data.csv",
    tokenizer=tokenizer,
    max_length=train_dataset.max_length
)

In [21]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=True,
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [23]:
# to ensure our dataloaders are working correctly, we iterate over the training loader and then break after the first batch
for input_batch, target_batch in train_loader:
    pass
print("Input batch shape:", input_batch.shape)
print("Target batch shape:", target_batch.shape)

Input batch shape: torch.Size([8, 120])
Target batch shape: torch.Size([8])


In [24]:
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} test batches")


130 training batches
18 validation batches
38 test batches


In [34]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"

BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True,
}
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

In [38]:
from safetensors.torch import load_file
import sys
import os

ch04_path = os.path.abspath("LLMs-from-scratch/ch04/01_main-chapter-code")

if ch04_path not in sys.path:
    sys.path.append(ch04_path)
from gpt import GPTModel

model = GPTModel(BASE_CONFIG)

model.load_state_dict(load_file("./gpt2/124M/gpt2-small-124M.safetensors"))

<All keys matched successfully>

In [44]:
from safetensors.torch import load_file
import numpy as np, torch, re
from pathlib import Path
from llms_from_scratch.ch05 import load_weights_into_gpt

model_path = "./gpt2/124M/gpt2-small-124M.safetensors"
params_flat = load_file(model_path)
keys = list(params_flat.keys())
print(f"Loaded {len(keys)} keys; showing first 200:")
for i,k in enumerate(keys[:200]): print(f"{i:03d}: {k}")

def to_numpy(val):
    return val.cpu().numpy() if isinstance(val, torch.Tensor) else np.array(val)

def find_key(params, candidates):
    # return first key whose name contains any candidate substring
    for k in params:
        for c in candidates:
            if c in k:
                return k, to_numpy(params[k])
    return None, None

# Top-level mappings (common candidates)
wte_k, wte = find_key(params_flat, ["wte", "word_embeddings", "lm_head.weight", "transformer.wte", "embeddings.word_embeddings"])
wpe_k, wpe = find_key(params_flat, ["wpe", "position_embeddings", "pos_emb", "transformer.wpe"])
g_k, g = find_key(params_flat, ["ln_f.weight", "transformer.ln_f", "final_layernorm.weight", "ln_f"])
b_k, b = find_key(params_flat, ["ln_f.bias", "transformer.ln_f.bias", "final_layernorm.bias", "ln_f.b"])

print("Top-level matches:")
print(" wte ->", wte_k)
print(" wpe ->", wpe_k)
print(" ln_f.weight ->", g_k)
print(" ln_f.bias ->", b_k)

# assemble blocks
n_layers = len(model.trf_blocks)
blocks = [None]*n_layers

# helper to find per-layer keys (tries multiple naming schemes)
def find_layer_param(params, layer, tail_variants):
    candidates = []
    prefixes = [f"transformer.h.{layer}.", f"model.h.{layer}.", f"h.{layer}.", f"blocks.{layer}.", f"{layer}."]
    for p in prefixes:
        for t in tail_variants:
            candidates.append(p + t)
            candidates.append(p + t + ".weight")
            candidates.append(p + t + ".bias")
            candidates.append(p + t + ".w")
            candidates.append(p + t + ".b")
    # also try substring match
    for c in candidates:
        for k in params:
            if k.endswith(c) or (c in k and not k.endswith(".meta")):
                return k, to_numpy(params[k])
    # fallback: substring search
    for t in tail_variants:
        for k in params:
            if f".{layer}." in k and t in k:
                return k, to_numpy(params[k])
    return None, None

for i in range(n_layers):
    blk = {"attn":{}, "mlp":{}, "ln_1":{}, "ln_2":{}}
    # combined qkv (sometimes named c_attn or attn.c_attn or c_attn.weight)
    kq_w, q_w = find_layer_param(params_flat, i, ["attn.c_attn", "c_attn", "attn.c_attn.weight", "attn/c_attn"])
    kb_w, qb = find_layer_param(params_flat, i, ["attn.c_attn.bias", "c_attn.bias", "c_attn.b"])
    if q_w is not None:
        blk["attn"]["c_attn"] = {"w": q_w, "b": qb}
    # c_proj / out_proj
    kproj_w, proj_w = find_layer_param(params_flat, i, ["attn.c_proj", "c_proj", "attn.c_proj.weight"])
    kproj_b, proj_b = find_layer_param(params_flat, i, ["attn.c_proj.bias", "c_proj.bias"])
    if proj_w is not None:
        blk["attn"]["c_proj"] = {"w": proj_w, "b": proj_b}
    # mlp
    kfc_w, fc_w = find_layer_param(params_flat, i, ["mlp.c_fc", "mlp.c_fc.weight", "c_fc"])
    kfc_b, fc_b = find_layer_param(params_flat, i, ["mlp.c_fc.bias", "c_fc.bias"])
    kproj2_w, proj2_w = find_layer_param(params_flat, i, ["mlp.c_proj", "mlp.c_proj.weight", "c_proj"])
    kproj2_b, proj2_b = find_layer_param(params_flat, i, ["mlp.c_proj.bias", "c_proj.bias"])
    if fc_w is not None: blk["mlp"]["c_fc"] = {"w": fc_w, "b": fc_b}
    if proj2_w is not None: blk["mlp"]["c_proj"] = {"w": proj2_w, "b": proj2_b}
    # layer norms
    ln1_g_k, ln1_g = find_layer_param(params_flat, i, ["ln_1.weight", "ln_1.gamma", "ln_1.g"])
    ln1_b_k, ln1_b = find_layer_param(params_flat, i, ["ln_1.bias", "ln_1.beta", "ln_1.b"])
    ln2_g_k, ln2_g = find_layer_param(params_flat, i, ["ln_2.weight", "ln_2.gamma", "ln_2.g"])
    ln2_b_k, ln2_b = find_layer_param(params_flat, i, ["ln_2.bias", "ln_2.beta", "ln_2.b"])
    if ln1_g is not None: blk["ln_1"]["g"] = ln1_g
    if ln1_b is not None: blk["ln_1"]["b"] = ln1_b
    if ln2_g is not None: blk["ln_2"]["g"] = ln2_g
    if ln2_b is not None: blk["ln_2"]["b"] = ln2_b
    blocks[i] = blk

params_struct = {"wte": wte, "wpe": wpe, "blocks": blocks, "g": g, "b": b}

# diagnostics: which top-level entries are still None
miss = [k for k,v in params_struct.items() if v is None]
if miss:
    print("Missing top-level mappings after heuristic:", miss)
    print("Scan the printed keys above for likely candidates and adjust search patterns.")
else:
    print("Top-level mappings found. Attempting to load into model...")
    load_weights_into_gpt(model, params_struct)
    model.eval()
    print("Weights loaded.")

Loaded 209 keys; showing first 200:
000: final_norm.scale
001: final_norm.shift
002: out_head.weight
003: pos_emb.weight
004: tok_emb.weight
005: trf_blocks.0.att.W_key.bias
006: trf_blocks.0.att.W_key.weight
007: trf_blocks.0.att.W_query.bias
008: trf_blocks.0.att.W_query.weight
009: trf_blocks.0.att.W_value.bias
010: trf_blocks.0.att.W_value.weight
011: trf_blocks.0.att.mask
012: trf_blocks.0.att.out_proj.bias
013: trf_blocks.0.att.out_proj.weight
014: trf_blocks.0.ff.layers.0.bias
015: trf_blocks.0.ff.layers.0.weight
016: trf_blocks.0.ff.layers.2.bias
017: trf_blocks.0.ff.layers.2.weight
018: trf_blocks.0.norm1.scale
019: trf_blocks.0.norm1.shift
020: trf_blocks.0.norm2.scale
021: trf_blocks.0.norm2.shift
022: trf_blocks.1.att.W_key.bias
023: trf_blocks.1.att.W_key.weight
024: trf_blocks.1.att.W_query.bias
025: trf_blocks.1.att.W_query.weight
026: trf_blocks.1.att.W_value.bias
027: trf_blocks.1.att.W_value.weight
028: trf_blocks.1.att.mask
029: trf_blocks.1.att.out_proj.bias
030: tr

In [45]:
pkg_path = os.path.abspath("LLMs-from-scratch/pkg")

# Add to sys.path if not already present
if pkg_path not in sys.path:
    sys.path.append(pkg_path)
from llms_from_scratch.ch05 import load_weights_into_gpt

In [47]:
from llms_from_scratch.ch04 import generate_text_simple
from llms_from_scratch.ch05 import text_to_token_ids, token_ids_to_text

text_1 = "Every effort moves you"
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(text_1, tokenizer),
    max_new_tokens=15,
    context_size=BASE_CONFIG["context_length"],
)
print(token_ids_to_text(token_ids, tokenizer))


Every effort moves you!!!!!!!!!!!!!!!


In [48]:
text_2 = (
    "Is the following text 'spam'? Answer with 'yes' or 'no': "
    "You are a winner you have been specially"
    " selected to receive a $1000 Walmart gift card. Click here to claim now."
)
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(text_2, tokenizer),
    max_new_tokens=23,
    context_size=BASE_CONFIG["context_length"],
)
print(token_ids_to_text(token_ids, tokenizer))

Is the following text 'spam'? Answer with 'yes' or 'no': You are a winner you have been specially selected to receive a $1000 Walmart gift card. Click here to claim now.!!!!!!!!!!!!!!!!!!!!!!!


the model is currently struggling to follow instructions, so now we need to do more work to prepare the model for classification finetuning by adding a classification head

In [49]:
print(model)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

to get the model ready for finetuning, we first freeze the model meaning we make all layers untrainable

In [50]:
for param in model.parameters():
    param.requires_grad = False

In [51]:
torch.manual_seed(123)
num_classes = 2
model.out_head = torch.nn.Linear(
    in_features=BASE_CONFIG["emb_dim"], out_features=num_classes
)

In [52]:
for param in model.trf_blocks[-1].parameters():
    param.requires_grad = True
for param in model.final_norm.parameters():
    param.requires_grad = True

Calculating the classification loss and accuracy

In [53]:
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0
    
    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)
        
            with torch.no_grad():
                logits = model(input_batch)[:, -1, :]
            predicted_labels = torch.argmax(logits, dim=-1)
        
            num_examples += predicted_labels.shape[0]
            correct_predictions += ((predicted_labels == target_batch).sum().item())
        else:
            break
    return correct_predictions / num_examples
        
        


In [55]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

torch.manual_seed(123)
train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=10)
val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=10)
test_accuracy = calc_accuracy_loader(test_loader, model, device, num_batches=10)
print(f"Train accuracy (first 10 batches): {train_accuracy*100:.2f}%")
print(f"Validation accuracy (first 10 batches): {val_accuracy*100:.2f}%")
print(f"Test accuracy (first 10 batches): {test_accuracy*100:.2f}%")

Train accuracy (first 10 batches): 53.75%
Validation accuracy (first 10 batches): 55.00%
Test accuracy (first 10 batches): 51.25%
