In [None]:
!pip install -q transformers datasets

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

Device: cuda


In [None]:
# load IMDB sentiment data
dataset = load_dataset('imdb')

print(f"Train: {len(dataset['train']):,}")
print(f"Test: {len(dataset['test']):,}")
print(f"\nExample: {dataset['train'][0]['text'][:200]}...")
print(f"Label: {dataset['train'][0]['label']}")

Train: 25,000
Test: 25,000

Example: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ev...
Label: 0


In [None]:
# tokenize with BERT
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

tokenized = dataset.map(tokenize, batched=True)
tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
# custom classification head - sits on top of frozen BERT
class ClassificationHead(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        return self.fc2(x)

# load pretrained BERT
bert = AutoModel.from_pretrained('bert-base-uncased')
embedding_dim = bert.config.hidden_size

head = ClassificationHead(embedding_dim)

print(f"BERT params: {sum(p.numel() for p in bert.parameters()):,}")
print(f"Head params: {sum(p.numel() for p in head.parameters()):,}")

BERT params: 109,482,240
Head params: 49,281


In [None]:
# freeze BERT - only train the head
for param in bert.parameters():
    param.requires_grad = False

bert.to(device)
head.to(device)

# demo subset - production would use full 25k samples and more epochs
train_data = tokenized['train'].shuffle(seed=42).select(range(1000))

def collate(batch):
    return {
        'input_ids': torch.stack([b['input_ids'] for b in batch]),
        'attention_mask': torch.stack([b['attention_mask'] for b in batch]),
        'labels': torch.tensor([b['label'] for b in batch], dtype=torch.float)
    }

train_loader = DataLoader(train_data, batch_size=16, shuffle=True, collate_fn=collate)

In [None]:
# training setup
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(head.parameters(), lr=2e-4)  # higher lr since BERT is frozen

epochs = 10  # more passes needed for head to learn
head.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].unsqueeze(1).to(device)

        # get BERT embeddings without updating BERT
        with torch.no_grad():
            outputs = bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled = outputs.pooler_output

        # train only the head
        logits = head(pooled)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/10, Loss: 0.6869
Epoch 2/10, Loss: 0.6658
Epoch 3/10, Loss: 0.6630
Epoch 4/10, Loss: 0.6453
Epoch 5/10, Loss: 0.6360
Epoch 6/10, Loss: 0.6293
Epoch 7/10, Loss: 0.6197
Epoch 8/10, Loss: 0.6207
Epoch 9/10, Loss: 0.6022
Epoch 10/10, Loss: 0.5994


In [None]:
# test predictions
def predict(text):
    encoded = tokenizer(text, return_tensors='pt', truncation=True, max_length=128).to(device)

    with torch.no_grad():
        outputs = bert(**encoded)
        pooled = outputs.pooler_output
        logit = head(pooled)
        prob = torch.sigmoid(logit).item()

    return 'positive' if prob > 0.5 else 'negative', prob

test_texts = [
    "This movie was amazing, loved every minute!",
    "Terrible film, waste of time.",
    "It was okay, nothing special."
]

print("Predictions:")
print("-"*50)
for text in test_texts:
    label, prob = predict(text)
    print(f"{text[:40]}...")
    print(f"  -> {label} ({prob:.2%})\n")

Predictions:
--------------------------------------------------
This movie was amazing, loved every minu...
  -> negative (39.32%)

Terrible film, waste of time....
  -> negative (35.59%)

It was okay, nothing special....
  -> negative (32.56%)



In [None]:
# evaluation on test set
test_data = tokenized['test'].shuffle(seed=42).select(range(500))
test_loader = DataLoader(test_data, batch_size=32, collate_fn=collate)

head.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = head(outputs.pooler_output)
        preds = (torch.sigmoid(logits) > 0.5).squeeze().float()

        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {correct/total:.2%}")

Test Accuracy: 62.60%


In [None]:
print("="*60)
print("SUMMARY")
print("="*60)
print(f"\nApproach: Frozen BERT backbone + trainable classification head")
print(f"\nWhy this works:")
print(f"  - BERT already encodes semantic understanding from pretraining")
print(f"  - We only need to train a small head for our specific task")
print(f"  - Much faster than fine-tuning all 110M BERT parameters")
print(f"\nWhy transformers here but not for life expectancy:")
print(f"  - Text requires learned representations - cant use raw words")
print(f"  - Life expectancy data is already numerical - simpler models work")
print(f"  - Using 110M params for 7 features would be massive overkill")
print(f"\nTradeoff: Transformers powerful but uninterpretable")
print(f"  - Cant explain why a review is classified positive/negative")
print(f"  - For text theres no real alternative, for tabular there is")
print("="*60)

SUMMARY

Approach: Frozen BERT backbone + trainable classification head

Why this works:
  - BERT already encodes semantic understanding from pretraining
  - We only need to train a small head for our specific task
  - Much faster than fine-tuning all 110M BERT parameters

Why transformers here but not for life expectancy:
  - Text requires learned representations - cant use raw words
  - Life expectancy data is already numerical - simpler models work
  - Using 110M params for 7 features would be massive overkill

Tradeoff: Transformers powerful but uninterpretable
  - Cant explain why a review is classified positive/negative
  - For text theres no real alternative, for tabular there is


Frozen backbone made more sense here than fine-tuning everything since we only have 1000 training samples and BERT already understands language structure. The key point for my assignment is that transformers are necessary for text because there's no sensible way to feed raw words into a Random Forest, but that same reasoning doesn't apply to tabular data where the features are already numeric. Using 110 million parameters on seven columns would be solving a problem that doesn't exist.