In [1]:
# 1. Install PyTorch EXACTLY matching Colab’s CUDA 12.6 build
!pip install -q --force-reinstall \
    torch==2.9.0+cu126 torchvision==0.24.0+cu126 torchaudio==2.9.0+cu126 \
    -f https://download.pytorch.org/whl/torch_stable.html

# 2. Install HF tools WITHOUT breaking pyarrow
#    - datasets >=3 pulls pyarrow>=17 → breaks Colab
#    - we must pin pyarrow<17 and datasets<3
!pip install -q "datasets<3.0" "pyarrow<17" transformers accelerate scikit-learn


[31mERROR: Could not find a version that satisfies the requirement torch==2.9.0+cu126 (from versions: 2.2.0, 2.2.0+cpu, 2.2.0+cpu.cxx11.abi, 2.2.0+cu118, 2.2.0+cu121, 2.2.0+rocm5.6, 2.2.0+rocm5.7, 2.2.1, 2.2.1+cpu, 2.2.1+cpu.cxx11.abi, 2.2.1+cu118, 2.2.1+cu121, 2.2.1+rocm5.6, 2.2.1+rocm5.7, 2.2.2, 2.2.2+cpu, 2.2.2+cpu.cxx11.abi, 2.2.2+cu118, 2.2.2+cu121, 2.2.2+rocm5.6, 2.2.2+rocm5.7, 2.3.0, 2.3.0+cpu, 2.3.0+cpu.cxx11.abi, 2.3.0+cu118, 2.3.0+cu121, 2.3.0+rocm5.7, 2.3.0+rocm6.0, 2.3.1, 2.3.1+cpu, 2.3.1+cpu.cxx11.abi, 2.3.1+cu118, 2.3.1+cu121, 2.3.1+rocm5.7, 2.3.1+rocm6.0, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0, 2.7.1, 2.8.0, 2.9.0, 2.9.1)[0m[31m
[0m[31mERROR: No matching distribution found for torch==2.9.0+cu126[0m[31m
[0m

In [2]:
import torch, transformers, datasets, sklearn, numpy as np
from accelerate import Accelerator

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Transformers:", transformers.__version__)
print("Datasets:", datasets.__version__)
print("Accelerate:", __import__("accelerate").__version__)
print("Scikit-learn:", sklearn.__version__)
print("NumPy:", np.__version__)

accelerator = Accelerator(mixed_precision="fp16")
print("Using precision:", accelerator.mixed_precision)

device = accelerator.device
print("Device:", device)


Torch: 2.9.1+cu128
CUDA available: True
Transformers: 4.57.3
Datasets: 2.21.0
Accelerate: 1.12.0
Scikit-learn: 1.7.2
NumPy: 2.0.2
Using precision: fp16
Device: cuda


In [5]:
# ============================================================
# 1. DATASET BLOCK (Reuters-21578)
#    Swap ONLY THIS BLOCK later for RCV1 loader.
# ============================================================

from datasets import load_dataset, DatasetDict

# ------------------------------------------------------------
# Load the Reuters-21578 dataset (ModApte split)
# ------------------------------------------------------------
raw_ds = load_dataset("reuters21578", "ModApte", trust_remote_code=True)
print(raw_ds)

# raw_ds has 3 splits: train, test, unused
# All include:
#   - text
#   - topics (list[str])
#   - metadata fields

# ------------------------------------------------------------
# Convert multi-label → single-label
# Strategy: take FIRST topic (same approach you’ll use for RCV1)
# ------------------------------------------------------------
def add_single_label(batch):
    topics = batch["topics"]
    if len(topics) == 0:
        batch["label_text"] = "__NO_LABEL__"
    else:
        batch["label_text"] = topics[0]
    return batch

ds = raw_ds.map(add_single_label)

# ------------------------------------------------------------
# Remove examples that have no topics
# ------------------------------------------------------------
ds = ds.filter(lambda x: x["label_text"] != "__NO_LABEL__", batched=False)

# ------------------------------------------------------------
# Build FULL label vocabulary from ALL splits
# (train + test + unused)
# This prevents KeyErrors like: KeyError: 'yen'
# ------------------------------------------------------------
all_labels = set()

for split in ds.keys():   # 'train', 'test', 'unused'
    for l in ds[split]["label_text"]:
        all_labels.add(l)

label_names = sorted(all_labels)
label_to_id = {name: i for i, name in enumerate(label_names)}
id_to_label = {i: name for name, i in label_to_id.items()}

num_labels = len(label_names)
print("Total Reuters-21578 labels:", num_labels)
print("First few labels:", label_names[:10])

# ------------------------------------------------------------
# Encode text labels → numeric label ids
# ------------------------------------------------------------
def encode_label(batch):
    batch["label"] = label_to_id[batch["label_text"]]
    return batch

ds = ds.map(encode_label)

# ------------------------------------------------------------
# Remove columns we do not need for BERT
# ------------------------------------------------------------
ds = ds.remove_columns(["topics", "label_text", "text_type", "lewis_split",
                        "cgis_split", "old_id", "new_id", "places", "people",
                        "orgs", "exchanges", "date", "title"])

# Keep only: text, label

# ------------------------------------------------------------
# Train / Validation split
# ------------------------------------------------------------
train_valid = ds["train"].train_test_split(test_size=0.1, seed=42)
train_ds = train_valid["train"]
valid_ds = train_valid["test"]
test_ds  = ds["test"]   # keep full ModApte test set

dataset = DatasetDict({
    "train": train_ds,
    "validation": valid_ds,
    "test": test_ds
})

print(dataset)


DatasetDict({
    test: Dataset({
        features: ['text', 'text_type', 'topics', 'lewis_split', 'cgis_split', 'old_id', 'new_id', 'places', 'people', 'orgs', 'exchanges', 'date', 'title'],
        num_rows: 3299
    })
    train: Dataset({
        features: ['text', 'text_type', 'topics', 'lewis_split', 'cgis_split', 'old_id', 'new_id', 'places', 'people', 'orgs', 'exchanges', 'date', 'title'],
        num_rows: 9603
    })
    unused: Dataset({
        features: ['text', 'text_type', 'topics', 'lewis_split', 'cgis_split', 'old_id', 'new_id', 'places', 'people', 'orgs', 'exchanges', 'date', 'title'],
        num_rows: 722
    })
})


Map:   0%|          | 0/3299 [00:00<?, ? examples/s]

Map:   0%|          | 0/9603 [00:00<?, ? examples/s]

Map:   0%|          | 0/722 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3299 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9603 [00:00<?, ? examples/s]

Filter:   0%|          | 0/722 [00:00<?, ? examples/s]

Total Reuters-21578 labels: 82
First few labels: ['acq', 'alum', 'austdlr', 'barley', 'bop', 'carcass', 'cocoa', 'coconut', 'coffee', 'copper']


Map:   0%|          | 0/3019 [00:00<?, ? examples/s]

Map:   0%|          | 0/7775 [00:00<?, ? examples/s]

Map:   0%|          | 0/565 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6997
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 778
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3019
    })
})


In [6]:
# ============================================================
# 2. Tokenization (same for Reuters now & RCV1 later)
# ============================================================

from transformers import DistilBertTokenizerFast

# You can also load tokenizer from "/content/bert_20ng" if you saved it there,
# but base tokenizer is the same:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

max_length = 256  # you can bump to 512 if you want & have memory

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )

tokenized = dataset.map(tokenize_batch, batched=True)

# HF models expect the label column to be named "labels"
tokenized = tokenized.rename_column("label", "labels")

# We keep text only for debugging (not used in training)
tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

print(tokenized)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/6997 [00:00<?, ? examples/s]

Map:   0%|          | 0/778 [00:00<?, ? examples/s]

Map:   0%|          | 0/3019 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 6997
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 778
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 3019
    })
})


In [7]:
# ============================================================
# 3. Dataloaders
# ============================================================

from torch.utils.data import DataLoader

train_batch_size = 16
eval_batch_size  = 32

train_loader = DataLoader(tokenized["train"], batch_size=train_batch_size, shuffle=True)
valid_loader = DataLoader(tokenized["validation"], batch_size=eval_batch_size)
test_loader  = DataLoader(tokenized["test"], batch_size=eval_batch_size)

batch = next(iter(train_loader))
print("Train batch keys:", batch.keys())
print("input_ids shape:", batch["input_ids"].shape)
print("labels shape:", batch["labels"].shape)


Train batch keys: dict_keys(['labels', 'input_ids', 'attention_mask'])
input_ids shape: torch.Size([16, 256])
labels shape: torch.Size([16])


In [15]:
# ============================================================
# 4. Model: load DistilBERT-20NG, replace head for Reuters
# ============================================================

from transformers import DistilBertForSequenceClassification, get_linear_schedule_with_warmup
import torch.nn as nn
import math

# ------------------------------------------------------------
# >>> IMPORTANT: path to your Phase-1 model (20 Newsgroups)
# ------------------------------------------------------------
# Make sure this folder exists in Colab (e.g. uploaded or saved from Phase 1).
# It should contain config.json, pytorch_model.bin, etc.
phase1_model_path = "/content/bert_20ng"  # <<< change if yours is different

# Load the 20NG-fine-tuned model
base_model = DistilBertForSequenceClassification.from_pretrained(phase1_model_path)

print("Original num_labels in 20NG model:", base_model.config.num_labels)

# ------------------------------------------------------------
# Replace classifier head for Reuters label count
# (Later, when you use RCV1 with 103 labels, this will just use num_labels=103)
# ------------------------------------------------------------
# Replace classifier head for Reuters
hidden_size = base_model.config.dim

base_model.classifier = nn.Linear(hidden_size, num_labels)

# update BOTH fields
base_model.config.num_labels = num_labels
base_model.num_labels = num_labels   # <<< REQUIRED FIX

# Optional: re-init classifier weights
nn.init.xavier_uniform_(base_model.classifier.weight)
nn.init.zeros_(base_model.classifier.bias)

print("New num_labels for Reuters:", base_model.config.num_labels)
print("Internal model.num_labels:", base_model.num_labels)

model = base_model.to(device)



Original num_labels in 20NG model: 20
New num_labels for Reuters: 82
Internal model.num_labels: 82


In [16]:
# ============================================================
# 5. Optimizer, Scheduler, and Accelerate Preparation
# ============================================================

from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import math

# ------------------------------------------------------------
# Hyperparameters
# ------------------------------------------------------------
learning_rate = 2e-5          # Good for DistilBERT fine-tuning
weight_decay  = 0.01
num_epochs    = 3

# ------------------------------------------------------------
# Optimizer
# ------------------------------------------------------------
optimizer = AdamW(
    model.parameters(),
    lr=learning_rate,
    weight_decay=weight_decay
)

# ------------------------------------------------------------
# Training length calculations
# ------------------------------------------------------------
# Use len(train_loader) AFTER building dataloaders = correct
num_update_steps_per_epoch = len(train_loader)  # gradient_accumulation=1
max_train_steps = num_epochs * num_update_steps_per_epoch

print("Steps per epoch:", num_update_steps_per_epoch)
print("Total training steps:", max_train_steps)

# ------------------------------------------------------------
# Warmup + linear LR schedule
# ------------------------------------------------------------
warmup_steps = int(0.1 * max_train_steps)  # 10% warmup

scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=max_train_steps,
)

print("Warmup steps:", warmup_steps)

# ------------------------------------------------------------
# Prepare everything with Accelerate
# This:
# - moves model to GPU
# - wraps optimizer
# - patches train_loader & valid_loader
# - enables fp16 automatically
# ------------------------------------------------------------
model, optimizer, train_loader, valid_loader, scheduler = accelerator.prepare(
    model,
    optimizer,
    train_loader,
    valid_loader,
    scheduler,
)

print("\nAccelerate is ready.")
print(f"Mixed precision mode: {accelerator.mixed_precision}")


Steps per epoch: 438
Total training steps: 1314
Warmup steps: 131

Accelerate is ready.
Mixed precision mode: fp16


In [17]:
# ============================================================
# 6. Training & Evaluation Loops (IMPROVED + SAFE)
# ============================================================

from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm

# ------------------------------------------------------------
# Evaluation function
# ------------------------------------------------------------
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():   # safer & faster
        for batch in dataloader:
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["labels"],
            )
            logits = outputs.logits
            preds = logits.argmax(dim=-1)

            # Gather for multi-GPU metrics
            all_preds.append(accelerator.gather_for_metrics(preds).cpu())
            all_labels.append(accelerator.gather_for_metrics(batch["labels"]).cpu())

    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()

    acc = accuracy_score(all_labels, all_preds)
    return acc

# ------------------------------------------------------------
# Training loop
# ------------------------------------------------------------
num_epochs = 3
global_step = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    progress_bar = tqdm(train_loader, disable=not accelerator.is_local_main_process)
    progress_bar.set_description(f"Epoch {epoch+1}")

    for batch in progress_bar:

        # -------------------------------
        # Forward pass
        # -------------------------------
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"],
        )
        loss = outputs.loss
        total_loss += loss.item()

        # -------------------------------
        # Backward + Optimizer step
        # -------------------------------
        optimizer.zero_grad()                       # FIX #1
        accelerator.backward(loss)

        accelerator.clip_grad_norm_(model.parameters(), 1.0)   # FIX #3

        optimizer.step()
        scheduler.step()

        global_step += 1
        progress_bar.set_postfix({"loss": loss.item()})

    # -------------------------------
    # Validation after epoch
    # -------------------------------
    avg_train_loss = total_loss / len(train_loader)
    val_acc = evaluate(model, valid_loader)

    if accelerator.is_local_main_process:
        print(f"\nEpoch {epoch+1} finished.")
        print(f"  Avg training loss: {avg_train_loss:.4f}")
        print(f"  Validation accuracy: {val_acc:.4f}\n")

accelerator.wait_for_everyone()


  0%|          | 0/438 [00:00<?, ?it/s]


Epoch 1 finished.
  Avg training loss: 1.5590
  Validation accuracy: 0.8226



  0%|          | 0/438 [00:00<?, ?it/s]


Epoch 2 finished.
  Avg training loss: 0.6072
  Validation accuracy: 0.8470



  0%|          | 0/438 [00:00<?, ?it/s]


Epoch 3 finished.
  Avg training loss: 0.4666
  Validation accuracy: 0.8548



In [18]:
# ============================================================
# 7. Save model locally (auto-download)
# ============================================================

import shutil
from google.colab import files

save_dir = "/content/bert_reuters21578"

# Make sure all processes finished
accelerator.wait_for_everyone()

# Unwrap model from Accelerate
unwrapped_model = accelerator.unwrap_model(model)

# Save the model + tokenizer
if accelerator.is_local_main_process:
    unwrapped_model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    print("Saved model to:", save_dir)

# ------------------------------------------------------------
# ZIP the folder
# ------------------------------------------------------------
zip_path = "/content/bert_reuters21578.zip"
shutil.make_archive("/content/bert_reuters21578", 'zip', save_dir)

print("Created zip:", zip_path)

# ------------------------------------------------------------
# Auto-download to your computer
# ------------------------------------------------------------
files.download(zip_path)


Saved model to: /content/bert_reuters21578
Created zip: /content/bert_reuters21578.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>