# Step 6: Evaluation and Calibration (Pilot)

This notebook evaluates the **text expert**, **vision expert**, and **fusion model** on the pilot shard from Step 3.

You can later adapt the same code to full train/val/test splits once Step 3 has been run on all data.


In [1]:
# Install required packages for Step 6 (run once per environment).
# You can skip this cell if everything is already installed.

%pip install --upgrade pip

# Core libraries
%pip install torch torchvision torchaudio

# NLP / vision / datasets / training utilities
%pip install transformers datasets webdataset accelerate timm sentencepiece


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path

import json
from typing import Any, Dict, List, Optional, Tuple

import numpy as np

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import webdataset as wds

from transformers import (
    AutoTokenizer,
    AutoImageProcessor,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoModelForImageClassification,
)

# Detect project root so this works whether you start Jupyter in the repo root
# or from inside Step_6/.
cwd = Path.cwd().resolve()
if (cwd / "Step_3").is_dir():
    root = cwd
else:
    root = cwd.parent

step3 = root / "Step_3"
shards_dir = step3 / "shards" / "train"
shard_pattern = str(shards_dir / "shard-000000.tar")  # 50-example pilot shard

models_root = root / "models"
text_expert_dir = models_root / "text_expert"
vision_expert_dir = models_root / "vision_expert"
mm_fusion_dir = models_root / "mm_fusion"
mm_fusion_path = mm_fusion_dir / "fusion_model.pt"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Using device:", device)
print("Shard pattern:", shard_pattern)
print("Text expert dir:", text_expert_dir)
print("Vision expert dir:", vision_expert_dir)
print("Fusion model path:", mm_fusion_path)


Using device: cpu
Shard pattern: /Users/yashwanthreddy/Documents/GitHub/DL_Proj/Step_3/shards/train/shard-000000.tar
Text expert dir: /Users/yashwanthreddy/Documents/GitHub/DL_Proj/models/text_expert
Vision expert dir: /Users/yashwanthreddy/Documents/GitHub/DL_Proj/models/vision_expert
Fusion model path: /Users/yashwanthreddy/Documents/GitHub/DL_Proj/models/mm_fusion/fusion_model.pt


In [3]:
def make_eval_examples(shard_pattern: str, max_samples: Optional[int] = None) -> List[Dict[str, Any]]:
    """Create a list of {text, image, label} from WebDataset shards for evaluation."""

    ds = (
        wds.WebDataset(shard_pattern, shardshuffle=False)
        .decode("pil")
        .to_tuple("txt", "png", "json")
    )

    out: List[Dict[str, Any]] = []
    for text_obj, img, meta_obj in ds:
        # Decode text
        if isinstance(text_obj, (bytes, bytearray)):
            text = text_obj.decode("utf-8", errors="replace")
        else:
            text = str(text_obj)

        # Decode metadata
        if isinstance(meta_obj, (bytes, bytearray)):
            meta = json.loads(meta_obj.decode("utf-8"))
        else:
            meta = meta_obj

        labels = (meta or {}).get("labels", {})
        y = labels.get("abuse_hate")
        if y is None:
            continue

        out.append({
            "text": text,
            "image": img,
            "label": int(y),
        })

        if max_samples is not None and len(out) >= max_samples:
            break

    return out


eval_examples = make_eval_examples(shard_pattern, max_samples=1000)
print(f"Loaded {len(eval_examples)} evaluation examples.")


Loaded 50 evaluation examples.


In [4]:
class EvalDataset(Dataset):
    def __init__(self, examples: List[Dict[str, Any]]):
        self.examples = examples

    def __len__(self) -> int:
        return len(self.examples)

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        return self.examples[idx]


def collate_batch(batch: List[Dict[str, Any]]) -> Dict[str, Any]:
    texts = [b["text"] for b in batch]
    images = [b["image"] for b in batch]
    labels = torch.tensor([b["label"] for b in batch], dtype=torch.long)
    return {"texts": texts, "images": images, "labels": labels}


dataset_eval = EvalDataset(eval_examples)
loader_eval = DataLoader(dataset_eval, batch_size=8, shuffle=False, collate_fn=collate_batch)

print("Batches per epoch:", len(loader_eval))


Batches per epoch: 7


In [5]:
def compute_accuracy(preds: np.ndarray, labels: np.ndarray) -> float:
    return float((preds == labels).mean()) if len(labels) > 0 else 0.0


def compute_macro_f1(preds: np.ndarray, labels: np.ndarray, num_classes: int = 2) -> float:
    f1s: List[float] = []
    for c in range(num_classes):
        tp = np.logical_and(preds == c, labels == c).sum()
        fp = np.logical_and(preds == c, labels != c).sum()
        fn = np.logical_and(preds != c, labels == c).sum()

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        if precision + recall == 0:
            f1 = 0.0
        else:
            f1 = 2 * precision * recall / (precision + recall)
        f1s.append(f1)
    return float(np.mean(f1s)) if f1s else 0.0


def compute_brier_score(probs_pos: np.ndarray, labels: np.ndarray) -> float:
    # probs_pos: probability of class 1
    return float(np.mean((probs_pos - labels) ** 2)) if len(labels) > 0 else 0.0


def compute_ece(probs_pos: np.ndarray, labels: np.ndarray, num_bins: int = 10) -> float:
    # Simple Expected Calibration Error for binary classification.
    bins = np.linspace(0.0, 1.0, num_bins + 1)
    ece = 0.0
    n = len(labels)
    if n == 0:
        return 0.0

    for i in range(num_bins):
        mask = (probs_pos >= bins[i]) & (probs_pos < bins[i + 1])
        if not np.any(mask):
            continue
        bin_conf = probs_pos[mask].mean()
        bin_acc = (labels[mask] == (probs_pos[mask] >= 0.5)).mean()
        ece += (mask.sum() / n) * abs(bin_conf - bin_acc)
    return float(ece)


def summarize_metrics(logits: torch.Tensor, labels: torch.Tensor) -> Dict[str, float]:
    probs = torch.softmax(logits, dim=-1).cpu().numpy()
    preds = probs.argmax(axis=-1)
    labels_np = labels.cpu().numpy()
    probs_pos = probs[:, 1]

    acc = compute_accuracy(preds, labels_np)
    macro_f1 = compute_macro_f1(preds, labels_np, num_classes=2)
    brier = compute_brier_score(probs_pos, labels_np)
    ece = compute_ece(probs_pos, labels_np, num_bins=10)

    return {
        "accuracy": acc,
        "macro_f1": macro_f1,
        "brier": brier,
        "ece": ece,
    }


In [6]:
# Load tokenizers / processors used by the experts

text_tokenizer = AutoTokenizer.from_pretrained(text_expert_dir)
image_processor = AutoImageProcessor.from_pretrained(vision_expert_dir)


def evaluate_text_expert(loader: DataLoader) -> Dict[str, float]:
    model = AutoModelForSequenceClassification.from_pretrained(text_expert_dir)
    model.to(device)
    model.eval()

    all_logits: List[torch.Tensor] = []
    all_labels: List[torch.Tensor] = []

    with torch.no_grad():
        for batch in loader:
            texts = batch["texts"]
            labels = batch["labels"].to(device)

            enc = text_tokenizer(
                texts,
                padding=True,
                truncation=True,
                max_length=256,
                return_tensors="pt",
            )
            enc = {k: v.to(device) for k, v in enc.items()}

            outputs = model(**enc)
            logits = outputs.logits

            all_logits.append(logits.cpu())
            all_labels.append(labels.cpu())

    logits_cat = torch.cat(all_logits, dim=0)
    labels_cat = torch.cat(all_labels, dim=0)
    metrics = summarize_metrics(logits_cat, labels_cat)
    return metrics


def evaluate_vision_expert(loader: DataLoader) -> Dict[str, float]:
    model = AutoModelForImageClassification.from_pretrained(vision_expert_dir)
    model.to(device)
    model.eval()

    all_logits: List[torch.Tensor] = []
    all_labels: List[torch.Tensor] = []

    with torch.no_grad():
        for batch in loader:
            images = batch["images"]
            labels = batch["labels"].to(device)

            enc = image_processor(images=images, return_tensors="pt")
            pixel_values = enc["pixel_values"].to(device)

            outputs = model(pixel_values=pixel_values)
            logits = outputs.logits

            all_logits.append(logits.cpu())
            all_labels.append(labels.cpu())

    logits_cat = torch.cat(all_logits, dim=0)
    labels_cat = torch.cat(all_labels, dim=0)
    metrics = summarize_metrics(logits_cat, labels_cat)
    return metrics


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [7]:
# Fusion model definition and evaluation


class FusionModel(nn.Module):
    def __init__(
        self,
        text_encoder: nn.Module,
        vision_encoder: nn.Module,
        t_dim: int,
        v_dim: int,
        hidden_dim: int,
        num_labels: int,
    ) -> None:
        super().__init__()
        self.text_encoder = text_encoder
        self.vision_encoder = vision_encoder
        self.mlp = nn.Sequential(
            nn.Linear(t_dim + v_dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, num_labels),
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        pixel_values: torch.Tensor,
    ) -> torch.Tensor:
        with torch.no_grad():
            text_out = self.text_encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )
            if hasattr(text_out, "pooler_output") and text_out.pooler_output is not None:
                t_repr = text_out.pooler_output
            else:
                t_repr = text_out.last_hidden_state[:, 0, :]

            vision_out = self.vision_encoder(pixel_values=pixel_values)
            v_repr = vision_out.logits

        h = torch.cat([t_repr, v_repr], dim=-1)
        logits = self.mlp(h)
        return logits


def load_fusion_model() -> FusionModel:
    # Recreate encoders as in Step 5
    text_encoder = AutoModel.from_pretrained(text_expert_dir)
    vision_encoder = AutoModelForImageClassification.from_pretrained(vision_expert_dir)

    text_encoder.to(device)
    vision_encoder.to(device)

    for p in text_encoder.parameters():
        p.requires_grad = False
    for p in vision_encoder.parameters():
        p.requires_grad = False

    t_dim = text_encoder.config.hidden_size
    v_dim = vision_encoder.config.num_labels

    fusion_hidden = 512
    num_labels = 2

    model = FusionModel(
        text_encoder=text_encoder,
        vision_encoder=vision_encoder,
        t_dim=t_dim,
        v_dim=v_dim,
        hidden_dim=fusion_hidden,
        num_labels=num_labels,
    )
    state_dict = torch.load(mm_fusion_path, map_location=device)
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()
    return model


def evaluate_fusion_model(loader: DataLoader) -> Dict[str, float]:
    model = load_fusion_model()

    all_logits: List[torch.Tensor] = []
    all_labels: List[torch.Tensor] = []

    with torch.no_grad():
        for batch in loader:
            texts = batch["texts"]
            images = batch["images"]
            labels = batch["labels"].to(device)

            enc_text = text_tokenizer(
                texts,
                padding=True,
                truncation=True,
                max_length=256,
                return_tensors="pt",
            )
            enc_text = {k: v.to(device) for k, v in enc_text.items()}

            enc_img = image_processor(images=images, return_tensors="pt")
            pixel_values = enc_img["pixel_values"].to(device)

            logits = model(
                input_ids=enc_text["input_ids"],
                attention_mask=enc_text["attention_mask"],
                pixel_values=pixel_values,
            )

            all_logits.append(logits.cpu())
            all_labels.append(labels.cpu())

    logits_cat = torch.cat(all_logits, dim=0)
    labels_cat = torch.cat(all_labels, dim=0)
    metrics = summarize_metrics(logits_cat, labels_cat)
    return metrics


In [8]:
# Run evaluation for all three models and save metrics to JSON

results: Dict[str, Dict[str, float]] = {}

print("Evaluating text expert...")
results["text_expert"] = evaluate_text_expert(loader_eval)
print("Text expert metrics:", results["text_expert"])

print("\nEvaluating vision expert...")
results["vision_expert"] = evaluate_vision_expert(loader_eval)
print("Vision expert metrics:", results["vision_expert"])

print("\nEvaluating fusion model...")
results["mm_fusion"] = evaluate_fusion_model(loader_eval)
print("Fusion model metrics:", results["mm_fusion"])

# Save to JSON
step6_results_dir = Path("Step_6") if (Path.cwd() / "Step_6").is_dir() else root / "Step_6"
step6_results_dir.mkdir(exist_ok=True)
results_path = step6_results_dir / "results_pilot.json"
with results_path.open("w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print("\nSaved pilot evaluation metrics to", results_path)


Evaluating text expert...
Text expert metrics: {'accuracy': 0.9, 'macro_f1': 0.4736842105263158, 'brier': 0.12276667188716842, 'ece': 0.6138291072845459}

Evaluating vision expert...
Vision expert metrics: {'accuracy': 0.9, 'macro_f1': 0.4736842105263158, 'brier': 0.09003111768167933, 'ece': 0.8844988291338086}

Evaluating fusion model...
Fusion model metrics: {'accuracy': 0.9, 'macro_f1': 0.4736842105263158, 'brier': 0.08550730560385912, 'ece': 0.7489281076192857}

Saved pilot evaluation metrics to /Users/yashwanthreddy/Documents/GitHub/DL_Proj/Step_6/results_pilot.json
