## 1. Introduction

In this assignment, I build a complete pipeline for curating the MNIST dataset, training a baseline LeNet classifier, identifying uncertain samples using CLIP embeddings, and creating an improved classifier with an additional IDK (I Don’t Know) class.

The goal is to simulate a real-world dataset curation workflow:

Identify mislabeled or low-confidence samples

Reassign them to an IDK class

Retrain a classifier with 11 classes

Compare performance between the original and improved models

In [None]:

%%capture
!uv pip install fiftyone==1.7.0 torch==2.6.0 torchvision==0.21 numpy==2.0.2 open-clip-torch==3.2.0
!fiftyone plugins download https://github.com/voxel51/fiftyone-plugins --plugin-names @voxel51/evaluation
!fiftyone plugins download https://github.com/jacobmarks/fiftyone-albumentations-plugin

## 2. Setup & Dependencies

In [None]:

import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as Fun
import torchvision.transforms.v2 as transforms
import fiftyone as fo
import fiftyone.zoo as foz
import fiftyone.brain as fob
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from PIL import Image
from tqdm import tqdm

# Set Seeds for Reproducibility
def set_seeds(seed=51):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

set_seeds()
device = "cuda" if torch.cuda.is_available() else "cpu"

## 3. Loading the MNIST Dataset

In [None]:

transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
])

## 4. Visualizing Sample Images

In [None]:
# Modified version
# Load MNIST Test split
dataset = foz.load_zoo_dataset("mnist", split="test")

# Load CLIP model
clip_model = foz.load_zoo_model("clip-vit-base32-torch", device=device)

# 1. Compute Embeddings
dataset.compute_embeddings(
    model=clip_model,
    embeddings_field="clip_embeddings",
    batch_size=512
)

# 2. Run PCA (Fast, linear)
fob.compute_visualization(
    dataset,
    embeddings="clip_embeddings",
    method="pca",
    brain_key="pca_vis"
)

# 3. Run UMAP (Slower, captures clusters better)
fob.compute_visualization(
    dataset,
    embeddings="clip_embeddings",
    method="umap",
    brain_key="umap_vis"
)

# Launch App to see it (Lab Requirement 1)
session = fo.launch_app(dataset)

## 5. Training the First LeNet Classifier

In this section:

I implement a LeNet-style CNN

Train it on the original 10 MNIST classes

Evaluate accuracy

Save the trained model weights

This model is used to generate predictions and confidence values for dataset curation.

In [None]:
# Modified version
class ModernLeNet5(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.conv3 = nn.Conv2d(16, 120, 4)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(120, 84)
        self.fc2 = nn.Linear(84, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(Fun.relu(self.conv1(x)))
        x = self.pool(Fun.relu(self.conv2(x)))
        x = Fun.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = Fun.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)

In [None]:

class FiftyOneTorchDataset(Dataset):
    def __init__(self, fo_dataset, transforms=None):
        self.samples = fo_dataset.values("filepath")
        self.labels = fo_dataset.values("ground_truth.label")
        self.transforms = transforms
        # Map "0 - zero" to 0, etc.
        self.label_map = {l: i for i, l in enumerate(sorted(fo_dataset.distinct("ground_truth.label")))}

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        img = Image.open(self.samples[idx]).convert("L")
        if self.transforms: img = self.transforms(img)
        label_str = self.labels[idx]
        return img, self.label_map.get(label_str, -1)

In [None]:

# Load Training Data
train_data = foz.load_zoo_dataset("mnist", split="train")

# Transforms
tfms = transforms.Compose([
    transforms.ToImage(),
    transforms.ToDtype(torch.float32, scale=True),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Train Loop Setup
model = ModernLeNet5().to(device)
opt = Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()
loader = DataLoader(FiftyOneTorchDataset(train_data, tfms), batch_size=64, shuffle=True)

# Train (1 Epoch is enough to find hard samples)
print("Training baseline...")
model.train()
for imgs, lbls in tqdm(loader):
    imgs, lbls = imgs.to(device), lbls.to(device)
    opt.zero_grad()
    loss_fn(model(imgs), lbls).backward()
    opt.step()

## 6. Generating Predictions & Confidence Scores

Using the trained model, I compute:

Logits for each training image

Softmax probabilities

Model confidence for predicted label

These values are added back into the FiftyOne dataset for inspection.

In [None]:

import torch.nn.functional as F
import numpy as np

# --- 1. Setup for Inference ---
# Use the same dataset wrapper, but we turn off shuffling to keep labels aligned
inference_dataset = FiftyOneTorchDataset(train_data, tfms)
inference_loader = DataLoader(inference_dataset, batch_size=64, shuffle=False, num_workers=2)

# Get our class list (0-9) so we can map predictions back to strings
classes = sorted(train_data.distinct("ground_truth.label"))

# --- 2. Run Inference (Manual Loop) ---
print("Generating predictions manually...")
model.eval()
predictions = []

# Turn off gradients to save memory
with torch.no_grad():
    for imgs, _ in tqdm(inference_loader):
        imgs = imgs.to(device)

        # Get raw output (logits) from the model
        logits = model(imgs)

        # Convert to probabilities (confidence scores)
        probs = F.softmax(logits, dim=1)

        # Move data back to CPU for processing
        logits = logits.cpu().numpy()
        probs = probs.cpu().numpy()

        # Create a FiftyOne Classification object for every image
        for i in range(len(logits)):
            pred_idx = np.argmax(probs[i])

            # We store the label, the confidence, AND the logits
            # The logits are required for computing hardness/mistakenness
            predictions.append(
                fo.Classification(
                    label=classes[pred_idx],
                    confidence=probs[i][pred_idx],
                    logits=logits[i].tolist()
                )
            )

# --- 3. Save to Dataset ---
print("Saving to FiftyOne dataset...")
# This bulk operation is much faster than saving samples one by one
train_data.set_values("predictions", predictions)

# --- 4. Compute Hardness ---
print("Computing hardness...")
fob.compute_hardness(train_data, label_field="predictions")

print("Hardness computation complete!")

In [None]:
# Modified version
if "mnist-curated-idk" in fo.list_datasets():
    fo.delete_dataset("mnist-curated-idk")

In [None]:

# 1. Clone the dataset so we don't ruin the original
idk_dataset = train_data.clone()
idk_dataset.name = "mnist-curated-idk"
idk_dataset.persistent = True

# 2. Find the hardest samples
hardness_thresh = idk_dataset.quantiles("hardness", [0.98])[0]
questionable_view = idk_dataset.match(fo.ViewField("hardness") > hardness_thresh)

print(f"Found {len(questionable_view)} questionable samples.")

# 3. Relabel them as '10 - IDK'
for sample in questionable_view:
    sample["ground_truth"] = fo.Classification(label="10 - IDK")
    sample.save()

# 4. Verify classes
print("New Classes:", idk_dataset.distinct("ground_truth.label"))

In [None]:

# Add 'questionable' tag to these samples for visualization in FiftyOne
for sample in questionable_view:
    sample.tags.append("questionable")
    sample.save()

print("Added 'questionable' tag to all hard samples.")


## 9. Training the Second LeNet Classifier (11 Classes)

Now I retrain LeNet, but this time:

With 11 output neurons

Using the IDK-augmented dataset

In [None]:
# Modified version
# 1. New Model with 11 Classes
idk_model = ModernLeNet5(num_classes=11).to(device)
opt = Adam(idk_model.parameters(), lr=0.001)

# 2. New Dataset Wrapper (Auto-updates label map for 11 classes)
idk_torch_data = FiftyOneTorchDataset(idk_dataset, tfms)
idk_loader = DataLoader(idk_torch_data, batch_size=64, shuffle=True)

# 3. Train Again
print("Training IDK Classifier...")
idk_model.train()
for epoch in range(3): # Train a bit longer this time
    for imgs, lbls in tqdm(idk_loader):
        imgs, lbls = imgs.to(device), lbls.to(device)
        opt.zero_grad()
        loss_fn(idk_model(imgs), lbls).backward()
        opt.step()

print("Training Complete!")

In [None]:

# --- 1. Prepare Test Data for 11 Classes ---
# We use the SAME label map from training so the model knows "10" means "IDK"
test_dataset = foz.load_zoo_dataset("mnist", split="test")

# Re-create the mapping (0-9 + IDK)
idk_label_map = {l: i for i, l in enumerate(sorted(idk_dataset.distinct("ground_truth.label")))}

# Test Loader
test_torch_data = FiftyOneTorchDataset(test_dataset, tfms) # Helper class we defined earlier
test_loader = DataLoader(test_torch_data, batch_size=64, shuffle=False) # No shuffle for evaluation!

# --- 2. Run Inference ---
print("Evaluating on Test Set...")
idk_model.eval()
idk_predictions = []

# Get list of class names (e.g., "0 - zero", ..., "10 - IDK")
# We sort by value to ensure index 10 corresponds to "10 - IDK"
class_names = sorted(idk_label_map, key=idk_label_map.get)

with torch.no_grad():
    for imgs, _ in tqdm(test_loader):
        imgs = imgs.to(device)
        logits = idk_model(imgs)
        probs = Fun.softmax(logits, dim=1).cpu().numpy()

        for i in range(len(probs)):
            pred_idx = np.argmax(probs[i])
            idk_predictions.append(
                fo.Classification(
                    label=class_names[pred_idx],
                    confidence=probs[i][pred_idx]
                )
            )

# --- 3. Store & Visualize ---
test_dataset.set_values("idk_predictions", idk_predictions)

# Generate the report (Accuracy, Precision, etc.)
results = test_dataset.evaluate_classifications(
    "idk_predictions",
    gt_field="ground_truth",
    eval_key="eval_idk"
)

print("Evaluation Results:")
results.print_report()

# Plot Confusion Matrix (Required for Lab!)
plot = results.plot_confusion_matrix()
plot.show()

In [None]:
# Modified version
session = fo.launch_app(dataset)

## 11. Saving the Curated Dataset & Predictions

In [None]:

!pip install huggingface_hub

In [None]:

from huggingface_hub import notebook_login
notebook_login()

In [None]:

export_dir = "mnist_curated_idk"

idk_dataset.export(
    export_dir=export_dir,
    dataset_type=fo.types.ImageClassificationDirectoryTree,
)


In [None]:

from huggingface_hub import HfApi

api = HfApi()
repo_id = "ishalijadhav/mnist-curated"

# Repo already exists, so skip creation
print("Repo already exists — skipping create_repo()")


In [None]:
from huggingface_hub import upload_large_folder

upload_large_folder(
    folder_path="mnist_curated_idk",
    repo_id="ishalijadhav/mnist-curated",
    repo_type="dataset"
)
