# FER Transfer Learning (PyTorch) — ResNet / EfficientNet
*Auto-generated on 2025-09-08 17:27*

This notebook upgrades your baseline CNN to **transfer learning** using pretrained backbones from `torchvision`
(e.g., **ResNet-18/50**, **EfficientNet-B0**). It includes:
- robust **CSV loader** (paths + labels)
- **data augmentations** suitable for face images
- **label smoothing**, **class weights**, optional **Focal Loss**
- **CosineAnnealingLR**, **Warmup**, **Early Stopping**
- **mixed precision** (AMP) and **gradient clipping**
- full **evaluation** (accuracy, classification report, confusion matrix)
- **Checkpointing** and **Resume**

> ⚙️ Fill the paths in **Config** below to point to your `train.csv`, `val.csv`, `test.csv`, and `label_map.json`.

In [None]:
# If you're in Colab, uncomment the following (torch/torchvision are usually preinstalled).
# !pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# !pip install -q scikit-learn pandas matplotlib tqdm

In [None]:
import os, json, math, time, random, shutil
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms as T
from torchvision import models

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

from tqdm.auto import tqdm

## 1) Config — set your paths and hyperparameters

In [None]:
# ==== PATHS (EDIT THESE) ====
CSV_TRAIN = "/content/drive/MyDrive/ann-visual-emotion/data/processed/EmoSet_splits/train.csv"
CSV_VAL   = "/content/drive/MyDrive/ann-visual-emotion/data/processed/EmoSet_splits/val.csv"
CSV_TEST  = "/content/drive/MyDrive/ann-visual-emotion/data/processed/EmoSet_splits/test.csv"  # can be missing
LABEL_MAP = "/content/drive/MyDrive/ann-visual-emotion/data/processed/EmoSet_splits/label_map.json"

# If your CSVs are already uploaded to this environment (e.g., /mnt/data/train.csv), set like:
# CSV_TRAIN = "/mnt/data/train.csv"
# CSV_VAL   = "/mnt/data/val.csv"
# CSV_TEST  = "/mnt/data/test.csv"
# LABEL_MAP = "/mnt/data/label_map.json"

# ==== DATA FORMAT ====
# The CSV is expected to have at least two columns: 'image' and 'label'
# - image: absolute path or path relative to some root.
# - label: class name string (e.g., 'anger', 'happiness', etc.)
# If your column names differ, edit the names below:
COL_IMAGE = "image"
COL_LABEL = "label"

# If your CSV stores relative paths and you need to join a root directory, set IMG_ROOT:
IMG_ROOT = ""  # e.g., "/content/drive/MyDrive/ann-visual-emotion/data/raw/EmoSet"

# ==== HYPERPARAMETERS ====
IMG_SIZE    = 224            # 224 works for most backbones
BATCH_SIZE  = 64
EPOCHS      = 30
LR          = 3e-4
WEIGHT_DECAY= 1e-4
LABEL_SMOOTH= 0.05           # 0..0.2 is typical
EARLY_STOP  = 8              # epochs of patience

# Choose a backbone: 'resnet18', 'resnet50', 'efficientnet_b0'
BACKBONE    = "resnet18"

# Imbalance handling: class weights in CE loss (True) and/or WeightedRandomSampler (False)
USE_CLASS_WEIGHTS      = True
USE_WEIGHTED_SAMPLER   = False   # Set True if you prefer sampling instead of weights

# Tricks
USE_MIXED_PRECISION = True
GRAD_CLIP_NORM      = 1.0
USE_FOCAL_LOSS      = False   # If True, focal loss overrides CE w/ label-smoothing
FOCAL_GAMMA         = 2.0
WARMUP_EPOCHS       = 2       # simple linear warmup for LR at start
SEED                = 42

# === OUTPUTS ===
OUT_DIR = "./runs_fer_transfer"
os.makedirs(OUT_DIR, exist_ok=True)

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

def read_label_map(path):
    if os.path.exists(path):
        with open(path, "r") as f:
            data = json.load(f)
        # Accept either {"amusement":0, ...} or {"idx2str":["..."], "str2idx":{...}}
        if isinstance(data, dict) and "str2idx" in data and "idx2str" in data:
            str2idx = {k:int(v) for k,v in data["str2idx"].items()}
            idx2str = {int(k):v for k,v in data["idx2str"].items()}
        else:
            # assume flat mapping str->idx
            str2idx = {k:int(v) for k,v in data.items()}
            idx2str = {v:k for k,v in str2idx.items()}
        return str2idx, idx2str
    return None, None

## 2) Dataset & Dataloaders

In [None]:
from PIL import Image

class CSVDataset(Dataset):
    def __init__(self, csv_path, transform=None, img_root="", col_img="image", col_lab="label", encoder=None):
        self.df = pd.read_csv(csv_path)
        self.transform = transform
        self.img_root = img_root
        self.col_img = col_img
        self.col_lab = col_lab
        
        if encoder is None:
            self.encoder = LabelEncoder().fit(self.df[self.col_lab].astype(str).values)
        else:
            self.encoder = encoder
        self.labels = self.encoder.transform(self.df[self.col_lab].astype(str).values)
        self.n_classes = len(self.encoder.classes_)
        
        self.paths = self.df[self.col_img].astype(str).tolist()
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        path = self.paths[idx]
        if self.img_root and not os.path.isabs(path):
            path = os.path.join(self.img_root, path)
        img = Image.open(path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        label = self.labels[idx]
        return img, label

def build_transforms(img_size=224):
    # Augmentations tuned for faces (avoid crazy rotations/crops)
    train_tf = T.Compose([
        T.Resize(int(img_size*1.1)),
        T.CenterCrop(img_size),
        T.RandomHorizontalFlip(p=0.5),
        T.ColorJitter(brightness=0.15, contrast=0.15, saturation=0.1, hue=0.02),
        T.ToTensor(),
        T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
    ])
    eval_tf = T.Compose([
        T.Resize(int(img_size*1.1)),
        T.CenterCrop(img_size),
        T.ToTensor(),
        T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
    ])
    return train_tf, eval_tf

train_tf, eval_tf = build_transforms(IMG_SIZE)

# Label map (optional). If absent, will infer from CSV via LabelEncoder.
str2idx, idx2str = read_label_map(LABEL_MAP)

# Build train set (to establish encoder)
enc = None
if str2idx is not None:
    # enforce mapping order
    classes_sorted = [k for k,_ in sorted(str2idx.items(), key=lambda kv: kv[1])]
    enc = LabelEncoder()
    enc.fit(classes_sorted)

ds_tr = CSVDataset(CSV_TRAIN, transform=train_tf, img_root=IMG_ROOT, col_img=COL_IMAGE, col_lab=COL_LABEL, encoder=enc)
ds_va = CSVDataset(CSV_VAL,   transform=eval_tf,  img_root=IMG_ROOT, col_img=COL_IMAGE, col_lab=COL_LABEL, encoder=ds_tr.encoder)
ds_te = None
if os.path.exists(CSV_TEST):
    ds_te = CSVDataset(CSV_TEST, transform=eval_tf, img_root=IMG_ROOT, col_img=COL_IMAGE, col_lab=COL_LABEL, encoder=ds_tr.encoder)

classes = list(ds_tr.encoder.classes_)
num_classes = len(classes)
print("Classes:", classes)
print("Train/Val/Test sizes:", len(ds_tr), len(ds_va), 0 if ds_te is None else len(ds_te))

In [None]:
# Compute class distribution for weights/sampler
y_tr = ds_tr.labels
class_counts = np.bincount(y_tr, minlength=num_classes).astype(float)
class_weights = class_counts.sum() / (num_classes * np.maximum(class_counts, 1.0))
class_weights_t = torch.tensor(class_weights, dtype=torch.float32)

print("Class counts:", class_counts)
print("Class weights:", class_weights)

if USE_WEIGHTED_SAMPLER:
    # Sampling probability inversely proportional to class frequency
    sample_weights = class_weights[y_tr]
    sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)
    loader_tr = DataLoader(ds_tr, batch_size=BATCH_SIZE, sampler=sampler, num_workers=2, pin_memory=True)
else:
    loader_tr = DataLoader(ds_tr, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)

loader_va = DataLoader(ds_va, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
loader_te = DataLoader(ds_te, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True) if ds_te else None

## 3) Model — choose a pretrained backbone

In [None]:
def build_model(backbone:str, n_classes:int):
    backbone = backbone.lower()
    if backbone == "resnet18":
        weights = models.ResNet18_Weights.DEFAULT
        net = models.resnet18(weights=weights)
        in_feats = net.fc.in_features
        net.fc = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_feats, n_classes)
        )
    elif backbone == "resnet50":
        weights = models.ResNet50_Weights.DEFAULT
        net = models.resnet50(weights=weights)
        in_feats = net.fc.in_features
        net.fc = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(in_feats, n_classes)
        )
    elif backbone == "efficientnet_b0":
        weights = models.EfficientNet_B0_Weights.DEFAULT
        net = models.efficientnet_b0(weights=weights)
        in_feats = net.classifier[1].in_features
        net.classifier = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_feats, n_classes)
        )
    else:
        raise ValueError(f"Unknown backbone: {backbone}")
    return net

model = build_model(BACKBONE, num_classes).to(device)
print(f"Built model: {BACKBONE} with {num_classes} classes")

## 4) Losses and Optimizer

In [None]:
class LabelSmoothingCE(nn.Module):
    def __init__(self, smoothing=0.0, weight=None, reduction="mean"):
        super().__init__()
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction
    def forward(self, logits, target):
        n = logits.size(-1)
        logp = F.log_softmax(logits, dim=-1)
        with torch.no_grad():
            true_dist = torch.zeros_like(logp)
            true_dist.fill_(self.smoothing / (n - 1))
            true_dist.scatter_(1, target.unsqueeze(1), 1 - self.smoothing)
        if self.weight is not None:
            w = self.weight.unsqueeze(0)
            loss = -(true_dist * logp * w).sum(dim=1)
        else:
            loss = -(true_dist * logp).sum(dim=1)
        if self.reduction == "mean":
            return loss.mean()
        elif self.reduction == "sum":
            return loss.sum()
        return loss

class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, weight=None, reduction="mean"):
        super().__init__()
        self.gamma = gamma
        self.weight = weight
        self.reduction = reduction
    def forward(self, logits, target):
        logp = F.log_softmax(logits, dim=-1)
        p = torch.exp(logp)
        logpt = logp.gather(1, target.unsqueeze(1)).squeeze(1)
        pt = p.gather(1, target.unsqueeze(1)).squeeze(1)
        loss = -((1-pt)**self.gamma) * logpt
        if self.weight is not None:
            w = self.weight[target]
            loss = loss * w
        if self.reduction == "mean":
            return loss.mean()
        elif self.reduction == "sum":
            return loss.sum()
        return loss

# Choose criterion
weight_vec = class_weights_t.to(device) if USE_CLASS_WEIGHTS else None
if USE_FOCAL_LOSS:
    criterion = FocalLoss(gamma=FOCAL_GAMMA, weight=weight_vec)
else:
    criterion = LabelSmoothingCE(smoothing=LABEL_SMOOTH, weight=weight_vec)

# Optimizer & Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
# Cosine schedule with T_max = EPOCHS; we'll do a simple warmup in the loop
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

## 5) Training & Evaluation

In [None]:
def accuracy_from_logits(logits, y):
    pred = logits.argmax(dim=1)
    return (pred == y).float().mean().item()

def train_one_epoch(model, loader, optimizer, scaler, epoch, total_epochs, warmup_epochs, grad_clip):
    model.train()
    losses = []
    accs = []
    pbar = tqdm(loader, desc=f"Train {epoch+1}/{total_epochs}")
    for x, y in pbar:
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        # LR warmup (linear)
        if warmup_epochs > 0 and epoch < warmup_epochs:
            warmup_factor = (epoch + 1) / max(1, warmup_epochs)
            for g in optimizer.param_groups:
                g['lr'] = LR * warmup_factor

        optimizer.zero_grad(set_to_none=True)
        if scaler is not None:
            with torch.cuda.amp.autocast():
                logits = model(x)
                loss = criterion(logits, y)
            scaler.scale(loss).backward()
            if grad_clip is not None and grad_clip > 0:
                scaler.unscale_(optimizer)
                nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            if grad_clip is not None and grad_clip > 0:
                nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()

        acc = accuracy_from_logits(logits, y)
        losses.append(loss.item())
        accs.append(acc)
        pbar.set_postfix(loss=np.mean(losses), acc=np.mean(accs))
    return float(np.mean(losses)), float(np.mean(accs))

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    losses = []
    accs = []
    y_true, y_pred = [], []
    for x, y in tqdm(loader, desc="Eval", leave=False):
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        logits = model(x)
        loss = criterion(logits, y)
        acc = accuracy_from_logits(logits, y)
        losses.append(loss.item())
        accs.append(acc)
        y_true.extend(y.cpu().numpy().tolist())
        y_pred.extend(logits.argmax(dim=1).cpu().numpy().tolist())
    return (float(np.mean(losses)), float(np.mean(accs)), 
            np.array(y_true, dtype=int), np.array(y_pred, dtype=int))

# AMP scaler
scaler = torch.cuda.amp.GradScaler() if (USE_MIXED_PRECISION and device.type=="cuda") else None

best_val_acc = -1.0
epochs_no_improve = 0
ckpt_path = os.path.join(OUT_DIR, f"{BACKBONE}_best.pt")

history = {"train_loss":[], "train_acc":[], "val_loss":[], "val_acc":[]}

for epoch in range(EPOCHS):
    tr_loss, tr_acc = train_one_epoch(model, loader_tr, optimizer, scaler, epoch, EPOCHS, WARMUP_EPOCHS, GRAD_CLIP_NORM)
    # Step cosine after each epoch (note: LR may have been warm-up adjusted; cosine still applies)
    scheduler.step()
    va_loss, va_acc, y_true, y_pred = evaluate(model, loader_va)

    history["train_loss"].append(tr_loss); history["train_acc"].append(tr_acc)
    history["val_loss"].append(va_loss);   history["val_acc"].append(va_acc)

    print(f"Epoch {epoch+1:02d}: train_loss={tr_loss:.4f} acc={tr_acc:.4f} | val_loss={va_loss:.4f} acc={va_acc:.4f}")
    
    # Early stopping + checkpoint
    if va_acc > best_val_acc:
        best_val_acc = va_acc
        epochs_no_improve = 0
        torch.save({"model": model.state_dict(), "classes": classes, "backbone": BACKBONE}, ckpt_path)
        print(f"  ✅ New best val acc {best_val_acc:.4f}. Saved: {ckpt_path}")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= EARLY_STOP:
            print(f"  ⏹️ Early stopping after {epoch+1} epochs (no improvement for {EARLY_STOP})")
            break

# Plot history
plt.figure(figsize=(6,4))
plt.plot(history["train_acc"], label="train_acc")
plt.plot(history["val_acc"], label="val_acc")
plt.xlabel("epoch"); plt.ylabel("accuracy"); plt.legend(); plt.title("Accuracy")
plt.show()

plt.figure(figsize=(6,4))
plt.plot(history["train_loss"], label="train_loss")
plt.plot(history["val_loss"], label="val_loss")
plt.xlabel("epoch"); plt.ylabel("loss"); plt.legend(); plt.title("Loss")
plt.show()

# Load best model for final eval
state = torch.load(ckpt_path, map_location=device)
model = build_model(state["backbone"], len(state["classes"])).to(device)
model.load_state_dict(state["model"])

# Validation report
_, va_acc, y_true, y_pred = evaluate(model, loader_va)
print("\nValidation accuracy (best):", va_acc)
print("\nClassification Report (Val):")
print(classification_report(y_true, y_pred, target_names=classes, digits=4))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=list(range(num_classes)))
fig = plt.figure(figsize=(6,6))
plt.imshow(cm, interpolation='nearest')
plt.title("Confusion Matrix (Val)")
plt.colorbar()
tick_marks = np.arange(num_classes)
plt.xticks(tick_marks, classes, rotation=45, ha="right")
plt.yticks(tick_marks, classes)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()

# Optional Test set
if loader_te is not None:
    _, te_acc, y_true_te, y_pred_te = evaluate(model, loader_te)
    print("\nTest accuracy:", te_acc)
    print("\nClassification Report (Test):")
    print(classification_report(y_true_te, y_pred_te, target_names=classes, digits=4))

## 6) Inference on a single image

In [None]:
from PIL import Image

def load_model(ckpt_path, device=device):
    state = torch.load(ckpt_path, map_location=device)
    net = build_model(state["backbone"], len(state["classes"])).to(device)
    net.load_state_dict(state["model"])
    net.eval()
    return net, state["classes"]

infer_tf = T.Compose([
    T.Resize(int(IMG_SIZE*1.1)),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])

def predict_image(path, model, classes):
    img = Image.open(path).convert("RGB")
    x = infer_tf(img).unsqueeze(0).to(device)
    with torch.no_grad():
        logits = model(x)
        probs = logits.softmax(dim=1).cpu().numpy().squeeze()
    topk = probs.argsort()[::-1][:5]
    return [(classes[i], float(probs[i])) for i in topk]

# Example:
# model_loaded, classes_loaded = load_model(ckpt_path)
# predict_image("/path/to/image.jpg", model_loaded, classes_loaded)