
# 🧠 CNN Transfer Learning — 48×48 ➜ 224 (Updated)

**Notebook:** `CNN_Transfer_Learning_71_UPDATED.ipynb`  
**Focus:** 48×48 grayscale dataset ➜ upscale to 224, Transfer Learning (ResNet-18 by default), class-imbalance handling, warmup+cosine LR, early stopping, checkpoints, evaluation (macro-F1 + confusion matrix), simple TTA, and export (TorchScript/ONNX).

> **Sinhala summary:**  
> ඔබගේ 48×48 කළු-සුදු (grayscale) emotion dataset එක **224×224** දක්වා upsample කරලා **pretrained CNN** (ResNet-18) එකක් **Transfer Learning** මගින් train කරන, imbalance දුරලන, LR schedule (warmup + cosine) එක්ක **මනාප pipeline** එකක් මෙහි සම්පූර්ණයි.  
> **Train→Validate→Fine-tune→Test→Export** එකම file එකක!


In [None]:

# =========================
# CONFIG (edit these first)
# =========================
from pathlib import Path

# --- Your CSVs must have columns: path,label
CSV_TRAIN = Path('/content/ann-visual-emotion/data/processed/EmoSet_splits/train.csv')
CSV_VAL   = Path('/content/ann-visual-emotion/data/processed/EmoSet_splits/val.csv')
CSV_TEST  = Path('/content/ann-visual-emotion/data/processed/EmoSet_splits/test.csv')

# If image paths in CSV are relative, set the root so they resolve correctly.
RAW_IMG_ROOT = Path('/content/ann-visual-emotion/data/raw/EmoSet')

# Where to save model/checkpoints/metrics
OUT_DIR = Path('./outputs_transfer_71')
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Training params
NUM_CLASSES = 7               # change as needed
BATCH_SIZE  = 32
EPOCHS_HEAD = 10              # train classifier head first
EPOCHS_FT   = 15              # fine-tune (unfreeze top layers)
LR_HEAD     = 1e-3
LR_FT_BB    = 1e-4            # backbone LR during fine-tune
LR_FT_HEAD  = 5e-4            # head LR during fine-tune
WEIGHT_DECAY = 1e-4
LABEL_SMOOTH = 0.1            # 0.0 to disable
USE_MIXUP    = True
MIXUP_ALPHA  = 0.2
USE_FP16     = True           # mixed precision

# Sampler / imbalance
USE_WEIGHTED_SAMPLER = True   # set False to disable
CLASS_WEIGHTS_IN_LOSS = False # alternative to sampler

# Inference / TTA
USE_TTA = True

# Export
EXPORT_ONNX = True
EXPORT_TORCHSCRIPT = True

SEED = 42
DEVICE_PREF = 'cuda'  # 'cuda' or 'cpu'


In [None]:

# ========================
# Imports & Reproducibility
# ========================
import os, json, math, random, time
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

import torchvision
from torchvision import transforms
from PIL import Image

from sklearn.metrics import classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt

try:
    from torch.cuda import amp
    AMP_AVAILABLE = True
except Exception:
    AMP_AVAILABLE = False

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(SEED)

device = torch.device(DEVICE_PREF if torch.cuda.is_available() and DEVICE_PREF=='cuda' else 'cpu')
print('Using device:', device)
print('Torch:', torch.__version__, '| Torchvision:', torchvision.__version__)


In [None]:

# ========================
# CSV Reading & Dataset
# ========================
READ_COLUMNS = ['path', 'label']

def robust_read_csv(csv_path: Path):
    df = pd.read_csv(csv_path)
    # flexible column normalization
    colmap = {}
    for c in df.columns:
        lc = c.strip().lower()
        if lc in ['image','img','filepath','file','path']:
            colmap[c] = 'path'
        if lc in ['label','class','target','emotion']:
            colmap[c] = 'label'
    df.rename(columns=colmap, inplace=True)
    assert READ_COLUMNS[0] in df.columns, f"Missing column '{READ_COLUMNS[0]}'"
    assert READ_COLUMNS[1] in df.columns, f"Missing column '{READ_COLUMNS[1]}'"
    return df

def resolve_path(p: str, root: Path):
    pp = Path(p)
    if pp.is_absolute():
        return pp
    return (root / pp).resolve()

train_df = robust_read_csv(CSV_TRAIN)
val_df   = robust_read_csv(CSV_VAL)
test_df  = robust_read_csv(CSV_TEST) if CSV_TEST.exists() else None

# Build label maps (string↔index) from train set
classes = sorted(train_df['label'].astype(str).unique().tolist())
class_to_idx = {c:i for i,c in enumerate(classes)}
idx_to_class = {i:c for c,i in class_to_idx.items()}
assert len(classes) == NUM_CLASSES, f"NUM_CLASSES={NUM_CLASSES} but found {len(classes)} in CSV"

def to_rgb_from_gray(img: Image.Image):
    # if grayscale, convert to 3-ch by repeat
    if img.mode != 'RGB':
        img = img.convert('L')
        img = Image.merge('RGB', (img, img, img))
    return img

INPUT_SIZE = 224
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]

train_tfms = transforms.Compose([
    transforms.Lambda(to_rgb_from_gray),
    transforms.Resize((INPUT_SIZE, INPUT_SIZE), antialias=True),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=12),
    transforms.RandomResizedCrop((INPUT_SIZE, INPUT_SIZE), scale=(0.9, 1.0)),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

val_tfms = transforms.Compose([
    transforms.Lambda(to_rgb_from_gray),
    transforms.Resize((INPUT_SIZE, INPUT_SIZE), antialias=True),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

class CSVDataset(Dataset):
    def __init__(self, df: pd.DataFrame, root: Path, transform=None):
        self.df = df.reset_index(drop=True)
        self.root = root
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        p   = resolve_path(str(row['path']), self.root)
        y   = class_to_idx[str(row['label'])]
        img = Image.open(p).convert('RGB') if p.suffix.lower() in ['.jpg','.jpeg','.png','.bmp'] else Image.open(p)
        # ensure RGB (even if input is gray)
        img = to_rgb_from_gray(img)
        if self.transform is not None:
            img = self.transform(img)
        return img, y, str(p)

train_ds = CSVDataset(train_df, RAW_IMG_ROOT, transform=train_tfms)
val_ds   = CSVDataset(val_df,   RAW_IMG_ROOT, transform=val_tfms)
test_ds  = CSVDataset(test_df,  RAW_IMG_ROOT, transform=val_tfms) if test_df is not None else None

print('Samples:', len(train_ds), len(val_ds), (len(test_ds) if test_ds else 0))


In [None]:

# ========================
# Sampler / Class Weights
# ========================
train_labels = [class_to_idx[str(l)] for l in train_df['label'].tolist()]
counts = np.bincount(train_labels, minlength=NUM_CLASSES)
class_weights = 1.0 / np.maximum(counts, 1)
class_weights = class_weights / class_weights.sum() * NUM_CLASSES  # normalize

print('Class counts:', dict(zip(classes, counts)))
print('Class weights (normalized):', class_weights.round(3))

sampler = None
if USE_WEIGHTED_SAMPLER:
    sample_weights = [class_weights[y] for y in train_labels]
    sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)
    print('Using WeightedRandomSampler')
else:
    print('Using regular random sampling')

if CLASS_WEIGHTS_IN_LOSS:
    ce_weights = torch.tensor(class_weights, dtype=torch.float32, device='cpu')
else:
    ce_weights = None


In [None]:

# ========================
# DataLoaders
# ========================
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=(sampler is None),
                          sampler=sampler, num_workers=2, pin_memory=True, drop_last=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True) if test_ds else None

len(train_loader), len(val_loader)


In [None]:

# ========================
# Model: ResNet-18 (pretrained)
# ========================
from torchvision.models import resnet18, ResNet18_Weights

def build_model(num_classes: int):
    weights = ResNet18_Weights.IMAGENET1K_V1
    m = resnet18(weights=weights)
    # replace classifier
    in_feats = m.fc.in_features
    m.fc = nn.Sequential(
        nn.Dropout(p=0.4),
        nn.Linear(in_feats, num_classes)
    )
    return m

model = build_model(NUM_CLASSES).to(device)

# Freeze all (for head training)
for p in model.parameters():
    p.requires_grad = False
for p in model.fc.parameters():
    p.requires_grad = True

def count_trainable(m):
    return sum(p.numel() for p in m.parameters() if p.requires_grad)

print('Trainable params (head stage):', count_trainable(model))


In [None]:

# ========================
# Loss / MixUp / Schedulers
# ========================
class LabelSmoothingCE(nn.Module):
    def __init__(self, eps=0.1, reduction='mean', weight=None):
        super().__init__()
        self.eps = eps
        self.reduction = reduction
        self.register_buffer('weight', weight if weight is not None else None)

    def forward(self, logits, target):
        n = logits.size(-1)
        logp = F.log_softmax(logits, dim=-1)
        with torch.no_grad():
            true_dist = torch.zeros_like(logp)
            true_dist.fill_(self.eps / (n - 1))
            true_dist.scatter_(1, target.unsqueeze(1), 1 - self.eps)
        if self.weight is not None:
            # apply per-class weights
            w = self.weight[target].unsqueeze(1)
            loss = (-true_dist * logp) * w
        else:
            loss = -true_dist * logp
        if self.reduction == 'mean':
            return loss.sum(dim=1).mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

def mixup_data(x, y, alpha=0.2):
    if alpha <= 0.0:
        return x, y, 1.0
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size(0)
    index = torch.randperm(batch_size, device=x.device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, (y_a, y_b), lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

def make_criterion():
    if LABEL_SMOOTH > 0.0:
        return LabelSmoothingCE(eps=LABEL_SMOOTH, weight=ce_weights)
    else:
        return nn.CrossEntropyLoss(weight=ce_weights)

def make_optimizer_head(model):
    return torch.optim.AdamW(model.fc.parameters(), lr=LR_HEAD, weight_decay=WEIGHT_DECAY)

def make_optimizer_finetune(model):
    # Different LR for backbone vs head
    params = [
        {'params': [p for n,p in model.named_parameters() if p.requires_grad and not n.startswith('fc.')], 'lr': LR_FT_BB},
        {'params': model.fc.parameters(), 'lr': LR_FT_HEAD},
    ]
    return torch.optim.AdamW(params, weight_decay=WEIGHT_DECAY)

def make_warmup_cosine(optimizer, total_steps, warmup_steps=0.1):
    if warmup_steps < 1.0:  # interpret as fraction
        warmup_steps = int(total_steps * warmup_steps)
    def lr_lambda(step):
        if step < warmup_steps:
            return float(step) / float(max(1, warmup_steps))
        progress = float(step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        return 0.5 * (1.0 + math.cos(math.pi * progress))
    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)


In [None]:

# ========================
# Train / Eval loops
# ========================
def train_one_epoch(model, loader, optimizer, criterion, scaler=None):
    model.train()
    running_loss, correct, total = 0.0, 0, 0
    for x, y, _ in loader:
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)

        if USE_MIXUP:
            xm, (ya, yb), lam = mixup_data(x, y, alpha=MIXUP_ALPHA)
            if scaler and AMP_AVAILABLE and USE_FP16:
                with amp.autocast():
                    logits = model(xm)
                    loss = mixup_criterion(criterion, logits, ya, yb, lam)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                logits = model(xm)
                loss = mixup_criterion(criterion, logits, ya, yb, lam)
                loss.backward()
                optimizer.step()
            # accuracy (approx) – not exact with mixup; we skip accuracy here
            running_loss += loss.item() * x.size(0)
            total += x.size(0)
        else:
            if scaler and AMP_AVAILABLE and USE_FP16:
                with amp.autocast():
                    logits = model(x)
                    loss = criterion(logits, y)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                logits = model(x)
                loss = criterion(logits, y)
                loss.backward()
                optimizer.step()

            preds = logits.argmax(dim=1)
            correct += (preds == y).sum().item()
            running_loss += loss.item() * x.size(0)
            total += x.size(0)

    avg_loss = running_loss / max(1, total)
    if USE_MIXUP:
        return avg_loss, None  # accuracy not tracked under mixup
    else:
        return avg_loss, correct / max(1, total)

@torch.no_grad()
def evaluate(model, loader, criterion):
    model.eval()
    running_loss, correct, total = 0.0, 0, 0
    all_y, all_p = [], []
    for x, y, _ in loader:
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        logits = model(x)
        loss = criterion(logits, y)
        preds = logits.argmax(dim=1)

        running_loss += loss.item() * x.size(0)
        correct += (preds == y).sum().item()
        total += x.size(0)
        all_y.append(y.cpu().numpy())
        all_p.append(preds.cpu().numpy())

    avg_loss = running_loss / max(1, total)
    acc = correct / max(1, total)
    y_true = np.concatenate(all_y) if all_y else np.array([])
    y_pred = np.concatenate(all_p) if all_p else np.array([])
    macro_f1 = f1_score(y_true, y_pred, average='macro') if len(y_true)>0 else 0.0
    return avg_loss, acc, macro_f1, y_true, y_pred

def save_ckpt(model, path: Path, extra: dict=None):
    path.parent.mkdir(parents=True, exist_ok=True)
    payload = {'state_dict': model.state_dict(), 'classes': classes}
    if extra: payload.update(extra)
    torch.save(payload, path)
    print('Saved:', path)


In [None]:

# ========================
# Stage 1: Train classifier head
# ========================
criterion = make_criterion()
optimizer = make_optimizer_head(model)
total_steps = EPOCHS_HEAD * len(train_loader)
scheduler = make_warmup_cosine(optimizer, total_steps, warmup_steps=0.1)
scaler = amp.GradScaler(enabled=(AMP_AVAILABLE and USE_FP16))

best_val_f1 = -1.0
best_path = OUT_DIR / 'best_head.pt'

for epoch in range(EPOCHS_HEAD):
    tl, tacc = train_one_epoch(model, train_loader, optimizer, criterion, scaler=scaler)
    vl, vacc, vf1, y_true, y_pred = evaluate(model, val_loader, criterion)
    scheduler.step()

    print(f"[Head] Epoch {epoch+1}/{EPOCHS_HEAD} | train_loss={tl:.4f} "
          f"| val_loss={vl:.4f} | val_acc={vacc:.4f} | val_macroF1={vf1:.4f}")

    if vf1 > best_val_f1:
        best_val_f1 = vf1
        save_ckpt(model, best_path, extra={'stage': 'head', 'epoch': epoch+1, 'val_f1': vf1})

print('Best val F1 (head):', round(best_val_f1, 4))


In [None]:

# ========================
# Stage 2: Fine-tune (unfreeze top layers)
# ========================
# Unfreeze layer4 (top block) + fc
for name, p in model.named_parameters():
    if name.startswith('layer4.') or name.startswith('fc.'):
        p.requires_grad = True
    else:
        p.requires_grad = False

print('Trainable params (fine-tune stage):', sum(p.numel() for p in model.parameters() if p.requires_grad))

criterion_ft = make_criterion()
optimizer_ft = make_optimizer_finetune(model)
total_steps_ft = EPOCHS_FT * len(train_loader)
scheduler_ft = make_warmup_cosine(optimizer_ft, total_steps_ft, warmup_steps=0.1)
scaler_ft = amp.GradScaler(enabled=(AMP_AVAILABLE and USE_FP16))

best_val_f1_ft = -1.0
best_path_ft = OUT_DIR / 'best_finetune.pt'
patience = 7
since_best = 0

for epoch in range(EPOCHS_FT):
    tl, tacc = train_one_epoch(model, train_loader, optimizer_ft, criterion_ft, scaler=scaler_ft)
    vl, vacc, vf1, y_true, y_pred = evaluate(model, val_loader, criterion_ft)
    scheduler_ft.step()

    print(f"[FT] Epoch {epoch+1}/{EPOCHS_FT} | train_loss={tl:.4f} "
          f"| val_loss={vl:.4f} | val_acc={vacc:.4f} | val_macroF1={vf1:.4f}")

    if vf1 > best_val_f1_ft:
        best_val_f1_ft = vf1
        save_ckpt(model, best_path_ft, extra={'stage': 'finetune', 'epoch': epoch+1, 'val_f1': vf1})
        since_best = 0
    else:
        since_best += 1
        if since_best >= patience:
            print('Early stopping triggered.')
            break

print('Best val F1 (finetune):', round(best_val_f1_ft, 4))


In [None]:

# ========================
# Final Eval (Test) + Confusion Matrix
# ========================
# Load best from FT if exists, else from head
best_final = OUT_DIR / 'best_finetune.pt'
if not best_final.exists():
    best_final = OUT_DIR / 'best_head.pt'

payload = torch.load(best_final, map_location='cpu')
model.load_state_dict(payload['state_dict'])
model.to(device).eval()
print('Loaded best:', best_final)

crit_eval = nn.CrossEntropyLoss(weight=ce_weights)

if test_loader is not None:
    tl, acc, mf1, y_true, y_pred = evaluate(model, test_loader, crit_eval)
    print(f"[TEST] loss={tl:.4f} | acc={acc:.4f} | macroF1={mf1:.4f}")
    print(classification_report(y_true, y_pred, target_names=classes, digits=4))

    cm = confusion_matrix(y_true, y_pred, labels=list(range(NUM_CLASSES)))
    fig = plt.figure(figsize=(7,6))
    plt.imshow(cm, interpolation='nearest')
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(NUM_CLASSES)
    plt.xticks(tick_marks, classes, rotation=45, ha='right')
    plt.yticks(tick_marks, classes)
    plt.ylabel('True')
    plt.xlabel('Pred')
    plt.tight_layout()
    plt.show()
else:
    print('No TEST set provided; skipping test evaluation.')


In [None]:

# ========================
# TTA + Single Image Inference
# ========================
base_tfms = transforms.Compose([
    transforms.Lambda(to_rgb_from_gray),
    transforms.Resize((INPUT_SIZE, INPUT_SIZE), antialias=True),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

tta_tfms = [
    base_tfms,
    transforms.Compose([
        transforms.Lambda(to_rgb_from_gray),
        transforms.Resize((INPUT_SIZE, INPUT_SIZE), antialias=True),
        transforms.RandomHorizontalFlip(p=1.0),
        transforms.ToTensor(),
        transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
    ])
]

@torch.no_grad()
def predict_image(img_path: Path, use_tta: bool=True):
    img = Image.open(img_path)
    img = to_rgb_from_gray(img)
    model.eval()
    if use_tta and USE_TTA:
        probs_accum = None
        for t in tta_tfms:
            x = t(img).unsqueeze(0).to(device)
            logits = model(x)
            probs = logits.softmax(dim=1)
            probs_accum = probs if probs_accum is None else probs_accum + probs
        probs = probs_accum / len(tta_tfms)
    else:
        x = base_tfms(img).unsqueeze(0).to(device)
        logits = model(x)
        probs = logits.softmax(dim=1)
    conf, pred_idx = probs.max(dim=1)
    return idx_to_class[pred_idx.item()], conf.item(), probs.squeeze(0).cpu().numpy()

# Example:
# sample_img = resolve_path(train_df.iloc[0]['path'], RAW_IMG_ROOT)
# label, conf, prob_vec = predict_image(sample_img, use_tta=True)
# print('Pred:', label, 'conf:', round(conf,4))


In [None]:

# ========================
# Export: TorchScript / ONNX
# ========================
DUMMY = torch.randn(1,3,INPUT_SIZE,INPUT_SIZE, device=device)

if EXPORT_TORCHSCRIPT:
    traced = torch.jit.trace(model, DUMMY)
    ts_path = OUT_DIR / 'best_model.ts'
    traced.save(str(ts_path))
    print('Saved TorchScript ->', ts_path)

if EXPORT_ONNX:
    onnx_path = OUT_DIR / 'best_model.onnx'
    torch.onnx.export(
        model, DUMMY, str(onnx_path),
        input_names=['input'], output_names=['logits'],
        dynamic_axes={'input': {0: 'batch'}, 'logits': {0: 'batch'}},
        opset_version=12
    )
    print('Saved ONNX ->', onnx_path)



## Tips & Troubleshooting (සිංහලෙන් සෙට් කිරීම)

- **CSV columns** `path,label` යන්න තියෙන්නම ඕන. වෙන නම් top cell එකේ mapping වෙනස් කරලා තිබේ.
- **Paths relative** නම් `RAW_IMG_ROOT` හරියාකාරිව සකස් කරන්න.
- **Class imbalance** ගැටලුවක් නම්: `USE_WEIGHTED_SAMPLER=True` තබන්න (default).
- **Val macro-F1** වැඩි කරන්න: LR අඩු/වැඩි කරලා බලන්න, `LABEL_SMOOTH=0.1 → 0.05` වගේ tune කරන්න.
- **Slow training?** `BATCH_SIZE` අඩු/වැඩි, `USE_FP16=True` (if GPU), `num_workers=2→4` try කරන්න.
- **Visual sanity-check:** Confusion matrix බලලා ව්‍යාකූල pair (e.g., fear vs surprise) තියෙන්ඩෙ augmentations ට්වීක් කරන්න.
- **Export**: TorchScript/ONNX files `./outputs_transfer_71/` තුළ.
