# Underwater Hand Gesture Classification with EfficientNet

This Colab notebook trains an **EfficientNet-B0 CNN** on underwater hand gesture images.

Steps:
1. Mount Google Drive & prepare dataset
2. Create stratified train/val/test splits
3. Build PyTorch datasets & dataloaders
4. Apply image augmentations
5. Train EfficientNet
6. Validate with confusion matrix & per-class accuracy
7. Test on held-out set

## Step 1 - Mount Google Drive and Split Dataset

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("aniket2125/underwater-hand-gesture-images")

print("Path to dataset files:", path)

In [None]:
import os
import numpy as np
import random
import matplotlib.pyplot as plt
from torchvision.datasets import ImageFolder
from sklearn.model_selection import train_test_split
from collections import Counter

# Dataset path 
dataset_dir = os.path.join('/kaggle/input', 'underwater-hand-gesture-images')
assert os.path.isdir(dataset_dir), f"dataset_dir not found: {dataset_dir}"

# Load dataset to inspect classes
full = ImageFolder(root=dataset_dir)
classes = full.classes
print("Detected classes:", classes)
print("Num images:", len(full))
print("Per-class counts:", Counter(full.targets))

# Stratified split (70/15/15)
indices = np.arange(len(full))
targets = np.array(full.targets)

train_idx, temp_idx, y_train, y_temp = train_test_split(
    indices, targets, test_size=0.30, stratify=targets, random_state=42
)
val_idx, test_idx, y_val, y_test = train_test_split(
    temp_idx, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print(f"Train: {len(train_idx)} | Val: {len(val_idx)} | Test: {len(test_idx)}")


## Step 2 - Create PyTorch Datasets and Dataloaders

In [None]:
import torch
from torchvision import transforms
from torch.utils.data import Subset, DataLoader

# Placeholder transforms (will be updated in Step 3)
normalize = transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
basic_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    normalize,
])

# Datasets with Subset
train_dataset_full = ImageFolder(root=dataset_dir, transform=basic_transform)
val_dataset_full   = ImageFolder(root=dataset_dir, transform=basic_transform)
test_dataset_full  = ImageFolder(root=dataset_dir, transform=basic_transform)

train_dataset = Subset(train_dataset_full, train_idx)
val_dataset   = Subset(val_dataset_full, val_idx)
test_dataset  = Subset(test_dataset_full, test_idx)

# Dataloaders
batch_size = 32
num_workers = 2
pin_memory = True if torch.cuda.is_available() else False

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                          num_workers=num_workers, pin_memory=pin_memory)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                        num_workers=num_workers, pin_memory=pin_memory)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
                         num_workers=num_workers, pin_memory=pin_memory)

print("Example batch:")
imgs, lbls = next(iter(train_loader))
print(imgs.shape, lbls.shape)


## Step 3 - Define Data Augmentation

In [None]:
from torchvision import transforms

normalize = transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8,1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.02),
    transforms.ToTensor(),
    normalize
])

val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    normalize
])

# Re-create datasets with new transforms
train_ds_full = ImageFolder(root=dataset_dir, transform=train_transform)
val_ds_full   = ImageFolder(root=dataset_dir, transform=val_transform)
test_ds_full  = ImageFolder(root=dataset_dir, transform=val_transform)

train_dataset = Subset(train_ds_full, train_idx)
val_dataset   = Subset(val_ds_full, val_idx)
test_dataset  = Subset(test_ds_full, test_idx)

print("Transforms applied. Sample:")
x, y = train_dataset[0]
print(x.shape, y)


## Step 4 - Load EfficientNet-B0 and Training Loop

In [None]:
import torch.nn as nn
from torchvision import models
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.amp import GradScaler, autocast
import time

full = ImageFolder(dataset_dir)
class_names = full.classes
num_classes = len(class_names)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

weights_path = "/kaggle/input/efficientnet-b0-weights/efficientnet_b0_rwightman-7f5810bc.pth"
state_dict = torch.load(weights_path, map_location=device)
model = models.efficientnet_b0(weights=None)
model.load_state_dict(state_dict)
in_features = model.classifier[1].in_features
model.classifier[1] = nn.Linear(in_features, num_classes)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)
scaler = GradScaler()

def train_one_epoch(model, loader, optimizer, criterion, device, scaler):
    model.train()
    running_loss, preds_all, labels_all = 0.0, [], []
    for imgs, labels in loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        with autocast(device_type="cuda" if device == "cuda" else "cpu"):
            outputs = model(imgs)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        running_loss += loss.item() * imgs.size(0)
        preds_all.extend(outputs.argmax(1).cpu().numpy())
        labels_all.extend(labels.cpu().numpy())
    return running_loss/len(loader.dataset), (np.array(preds_all)==np.array(labels_all)).mean()

def validate(model, loader, criterion, device):
    model.eval()
    running_loss, preds_all, labels_all = 0.0, [], []
    with torch.no_grad():
        for imgs, labels in loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * imgs.size(0)
            preds_all.extend(outputs.argmax(1).cpu().numpy())
            labels_all.extend(labels.cpu().numpy())
    preds_all, labels_all = np.array(preds_all), np.array(labels_all)
    return running_loss/len(loader.dataset), (preds_all==labels_all).mean(), labels_all, preds_all

num_epochs = 12
best_val_acc = 0.0
ckpt_path = 'best_efficientnet_b0.pth'

for epoch in range(1, num_epochs+1):
    t0 = time.time()
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device, scaler)
    val_loss, val_acc, y_true_val, y_pred_val = validate(model, val_loader, criterion, device)
    scheduler.step(val_loss)
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(), 'val_acc': val_acc}, ckpt_path)
    print(f"Epoch {epoch}/{num_epochs}: train_loss={train_loss:.4f} acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f} time={time.time()-t0:.1f}s")
print("Best Val Acc:", best_val_acc)


## Step 5 - Validation Confusion Matrix and Per-class Accuracy

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt

val_loss, val_acc, y_true_val, y_pred_val = validate(model, val_loader, criterion, device)

print("Validation accuracy:", val_acc)
cm = confusion_matrix(y_true_val, y_pred_val)
print("Confusion matrix:\n", cm)
per_class_acc = np.diag(cm) / cm.sum(axis=1)
for cls, acc in zip(class_names, per_class_acc):
    print(f"{cls}: {acc*100:.2f}%")
print(classification_report(y_true_val, y_pred_val, target_names=class_names))

cm_norm = cm.astype(float)/cm.sum(axis=1)[:,None]
fig, ax = plt.subplots(figsize=(7,6))
im = ax.imshow(cm_norm, cmap='Blues')
plt.colorbar(im)
ax.set_xticks(range(len(class_names)))
ax.set_yticks(range(len(class_names)))
ax.set_xticklabels(class_names, rotation=45)
ax.set_yticklabels(class_names)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j,i,f"{cm_norm[i,j]:.2f}",ha='center',va='center',
                color='white' if cm_norm[i,j]>0.5 else 'black')
plt.xlabel("Predicted"); plt.ylabel("True")
plt.title("Validation Confusion Matrix (normalized)")
plt.show()


## Step 6 - Test Evaluation

In [None]:
if os.path.exists(ckpt_path):
    ckpt = torch.load(ckpt_path, map_location=device, weights_only= False)
    model.load_state_dict(ckpt['model_state_dict'])
    print("Loaded checkpoint at epoch", ckpt.get('epoch'), "val_acc", ckpt.get('val_acc'))
else:
    print("Checkpoint not found, using current model weights.")

test_loss, test_acc, y_true_test, y_pred_test = validate(model, test_loader, criterion, device)
print("Test Accuracy:", test_acc)

cm_test = confusion_matrix(y_true_test, y_pred_test)
print("Test confusion matrix:\n", cm_test)
per_class_acc_test = np.diag(cm_test) / cm_test.sum(axis=1)
for cls, acc in zip(class_names, per_class_acc_test):
    print(f"{cls}: {acc*100:.2f}%")
print(classification_report(y_true_test, y_pred_test, target_names=class_names))


## Step 7 - Save Class Mapping and Model Utility

In [None]:
import json

class_map = {i:c for i,c in enumerate(class_names)}
with open('class_map.json','w') as f:
    json.dump(class_map,f,indent=2)
print("Saved class map.")

# To reload model:
# model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)
# model.classifier[1] = nn.Linear(model.classifier[1].in_features, len(class_names))
# model.load_state_dict(torch.load('best_efficientnet_b0.pth')['model_state_dict'])
# model.eval()
