In [None]:
import os
import h5py
import cv2
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torchvision.transforms as T
from torchvision import models
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2
import matplotlib.pyplot as plt

# 1. Device Setup
# ---------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 2. Dataset Class
# ---------------------------
class ISIC_HDF5_Dataset(Dataset):
    def __init__(self, df: pd.DataFrame, hdf5_path: str, transform=None, is_labelled: bool = True):
        self.df = df.reset_index(drop=True)
        self.hdf5_path = hdf5_path
        self.transform = transform
        self.is_labelled = is_labelled
        self.hdf5_file = None  # Add this

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if self.hdf5_file is None:
            self.hdf5_file = h5py.File(self.hdf5_path, 'r')  # Open once per worker

        row = self.df.iloc[idx]
        isic_id = row["isic_id"]
        encoded_bytes = self.hdf5_file[isic_id][()]
        image_bgr = cv2.imdecode(encoded_bytes, cv2.IMREAD_COLOR)
        image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)

        if self.transform:
            augmented = self.transform(image=image_rgb)
            image = augmented["image"]
        else:
            image = torch.from_numpy(image_rgb).permute(2, 0, 1).float()

        if self.is_labelled:
            label = torch.tensor(row["target"]).float()
            return image, label, isic_id
        else:
            return image, isic_id


    def _load_image_from_hdf5(self, isic_id: str):
        with h5py.File(self.hdf5_path, 'r') as hf:
            encoded_bytes = hf[isic_id][()]
        image_bgr = cv2.imdecode(encoded_bytes, cv2.IMREAD_COLOR)
        image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
        return image_rgb

# 3. Load CSVs and Partition Dataset
# ---------------------------
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/KaggleChallenge'

TRAIN_CSV = path+"/new-train-metadata.csv"
TEST_CSV  = path+"/students-test-metadata.csv"
TRAIN_HDF5 = path+"/train-image.hdf5"
TEST_HDF5  = path+"/test-image.hdf5"

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

from sklearn.model_selection import train_test_split
train_df_sub, valid_df_sub = train_test_split(train_df, test_size=0.2, stratify=train_df['target'], random_state=42)

# 4. Data Augmentation with Albumentations
# ---------------------------
train_transform = A.Compose([
    A.Resize(224,224),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=30, p=0.8),  # More aggressive rotate/scale
    A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.7),
    A.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=0.5),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

valid_transform = A.Compose([
    A.Resize(224, 224),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])


In [None]:
# 5. Dataset Instantiation
# ---------------------------
train_dataset = ISIC_HDF5_Dataset(train_df_sub, TRAIN_HDF5, transform=train_transform, is_labelled=True)
valid_dataset = ISIC_HDF5_Dataset(valid_df_sub, TRAIN_HDF5, transform=valid_transform, is_labelled=True)
test_dataset  = ISIC_HDF5_Dataset(test_df, TEST_HDF5, transform=valid_transform, is_labelled=False)

# 6. Weighted Sampler to Balance Classes
# ---------------------------
class_counts = train_df_sub['target'].value_counts()
weights = train_df_sub['target'].apply(lambda x: 1.0 / class_counts[x])
sampler = WeightedRandomSampler(weights=weights, num_samples=6000, replacement=True)

# 7. DataLoaders
# ---------------------------
BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=sampler, num_workers=2)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)


In [None]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2):
        super().__init__()
        self.gamma = gamma
        self.bce = nn.BCEWithLogitsLoss(reduction='none')  # important: no reduction here!

    def forward(self, inputs, targets):
        bce_loss = self.bce(inputs, targets)
        probs = torch.sigmoid(inputs)
        p_t = probs * targets + (1 - probs) * (1 - targets)  # p_t: prob of true class
        loss = (1 - p_t) ** self.gamma * bce_loss
        return loss.mean()

In [None]:
# 8. Model: EfficientNet + Dropout
# ---------------------------
!pip install efficientnet_pytorch
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_pretrained("efficientnet-b3")
model._fc = nn.Sequential(
    nn.Dropout(p=0.4),
    nn.Linear(model._fc.in_features, 1)
)
model = model.to(device)

# 9. Optimizer, Loss, Scheduler
# ---------------------------
criterion = FocalLoss(gamma=2)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=2, factor=0.5)


In [None]:
from sklearn.model_selection import StratifiedKFold

N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

PATIENCE = 3
BATCH_SIZE = 16

oof_preds = np.zeros(len(train_df))
oof_targets = train_df["target"].values

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df["target"])):
    print(f"\n--- Fold {fold+1} ---")

    train_df_sub = train_df.iloc[train_idx].reset_index(drop=True)
    valid_df_sub = train_df.iloc[val_idx].reset_index(drop=True)

    # Datasets and Dataloaders
    train_dataset = ISIC_HDF5_Dataset(train_df_sub, TRAIN_HDF5, transform=train_transform)
    valid_dataset = ISIC_HDF5_Dataset(valid_df_sub, TRAIN_HDF5, transform=valid_transform)

    class_counts = train_df_sub['target'].value_counts()
    weights = train_df_sub['target'].apply(lambda x: 1.0 / class_counts[x])
    sampler = WeightedRandomSampler(weights=weights, num_samples=6000, replacement=True)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=sampler, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

    # Model, loss, optimizer
    model = EfficientNet.from_pretrained("efficientnet-b3")
    model._fc = nn.Sequential(nn.Dropout(0.4), nn.Linear(model._fc.in_features, 1))
    model = model.to(device)

    criterion = FocalLoss(gamma=2)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", patience=2, factor=0.5)

    best_auc = 0
    patience_counter = 0

    for epoch in range(1, EPOCHS+1):
        model.train()
        train_losses = []

        for images, labels, _ in tqdm(train_loader, desc=f"Epoch {epoch}"):
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            logits = model(images).view(-1)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

        model.eval()
        val_logits, val_labels = [], []

        with torch.no_grad():
            for images, labels, _ in valid_loader:
                images = images.to(device)
                logits = model(images).view(-1)
                val_logits.extend(torch.sigmoid(logits).cpu().numpy())
                val_labels.extend(labels.numpy())

        val_auc = roc_auc_score(val_labels, val_logits)
        scheduler.step(val_auc)

        print(f"Epoch {epoch}: Train Loss={np.mean(train_losses):.4f}, Val AUC={val_auc:.4f}")

        if val_auc > best_auc:
            best_auc = val_auc
            torch.save(model.state_dict(), f"best_model_fold{fold}.pt")
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= PATIENCE:
            print("Early stopping!")
            break
        torch.cuda.empty_cache()

    # Save OOF predictions
    oof_preds[val_idx] = val_logits


In [None]:
# 11. Inference using all folds
# -----------------------------
all_fold_preds = []

for fold in range(N_FOLDS):
    print(f"Loading model from fold {fold}...")
    model = EfficientNet.from_pretrained("efficientnet-b3")
    model._fc = nn.Sequential(nn.Dropout(0.4), nn.Linear(model._fc.in_features, 1))
    model.load_state_dict(torch.load(f"best_model_fold{fold}.pt"))
    model = model.to(device)
    model.eval()

    fold_preds = []

    with torch.no_grad():
        for images, isic_ids in tqdm(test_loader, desc=f"Inference Fold {fold}"):
            images = images.to(device)
            probs = torch.sigmoid(model(images).view(-1)).cpu().numpy()
            fold_preds.extend(probs)

    all_fold_preds.append(fold_preds)
    torch.cuda.empty_cache()

# Average predictions across all folds
avg_preds = np.mean(all_fold_preds, axis=0)

# Prepare submission
submission_df = pd.DataFrame({
    "isic_id": [id for _, id in test_dataset],
    "target": avg_preds
})
submission_df = submission_df.sort_values(by="isic_id").reset_index(drop=True)
submission_df.to_csv(path + "/submission_cv_ensemble.csv", index=False)
print("Ensemble submission saved.")

