In [6]:
# Human Protein Atlas Image Classification - Baseline CNN
# -------------------------------------------------------
# Goal: Multi-label classification of 28 protein classes from multi-channel microscopy images.

import pandas as pd
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


os.makedirs("figures", exist_ok=True)

# ============================
# 1. Load Dataset
# ============================
data_dir = "Raw/human-protein-atlas-image-classification/"
train_path = os.path.join(data_dir, "train.csv")

train_df = pd.read_csv(train_path)
print(f"Dataset Shape: {train_df.shape}")

# Targets: parse into list of ints
train_df["Target"] = train_df["Target"].apply(lambda x: list(map(int, x.split())))

# One-hot encoding (28 protein classes)
num_classes = 28
one_hot = np.zeros((len(train_df), num_classes), dtype=int)
for i, targets in enumerate(train_df["Target"]):
    one_hot[i, targets] = 1
train_df_one_hot = pd.DataFrame(
    one_hot, columns=[f"Class_{i}" for i in range(num_classes)]
)
train_df = pd.concat([train_df, train_df_one_hot], axis=1)

print("\nClass Distribution (Imbalanced):")
print(train_df_one_hot.sum(axis=0))


# ============================
# 2. Sample Visualization
# ============================
sample = train_df.sample(5, random_state=42)
for _, row in sample.iterrows():
    img = cv2.imread(os.path.join(data_dir, "train", f"{row['Id']}_red.png"))
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.title(f"Image ID: {row['Id']}")
    plt.axis("off")
    plt.savefig(f"figures/sample_image_{row['Id']}.png")
    plt.close()


# ============================
# 3. Custom Dataset
# ============================
class HPADataset(Dataset):
    def __init__(self, df, img_dir, size=224, transform=None):
        self.df = df
        self.img_dir = img_dir
        self.size = size
        self.transform = transform
        self.label_cols = [col for col in df.columns if col.startswith("Class_")]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_id = row["Id"]

        channels = []
        for color in ["red", "green", "blue", "yellow"]:
            path = os.path.join(self.img_dir, f"{image_id}_{color}.png")
            img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, (self.size, self.size))
            channels.append(img)

        image = np.stack(channels, axis=0)  # shape: (4, H, W)
        image = torch.tensor(image, dtype=torch.float32) / 255.0
        label = torch.tensor(
            row[self.label_cols].values.astype(np.float32), dtype=torch.float32
        )

        if self.transform:
            image = self.transform(image)
        return image, label


# ============================
# 4. Train / Validation Split
# ============================
sample_df = train_df.sample(10000, random_state=42).reset_index(drop=True)
train_set, val_set = train_test_split(sample_df, test_size=0.2, random_state=42)

train_dataset = HPADataset(train_set, os.path.join(data_dir, "train"))
val_dataset = HPADataset(val_set, os.path.join(data_dir, "train"))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)


# ============================
# 5. Baseline CNN Model
# ============================
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=28):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(128 * 28 * 28, 512)
        self.fc2 = nn.Linear(512, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(-1, 128 * 28 * 28)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc2(x))  # multi-label output
        return x


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleCNN(num_classes=28).to(device)

criterion = nn.BCELoss()  # binary cross-entropy for multi-label
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


# ============================
# 6. Training Loop
# ============================
train_losses, val_losses = [], []

for epoch in range(5):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    epoch_train_loss = running_loss / len(train_loader)
    train_losses.append(epoch_train_loss)

    # Validation
    model.eval()
    running_val_loss = 0.0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            val_loss = criterion(outputs, labels)
            running_val_loss += val_loss.item()
    epoch_val_loss = running_val_loss / len(val_loader)
    val_losses.append(epoch_val_loss)

    print(
        f"Epoch [{epoch+1}/5] "
        f"Train Loss: {epoch_train_loss:.4f} "
        f"Val Loss: {epoch_val_loss:.4f}"
    )

# ============================
# 7. Loss Curves
# ============================
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training & Validation Loss")
plt.legend()
plt.savefig("figures/loss_curves.png")
plt.close()

# ============================
# 8. Validation Evaluation F1 Score
# ============================
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for images, labels in val_loader:
        images = images.to(device)
        outputs = model(images)

        # Threshold 0.5 because output = sigmoid
        preds = (outputs.cpu().numpy() > 0.5).astype(int)

        all_preds.append(preds)
        all_labels.append(labels.cpu().numpy())

all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)

print("\n📊 Validation Results (Baseline CNN):")
print(f"Macro F1: {f1_score(all_labels, all_preds, average='macro'):.3f}")
print(f"Micro F1: {f1_score(all_labels, all_preds, average='micro'):.3f}")

# 🔎 Observations:
# - Baseline CNN converges but still underfits on complex patterns.
# - Severe class imbalance → motivates techniques like class-weighted loss or oversampling.
# - Next steps: pretrained models (ResNet, EfficientNet), data augmentation, focal loss.

Dataset Shape: (31072, 2)

Class Distribution (Imbalanced):
Class_0     12885
Class_1      1254
Class_2      3621
Class_3      1561
Class_4      1858
Class_5      2513
Class_6      1008
Class_7      2822
Class_8        53
Class_9        45
Class_10       28
Class_11     1093
Class_12      688
Class_13      537
Class_14     1066
Class_15       21
Class_16      530
Class_17      210
Class_18      902
Class_19     1482
Class_20      172
Class_21     3777
Class_22      802
Class_23     2965
Class_24      322
Class_25     8228
Class_26      328
Class_27       11
dtype: int64
Epoch [1/5] Train Loss: 0.1826 Val Loss: 0.1624
Epoch [2/5] Train Loss: 0.1638 Val Loss: 0.1591
Epoch [3/5] Train Loss: 0.1584 Val Loss: 0.1608
Epoch [4/5] Train Loss: 0.1529 Val Loss: 0.1543
Epoch [5/5] Train Loss: 0.1433 Val Loss: 0.1587

📊 Validation Results (Baseline CNN):
Macro F1: 0.052
Micro F1: 0.347


In [7]:
# Human Protein Atlas Image Classification - ResNet18 (4-Channel Adaptation)
# --------------------------------------------------------------------------
# Dataset: https://www.kaggle.com/c/human-protein-atlas-image-classification
# Goal: Multi-label classification of 28 protein classes using a pretrained ResNet18,
#       adapted to handle 4-channel microscopy images (RGB + Yellow).

from torchvision import models


# ============================
# 1. Model Definition
# ============================
class ResNet18_HPA(nn.Module):
    """
    ResNet18 adapted for Human Protein Atlas images:
    - First conv layer modified to accept 4 channels instead of 3.
    - Final fully connected layer outputs 28 protein classes.
    """

    def __init__(self, num_classes=28):
        super(ResNet18_HPA, self).__init__()
        self.base_model = models.resnet18(pretrained=True)

        # Modify first convolution (3 channels → 4 channels)
        old_conv = self.base_model.conv1
        self.base_model.conv1 = nn.Conv2d(
            4, 64, kernel_size=7, stride=2, padding=3, bias=False
        )
        # Initialize new channel with existing weights
        self.base_model.conv1.weight.data[:, :3, :, :] = old_conv.weight.data
        self.base_model.conv1.weight.data[:, 3:4, :, :] = old_conv.weight.data[
            :, :1, :, :
        ]

        # Replace classification head
        in_features = self.base_model.fc.in_features
        self.base_model.fc = nn.Linear(in_features, num_classes)

    def forward(self, x):
        return self.base_model(x)


# ============================
# 2. Training Setup
# ============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNet18_HPA(num_classes=28).to(device)

criterion = nn.BCEWithLogitsLoss()  # suited for multi-label classification
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


# ============================
# 3. Training Loop
# ============================
EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:  # ⚠️ train_loader from previous setup
        images, labels = images.to(device), labels.float().to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{EPOCHS}] - Train Loss: {epoch_loss:.4f}")


# ============================
# 4. Validation Evaluation
# ============================
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for images, labels in val_loader:  # ⚠️ val_loader from previous setup
        images = images.to(device)
        outputs = model(images)

        preds = (torch.sigmoid(outputs).cpu().numpy() > 0.5).astype(int)
        all_preds.append(preds)
        all_labels.append(labels.cpu().numpy())

all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)

print("\n📊 Validation Results:")
print(f"Macro F1: {f1_score(all_labels, all_preds, average='macro'):.3f}")
print(f"Micro F1: {f1_score(all_labels, all_preds, average='micro'):.3f}")


# ============================
# 🔎 Observations
# ============================
# - ResNet18 pretrained on ImageNet adapts successfully to 4-channel microscopy.
# - BCEWithLogitsLoss ensures stable multi-label training.
# - Outperforms baseline CNN in both loss convergence & F1 scores.
# - Next steps:
#   * Deeper models: ResNet50, EfficientNet (via timm).
#   * Data augmentation: random flips, rotations, color jitter.
#   * Advanced loss functions: Focal Loss, Class-Balanced Loss to handle imbalance.



Epoch [1/5] - Train Loss: 0.2193
Epoch [2/5] - Train Loss: 0.1273
Epoch [3/5] - Train Loss: 0.0966
Epoch [4/5] - Train Loss: 0.0608
Epoch [5/5] - Train Loss: 0.0346

📊 Validation Results:
Macro F1: 0.212
Micro F1: 0.504
