In [3]:
import torch
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Dataset
from torch.nn.functional import sigmoid
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

import numpy as np

In [4]:
class EmojiDataset(Dataset):
    def __init__(self, rgb_images, edge_images, labels, transform=None):
        self.rgb_images = rgb_images  # (num, 224, 224, 3)
        self.edge_images = np.expand_dims(edge_images, axis=-1)  # Ensure (num, 224, 224, 1)
        self.labels = labels  # (num, 11)
        self.transform = transform

    def __len__(self):
        return len(self.rgb_images)

    def __getitem__(self, idx):
        # Convert to PyTorch tensors
        rgb = torch.tensor(self.rgb_images[idx], dtype=torch.float32)
        edge = torch.tensor(self.edge_images[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.float32)

        # Permute from (H, W, C) → (C, H, W)
        rgb = rgb.permute(2, 0, 1)  # (3, 224, 224)
        edge = edge.permute(2, 0, 1)  # (1, 224, 224)
        # Concatenate along channel dimension
        combined = torch.cat((rgb, edge), dim=0)  # (4, 224, 224)
        # Normalize image pixels to [0,1]
        combined /= 255.0

        if self.transform:
            combined = self.transform(combined)
        
        return combined, label

In [5]:
# Load images and labels
rgb_train= np.load("img_train_plaintext_top11_rgb.npy")  # Shape: (num_train, 224, 224, 3)
edges_train = np.load("img_train_plaintext_top11_edges.npy")  # Shape: (num_train, 224, 224)
labels_train = np.load("img_train_plaintext_top11_labels.npy")  # Shape: (num_train, 11) - Multi-label one-hot

rgb_valid= np.load("img_valid_plaintext_top11_rgb.npy")  # Shape: (num_valid, 224, 224, 3)
edges_valid = np.load("img_valid_plaintext_top11_edges.npy")  # Shape: (num_valid, 224, 224)
labels_valid = np.load("img_valid_plaintext_top11_labels.npy")  # Shape: (num_valid, 11)

rgb_test = np.load("img_test_plaintext_top11_rgb.npy")  # Shape: (num_valid, 224, 224, 3)
edges_test = np.load("img_test_plaintext_top11_edges.npy")  # Shape: (num_valid, 224, 224)
labels_test = np.load("img_test_plaintext_top11_labels.npy")  # Shape: (num_valid, 11)

print(rgb_train.shape, rgb_train.dtype, type(rgb_train[0]), rgb_train[0].shape)
print(edges_train.shape, edges_train.dtype, type(edges_train[0]), edges_train[0].shape)
print(labels_train.shape, labels_train.dtype, type(labels_train[0]), labels_train[0], labels_train[0][4])

(109176, 224, 224, 3) uint8 <class 'numpy.ndarray'> (224, 224, 3)
(109176, 224, 224) uint8 <class 'numpy.ndarray'> (224, 224)
(109176, 11) int64 <class 'numpy.ndarray'> [0 0 0 0 1 0 0 0 0 0 0] 1


In [6]:
# Define transformations (optional)
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),  # Data augmentation
    transforms.RandomRotation(10),
])

# Create Dataset instances
train_dataset = EmojiDataset(rgb_train, edges_train, labels_train, transform=transform)
valid_dataset = EmojiDataset(rgb_valid, edges_valid, labels_valid, transform=None)
test_dataset = EmojiDataset(rgb_test, edges_test, labels_test, transform=None)

# Create DataLoaders
batch_size = 32  # Adjust as needed
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [7]:
combined, label = train_dataset[0]

print("Single image shape:", combined.shape)  # Expected: (4, 224, 224)
print("Single label shape:", label.shape)  # Expected: (11,)

Single image shape: torch.Size([4, 224, 224])
Single label shape: torch.Size([11])


In [8]:
for combined, labels in train_loader:
    print("Batch image shape:", combined.shape)  # Expected: (32, 4, 224, 224)
    print("Batch label shape:", labels.shape)  # Expected: (32, 11)
    break

Batch image shape: torch.Size([32, 4, 224, 224])
Batch label shape: torch.Size([32, 11])


In [9]:
# Load Pretrained ResNet18
model = models.convnext_base(weights="DEFAULT")

num_classes = 11  # Top 11

# Modify for 4-channel (RGB + Edge)
model.features[0][0] = nn.Conv2d(4, 128, kernel_size=4, stride=4)
# Modify the last layer
model.classifier[-1] = nn.Linear(model.classifier[-1].in_features, num_classes)
print(model)

# Move model to GPU
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)

ConvNeXt(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(4, 128, kernel_size=(4, 4), stride=(4, 4))
      (1): LayerNorm2d((128,), eps=1e-06, elementwise_affine=True)
    )
    (1): Sequential(
      (0): CNBlock(
        (block): Sequential(
          (0): Conv2d(128, 128, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=128)
          (1): Permute()
          (2): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
          (3): Linear(in_features=128, out_features=512, bias=True)
          (4): GELU(approximate='none')
          (5): Linear(in_features=512, out_features=128, bias=True)
          (6): Permute()
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
      )
      (1): CNBlock(
        (block): Sequential(
          (0): Conv2d(128, 128, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=128)
          (1): Permute()
          (2): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
          (3): Linear(

In [10]:
# Define loss function & optimizer
criterion = nn.BCEWithLogitsLoss()  # Multi-label loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
num_epochs = 20  # Adjust as needed

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)

        loss = criterion(outputs, labels) 
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # After each epoch, validate on the validation set
    model.eval()  # Set model to evaluation mode
    valid_loss = 0.0
    with torch.no_grad():  # Disable gradient calculation for validation
        for images, labels in valid_loader:
            images, labels = images.to(device), labels.to(device) 

            outputs = model(images)
            loss = criterion(outputs, labels)

            valid_loss += loss.item()
    print(f"Epoch {epoch+1}, Train Loss: {train_loss/len(train_loader)} , Valid Loss: {valid_loss/len(valid_loader)}")
    

Epoch 1, Train Loss: 0.2812170275341794 , Valid Loss: 0.2813217341111451
Epoch 2, Train Loss: 0.28058404172361245 , Valid Loss: 0.2807369873952613
Epoch 3, Train Loss: 0.2801343678973903 , Valid Loss: 0.2807458509449606
Epoch 4, Train Loss: 0.28004300686042727 , Valid Loss: 0.28057760358960543
Epoch 5, Train Loss: 0.2800008946896508 , Valid Loss: 0.280255787153408
Epoch 6, Train Loss: 0.2799432360177373 , Valid Loss: 0.2801549897859336
Epoch 7, Train Loss: 0.2799219448787596 , Valid Loss: 0.2806131699019008
Epoch 8, Train Loss: 0.27991045284732424 , Valid Loss: 0.2800868716899049
Epoch 9, Train Loss: 0.27988658668986516 , Valid Loss: 0.2803355153039019
Epoch 10, Train Loss: 0.27987679391051234 , Valid Loss: 0.28015613331208156
Epoch 11, Train Loss: 0.2798473111813471 , Valid Loss: 0.28018334003352613
Epoch 12, Train Loss: 0.2798669612229499 , Valid Loss: 0.28006450678147965
Epoch 13, Train Loss: 0.27986471682944164 , Valid Loss: 0.2800159913601068
Epoch 14, Train Loss: 0.27983796361130

In [12]:
model.eval()

for threshold in (0.3, 0.5, 0.7):
    print(f"Threshold: {threshold}")
    
    total = 0
    correct = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            # Convert logits to probabilities
            probs = sigmoid(outputs)  
            # Convert logits to binary, 
            # Threshold at 0.5 (adjustable) to get binary labels
            predicted = (probs > threshold).float()
            
            # Store predictions and actual labels
            all_preds.append(predicted.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            
            correct += (predicted == labels).sum().item()
            total += labels.numel()  # Count total number of elements

    # Convert list of numpy arrays into a single numpy array
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    # Compute confusion matrix, precision, and recall for each label (emoji)
    accuracy = 100 * correct / total
    precision = 100 * precision_score(all_labels, all_preds, average="macro")  # Use 'macro' for multi-label
    recall = 100 * recall_score(all_labels, all_preds, average="macro")
    f1 = 100 * f1_score(all_labels, all_preds, average="macro")

    print(f"Test Accuracy: {accuracy:.2f}%")
    print(f"Test Precision: {precision:.2f}%")
    print(f"Test Recall: {recall:.2f}%")
    print(f"Test F1: {f1:.2f}% \n")

Threshold: 0.3


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Test Accuracy: 90.17%
Test Precision: 0.00%
Test Recall: 0.00%
Test F1: 0.00% 

Threshold: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Test Accuracy: 90.17%
Test Precision: 0.00%
Test Recall: 0.00%
Test F1: 0.00% 

Threshold: 0.7
Test Accuracy: 90.17%
Test Precision: 0.00%
Test Recall: 0.00%
Test F1: 0.00% 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
