In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ----------------------------
# Data Loading
# ----------------------------
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=True,
    download=True,
    transform=transform
)

test_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=False,
    download=True,
    transform=transform
)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# ----------------------------
# VGG-like Model for MNIST
# ----------------------------
class VGG_MNIST(nn.Module):
    def __init__(self):
        super(VGG_MNIST, self).__init__()

        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(1, 64, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            # Block 2
            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            # Block 3
            nn.Conv2d(128, 256, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * 3 * 3, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 10)  # 10 classes
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

model = VGG_MNIST().to(device)

# ----------------------------
# Loss + Optimizer
# ----------------------------
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ----------------------------
# Training Loop
# ----------------------------
epochs = 10

for epoch in range(epochs):
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        # Convert labels to one-hot for BCE
        labels_onehot = torch.zeros(labels.size(0), 10).to(device)
        labels_onehot.scatter_(1, labels.unsqueeze(1), 1)

        optimizer.zero_grad()
        outputs = model(images)

        loss = criterion(outputs, labels_onehot)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Accuracy calculation
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch+1}/{epochs}] "
          f"Loss: {running_loss/len(train_loader):.4f} "
          f"Train Accuracy: {100*correct/total:.2f}%")

# ----------------------------
# Testing
# ----------------------------
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"\nTest Accuracy: {100*correct/total:.2f}%")


Using device: cuda
Epoch [1/10] Loss: 0.2570 Train Accuracy: 33.65%
Epoch [2/10] Loss: 0.0153 Train Accuracy: 97.69%
Epoch [3/10] Loss: 0.0091 Train Accuracy: 98.70%
Epoch [4/10] Loss: 0.0064 Train Accuracy: 99.07%
Epoch [5/10] Loss: 0.0053 Train Accuracy: 99.24%
Epoch [6/10] Loss: 0.0046 Train Accuracy: 99.38%
Epoch [7/10] Loss: 0.0042 Train Accuracy: 99.41%
Epoch [8/10] Loss: 0.0032 Train Accuracy: 99.59%
Epoch [9/10] Loss: 0.0030 Train Accuracy: 99.61%
Epoch [10/10] Loss: 0.0028 Train Accuracy: 99.65%

Test Accuracy: 99.18%


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F

# ----------------------------
# Device Setup
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

if device.type == "cuda":
    print("GPU Name:", torch.cuda.get_device_name(0))

# ----------------------------
# Data
# ----------------------------
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=True,
    download=True,
    transform=transform
)

test_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=False,
    download=True,
    transform=transform
)

train_loader = DataLoader(train_dataset, batch_size=128,
                          shuffle=True, num_workers=2, pin_memory=True)

test_loader = DataLoader(test_dataset, batch_size=128,
                         shuffle=False, num_workers=2, pin_memory=True)

# ----------------------------
# AlexNet (Modified for MNIST)
# ----------------------------
class AlexNet_MNIST(nn.Module):
    def __init__(self):
        super(AlexNet_MNIST, self).__init__()

        self.features = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 192, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(),

            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(),

            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * 3 * 3, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x


# ----------------------------
# Focal Loss (Multi-class)
# ----------------------------
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()


model = AlexNet_MNIST().to(device)

criterion = FocalLoss(alpha=1, gamma=2)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Mixed precision
scaler = torch.cuda.amp.GradScaler()

# ----------------------------
# Training
# ----------------------------
epochs = 20

for epoch in range(epochs):
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch+1}/{epochs}] "
          f"Loss: {running_loss/len(train_loader):.4f} "
          f"Train Acc: {100*correct/total:.2f}%")

# ----------------------------
# Testing
# ----------------------------
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        outputs = model(images)
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"\nFinal Test Accuracy: {100*correct/total:.2f}%")


Using device: cuda
GPU Name: NVIDIA GeForce RTX 3050 6GB Laptop GPU


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch [1/20] Loss: 1.1988 Train Acc: 43.37%
Epoch [2/20] Loss: 0.0799 Train Acc: 94.91%
Epoch [3/20] Loss: 0.0446 Train Acc: 96.97%
Epoch [4/20] Loss: 0.0323 Train Acc: 97.76%
Epoch [5/20] Loss: 0.0258 Train Acc: 98.21%
Epoch [6/20] Loss: 0.0211 Train Acc: 98.50%
Epoch [7/20] Loss: 0.0175 Train Acc: 98.72%
Epoch [8/20] Loss: 0.0151 Train Acc: 98.83%
Epoch [9/20] Loss: 0.0138 Train Acc: 98.93%
Epoch [10/20] Loss: 0.0115 Train Acc: 99.11%
Epoch [11/20] Loss: 0.0099 Train Acc: 99.22%
Epoch [12/20] Loss: 0.0088 Train Acc: 99.28%
Epoch [13/20] Loss: 0.0083 Train Acc: 99.30%
Epoch [14/20] Loss: 0.0076 Train Acc: 99.38%
Epoch [15/20] Loss: 0.0067 Train Acc: 99.41%
Epoch [16/20] Loss: 0.0057 Train Acc: 99.49%
Epoch [17/20] Loss: 0.0048 Train Acc: 99.57%
Epoch [18/20] Loss: 0.0045 Train Acc: 99.60%
Epoch [19/20] Loss: 0.0042 Train Acc: 99.64%
Epoch [20/20] Loss: 0.0039 Train Acc: 99.62%

Final Test Accuracy: 99.28%


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torchvision.models as models
import torch.nn.functional as F
import math

# ----------------------------
# Device Setup
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

if device.type == "cuda":
    print("GPU Name:", torch.cuda.get_device_name(0))

# ----------------------------
# Data
# ----------------------------
transform = transforms.Compose([
    transforms.Resize(32),  # ResNet prefers larger input
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = torchvision.datasets.MNIST(
    root='./data', train=True, download=True, transform=transform
)

test_dataset = torchvision.datasets.MNIST(
    root='./data', train=False, download=True, transform=transform
)

train_loader = DataLoader(train_dataset, batch_size=128,
                          shuffle=True, num_workers=2, pin_memory=True)

test_loader = DataLoader(test_dataset, batch_size=128,
                         shuffle=False, num_workers=2, pin_memory=True)

# ----------------------------
# ResNet Backbone (Modified)
# ----------------------------
class ResNet_MNIST(nn.Module):
    def __init__(self, embedding_dim=128):
        super().__init__()

        self.backbone = models.resnet18(weights=None)

        # Modify first conv layer for 1-channel
        self.backbone.conv1 = nn.Conv2d(
            1, 64, kernel_size=7, stride=2, padding=3, bias=False
        )

        self.backbone.fc = nn.Linear(self.backbone.fc.in_features,
                                     embedding_dim)

    def forward(self, x):
        return self.backbone(x)


# ----------------------------
# ArcFace Layer
# ----------------------------
class ArcFace(nn.Module):
    def __init__(self, in_features, out_features, s=30.0, m=0.5):
        super().__init__()
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, input, label):
        cosine = F.linear(F.normalize(input),
                          F.normalize(self.weight))
        theta = torch.acos(torch.clamp(cosine, -1+1e-7, 1-1e-7))
        target_logits = torch.cos(theta + self.m)

        one_hot = torch.zeros_like(cosine)
        one_hot.scatter_(1, label.view(-1,1), 1.0)

        output = (one_hot * target_logits) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output


# ----------------------------
# Model + ArcFace Head
# ----------------------------
embedding_dim = 128
num_classes = 10

backbone = ResNet_MNIST(embedding_dim).to(device)
arcface = ArcFace(embedding_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(list(backbone.parameters()) +
                       list(arcface.parameters()), lr=0.001)

scaler = torch.cuda.amp.GradScaler()

# ----------------------------
# Training
# ----------------------------
epochs = 15

for epoch in range(epochs):
    backbone.train()
    arcface.train()

    running_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            features = backbone(images)
            outputs = arcface(features, labels)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch+1}/{epochs}] "
          f"Loss: {running_loss/len(train_loader):.4f} "
          f"Train Acc: {100*correct/total:.2f}%")

# ----------------------------
# Testing
# ----------------------------
backbone.eval()
arcface.eval()

correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        features = backbone(images)
        outputs = arcface(features, labels)
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"\nFinal Test Accuracy: {100*correct/total:.2f}%")


Using device: cuda
GPU Name: NVIDIA GeForce RTX 3050 6GB Laptop GPU


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch [1/15] Loss: 1.2583 Train Acc: 91.88%
Epoch [2/15] Loss: 0.4571 Train Acc: 96.88%
Epoch [3/15] Loss: 0.2982 Train Acc: 97.74%
Epoch [4/15] Loss: 0.2340 Train Acc: 98.14%
Epoch [5/15] Loss: 0.2380 Train Acc: 98.13%
Epoch [6/15] Loss: 0.2601 Train Acc: 97.87%
Epoch [7/15] Loss: 0.2617 Train Acc: 97.92%
Epoch [8/15] Loss: 0.2584 Train Acc: 97.87%
Epoch [9/15] Loss: 0.2549 Train Acc: 97.95%
Epoch [10/15] Loss: 0.2553 Train Acc: 97.92%
Epoch [11/15] Loss: 0.2554 Train Acc: 97.81%
Epoch [12/15] Loss: 0.2524 Train Acc: 97.79%
Epoch [13/15] Loss: 0.2615 Train Acc: 97.75%
Epoch [14/15] Loss: 0.2557 Train Acc: 97.77%
Epoch [15/15] Loss: 0.2565 Train Acc: 97.76%

Final Test Accuracy: 97.66%


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# ----------------------------
# Device Setup
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

if device.type == "cuda":
    print("GPU Name:", torch.cuda.get_device_name(0))

# ----------------------------
# Data (CIFAR-10)
# ----------------------------
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5),
                         (0.5, 0.5, 0.5))
])

train_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=True,
    download=True,
    transform=transform
)

test_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=transform
)

train_loader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

# ----------------------------
# VGG-like Model for CIFAR-10
# ----------------------------
class VGG_CIFAR10(nn.Module):
    def __init__(self):
        super(VGG_CIFAR10, self).__init__()

        self.features = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),  # changed to 3 channels
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(128, 256, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * 4 * 4, 512),  # changed size (32→16→8→4)
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

model = VGG_CIFAR10().to(device)

# ----------------------------
# Loss + Optimizer
# ----------------------------
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

scaler = torch.cuda.amp.GradScaler()

# ----------------------------
# Training
# ----------------------------
epochs = 10

for epoch in range(epochs):
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        # One-hot encoding for BCE
        labels_onehot = torch.zeros(labels.size(0), 10, device=device)
        labels_onehot.scatter_(1, labels.unsqueeze(1), 1)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model(images)
            loss = criterion(outputs, labels_onehot)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch+1}/{epochs}] "
          f"Loss: {running_loss/len(train_loader):.4f} "
          f"Train Acc: {100*correct/total:.2f}%")

# ----------------------------
# Testing
# ----------------------------
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        outputs = model(images)
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"\nFinal Test Accuracy: {100*correct/total:.2f}%")


Using device: cuda
GPU Name: NVIDIA GeForce RTX 3050 6GB Laptop GPU
Files already downloaded and verified
Files already downloaded and verified


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch [1/10] Loss: 0.3021 Train Acc: 22.54%
Epoch [2/10] Loss: 0.2064 Train Acc: 52.42%
Epoch [3/10] Loss: 0.1605 Train Acc: 65.30%
Epoch [4/10] Loss: 0.1301 Train Acc: 72.71%
Epoch [5/10] Loss: 0.1091 Train Acc: 77.81%
Epoch [6/10] Loss: 0.0923 Train Acc: 81.58%
Epoch [7/10] Loss: 0.0780 Train Acc: 84.71%
Epoch [8/10] Loss: 0.0655 Train Acc: 87.44%
Epoch [9/10] Loss: 0.0552 Train Acc: 89.43%
Epoch [10/10] Loss: 0.0449 Train Acc: 91.68%

Final Test Accuracy: 81.18%


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F

# ----------------------------
# Device Setup
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

if device.type == "cuda":
    print("GPU Name:", torch.cuda.get_device_name(0))

# ----------------------------
# Data (CIFAR-10)
# ----------------------------
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5),
                         (0.5, 0.5, 0.5))
])

train_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=True,
    download=True,
    transform=transform
)

test_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=transform
)

train_loader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

# ----------------------------
# AlexNet for CIFAR-10
# ----------------------------
class AlexNet_CIFAR10(nn.Module):
    def __init__(self):
        super(AlexNet_CIFAR10, self).__init__()

        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 192, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(),

            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(),

            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * 4 * 4, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x


# ----------------------------
# Focal Loss (Multi-class)
# ----------------------------
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()


model = AlexNet_CIFAR10().to(device)

criterion = FocalLoss(alpha=1, gamma=2)
optimizer = optim.SGD(model.parameters(),
                      lr=0.01,
                      momentum=0.9,
                      weight_decay=5e-4)

scaler = torch.cuda.amp.GradScaler()

# ----------------------------
# Training
# ----------------------------
epochs = 20

for epoch in range(epochs):
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch+1}/{epochs}] "
          f"Loss: {running_loss/len(train_loader):.4f} "
          f"Train Acc: {100*correct/total:.2f}%")

# ----------------------------
# Testing
# ----------------------------
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        outputs = model(images)
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"\nFinal Test Accuracy: {100*correct/total:.2f}%")


Using device: cuda
GPU Name: NVIDIA GeForce RTX 3050 6GB Laptop GPU
Files already downloaded and verified
Files already downloaded and verified


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch [1/20] Loss: 1.7854 Train Acc: 15.09%
Epoch [2/20] Loss: 1.2298 Train Acc: 33.60%
Epoch [3/20] Loss: 0.9987 Train Acc: 42.77%
Epoch [4/20] Loss: 0.8481 Train Acc: 49.85%
Epoch [5/20] Loss: 0.7546 Train Acc: 54.31%
Epoch [6/20] Loss: 0.6550 Train Acc: 59.36%
Epoch [7/20] Loss: 0.5890 Train Acc: 62.90%
Epoch [8/20] Loss: 0.5270 Train Acc: 66.31%
Epoch [9/20] Loss: 0.4766 Train Acc: 69.37%
Epoch [10/20] Loss: 0.4358 Train Acc: 71.52%
Epoch [11/20] Loss: 0.3906 Train Acc: 74.10%
Epoch [12/20] Loss: 0.3551 Train Acc: 75.88%
Epoch [13/20] Loss: 0.3271 Train Acc: 77.58%
Epoch [14/20] Loss: 0.2970 Train Acc: 78.93%
Epoch [15/20] Loss: 0.2696 Train Acc: 80.34%
Epoch [16/20] Loss: 0.2472 Train Acc: 81.76%
Epoch [17/20] Loss: 0.2248 Train Acc: 82.99%
Epoch [18/20] Loss: 0.1973 Train Acc: 84.60%
Epoch [19/20] Loss: 0.1784 Train Acc: 85.64%
Epoch [20/20] Loss: 0.1606 Train Acc: 86.59%

Final Test Accuracy: 78.45%


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torchvision.models as models
import torch.nn.functional as F

# ----------------------------
# Device Setup
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

if device.type == "cuda":
    print("GPU Name:", torch.cuda.get_device_name(0))

# ----------------------------
# Data (CIFAR-10)
# ----------------------------
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5),
                         (0.5, 0.5, 0.5))
])

train_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=True,
    download=True,
    transform=transform
)

test_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=transform
)

train_loader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

# ----------------------------
# ResNet Backbone (CIFAR-10)
# ----------------------------
class ResNet_CIFAR10(nn.Module):
    def __init__(self, embedding_dim=128):
        super().__init__()

        self.backbone = models.resnet18(weights=None)

        # Adjust for CIFAR (remove large initial downsampling)
        self.backbone.conv1 = nn.Conv2d(
            3, 64, kernel_size=3, stride=1, padding=1, bias=False
        )
        self.backbone.maxpool = nn.Identity()

        self.backbone.fc = nn.Linear(
            self.backbone.fc.in_features,
            embedding_dim
        )

    def forward(self, x):
        return self.backbone(x)


# ----------------------------
# ArcFace Layer
# ----------------------------
class ArcFace(nn.Module):
    def __init__(self, in_features, out_features, s=30.0, m=0.5):
        super().__init__()
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, input, labels):
        cosine = F.linear(F.normalize(input),
                          F.normalize(self.weight))
        cosine = cosine.clamp(-1 + 1e-7, 1 - 1e-7)

        theta = torch.acos(cosine)
        target_logits = torch.cos(theta + self.m)

        one_hot = torch.zeros_like(cosine)
        one_hot.scatter_(1, labels.view(-1, 1), 1.0)

        output = one_hot * target_logits + (1 - one_hot) * cosine
        output *= self.s

        return output


# ----------------------------
# Model + ArcFace Head
# ----------------------------
embedding_dim = 128
num_classes = 10

backbone = ResNet_CIFAR10(embedding_dim).to(device)
arcface = ArcFace(embedding_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(
    list(backbone.parameters()) +
    list(arcface.parameters()),
    lr=0.001
)

scaler = torch.cuda.amp.GradScaler()

# ----------------------------
# Training
# ----------------------------
epochs = 15

for epoch in range(epochs):
    backbone.train()
    arcface.train()

    running_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            features = backbone(images)
            outputs = arcface(features, labels)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch+1}/{epochs}] "
          f"Loss: {running_loss/len(train_loader):.4f} "
          f"Train Acc: {100*correct/total:.2f}%")

# ----------------------------
# Testing
# ----------------------------
backbone.eval()
arcface.eval()

correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        features = backbone(images)
        outputs = arcface(features, labels)

        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"\nFinal Test Accuracy: {100*correct/total:.2f}%")


Using device: cuda
GPU Name: NVIDIA GeForce RTX 3050 6GB Laptop GPU
Files already downloaded and verified
Files already downloaded and verified


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch [1/15] Loss: 7.6165 Train Acc: 30.30%
Epoch [2/15] Loss: nan Train Acc: 45.71%
Epoch [3/15] Loss: nan Train Acc: 10.00%
Epoch [4/15] Loss: nan Train Acc: 10.00%
Epoch [5/15] Loss: nan Train Acc: 10.00%
Epoch [6/15] Loss: nan Train Acc: 10.00%
Epoch [7/15] Loss: nan Train Acc: 10.00%
Epoch [8/15] Loss: nan Train Acc: 10.00%
Epoch [9/15] Loss: nan Train Acc: 10.00%
Epoch [10/15] Loss: nan Train Acc: 10.00%
Epoch [11/15] Loss: nan Train Acc: 10.00%
Epoch [12/15] Loss: nan Train Acc: 10.00%
Epoch [13/15] Loss: nan Train Acc: 10.00%
Epoch [14/15] Loss: nan Train Acc: 10.00%
Epoch [15/15] Loss: nan Train Acc: 10.00%

Final Test Accuracy: 10.00%
