In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import os
from tqdm import tqdm
from tqdm import tqdm 

In [5]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.2860,), (0.3530,))  # Fashion-MNIST mean and std
])

In [8]:
train_dataset = torchvision.datasets.FashionMNIST(
    root='./data', 
    train=True, 
    download=True, 
    transform=transform
)

test_dataset = torchvision.datasets.FashionMNIST(
    root='./data', 
    train=False, 
    download=True, 
    transform=transform
)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data\FashionMNIST\raw\train-images-idx3-ubyte.gz


100%|██████████| 26.4M/26.4M [00:04<00:00, 5.43MB/s]


Extracting ./data\FashionMNIST\raw\train-images-idx3-ubyte.gz to ./data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data\FashionMNIST\raw\train-labels-idx1-ubyte.gz


100%|██████████| 29.5k/29.5k [00:00<00:00, 217kB/s]


Extracting ./data\FashionMNIST\raw\train-labels-idx1-ubyte.gz to ./data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz


100%|██████████| 4.42M/4.42M [00:02<00:00, 1.82MB/s]


Extracting ./data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz to ./data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz


100%|██████████| 5.15k/5.15k [00:00<00:00, 5.12MB/s]

Extracting ./data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\FashionMNIST\raw






In [9]:
batch_size=32
train_loader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
test_loader=DataLoader(test_dataset,batch_size=batch_size,shuffle=False)

In [22]:
print(train_dataset.targets.unique())

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


In [23]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

In [32]:
class TeacherResNet(nn.Module):
    def __init__(self):
        super().__init__()
        #input=1x28x28
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.bn1=nn.BatchNorm2d(32)
        #output=32x28x28

        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2=nn.BatchNorm2d(64)
        self.conv3=nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.bn3=nn.BatchNorm2d(64)
        #output=64x28x28

        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        #output=64x14x14

        self.conv4 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn4=nn.BatchNorm2d(128)
        self.conv5=nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
        self.bn5=nn.BatchNorm2d(128)

        self.pool2=nn.MaxPool2d(kernel_size=2, stride=2)

        self.fc1 = nn.Linear(128 * 7 * 7, 256)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, 10)

    def forward(self, x):
        # Input x shape: [batch_size, 1, 28, 28]
        x = F.relu(self.bn1(self.conv1(x)))  # [batch_size, 32, 28, 28]
        
        residual = x
        x = F.relu(self.bn2(self.conv2(x)))  # [batch_size, 64, 28, 28]
        x = self.bn3(self.conv3(x))  # [batch_size, 64, 28, 28]
        
    
        residual = F.pad(residual, (0, 0, 0, 0, 16, 16))  # Pad channels from 32 to 64
        x = F.relu(x + residual)  # [batch_size, 64, 28, 28]
        
        x = self.pool1(x)  # [batch_size, 64, 14, 14]
        

        residual = x
        x = F.relu(self.bn4(self.conv4(x)))  # [batch_size, 128, 14, 14]
        x = self.bn5(self.conv5(x))  # [batch_size, 128, 14, 14]
        

        residual = F.pad(residual, (0, 0, 0, 0, 32, 32))  # Pad channels from 64 to 128
        x = F.relu(x + residual)  # [batch_size, 128, 14, 14]
        
        x = self.pool2(x)  # [batch_size, 128, 7, 7]
        
        # Flatten
        x = torch.flatten(x, 1)  # [batch_size, 128*7*7] = [batch_size, 6272]
        
        # Fully connected layers
        x = F.relu(self.fc1(x))  # [batch_size, 256]
        x = self.dropout1(x)
        x = self.fc2(x)  # [batch_size, 10]
        
        return x


In [33]:
class StudentMobileNet(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Depthwise separable convolution block function
        def depthwise_separable_conv(in_channels, out_channels, stride=1):
            return nn.Sequential(
                # Depthwise convolution
                nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=stride, padding=1, groups=in_channels),
                nn.BatchNorm2d(in_channels),
                nn.ReLU(inplace=True),
                
                # Pointwise convolution
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1),
                nn.BatchNorm2d(out_channels),
                nn.ReLU(inplace=True)
            )
        
        # Initial convolution
        # Input: [batch_size, 1, 28, 28]
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        # Output: [batch_size, 16, 28, 28]
        
        # Depthwise separable convolution blocks
        self.conv_dw1 = depthwise_separable_conv(16, 32, stride=2)
        # Output: [batch_size, 32, 14, 14]
        
        self.conv_dw2 = depthwise_separable_conv(32, 64, stride=1)
        # Output: [batch_size, 64, 14, 14]
        
        self.conv_dw3 = depthwise_separable_conv(64, 64, stride=2)
        # Output: [batch_size, 64, 7, 7]
        
        # Global average pooling
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        # Output: [batch_size, 64, 1, 1]
        
        # Classifier
        self.fc = nn.Linear(64, 10)  # 10 classes for Fashion-MNIST
    
    def forward(self, x):
        # Input x shape: [batch_size, 1, 28, 28]
        x = F.relu(self.bn1(self.conv1(x)))  # [batch_size, 16, 28, 28]
        
        x = self.conv_dw1(x)  # [batch_size, 32, 14, 14]
        x = self.conv_dw2(x)  # [batch_size, 64, 14, 14]
        x = self.conv_dw3(x)  # [batch_size, 64, 7, 7]
        
        x = self.global_pool(x)  # [batch_size, 64, 1, 1]
        x = x.view(x.size(0), -1)  # [batch_size, 64]
        x = self.fc(x)  # [batch_size, 10]
        
        return x


In [35]:
teacher_model = TeacherResNet()
student_model = StudentMobileNet()

In [36]:
from torchinfo import summary
print("Teacher Model Summary:")
summary(model=teacher_model, input_size=(batch_size, 1, 28, 28))
print("\nStudent Model Summary:")
summary(model=student_model, input_size=(batch_size, 1, 28, 28))

Teacher Model Summary:

Student Model Summary:


Layer (type:depth-idx)                   Output Shape              Param #
StudentMobileNet                         [64, 10]                  --
├─Conv2d: 1-1                            [64, 16, 28, 28]          160
├─BatchNorm2d: 1-2                       [64, 16, 28, 28]          32
├─Sequential: 1-3                        [64, 32, 14, 14]          --
│    └─Conv2d: 2-1                       [64, 16, 14, 14]          160
│    └─BatchNorm2d: 2-2                  [64, 16, 14, 14]          32
│    └─ReLU: 2-3                         [64, 16, 14, 14]          --
│    └─Conv2d: 2-4                       [64, 32, 14, 14]          544
│    └─BatchNorm2d: 2-5                  [64, 32, 14, 14]          64
│    └─ReLU: 2-6                         [64, 32, 14, 14]          --
├─Sequential: 1-4                        [64, 64, 14, 14]          --
│    └─Conv2d: 2-7                       [64, 32, 14, 14]          320
│    └─BatchNorm2d: 2-8                  [64, 32, 14, 14]          64
│    └─ReLU

In [39]:
print(teacher_model.parameters)

<bound method Module.parameters of TeacherResNet(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv5): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_si

In [43]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.notebook import tqdm

def train_model(model, train_loader, val_loader, epochs=10, lr=1e-4, device="cuda", run_name="Model"):
    os.makedirs(f"checkpoints", exist_ok=True)
    
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # Track best validation accuracy
    best_val_acc = 0
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        # Use tqdm for progress bar
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} [Train]')
        
        for images, labels in progress_bar:
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            # Update progress bar
            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{100 * correct / total:.2f}%'
            })
        
        train_acc = 100 * correct / total
        train_loss = total_loss / len(train_loader)
        
        # Validation step
        val_acc, val_loss = evaluate_model(model, val_loader, criterion, device)
        
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
        
        # Save model if it's the best so far
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            model_path = f"checkpoints/{run_name}_best.pt"
            torch.save(model.state_dict(), model_path)
            print(f"New best model saved at: {model_path} (Val Acc: {val_acc:.2f}%)")
        
        # Save model every 5 epochs
        if (epoch + 1) % 5 == 0:
            model_path = f"checkpoints/{run_name}_epoch{epoch+1}.pt"
            torch.save(model.state_dict(), model_path)
            print(f"Checkpoint saved at: {model_path}")
    
    print(f"Training completed. Best validation accuracy: {best_val_acc:.2f}%")
    return model

def evaluate_model(model, val_loader, criterion, device="cuda"):
    model.eval()
    
    total_loss = 0
    correct = 0
    total = 0
    
    progress_bar = tqdm(val_loader, desc='Evaluation', leave=False)
    
    with torch.no_grad():
        for images, labels in progress_bar:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{100 * correct / total:.2f}%'
            })
    
    val_acc = 100 * correct / total
    val_loss = total_loss / len(val_loader)
    return val_acc, val_loss

def test_model(model, test_loader, device="cuda"):
    """Evaluate model on test set and print results per class"""
    model.eval()
    
    class_correct = [0] * 10
    class_total = [0] * 10
    
    # Fashion MNIST classes
    classes = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
    
    with torch.no_grad():
        for images, labels in tqdm(test_loader, desc='Testing'):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            
            # Collect per-class accuracy
            for i in range(labels.size(0)):
                label = labels[i]
                class_correct[label] += (predicted[i] == label).item()
                class_total[label] += 1
    
    # Print per-class accuracy
    print("\nTest Accuracy per class:")
    for i in range(10):
        accuracy = 100 * class_correct[i] / class_total[i]
        print(f'{classes[i]}: {accuracy:.2f}%')
    
    # Overall accuracy
    total_correct = sum(class_correct)
    total_samples = sum(class_total)
    overall_accuracy = 100 * total_correct / total_samples
    print(f'\nOverall Test Accuracy: {overall_accuracy:.2f}%')
    
    return overall_accuracy

# Main execution function to train and evaluate both models
def main(train_loader, test_loader, device="cuda", epochs=10):
    print("=" * 50)
    print("Training StudentMobileNet Model")
    print("=" * 50)
    
    student_model = StudentMobileNet()
    student_model = train_model(
        student_model, 
        train_loader, 
        test_loader, 
        epochs=epochs, 
        lr=1e-3,
        device=device, 
        run_name="StudentMobileNet"
    )
    
    print("\nEvaluating StudentMobileNet on test set:")
    student_acc = test_model(student_model, test_loader, device)
    
    print("\n" + "=" * 50)
    print("Training TeacherResNet Model")
    print("=" * 50)
    
    teacher_model = TeacherResNet()
    teacher_model = train_model(
        teacher_model, 
        train_loader, 
        test_loader, 
        epochs=epochs, 
        lr=1e-3,
        device=device, 
        run_name="TeacherResNet"
    )
    
    print("\nEvaluating TeacherResNet on test set:")
    teacher_acc = test_model(teacher_model, test_loader, device)
    
    print("\n" + "=" * 50)
    print("Model Comparison")
    print("=" * 50)
    print(f"StudentMobileNet accuracy: {student_acc:.2f}%")
    print(f"TeacherResNet accuracy: {teacher_acc:.2f}%")
    print(f"Difference: {teacher_acc - student_acc:.2f}%")
    
    return student_model, teacher_model

# To run the training and evaluation:
device = "cuda" if torch.cuda.is_available() else "cpu"
student_model, teacher_model = main(train_loader, test_loader, device=device, epochs=10)

Training StudentMobileNet Model


Epoch 1/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 1/10 | Train Loss: 0.8214 | Train Acc: 73.62% | Val Loss: 0.4750 | Val Acc: 83.54%
New best model saved at: checkpoints/StudentMobileNet_best.pt (Val Acc: 83.54%)


Epoch 2/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 2/10 | Train Loss: 0.4182 | Train Acc: 85.39% | Val Loss: 0.3912 | Val Acc: 86.10%
New best model saved at: checkpoints/StudentMobileNet_best.pt (Val Acc: 86.10%)


Epoch 3/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 3/10 | Train Loss: 0.3615 | Train Acc: 87.17% | Val Loss: 0.3689 | Val Acc: 87.11%
New best model saved at: checkpoints/StudentMobileNet_best.pt (Val Acc: 87.11%)


Epoch 4/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 4/10 | Train Loss: 0.3343 | Train Acc: 88.00% | Val Loss: 0.3403 | Val Acc: 87.87%
New best model saved at: checkpoints/StudentMobileNet_best.pt (Val Acc: 87.87%)


Epoch 5/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 5/10 | Train Loss: 0.3176 | Train Acc: 88.56% | Val Loss: 0.3345 | Val Acc: 88.39%
New best model saved at: checkpoints/StudentMobileNet_best.pt (Val Acc: 88.39%)
Checkpoint saved at: checkpoints/StudentMobileNet_epoch5.pt


Epoch 6/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 6/10 | Train Loss: 0.3034 | Train Acc: 89.08% | Val Loss: 0.3306 | Val Acc: 88.36%


Epoch 7/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 7/10 | Train Loss: 0.2915 | Train Acc: 89.53% | Val Loss: 0.3219 | Val Acc: 88.55%
New best model saved at: checkpoints/StudentMobileNet_best.pt (Val Acc: 88.55%)


Epoch 8/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 8/10 | Train Loss: 0.2858 | Train Acc: 89.65% | Val Loss: 0.3214 | Val Acc: 88.72%
New best model saved at: checkpoints/StudentMobileNet_best.pt (Val Acc: 88.72%)


Epoch 9/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 9/10 | Train Loss: 0.2783 | Train Acc: 89.94% | Val Loss: 0.3102 | Val Acc: 88.88%
New best model saved at: checkpoints/StudentMobileNet_best.pt (Val Acc: 88.88%)


Epoch 10/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 10/10 | Train Loss: 0.2709 | Train Acc: 90.38% | Val Loss: 0.3029 | Val Acc: 89.16%
New best model saved at: checkpoints/StudentMobileNet_best.pt (Val Acc: 89.16%)
Checkpoint saved at: checkpoints/StudentMobileNet_epoch10.pt
Training completed. Best validation accuracy: 89.16%

Evaluating StudentMobileNet on test set:


Testing:   0%|          | 0/157 [00:00<?, ?it/s]


Test Accuracy per class:
T-shirt/top: 85.70%
Trouser: 97.20%
Pullover: 85.70%
Dress: 89.80%
Coat: 79.90%
Sandal: 97.10%
Shirt: 68.00%
Sneaker: 93.30%
Bag: 98.10%
Ankle boot: 96.80%

Overall Test Accuracy: 89.16%

Training TeacherResNet Model


Epoch 1/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 1/10 | Train Loss: 0.5890 | Train Acc: 79.65% | Val Loss: 0.3212 | Val Acc: 88.23%
New best model saved at: checkpoints/TeacherResNet_best.pt (Val Acc: 88.23%)


Epoch 2/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 2/10 | Train Loss: 0.3500 | Train Acc: 87.42% | Val Loss: 0.2608 | Val Acc: 90.29%
New best model saved at: checkpoints/TeacherResNet_best.pt (Val Acc: 90.29%)


Epoch 3/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 3/10 | Train Loss: 0.2922 | Train Acc: 89.66% | Val Loss: 0.2352 | Val Acc: 91.37%
New best model saved at: checkpoints/TeacherResNet_best.pt (Val Acc: 91.37%)


Epoch 4/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 4/10 | Train Loss: 0.2582 | Train Acc: 90.98% | Val Loss: 0.2378 | Val Acc: 91.51%
New best model saved at: checkpoints/TeacherResNet_best.pt (Val Acc: 91.51%)


Epoch 5/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 5/10 | Train Loss: 0.2284 | Train Acc: 91.97% | Val Loss: 0.2208 | Val Acc: 92.50%
New best model saved at: checkpoints/TeacherResNet_best.pt (Val Acc: 92.50%)
Checkpoint saved at: checkpoints/TeacherResNet_epoch5.pt


Epoch 6/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 6/10 | Train Loss: 0.2091 | Train Acc: 92.64% | Val Loss: 0.2027 | Val Acc: 92.69%
New best model saved at: checkpoints/TeacherResNet_best.pt (Val Acc: 92.69%)


Epoch 7/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 7/10 | Train Loss: 0.1854 | Train Acc: 93.43% | Val Loss: 0.1981 | Val Acc: 93.06%
New best model saved at: checkpoints/TeacherResNet_best.pt (Val Acc: 93.06%)


Epoch 8/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 8/10 | Train Loss: 0.1693 | Train Acc: 94.00% | Val Loss: 0.1816 | Val Acc: 93.46%
New best model saved at: checkpoints/TeacherResNet_best.pt (Val Acc: 93.46%)


Epoch 9/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 9/10 | Train Loss: 0.1513 | Train Acc: 94.56% | Val Loss: 0.2160 | Val Acc: 93.30%


Epoch 10/10 [Train]:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch 10/10 | Train Loss: 0.1363 | Train Acc: 95.08% | Val Loss: 0.1934 | Val Acc: 93.59%
New best model saved at: checkpoints/TeacherResNet_best.pt (Val Acc: 93.59%)
Checkpoint saved at: checkpoints/TeacherResNet_epoch10.pt
Training completed. Best validation accuracy: 93.59%

Evaluating TeacherResNet on test set:


Testing:   0%|          | 0/157 [00:00<?, ?it/s]


Test Accuracy per class:
T-shirt/top: 88.00%
Trouser: 98.20%
Pullover: 89.90%
Dress: 95.20%
Coat: 91.20%
Sandal: 99.30%
Shirt: 80.50%
Sneaker: 98.20%
Bag: 99.10%
Ankle boot: 96.30%

Overall Test Accuracy: 93.59%

Model Comparison
StudentMobileNet accuracy: 89.16%
TeacherResNet accuracy: 93.59%
Difference: 4.43%
