In [None]:
!pip install -U torch torchvision



# Imports

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import random

# Enable faster training on NVIDIA GPUs
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [None]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, initial_channels=64, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = initial_channels

        self.conv1 = nn.Conv2d(3, initial_channels, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(initial_channels)

        # Stacking layers
        self.layer1 = self._make_layer(block, initial_channels,     num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, initial_channels*2,   num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, initial_channels*4,   num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, initial_channels*8,   num_blocks[3], stride=2)

        self.linear = nn.Linear(initial_channels*8*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for s in strides:
            layers.append(block(self.in_planes, planes, s))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)  # typical for CIFAR-10
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNetTeacher():
    """
    A "larger" teacher with more layers, e.g. a ResNet-like config [3,4,6,3].
    """
    return ResNet(BasicBlock, [3, 4, 6, 3], initial_channels=64, num_classes=10)


def ResNetStudent():
    """
    A smaller student, e.g. [2,2,2,1] with fewer channels.
    """
    return ResNet(BasicBlock, [2, 2, 2, 1], initial_channels=56, num_classes=10)


In [None]:
def ResNetTeacher():
    """
    Example "bigger" teacher:
    4 stages with [3, 4, 6, 3] blocks each,
    initial_channels=64 => total params are bigger than the student.
    """
    return ResNet(BasicBlock, [3, 4, 6, 3], initial_channels=64, num_classes=10)


In [None]:
def ResNetStudent():
    """
    A smaller student model with fewer blocks / channels:
    e.g. [2, 2, 2, 1], initial_channels=56
    """
    return ResNet(BasicBlock, [2, 2, 2, 1], initial_channels=56, num_classes=10)


In [None]:
def distillation_loss(student_logits, teacher_logits, labels,
                      alpha=0.5, T=4.0, label_smoothing=0.0):
    """
    student_logits: Student's raw output
    teacher_logits: Teacher's raw output (no gradient)
    labels: Ground-truth labels
    alpha: weighting factor between CE and Distillation
    T: temperature
    label_smoothing: optional label smoothing for CE
    """
    # CE with label smoothing
    ce = nn.CrossEntropyLoss(label_smoothing=label_smoothing)(student_logits, labels)

    # KL divergence of softened outputs
    kl = nn.KLDivLoss(reduction='batchmean')(
        F.log_softmax(student_logits / T, dim=1),
        F.softmax(teacher_logits / T, dim=1)
    ) * (T * T)

    return alpha * ce + (1 - alpha) * kl


In [None]:
def train_teacher(
    teacher_model,
    trainloader,
    valloader,
    num_epochs=100,
    lr=0.1,
    label_smoothing=0.0
):
    """
    Standard classification training for the teacher model on CIFAR-10.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = teacher_model.to(device).to(memory_format=torch.channels_last)
    model.train()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
    scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)

    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    scaler = torch.cuda.amp.GradScaler()

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct, total = 0, 0

        for inputs, labels in trainloader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad(set_to_none=True)

            with torch.cuda.amp.autocast(dtype=torch.float16):
                outputs = model(inputs)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # optional
            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

        scheduler.step()

        train_acc = 100. * correct / total
        val_acc = validate(model, valloader, device)
        print(f"[Epoch {epoch+1}/{num_epochs}] Loss: {running_loss/len(trainloader):.4f} | "
              f"Train Acc: {train_acc:.2f}% | Val Acc: {val_acc:.2f}%")

    return model


print("Teacher training function ready.")


Teacher training function ready.


In [None]:
def train_distilled(teacher, student, trainloader, valloader,
                    num_epochs=250, alpha=0.5, T=4.0,
                    lr=0.1, label_smoothing=0.1):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Move teacher to device, set eval mode
    teacher = teacher.to(device)
    teacher.eval()

    # Move student to device, set train mode
    student = student.to(device).to(memory_format=torch.channels_last)
    student.train()

    optimizer = torch.optim.SGD(student.parameters(), lr=lr,
                                momentum=0.9, weight_decay=5e-4)
    scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)

    scaler = torch.cuda.amp.GradScaler()

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct, total = 0, 0

        for inputs, labels in trainloader:
            inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)

            optimizer.zero_grad(set_to_none=True)

            with torch.no_grad():
                teacher_outputs = teacher(inputs)

            with torch.cuda.amp.autocast(dtype=torch.float16):
                student_outputs = student(inputs)
                loss = distillation_loss(
                    student_outputs,
                    teacher_outputs,
                    labels,
                    alpha=alpha,
                    T=T,
                    label_smoothing=label_smoothing
                )

            scaler.scale(loss).backward()

            # Gradient clipping (optional)
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(student.parameters(), 1.0)

            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item()
            _, predicted = student_outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

        scheduler.step()
        train_acc = 100. * correct / total
        val_acc   = validate(student, valloader, device)
        print(f"[Epoch {epoch+1}/{num_epochs}] Loss: {running_loss/len(trainloader):.4f} | "
              f"Train Acc: {train_acc:.2f}% | Val Acc: {val_acc:.2f}%")

    return student


In [None]:
def validate(net, dataloader, device):
    net.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = net(inputs)
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)
    net.train()
    return 100. * correct / total

In [None]:
def get_dataloaders(batch_size=512, val_ratio=0.1):
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2470, 0.2435, 0.2616)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2470, 0.2435, 0.2616)),
    ])

    # CIFAR10 train/val
    cifar_trainset = torchvision.datasets.CIFAR10(
        root='./data', train=True, download=True, transform=transform_train
    )
    train_size = int((1 - val_ratio) * len(cifar_trainset))
    val_size = len(cifar_trainset) - train_size
    trainset, valset = random_split(cifar_trainset, [train_size, val_size])

    trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True,  num_workers=2)
    valloader   = DataLoader(valset,  batch_size=batch_size, shuffle=False, num_workers=2)

    # CIFAR10 test
    testset = torchvision.datasets.CIFAR10(
        root='./data', train=False, download=True, transform=transform_test
    )
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

    return trainloader, valloader, testloader


In [None]:
def unpickle(file):
    with open(file, 'rb') as fo:
        datadict = pickle.load(fo, encoding='bytes')
    return datadict

def load_eval(filename):
    """
    Example function to load unlabeled images from a .pkl file
    that stores { b'ids': ..., b'data': ... }.
    """
    datadict = unpickle(filename)
    ids = datadict[b'ids']   # e.g. [0, 1, 2, ...]
    imgs = datadict[b'data'] # raw image data, shape might be (N, 32, 32, 3) or something else
    imgs = imgs.astype("uint8")
    return ids, imgs


In [None]:
from PIL import Image

class EvalDataset(Dataset):
    def __init__(self, ids, images, transform=None):
        self.ids = ids
        self.images = images
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        img = Image.fromarray(self.images[idx])  # shape must be (H,W,3)
        if self.transform:
            img = self.transform(img)
        return img_id, img


In [None]:
def generate_submission(model, submission_loader, device, out_csv='submission.csv'):
    model.eval()
    all_ids, all_preds = [], []

    with torch.no_grad():
        for img_ids, inputs in submission_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            all_ids.extend(img_ids.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())

    df = pd.DataFrame({'ID': all_ids, 'Labels': all_preds})
    df.to_csv(out_csv, index=False)
    print(f"Saved predictions to {out_csv}")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Data
    trainloader, valloader, testloader = get_dataloaders(batch_size=512, val_ratio=0.1)

    # 2) Teacher
    teacher = ResNetTeacher()
    print("\n=== Training Teacher ===")
    teacher = train_teacher(
        teacher_model=teacher,
        trainloader=trainloader,
        valloader=valloader,
        num_epochs=100,        # Adjust as needed
        lr=0.1,                # Common for CIFAR-10 with ResNets
        label_smoothing=0.0    # or 0.1 if you like label smoothing
    )

    # If you already have a teacher checkpoint, load it:
    # teacher.load_state_dict(torch.load('path_to_teacher.pth'))
    # teacher.eval()

    # 3) Student
    student = ResNetStudent()

    # 4) Distillation
    print("\n=== Distillation Training ===")
    student = train_distilled(
        teacher,
        student,
        trainloader,
        valloader,
        num_epochs=250,
        alpha=0.4,   # Tweak as needed
        T=1.0,       # Tweak as needed
        lr=0.01,
        label_smoothing=0.1
    )

    # 5) Evaluate on CIFAR-10 test
    test_acc = validate(student, testloader, device)
    print(f"\nDistilled Student Test Accuracy: {test_acc:.2f}%")

    # (Optional) Save student checkpoint
    torch.save(student.state_dict(), "distilled_student.pth")

    # 6) Generate submission for unlabeled data
    # Example: 'drive/MyDrive/cifar_test_nolabel.pkl'
    try:
        ids, imgs = load_eval('drive/MyDrive/cifar_test_nolabel.pkl')
        transform_unlabeled = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2470, 0.2435, 0.2616)),
        ])
        eval_dataset = EvalDataset(ids, imgs, transform_unlabeled)
        eval_loader  = DataLoader(eval_dataset, batch_size=2048, shuffle=False)

        print("\n=== Generating Submission ===")
        generate_submission(student, eval_loader, device, out_csv='submission.csv')

    except FileNotFoundError:
        print("No unlabeled .pkl file found. Skipping submission step.")

def submiss():
    student = ResNetStudent()
    student.load_state_dict(torch.load('distilled_student.pth'))
    student = student.to(device) # Move the student model to the device (GPU)
    student.eval()
    try:
        ids, imgs = load_eval('drive/MyDrive/cifar_test_nolabel.pkl')
        transform_unlabeled = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2470, 0.2435, 0.2616)),
        ])
        eval_dataset = EvalDataset(ids, imgs, transform_unlabeled)
        eval_loader  = DataLoader(eval_dataset, batch_size=2048, shuffle=False)

        print("\n=== Generating Submission ===")
        generate_submission(student, eval_loader, device, out_csv='submission.csv')

    except FileNotFoundError:
        print("No unlabeled .pkl file found. Skipping submission step.")

if __name__ == "__main__":
  # submiss()
    main()



=== Training Teacher ===


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast(dtype=torch.float16):


[Epoch 1/100] Loss: 1.7348 | Train Acc: 35.52% | Val Acc: 42.48%
[Epoch 2/100] Loss: 1.2938 | Train Acc: 53.52% | Val Acc: 48.38%
[Epoch 3/100] Loss: 1.0236 | Train Acc: 63.75% | Val Acc: 58.80%
[Epoch 4/100] Loss: 0.8769 | Train Acc: 69.26% | Val Acc: 62.76%
[Epoch 5/100] Loss: 0.7636 | Train Acc: 73.33% | Val Acc: 67.54%
[Epoch 6/100] Loss: 0.7024 | Train Acc: 75.69% | Val Acc: 72.24%
[Epoch 7/100] Loss: 0.6353 | Train Acc: 77.97% | Val Acc: 66.60%
[Epoch 8/100] Loss: 0.5918 | Train Acc: 79.43% | Val Acc: 74.46%
[Epoch 9/100] Loss: 0.5514 | Train Acc: 81.05% | Val Acc: 74.02%
[Epoch 10/100] Loss: 0.5175 | Train Acc: 82.04% | Val Acc: 77.14%
[Epoch 11/100] Loss: 0.4845 | Train Acc: 83.14% | Val Acc: 79.18%
[Epoch 12/100] Loss: 0.4701 | Train Acc: 83.82% | Val Acc: 78.26%
[Epoch 13/100] Loss: 0.4420 | Train Acc: 84.85% | Val Acc: 77.56%
[Epoch 14/100] Loss: 0.4164 | Train Acc: 85.72% | Val Acc: 77.46%
[Epoch 15/100] Loss: 0.4025 | Train Acc: 86.20% | Val Acc: 74.28%
[Epoch 16/100] Loss

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast(dtype=torch.float16):


[Epoch 1/250] Loss: 1.8493 | Train Acc: 32.87% | Val Acc: 40.56%
[Epoch 2/250] Loss: 1.4932 | Train Acc: 48.28% | Val Acc: 47.84%
[Epoch 3/250] Loss: 1.3465 | Train Acc: 54.93% | Val Acc: 53.50%
[Epoch 4/250] Loss: 1.2518 | Train Acc: 59.02% | Val Acc: 60.66%
[Epoch 5/250] Loss: 1.1814 | Train Acc: 62.28% | Val Acc: 60.98%
[Epoch 6/250] Loss: 1.1257 | Train Acc: 64.31% | Val Acc: 61.86%
[Epoch 7/250] Loss: 1.0793 | Train Acc: 66.37% | Val Acc: 63.08%
[Epoch 8/250] Loss: 1.0329 | Train Acc: 68.33% | Val Acc: 69.02%
[Epoch 9/250] Loss: 0.9923 | Train Acc: 70.10% | Val Acc: 69.90%
[Epoch 10/250] Loss: 0.9592 | Train Acc: 71.53% | Val Acc: 68.20%
[Epoch 11/250] Loss: 0.9250 | Train Acc: 73.06% | Val Acc: 71.68%
[Epoch 12/250] Loss: 0.8975 | Train Acc: 74.33% | Val Acc: 73.06%
[Epoch 13/250] Loss: 0.8690 | Train Acc: 75.34% | Val Acc: 73.48%
[Epoch 14/250] Loss: 0.8412 | Train Acc: 76.68% | Val Acc: 71.98%
[Epoch 15/250] Loss: 0.8215 | Train Acc: 77.24% | Val Acc: 76.32%
[Epoch 16/250] Loss