In [1]:
# Parameters that can be changed according to the environment, number of epochs, learning rate, etc.

# To choose kaggle vs colab notebooks
kaggle = 1 # 1 if kaggle, 0 if colab
load_dataset_online = 0 # 0 if kaggle, as the Kaggle competition already had the dataset, 1 to download teh dataset
use_wandb = 0 # 1 to use wandb (Weights And Biases) and save our sweeps to wandb, 0 to not run wandb
wandbapi = '' ## add wandb API key from wandb.com/authorize, otherwise wandb won't work

# model config to pass to our main model
blocks = [3,5,3]
channels = [64,128,256]

earlystop = 0 # to use early stop for our training

#number of epochs
number_of_epochs = 200 # max epochs
save_freq = 15 # number of epochs after which we save our model

#early stopping criterion
earlystop_patience = 5
earlystop_mindelta = 2
earlystop_threshold = 88

# to create csv
final_test_create_csv=0

# hyperparameters if not using wandb
args_weight_decay = 5e-4
args_lr = 0.01
args_optimizer = 'sgd' # options: sgd, sgdn, adadelta-clipping,

# load model
load_model = 0
load_model_name = '100_checkpoint.tar' #can only be done on colab

# other parameters
our_batch_size = 128
our_number_workers = 2

# Maximum number of iterations for annealing
t_max = 50

In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader, Dataset

import torchvision
import torchvision.transforms as transforms

import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

import pickle

import argparse

from PIL import Image

import pprint

# from utils import progress_bar
from time import perf_counter

!pip install wandb -Uq
import wandb
import os

In [3]:
if kaggle==1:
  for dirname, _, filenames in os.walk('/kaggle/input'):
      for filename in filenames:
          print(os.path.join(dirname, filename))

/kaggle/input/deep-learning-mini-project-spring-24-nyu/cifar_test_nolabels.pkl
/kaggle/input/deep-learning-mini-project-spring-24-nyu/cifar-10-python/cifar-10-batches-py/data_batch_1
/kaggle/input/deep-learning-mini-project-spring-24-nyu/cifar-10-python/cifar-10-batches-py/data_batch_2
/kaggle/input/deep-learning-mini-project-spring-24-nyu/cifar-10-python/cifar-10-batches-py/batches.meta
/kaggle/input/deep-learning-mini-project-spring-24-nyu/cifar-10-python/cifar-10-batches-py/test_batch
/kaggle/input/deep-learning-mini-project-spring-24-nyu/cifar-10-python/cifar-10-batches-py/data_batch_3
/kaggle/input/deep-learning-mini-project-spring-24-nyu/cifar-10-python/cifar-10-batches-py/data_batch_5
/kaggle/input/deep-learning-mini-project-spring-24-nyu/cifar-10-python/cifar-10-batches-py/data_batch_4
/kaggle/input/deep-learning-mini-project-spring-24-nyu/cifar-10-python/cifar-10-batches-py/readme.html


In [4]:
#ResNet model
class BasicBlock(nn.Module):
   expansion = 1

   def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
                in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                                                   stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
                self.shortcut = nn.Sequential(
                        nn.Conv2d(in_planes, self.expansion*planes,
                                          kernel_size=1, stride=stride, bias=False),
                        nn.BatchNorm2d(self.expansion*planes)
                )

   def forward(self, x):

        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

class ResNet3(nn.Module):
   def __init__(self, block, num_blocks, channels = [64, 128, 256], num_classes=10):
        super(ResNet3, self).__init__()
        self.in_planes = channels[0]

        self.conv1 = nn.Conv2d(3, channels[0], kernel_size=3,
                                                   stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(channels[0])
        self.layer1 = self._make_layer(block, channels[0], num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, channels[1], num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, channels[2], num_blocks[2], stride=2)
        # self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(channels[2]*block.expansion, num_classes)

   def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
                layers.append(block(self.in_planes, planes, stride))
                self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

   def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        # out = self.layer4(out)
        out = F.avg_pool2d(out, 8) #changed from 4 to 8
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

class ResNet4(nn.Module):
   def __init__(self, block, num_blocks, channels = [64, 128, 256, 512], num_classes=10):
        super(ResNet4, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, channels[0], kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(channels[0])
        self.layer1 = self._make_layer(block, channels[0], num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, channels[1], num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, channels[2], num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, channels[3], num_blocks[3], stride=2)
        self.linear = nn.Linear(channels[3]*block.expansion, num_classes)

   def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
                layers.append(block(self.in_planes, planes, stride))
                self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

   def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def ResNetCustom(blocks, channels):
  layers = len(blocks)
  if(layers==3):
    return ResNet3(BasicBlock, blocks, channels)
  else:
    return ResNet4(BasicBlock, blocks, channels)

In [5]:
def getParameters(resnet):
  total_params = 0
  for x in filter(lambda p: p.requires_grad, resnet.parameters()):
    total_params += np.prod(x.data.cpu().numpy().shape)
  print("Total number of params", total_params)

Creating Model

In [6]:
device = 'cuda'

In [7]:
def build_model():
    return ResNetCustom(blocks, channels)
net = build_model()
getParameters(net)
if device == 'cuda':
   net = torch.nn.DataParallel(net)
   cudnn.benchmark = True


Total number of params 4918602


Select Loss function

In [8]:
criterion = nn.CrossEntropyLoss()
#optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max)

Loading Dataset

In [9]:
class CIFAR10Dataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # The data is originally stored as a numpy array with shape (num_samples, 3, 32, 32)
        # We need to transpose it to (num_samples, 32, 32, 3) for PIL
        image = self.data[idx].transpose((1, 2, 0))
        label = self.labels[idx]

        # Convert the numpy array to a PIL Image
        image = Image.fromarray(image.astype('uint8'))

        if self.transform:
            image = self.transform(image)

        return image, label

def load_cifar10_batch(file):
      with open(file, 'rb') as fo:
          batch = pickle.load(fo, encoding='latin1')
      data = batch['data']
      labels = batch['labels']
      data = data.reshape(-1, 3, 32, 32)
      return data, labels

  # Function to load all CIFAR-10 data
def load_cifar10_data(data_dir):
      train_data = []
      train_labels = []
      for i in range(1, 6):
          batch_data, batch_labels = load_cifar10_batch(os.path.join(data_dir, f'data_batch_{i}'))
          train_data.append(batch_data)
          train_labels.extend(batch_labels)

      train_data = np.vstack(train_data)
      train_labels = np.array(train_labels)

      test_data, test_labels = load_cifar10_batch(os.path.join(data_dir, 'test_batch'))
      test_data = test_data.reshape(-1, 3, 32, 32)
      test_labels = np.array(test_labels)

      return train_data, train_labels, test_data, test_labels


In [10]:
data_dir = ''
if kaggle==1:
  data_dir = '/kaggle/input/deep-learning-mini-project-spring-24-nyu/cifar-10-python/cifar-10-batches-py'
else:
  data_dir = 'cifar-10-batches-py'

In [11]:
#ORIGINAL TRANSFORM
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10), #newly added
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])


In [12]:
#CUTOUT

class Cutout(object):
    def __init__(self, n_holes, length):
        self.n_holes = n_holes
        self.length = length

    def __call__(self, img):
        h = img.size(1)
        w = img.size(2)
        mask = torch.ones(h, w, dtype=torch.float32)

        for _ in range(self.n_holes):
            y = torch.randint(0, h, (1,))
            x = torch.randint(0, w, (1,))

            y1 = int(torch.clamp(y - self.length / 2, 0, h).item())
            x1 = int(torch.clamp(x - self.length / 2, 0, w).item())

            y2 = int(torch.clamp(y + self.length / 2, 0, h).item())
            x2 = int(torch.clamp(x + self.length / 2, 0, w).item())

            mask[y1: y2, x1: x2] = 0

        img = img * mask.unsqueeze(0)
        return img

transform_train_cutout = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10), # newly added
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    Cutout(n_holes=1, length=8)  # Cutout added
])

In [13]:
#MIXUP

def mixup_data(x, y, alpha=0.2, device='cuda'):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    mixed_y = (y, y[index], lam)
    return mixed_x, mixed_y


In [14]:
#CUTMIX

def cutmix_data(inputs, targets, alpha=1.0):
    # Generate mixed sample
    lam = np.random.beta(alpha, alpha) if alpha > 0 else 1
    batch_size = inputs.size(0)
    index = torch.randperm(batch_size).to(inputs.device)

    # Random rectangle region
    H, W = inputs.size(2), inputs.size(3)
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    # Uniform
    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    inputs[:, :, bby1:bby2, bbx1:bbx2] = inputs[index, :, bby1:bby2, bbx1:bbx2]
    # Adjust lambda to exactly match the pixel ratio
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (inputs.size()[-1] * inputs.size()[-2]))
    targets_a, targets_b = targets, targets[index]

    return inputs, targets_a, targets_b, lam

# def rand_bbox(size, lam):
#     W = size[2]
#     H = size[3]
#     cut_rat = np.sqrt(1. - lam)
#     cut_w = int(W * cut_rat)
#     cut_h = int(H * cut_rat)

#     # uniform
#     cx = np.random.randint(W)
#     cy = np.random.randint(H)

#     bbx1 = np.clip(cx - cut_w // 2, 0, W)
#     bbx2 = np.clip(cx + cut_w // 2, 0, W)
#     bby1 = np.clip(cy - cut_h // 2, 0, H)
#     bby2 = np.clip(cy + cut_h // 2, 0, H)

#     return bbx1, bby1, bbx2, bby2


Create dataloaders

In [15]:
if load_dataset_online==0:
  train_data, train_labels, test_data, test_labels = load_cifar10_data(data_dir)
  train_dataset = CIFAR10Dataset(train_data, train_labels, transform=transform_train)
  test_dataset = CIFAR10Dataset(test_data, test_labels, transform=transform_test)

  train_dataset_cutout = CIFAR10Dataset(train_data, train_labels, transform=transform_train_cutout)
else:
  train_dataset = torchvision.datasets.CIFAR10(root='/data', train=True, download=True, transform=transform_train)
  test_dataset = torchvision.datasets.CIFAR10(root='/data', train=False, download=True, transform=transform_test)

  train_dataset_cutout = torchvision.datasets.CIFAR10(root='/datacutout', train=True, download=True, transform=transform_train_cutout)

# Create DataLoaders
trainloader = DataLoader(train_dataset, batch_size=our_batch_size, shuffle=True, num_workers=our_number_workers)
testloader = DataLoader(test_dataset, batch_size=our_batch_size, shuffle=False, num_workers=our_number_workers)

trainloader_cutout = DataLoader(train_dataset_cutout, batch_size=our_batch_size, shuffle=True, num_workers=our_number_workers)

In [16]:
print(len(trainloader))
print(len(trainloader_cutout))
print(len(testloader))

391
391
79


In [17]:
def adaptive_gradient_clipping(parameters, clip_factor=0.01, eps=1e-3):
    with torch.no_grad():
        for p in parameters:
            if p.grad is not None:
                param_norm = torch.norm(p.data, p=2)
                grad_norm = torch.norm(p.grad.data, p=2)
                max_norm = param_norm * clip_factor
                if grad_norm > max_norm + eps:
                    clip_coef = max_norm / (grad_norm + eps)
                    p.grad.data.mul_(clip_coef)

In [18]:
# do_clipping = 0
# if args_optimizer=='sgdn':
#   optimizer = optim.SGD(net.parameters(), lr=args_lr, momentum=0.9, weight_decay=args_weight_decay, nesterov=True)
# elif args_optimizer=='adadelta':
#   optimizer = optim.Adadelta(net.parameters(), lr=args_lr, weight_decay=args_weight_decay)
# elif args_optimizer=='adadelta-clipping':
#   optimizer = optim.Adadelta(net.parameters(), lr=args_lr, weight_decay=args_weight_decay)
#   do_clipping = 1
# else: #sgd
#   optimizer = optim.SGD(net.parameters(), lr=args_lr, momentum=0.9, weight_decay=args_weight_decay)

# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max)

In [19]:
def train(epoch, optimizer, mynet, mytrainloader, mixup = 0, cutmix = 0, clipping=0):
    print('\nEpoch: %d' % epoch)
    epoch_start = perf_counter()  # Start timing the epoch
    mynet.train()
    train_loss = 0
    correct = 0
    total = 0
    dataloading_time = 0
    training_time = 0

    train_accuracy = []
    train_losses = []
#   print("starting loop")
    #try:
    for batch_idx, (inputs, targets) in enumerate(mytrainloader):

        inputs, targets = inputs.to(device), targets.to(device)

        if mixup==1:
          inputs, mixed_targets = mixup_data(inputs, targets, alpha=0.5, device=device)
          targets, shuffled_targets, lam = mixed_targets

          optimizer.zero_grad()
          outputs = mynet(inputs)

          loss = lam * criterion(outputs, targets) + (1 - lam) * criterion(outputs, shuffled_targets)
          loss.backward()
          if (clipping==1):
              adaptive_gradient_clipping(mynet.parameters(), clip_factor=0.01)
          optimizer.step()

          _, predicted = outputs.max(1)
          correct += (predicted.eq(targets) | predicted.eq(shuffled_targets)).sum().item()
          total += targets.size(0)
          train_loss += loss.item()
          epoch_accuracy = 100. * correct / total

          train_accuracy.append(epoch_accuracy)
          train_losses.append(loss.item())

        elif cutmix==1:

          targets_a, targets_b = targets.clone(), targets.clone()
          if np.random.rand() < 0.5:
            inputs, targets_a, targets_b, lam = cutmix_data(inputs, targets, alpha=1.0)
            outputs = mynet(inputs)
            loss = lam * criterion(outputs, targets_a) + (1 - lam) * criterion(outputs, targets_b)
          else:
            outputs = mynet(inputs)
            loss = criterion(outputs, targets)

          optimizer.zero_grad()
          loss.backward()
          if (clipping==1):
              adaptive_gradient_clipping(mynet.parameters(), clip_factor=0.01)
          optimizer.step()

          _, predicted = outputs.max(1)
          correct += (predicted.eq(targets_a) | predicted.eq(targets_b)).sum().item()
          total += targets.size(0)
          train_loss += loss.item()
          epoch_accuracy = 100. * correct / total

          train_accuracy.append(epoch_accuracy)
          train_losses.append(loss.item())

        else:

          optimizer.zero_grad()
          outputs = mynet(inputs)

          loss = criterion(outputs, targets)
          loss.backward()
          if (clipping==1):
              adaptive_gradient_clipping(mynet.parameters(), clip_factor=0.01)
          optimizer.step()

          train_loss += loss.item()
          _, predicted = outputs.max(1)
          total += targets.size(0)
          correct += predicted.eq(targets).sum().item()
          epoch_accuracy = 100. * correct / total

          train_accuracy.append(epoch_accuracy)
          train_losses.append(loss.item())


    epoch_loss = train_loss / len(mytrainloader)

    epoch_accuracy = 100. * correct / total
    print(f'\nEpoch: {epoch}, Loss: {epoch_loss:.3f}, Acc: {epoch_accuracy:.3f}%')
    # print('Data loading time:', dataloading_time)
    # print('Training time:', training_time)
    # print('Total epoch time:', total_epoch_time)

    return epoch_accuracy, epoch_loss


def test(optimizer, mynet):
    mynet.eval()
    test_loss = 0
    correct = 0
    total = 0

    test_accuracy = []
    test_losses = []

    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = mynet(inputs)
            loss = criterion(outputs, targets)

            batch_loss = loss.item()
            test_loss += batch_loss
            _, predicted = outputs.max(1)
            batch_sizee = targets.size(0)
            total += batch_sizee
            correct_batch =  predicted.eq(targets).sum().item()
            correct += correct_batch

            # Compute and store epoch-wise test accuracy
            epoch_accuracy = 100. * correct / total
            test_accuracy.append(epoch_accuracy)
            test_losses.append(loss.item())

            batch_accuracy = 100. * correct_batch / batch_sizee

    epoch_loss = test_loss / len(testloader)
    epoch_accuracy = 100. * correct / total
    print(f'\nTest Loss: {epoch_loss:.3f}, Acc: {epoch_accuracy:.3f}%')

    return epoch_accuracy, epoch_loss



In [20]:
#Saving Model
if kaggle==1:
  output_directory = '/kaggle/working/'
else:
  output_directory = ''
import time
def saveModel(mynet, number_of_epochs):
    timestr = time.strftime("%m%d-%H%M")
    directory = os.path.join(output_directory, str(number_of_epochs))
    if not os.path.exists(directory):
        os.makedirs(directory)
    torch.save({
        'net': mynet,
    }, os.path.join(directory, '{}_{}.tar'.format('Time', timestr)))

In [21]:
# Plotting function for train vs test graph

def plot_train_test_accuracy(train_acc, test_acc, train_loss, test_loss):
    epochs = range(1, len(train_acc) + 1)
    plt.figure(figsize=(12, 5))

    # Plotting accuracy
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_acc, 'b', label='Training Accuracy')
    plt.plot(epochs, test_acc, 'r', label='Test Accuracy')
    plt.title('Training and Test Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Plotting loss
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_loss, 'b', label='Training Loss')
    plt.plot(epochs, test_loss, 'r', label='Test Loss')
    plt.title('Training and Test Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [22]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.max_validation_acc = float('inf')

    def early_stop(self, validation_accuracy):
        if validation_accuracy > self.max_validation_acc:
            self.max_validation_acc = validation_accuracy
            self.counter = 0
        elif validation_accuracy < (self.max_validation_acc - self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

WANDB

In [23]:
if use_wandb==1:
  wandb.login(key = wandbapi)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [24]:
# sweep_config = {
#     'method': 'random'
#     }
# metric = {
#     'name': 'loss',
#     'goal': 'minimize'
#     }
# sweep_config['metric'] = metric
# parameters_dict = {
#     'optimizer': {
#         'values': ['adadelta', 'sgd', 'adadelta-clipping']
#         },
#     'learning_rate': {
#           'values': [0.1, 0.01]
#         }
#     }

# sweep_config['parameters'] = parameters_dict


sweep_config = {
    'method': 'grid'
    }
metric = {
    'name': 'loss',
    'goal': 'minimize'
    }
sweep_config['metric'] = metric
parameters_dict = {
    'dataaug': {
        #'values': ['cutout', 'mixup', 'cutmix', 'cutout-mixup']
        'values': ['cutout']
        # 'values': [ 'mixup', 'cutmix', 'cutout-mixup']
        }
    }

sweep_config['parameters'] = parameters_dict


In [25]:
pprint.pprint(sweep_config)

{'method': 'grid',
 'metric': {'goal': 'minimize', 'name': 'loss'},
 'parameters': {'dataaug': {'values': ['cutout']}}}


In [26]:
start_epoch = 0  # start from epoch 0 or last checkpoint epoch
def trainwandb(config=None):
    wandb.init(config=config)
    config = wandb.config

    net = build_model()
    getParameters(net)
    if device == 'cuda':
       net = torch.nn.DataParallel(net)
       cudnn.benchmark = True

    # do_clipping = 0
    # if config.optimizer == 'adadelta':
    #     optimizer = optim.Adadelta(net.parameters(), lr=config.learning_rate, weight_decay=0.0005)
    # elif config.optimizer == 'adadelta-clipping':
    #     optimizer = optim.Adadelta(net.parameters(), lr=config.learning_rate, weight_decay=0.0005)
    #     do_clipping = 1
    # elif config.optimizer == 'sgd':
    #     optimizer = optim.SGD(net.parameters(), lr=config.learning_rate, momentum=0.9, weight_decay=0.0005)
    # else: #sgdn
    #     optimizer = optim.SGD(net.parameters(), lr=config.learning_rate, momentum=0.9, weight_decay=0.0005, nesterov=True)

    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max)

    args_lr=0.01
    
    do_clipping = 0
    if args_optimizer=='sgdn':
      optimizer = optim.SGD(net.parameters(), lr=args_lr, momentum=0.9, weight_decay=args_weight_decay, nesterov=True)
    elif args_optimizer=='adadelta':
      optimizer = optim.Adadelta(net.parameters(), lr=args_lr, weight_decay=args_weight_decay)
    elif args_optimizer=='adadelta-clipping':
      optimizer = optim.Adadelta(net.parameters(), lr=args_lr, weight_decay=args_weight_decay)
      do_clipping = 1
    else: #sgd
      optimizer = optim.SGD(net.parameters(), lr=args_lr, momentum=0.9, weight_decay=args_weight_decay)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max)

    train_accuracies = []
    test_accuracies = []
    train_losses = []
    test_losses = []

    if earlystop:
      early_stopper = EarlyStopper(patience=5, min_delta=2)

    for epoch in range(start_epoch, start_epoch+number_of_epochs):
        if config.dataaug=='cutout': #'cutout', 'mixup', 'cutmix', 'cutout-mixup' #trainloader, trainloader_cutout
          train_acc, train_loss = train(epoch, optimizer, net, trainloader_cutout, 0, 0, do_clipping)
        elif config.dataaug=='mixup':
          train_acc, train_loss = train(epoch, optimizer, net, trainloader, 1, 0, do_clipping)
        elif config.dataaug=='cutmix':
          train_acc, train_loss = train(epoch, optimizer, net, trainloader, 0, 1, do_clipping)
        else: #cutout-mixup
          train_acc, train_loss = train(epoch, optimizer, net, trainloader_cutout, 1, 0, do_clipping)

        test_acc, test_loss = test(optimizer, net)

        wandb.log({"epoch": epoch, "train_accuracy": train_acc, "train_loss": train_loss,
                   "test_accuracy": test_acc, "test_loss": test_loss})

        scheduler.step()
        
        if epoch+start_epoch==80: #80
            args_lr/=10
        if epoch+start_epoch==150: #150
            args_lr/=10
        for param_group in optimizer.param_groups:
            param_group['lr']=args_lr
        
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)
        train_losses.append(train_loss)
        test_losses.append(test_loss)

        if(epoch%save_freq==0):
            saveModel(net, epoch)
            #test()
        if earlystop:
          if early_stopper.early_stop(test_acc) and test_acc>earlystop_threshold:
            wandb.log({"early_stop": epoch})
            break

    saveModel(net, number_of_epochs)

    # Plot train vs test accuracy graph
    plot_train_test_accuracy(train_accuracies, test_accuracies, train_losses, test_losses)

    test(optimizer, net)
    wandb.log({"final_test_loss": test_losses[-1]})
    wandb.finish()

In [27]:
if use_wandb==1:
  sweep_id = wandb.sweep(sweep_config, project="dl-mini-353-cutout")
  #  sweep_id = "classical/pytorch-sweeps-demo/mnpi1d3n"

Create sweep with ID: 7rbk9m9t
Sweep URL: https://wandb.ai/kavyagupta/dl-mini-353-cutout/sweeps/7rbk9m9t


In [28]:
if use_wandb==1:
  wandb.agent(sweep_id, trainwandb, count=1)

[34m[1mwandb[0m: Agent Starting Run: ae12ytgm with config:
[34m[1mwandb[0m: 	dataaug: cutout
[34m[1mwandb[0m: Currently logged in as: [33mkavyagupta[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240412_230234-ae12ytgm[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mearnest-sweep-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/kavyagupta/dl-mini-353-cutout[0m
[34m[1mwandb[0m: 🧹 View sweep at [34m[4mhttps://wandb.ai/kavyagupta/dl-mini-353-cutout/sweeps/7rbk9m9t[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/kavyagupta/dl-mini-353-cutout/runs/ae12ytgm[0m


Total number of params 4918602

Epoch: 0

Epoch: 0, Loss: 1.492, Acc: 44.928%

Test Loss: 1.199, Acc: 58.830%

Epoch: 1

Epoch: 1, Loss: 1.065, Acc: 61.868%

Test Loss: 1.361, Acc: 58.680%

Epoch: 2

Epoch: 2, Loss: 0.866, Acc: 69.288%

Test Loss: 1.060, Acc: 67.120%

Epoch: 3

Epoch: 3, Loss: 0.749, Acc: 73.810%

Test Loss: 0.756, Acc: 74.350%

Epoch: 4

Epoch: 4, Loss: 0.668, Acc: 76.740%

Test Loss: 0.682, Acc: 77.440%

Epoch: 5

Epoch: 5, Loss: 0.613, Acc: 78.538%

Test Loss: 0.721, Acc: 77.090%

Epoch: 6

Epoch: 6, Loss: 0.561, Acc: 80.316%

Test Loss: 0.528, Acc: 82.550%

Epoch: 7

Epoch: 7, Loss: 0.520, Acc: 81.868%

Test Loss: 0.738, Acc: 78.500%

Epoch: 8

Epoch: 8, Loss: 0.495, Acc: 82.758%

Test Loss: 0.606, Acc: 80.410%

Epoch: 9

Epoch: 9, Loss: 0.469, Acc: 83.594%

Test Loss: 0.460, Acc: 84.860%

Epoch: 10

Epoch: 10, Loss: 0.436, Acc: 84.768%

Test Loss: 0.544, Acc: 83.070%

Epoch: 11

Epoch: 11, Loss: 0.412, Acc: 85.512%

Test Loss: 0.469, Acc: 84.760%

Epoch: 12

Epoch

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:          epoch ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
[34m[1mwandb[0m:  test_accuracy ▁▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████████████████
[34m[1mwandb[0m:      test_loss █▄▃▂▂▂▂▂▂▂▂▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: train_accuracy ▁▄▅▆▆▆▇▇▇▇▇▇▇▇▇▇████████████████████████
[34m[1mwandb[0m:     train_loss █▅▄▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:          epoch 199
[34m[1mwandb[0m:  test_accuracy 95.01
[34m[1mwandb[0m:      test_loss 0.19657
[34m[1mwandb[0m: train_accuracy 99.73
[34m[1mwandb[0m:     train_loss 0.01028
[34m[1mwandb[0m: 
[34m[1mwandb[0m: 🚀 View run [33mearnest-sweep-1[0m at: [34m[4mhttps://wandb.ai/kavyagupta/dl-mini-353-cutout/runs/ae12ytgm[0m
[34m[1mwandb[0m: ⭐️ View project at: [34m[4mh

If running without WANDB

Loading pkl model if needed:

In [29]:
if load_model == 1:
  loadFilename = load_model_name
  checkpoint = torch.load(loadFilename)
  net = checkpoint['net']

In [30]:
if use_wandb==0:
  do_clipping = 0
  if args_optimizer=='sgdn':
    optimizer = optim.SGD(net.parameters(), lr=args_lr, momentum=0.9, weight_decay=args_weight_decay, nesterov=True)
  elif args_optimizer=='adadelta':
    optimizer = optim.Adadelta(net.parameters(), lr=args_lr, weight_decay=args_weight_decay)
  elif args_optimizer=='adadelta-clipping':
    optimizer = optim.Adadelta(net.parameters(), lr=args_lr, weight_decay=args_weight_decay)
    do_clipping = 1
  else: #sgd
    optimizer = optim.SGD(net.parameters(), lr=args_lr, momentum=0.9, weight_decay=args_weight_decay)

  scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max)
  scheduler.step()

  # number_of_epochs = 100

  train_accuracies = []
  test_accuracies = []
  train_losses = []
  test_losses = []

  if earlystop:
    early_stopper = EarlyStopper(patience=earlystop_patience, min_delta=earlystop_mindelta)
  for epoch in range(start_epoch, start_epoch+number_of_epochs):
      train_acc, train_loss = train(epoch, optimizer, net, do_clipping)
      test_acc, test_loss = test(optimizer, net)

      train_accuracies.append(train_acc)
      test_accuracies.append(test_acc)
      train_losses.append(train_loss)
      test_losses.append(test_loss)

      if(epoch%save_freq==0):
          saveModel(net, epoch)
          #test()
      if earlystop==1:
        if early_stopper.early_stop(test_acc) and test_acc>earlystop_threshold:
          wandb.log({"early_stop": epoch})
          break

  saveModel(net, number_of_epochs)

  # Plot train vs test accuracy graph
  plot_train_test_accuracy(train_accuracies, test_accuracies, train_losses, test_losses)

  test(optimizer, net)

Generating CSV file with Test Data

In [31]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, pkl_file, transform=None):
        with open(pkl_file, 'rb') as f:
            self.data_dict = pickle.load(f)
        self.data = self.data_dict[b'data']
        self.ids = self.data_dict[b'ids']
        self.transform = transform

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        img_flat = self.data[idx, :]
        img = img_flat.reshape(3, 32, 32) / 255.0  # Convert to float and scale to [0, 1]
        img_id = self.ids[idx]
        img_tensor = torch.from_numpy(img).float()  # Convert to tensor and ensure type

        if self.transform:
            img_tensor = self.transform(img_tensor)

        return img_tensor, img_id

# Define transformations
test_transform = transforms.Compose([
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# Create dataset and dataloader
test_dataset = ''
if kaggle==1:
  test_dataset = CustomDataset('/kaggle/input/deep-learning-mini-project-spring-24-nyu/cifar_test_nolabels.pkl', transform=test_transform)
else:
    test_dataset = CustomDataset('cifar_test_nolabels.pkl', transform=test_transform)
test_dataloader = DataLoader(test_dataset, batch_size=our_batch_size, shuffle=False)


In [32]:
if final_test_create_csv==1:
  predicted_labels = []
  image_ids = []
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  net.eval()
  with torch.no_grad():
      for images, ids in test_dataloader:
          images = images.to(device)
          myoutputs = net(images)
          _, mypredicted = myoutputs.max(1)
          predicted_labels.extend(mypredicted.cpu().numpy())
          image_ids.extend(ids.numpy())

  # Create a DataFrame and then save it to CSV
  df = pd.DataFrame({
      "ID": image_ids,
      "Labels": predicted_labels
  })

  # Save the DataFrame to a CSV file
  if kaggle==1:
    df.to_csv('/kaggle/working/predictions.csv', index=False)
  else:
    df.to_csv('predictions.csv', index=False)