In [1]:
## model design: https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-build-a-convnet-for-cifar-10-and-cifar-100-classification-with-keras.md
import infrastructure as inf


from argparse import ArgumentParser
from typing import List
import time
import numpy as np
from tqdm import tqdm

import torch    

import torch as ch
from torch.cuda.amp import GradScaler, autocast
from torch.nn import CrossEntropyLoss, Conv2d, BatchNorm2d
from torch.optim import SGD, lr_scheduler
from torchvision.transforms import v2
import torchvision
import torch.nn as nn
import torch.nn.functional as F


from fastargs import get_current_config, Param, Section
from fastargs.decorators import param
from fastargs.validation import And, OneOf

from ffcv.fields import IntField, RGBImageField
from ffcv.fields.decoders import IntDecoder, SimpleRGBImageDecoder
from ffcv.loader import Loader, OrderOption
from ffcv.pipeline.operation import Operation
from ffcv.transforms import RandomHorizontalFlip, Cutout, \
    RandomTranslate, Convert, ToDevice, ToTensor, ToTorchImage
from ffcv.transforms.common import Squeeze
from ffcv.writer import DatasetWriter

Using device cuda:0


In [2]:
device = inf.device

In [3]:

# Define the CNN model in PyTorch
class MyCNNModel(nn.Module):
    def __init__(self, no_classes):
        super(MyCNNModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=(3, 3)) # output shape: 
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3, 3))
        self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3, 3))
        self.pool = nn.MaxPool2d(kernel_size=(2, 2))
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(1024, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, no_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def generate_model(output_dim=100):
    model = MyCNNModel(output_dim)
    return model.to(device)

In [4]:
from tqdm import tqdm

def train(model, loaders, lr=0.1, epochs=100, momentum=0.9, weight_decay=0.0001, reduce_patience=5, reduce_factor=0.2, tracking_freq=5,early_stopping_patience=10, early_stopping_min_epochs=100, do_tracking=True, verbose=False):
    # dictionary to keep track of training params and results
    train_dict = {}
    train_dict['lr'] = lr
    train_dict['epochs'] = epochs
    train_dict['momentum'] = momentum
    train_dict['weight_decay'] = weight_decay
    train_dict['reduce_patience'] = reduce_patience
    train_dict['reduce_factor'] = reduce_factor
    train_dict['tracking_freq'] = tracking_freq
    # results
    # training loss is tracked every epoch
    train_dict['train_loss'] = []
    train_dict['val_loss'] = []
    train_dict['lr_list'] = []
    train_dict['train_acc_top1'] = []
    train_dict['train_acc_top5'] = []
    train_dict['val_acc_top1'] = []
    train_dict['val_acc_top5'] = []

    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
    criterion = ch.nn.CrossEntropyLoss()
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=reduce_patience, verbose=True, factor=reduce_factor)
    len_train_loader = len(loaders['train'])
    len_val_loader = len(loaders['test'])

    best_val_loss = float('inf')
    early_stopping_counter = 0

    for i in tqdm(range(epochs),disable=verbose):
        model.train()
        running_loss = 0.0
        total_correct, total_num, total_correct_top5 = 0., 0., 0.

        for ims, labs in loaders['train']:
            optimizer.zero_grad(set_to_none=True)
            with autocast():
                out = model(ims)
                loss = criterion(out, labs)
            
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if do_tracking and (i+1)%tracking_freq == 0: # only do bookkeeping if needed
                # computing top1 accuracy
                total_correct += out.argmax(1).eq(labs).sum().cpu().item()
                total_num += ims.shape[0]
                # computing top5 accuracy
                total_correct_top5 += out.argsort(1)[:,-5:].eq(labs.unsqueeze(-1)).sum().cpu().item()

        scheduler.step(running_loss)
        # save training loss
        if verbose: print(f'Epoch {i+1}/{epochs}, Training Loss: {running_loss/len_train_loader}')
        train_dict['train_loss'].append(running_loss/len_train_loader)
        # keep track of the current lr 
        train_dict['lr_list'].append(optimizer.param_groups[0]['lr'])
        # keep track of other metrics
        if do_tracking and (i+1)%tracking_freq == 0:
            train_top1 = total_correct / total_num * 100
            train_top5 = total_correct_top5 / total_num * 100
            val_loss = 0.0
            total_val_correct, total_val_num, total_val_correct_top5 = 0., 0., 0.
            model.eval()
            with ch.no_grad():
                for val_ims, val_labs in loaders['test']:
                    val_out = model(val_ims)
                    val_loss += criterion(val_out, val_labs).item()
                    # computing top1 accuracy
                    total_val_correct += val_out.argmax(1).eq(val_labs).sum().cpu().item()
                    total_val_num += val_ims.shape[0]
                    # computing top5 accuracy
                    total_val_correct_top5 += val_out.argsort(1)[:,-5:].eq(val_labs.unsqueeze(-1)).sum().cpu().item()
            val_loss /= len_val_loader
            val_top1 = total_val_correct / total_val_num * 100
            val_top5 = total_val_correct_top5 / total_val_num * 100
            train_dict['val_loss'].append(val_loss)
            train_dict['train_acc_top1'].append(train_top1)
            train_dict['train_acc_top5'].append(train_top5)
            train_dict['val_acc_top1'].append(val_top1)
            train_dict['val_acc_top5'].append(val_top5)
            if verbose: print(f'Epoch {i+1}/{epochs}, Validation Loss: {val_loss}')
            if i > early_stopping_min_epochs:
                # Early stopping based on increasing validation loss
                if val_loss > best_val_loss:
                    early_stopping_counter += 1
                    if early_stopping_counter >= early_stopping_patience:
                        print(f"Early stopping triggered at epoch {i}!")
                        return model, train_dict
                else:
                    best_val_loss = val_loss
                    early_stopping_counter = 0

    return model, train_dict


In [15]:
loaders, start_time = inf.make_dataloaders(batch_size=256, num_workers=12)
model = generate_model()
print(model)
# load model from checkpoint stored at ./models/model.pt
#model.load_state_dict(torch.load("./models/model.pt"))
model, tracked_params = train(model, loaders,epochs=150,tracking_freq=2,reduce_factor=0.2,early_stopping_min_epochs=100,early_stopping_patience=5,do_tracking=True,verbose=False)
print(f'Total time: {time.time() - start_time:.5f}')
#evaluate(model, loaders)

# store the model   
torch.save(model.state_dict(), "./models/model.pt")	
# save the tracked params
np.save("./models/tracked_params.npy", tracked_params)

# visualize the tracked params from training
inf.plot_training(tracked_params,"whole_cifar100",plot=False,save=True)

MyCNNModel(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
  (conv3): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=1024, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=100, bias=True)
)


 36%|███▌      | 54/150 [01:20<02:02,  1.28s/it]

Epoch 00054: reducing learning rate of group 0 to 2.0000e-02.


 74%|███████▍  | 111/150 [02:43<00:57,  1.48s/it]

Early stopping triggered at epoch 111!
Total time: 164.33727





In [6]:
# do training on the models for the tupels of superclasses
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

import gc
for i in range(0,20):
    for j in range(i+1,20):
        
        print(f"Training model for superclasses {i} and {j}")
        paths =  [f'./data/subsets/{dataset_name}_superclass_{i}_{j}.beton' for dataset_name in ["train","test"]]
        loaders, start_time = inf.make_dataloaders(paths[0],paths[1])
        model = generate_model(10)
        model, tracked_params = train(model, loaders,epochs=140,tracking_freq=2,reduce_factor=0.2,reduce_patience=5,do_tracking=True,early_stopping_min_epochs=40,early_stopping_patience=5,verbose=False)
        print(f'Total time: {time.time() - start_time:.5f}')
        # store the model   
        torch.save(model.state_dict(), f'./models/model_{i}_{j}.pt')	
        # save the tracked params
        np.save(f"./models/tracked_params{i}_{j}.npy", tracked_params)
        
        # once done remove the model, tracked params and loaders from storage
        name = f'model_{i}_{j}'
        inf.plot_training(tracked_params,name, False, True)
        del model, tracked_params, loaders, start_time
        torch.cuda.empty_cache()
        print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
        print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
        print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.121706GB
torch.cuda.memory_reserved: 0.556641GB
torch.cuda.max_memory_reserved: 0.556641GB
Training model for superclasses 0 and 1


 36%|███▋      | 51/140 [00:19<00:33,  2.63it/s]

Early stopping triggered at epoch 51!
Total time: 19.48851





torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 2


 44%|████▍     | 62/140 [00:21<00:14,  5.52it/s]

Epoch 00062: reducing learning rate of group 0 to 2.0000e-02.


 55%|█████▌    | 77/140 [00:24<00:19,  3.16it/s]


Early stopping triggered at epoch 77!
Total time: 24.49734
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 3


 39%|███▉      | 55/140 [00:20<00:31,  2.74it/s]


Early stopping triggered at epoch 55!
Total time: 20.18290
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 4


 36%|███▋      | 51/140 [00:19<00:34,  2.61it/s]


Early stopping triggered at epoch 51!
Total time: 19.62774
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 5


 42%|████▏     | 59/140 [00:21<00:28,  2.80it/s]


Early stopping triggered at epoch 59!
Total time: 21.20102
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 6


 36%|███▋      | 51/140 [00:19<00:34,  2.61it/s]


Early stopping triggered at epoch 51!
Total time: 19.64749
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 7


 39%|███▉      | 55/140 [00:20<00:31,  2.70it/s]


Early stopping triggered at epoch 55!
Total time: 20.77265
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 8


 39%|███▉      | 55/140 [00:20<00:31,  2.72it/s]

Early stopping triggered at epoch 55!
Total time: 20.32211
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 9



 42%|████▏     | 59/140 [00:21<00:29,  2.76it/s]

Early stopping triggered at epoch 59!
Total time: 21.47612
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 10



 36%|███▋      | 51/140 [00:19<00:34,  2.61it/s]


Early stopping triggered at epoch 51!
Total time: 19.70109
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 11


 36%|███▋      | 51/140 [00:19<00:34,  2.55it/s]

Early stopping triggered at epoch 51!
Total time: 20.40465
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 12



 36%|███▋      | 51/140 [00:20<00:35,  2.53it/s]


Early stopping triggered at epoch 51!
Total time: 20.28369
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 13


 38%|███▊      | 53/140 [00:19<00:32,  2.65it/s]


Early stopping triggered at epoch 53!
Total time: 20.11522
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 14


 36%|███▋      | 51/140 [00:19<00:33,  2.63it/s]

Early stopping triggered at epoch 51!
Total time: 19.56097
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.160156GB
Training model for superclasses 0 and 15



 36%|███▋      | 51/140 [00:19<00:34,  2.61it/s]

Early stopping triggered at epoch 51!
Total time: 19.63516





torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.164062GB
Training model for superclasses 0 and 16


 36%|███▋      | 51/140 [00:20<00:35,  2.51it/s]

Early stopping triggered at epoch 51!
Total time: 20.44118
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.164062GB
Training model for superclasses 0 and 17



 36%|███▋      | 51/140 [00:20<00:35,  2.52it/s]

Early stopping triggered at epoch 51!
Total time: 20.40308
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.164062GB
Training model for superclasses 0 and 18



 36%|███▋      | 51/140 [00:20<00:35,  2.53it/s]

Early stopping triggered at epoch 51!
Total time: 20.27091
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.164062GB
Training model for superclasses 0 and 19



 38%|███▊      | 53/140 [00:20<00:33,  2.61it/s]

Early stopping triggered at epoch 53!
Total time: 20.47783





torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.164062GB
Training model for superclasses 1 and 2


 36%|███▋      | 51/140 [00:19<00:34,  2.58it/s]


Early stopping triggered at epoch 51!
Total time: 19.87965
torch.cuda.memory_allocated: 0.296266GB
torch.cuda.memory_reserved: 0.689453GB
torch.cuda.max_memory_reserved: 1.164062GB
Training model for superclasses 1 and 3


  2%|▏         | 3/140 [00:11<08:29,  3.72s/it]


KeyboardInterrupt: 

In [None]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved