In [1]:
## model design: https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-build-a-convnet-for-cifar-10-and-cifar-100-classification-with-keras.md
import infrastructure as inf


from argparse import ArgumentParser
from typing import List
import time
import numpy as np
from tqdm import tqdm

import torch    

import torch as ch
from torch.cuda.amp import GradScaler, autocast
from torch.nn import CrossEntropyLoss, Conv2d, BatchNorm2d
from torch.optim import SGD, lr_scheduler
from torchvision.transforms import v2
import torchvision
import torch.nn as nn
import torch.nn.functional as F


from fastargs import get_current_config, Param, Section
from fastargs.decorators import param
from fastargs.validation import And, OneOf

from ffcv.fields import IntField, RGBImageField
from ffcv.fields.decoders import IntDecoder, SimpleRGBImageDecoder
from ffcv.loader import Loader, OrderOption
from ffcv.pipeline.operation import Operation
from ffcv.transforms import RandomHorizontalFlip, Cutout, \
    RandomTranslate, Convert, ToDevice, ToTensor, ToTorchImage
from ffcv.transforms.common import Squeeze
from ffcv.writer import DatasetWriter

Using device cuda:0


In [2]:
device = inf.device

In [3]:
# Define the CNN model in PyTorch
class MyCNNModel(nn.Module):
    def __init__(self, no_classes):
        super(MyCNNModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=(3, 3)) # output shape: 
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3, 3))
        self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3, 3))
        self.pool = nn.MaxPool2d(kernel_size=(2, 2))
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(1024, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, no_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def generate_model(output_dim=100):
    model = MyCNNModel(output_dim)
    return model.to(device)

In [4]:
def train(model, loaders, lr=0.1, epochs=100, momentum=0.9, weight_decay=0.0001,reduce_patience=5, reduce_factor=0.2,tracking_freq=5,do_tracking=True,verbose=True):
    
    # dictionary to keep track of training params and results
    train_dict = {}
    train_dict['lr'] = lr
    train_dict['epochs'] = epochs
    train_dict['momentum'] = momentum
    train_dict['weight_decay'] = weight_decay
    train_dict['reduce_patience'] = reduce_patience
    train_dict['reduce_factor'] = reduce_factor
    # results
    # training loss is tracked every epoch
    train_dict['train_loss'] = []
    
    # all other params are tracked every e.g. 10 epochs	(tracking_freq) if do_tracking is True
    train_dict['train_acc_top1'] = []
    train_dict['train_acc_top5'] = []
    train_dict['val_acc_top1'] = []
    train_dict['val_acc_top5'] = []
    
    
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
    criterion = ch.nn.CrossEntropyLoss() # doesn't require the input to be in valid probabilities format
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=reduce_patience, verbose=True, factor=reduce_factor)
    len_train_loader = len(loaders['train'])
    
    for i in range(epochs):
        model.train()
        running_loss = 0.0
        #for ims, labs in tqdm(loaders['train']):
        for ims, labs in loaders['train']:
            optimizer.zero_grad(set_to_none=True)
            with autocast():
                out = model(ims)
                loss = criterion(out, labs)
            
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        scheduler.step(running_loss)
        # save training loss
        print(f'Epoch {i+1}/{epochs}, Loss: {running_loss/len_train_loader}')
        train_dict['train_loss'].append(running_loss/len_train_loader)
        # keep track of other metrics
        if do_tracking and (i+1)%tracking_freq == 0:
            train_top1, train_top5, val_top1, val_top5 = evaluate(model, loaders, lr_tta=False,verbose=verbose)
            train_dict['train_acc_top1'].append(train_top1)
            train_dict['train_acc_top5'].append(train_top5)
            train_dict['val_acc_top1'].append(val_top1)
            train_dict['val_acc_top5'].append(val_top5)
    return model, train_dict

def evaluate(model, loaders, lr_tta=False,verbose=True):
    # lr_tta: whether to use test-time augmentation by flipping images horizontally
    model.eval()
    train_top1, train_top5, val_top1, val_top5 = 0., 0., 0., 0.
    with ch.no_grad():
        for name in ['train', 'test']:
            total_correct, total_num, total_correct_top5 = 0., 0., 0.
            for ims, labs in loaders[name]:
                with autocast():
                    out = model(ims)
                    if lr_tta:
                        out += model(ims.flip(-1))
                    # computing top1 accuracy
                    total_correct += out.argmax(1).eq(labs).sum().cpu().item()
                    total_num += ims.shape[0]
                    # computing top5 accuracy
                    total_correct_top5 += out.argsort(1)[:,-5:].eq(labs.unsqueeze(-1)).sum().cpu().item()
            if verbose:
                print(f'{name} (acc) top-1: {total_correct / total_num * 100:.1f}, top-5: {total_correct_top5 / total_num * 100:.1f} %')
            if name == 'train':
                train_top1, train_top5 = total_correct / total_num * 100, total_correct_top5 / total_num * 100
            else:
                val_top1, val_top5 = total_correct / total_num * 100, total_correct_top5 / total_num * 100
    return train_top1, train_top5, val_top1, val_top5

In [None]:
loaders, start_time = inf.make_dataloaders(batch_size=256, num_workers=12)
model = generate_model()
print(model)
# load model from checkpoint stored at ./models/model.pt
#model.load_state_dict(torch.load("./models/model.pt"))
model, tracked_params = train(model, loaders,epochs=80,tracking_freq=5,reduce_factor=0.2,do_tracking=True,verbose=True)
print(f'Total time: {time.time() - start_time:.5f}')
evaluate(model, loaders)

# store the model   
torch.save(model.state_dict(), "./models/model.pt")	
# save the tracked params
np.save("./models/tracked_params.npy", tracked_params)

# visualize the tracked params from training

In [6]:
# do training on the models for the tupels of superclasses
import gc
for i in range(0,20):
    for j in range(i+1,20):
        paths =  [f'./data/subsets/{dataset_name}_superclass_{i}_{j}.beton' for dataset_name in ["train","test"]]
        loaders, start_time = inf.make_dataloaders(paths[0],paths[1])
        model = generate_model(10)
        model, tracked_params = train(model, loaders,epochs=20,tracking_freq=5,reduce_factor=0.2,do_tracking=True,verbose=True)
        print(f'Total time: {time.time() - start_time:.5f}')
        evaluate(model, loaders)
        # store the model   
        torch.save(model.state_dict(), f'./models/model_{i}_{j}.pt')	
        # save the tracked params
        np.save(f"./models/tracked_params{i}_{j}.npy", tracked_params)
        
        # once done remove the model, tracked params and loaders from storage
        
        del model, tracked_params, loaders, start_time
        torch.cuda.empty_cache()

Epoch 1/20, Loss: 2.281559793572677
Epoch 2/20, Loss: 2.2788657640155994
Epoch 3/20, Loss: 2.258342240986071
Epoch 4/20, Loss: 2.048417881915444
Epoch 5/20, Loss: 1.9063385411312705
train (acc) top-1: 26.7, top-5: 84.7 %
test (acc) top-1: 25.7, top-5: 85.1 %
Epoch 6/20, Loss: 1.8245369446905035
Epoch 7/20, Loss: 1.7192870503977726
Epoch 8/20, Loss: 1.663450379120676
Epoch 9/20, Loss: 1.6124906790883917
Epoch 10/20, Loss: 1.598766421016894
train (acc) top-1: 43.4, top-5: 90.7 %
test (acc) top-1: 42.1, top-5: 90.5 %
Epoch 11/20, Loss: 1.550866911285802
Epoch 12/20, Loss: 1.5248469239787052
Epoch 13/20, Loss: 1.4818741710562455
Epoch 14/20, Loss: 1.4572236600675081
Epoch 15/20, Loss: 1.4273042427866083
train (acc) top-1: 49.8, top-5: 93.5 %
test (acc) top-1: 48.7, top-5: 92.1 %
Epoch 16/20, Loss: 1.3779754262221486
Epoch 17/20, Loss: 1.3585858345031738
Epoch 18/20, Loss: 1.3211899933062101
Epoch 19/20, Loss: 1.2716670161799382
Epoch 20/20, Loss: 1.339365043138203
train (acc) top-1: 52.2, 

Exception in thread Thread-245:
Traceback (most recent call last):
  File "/home/janulm/miniconda3/envs/ffcv_env/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/janulm/miniconda3/envs/ffcv_env/lib/python3.10/site-packages/ffcv/loader/epoch_iterator.py", line 84, in run
    result = self.run_pipeline(b_ix, ixes, slot, events[slot])
  File "/home/janulm/miniconda3/envs/ffcv_env/lib/python3.10/site-packages/ffcv/loader/epoch_iterator.py", line 146, in run_pipeline
    results = stage_code(**args)
  File "", line 2, in stage_code_1
  File "/home/janulm/miniconda3/envs/ffcv_env/lib/python3.10/site-packages/ffcv/transforms/ops.py", line 91, in to_torch_image
    assert inp.is_contiguous(memory_format=ch.channels_last)
AssertionError


KeyboardInterrupt: 