In [1]:
import torch
import torchvision
import torch.nn as nn
import time
import json
import datetime
from DeepLearning.Project1.data import *

In [2]:
# File based on https://github.com/huyvnphan/PyTorch_CIFAR10/
class VGG(nn.Module):
    def __init__(self, features, num_classes=10, avgpool_size=(1,1)):
        super(VGG, self).__init__()
        self.features = features

        self.avgpool = nn.AdaptiveAvgPool2d(avgpool_size)

        self.classifier = nn.Sequential(
            nn.Linear(512 * avgpool_size[0] * avgpool_size[1], 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

 # Create convolution part of VGG11_bn archhitecture
def make_vgg11_bn_layers(cfg = None):
    if cfg == None:
        cfg = [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"]
    layers = []
    in_channels = 3
    for v in cfg:
        if v == "M":
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)

 # Create VGG11_bn model
def vgg11_bn(device="cpu", num_classes=10):
    model = VGG(make_vgg11_bn_layers(), num_classes=num_classes)
    return model

vgg_cfg = {
    '8':  [64, 'M', 128, 'M', 256, 'M', 512, 'M', 512, 'M'],
    '11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    '13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    '16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    '19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 
          512, 512, 512, 512, 'M'],
}



def eval_accuracy(model, dataloader, training_device='cpu'):
    model.eval()
    with torch.no_grad():
        model.to(training_device)
        correct = 0
        all_so_far = 0
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(training_device), labels.to(training_device)
            pred = torch.argmax(model(inputs), dim=1)

            all_so_far += labels.size().numel()
            correct += torch.sum(pred.eq(labels)).item()
    model.train()
    return correct/all_so_far

In [3]:
def backup_to_ram(model):
    from copy import deepcopy
    return deepcopy(model).cpu()

class EarlyStopper:
    def __init__(self, patience = 3, backup_method=backup_to_ram):
        self.patience = patience
        self.current = 0
        
        self.backup_method = backup_method
        
        self.best_backup = None
        self.best_accuracy = 0.

    def should_continue(self, accuracy, model = None):
        if self.best_accuracy < accuracy:
            self.current = 0
            self.best_accuracy = accuracy
            if model is not None:
                self.best_backup = self.backup_method(model)
            return True
        
        self.current += 1
        
        if self.current >= self.patience:
            return False
        return True

In [4]:
def train_one_epoch(model, optimizer, criterion, dataloader_train, training_device):
    model.train()
    for inputs, labels in dataloader_train:
        inputs, labels = inputs.to(training_device), labels.to(training_device)
        optimizer.zero_grad()
        loss = criterion(model(inputs), labels)
        loss.backward()
        optimizer.step()

def train_one_run(model, optimizer, criterion,
                  dataloader_train, dataloader_val,
                  max_epochs, early_stopper, 
                  trajectory, 
                  training_device='cuda', mem_bg_allocated=0, mem_bg_reserved=0,
                  *_args, **_kwargs):
    model.train()
    model.to(training_device)

    for epoch in range(max_epochs):
        start_time = time.time()
        
        train_one_epoch(model, optimizer, criterion, dataloader_train, training_device)
        
        end_time = time.time()
        
        training_accuracy = eval_accuracy(model, dataloader_train, training_device)
        validation_accuracy = eval_accuracy(model, dataloader_val, training_device)
        print("Epoch: {}, Accuracy on validation set: {}".format(epoch, validation_accuracy))
        
        trajectory.append({
            "epoch": epoch,
            "train": training_accuracy,
            "validation": validation_accuracy,
            "start_time": start_time,
            "duration": end_time - start_time,
            "memory_allocated_mb": (torch.cuda.memory_allocated() - mem_bg_allocated)/1024/1024,
            "memory_reserved_mb": (torch.cuda.memory_reserved() - mem_bg_reserved)/1024/1024,
        })
        
        
        if not early_stopper.should_continue(validation_accuracy, model):
            print("Early stop")
            return early_stopper.best_backup
    
    return model

In [5]:
def run_train_experiment(arch_name, model_factory, aug_name, aug_factory, train_name, train_func, run):
    path = f"experiments/train_{train_name}_aug_{aug_name}_arch_{arch_name}_{run}_"
    
    # 
    import os
    try:
        if os.stat(path + "report.json").st_size != 0:
            print("Report exists already for " + path[:-1] + ". Skipping...")
            return
    except OSError:
        pass
    
    model, trajectory, validation_accuracy = train_func(aug_factory, model_factory)
    
    with open(path + "report.json", "w") as f:
        json.dump(
            {
                "name": arch_name,
                "run": run,
                "augment": aug_name,
                "train": train_name, 
                "best_accuracy_validation": validation_accuracy,
                "time_generated": datetime.datetime.now().isoformat(),
                "trajectory": trajectory
            },
            f
        )
    torch.save(model, path + "model.pt")

In [6]:
def fasterlearn(aug_factory, model_factory):
    torch.cuda.empty_cache()
    mem_bg_allocated = torch.cuda.memory_allocated()
    mem_bg_reserved = torch.cuda.memory_reserved()
    
    train, test, val = aug_factory()
    model = model_factory()
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    early_stopper = EarlyStopper(patience = 5)
    trajectory = []
    model = train_one_run(model, optimizer, criterion,
                          train, val,
                          200, early_stopper,
                          trajectory, 
                          mem_bg_allocated=mem_bg_allocated,
                          mem_bg_reserved=mem_bg_reserved)
    validation_accuracy = eval_accuracy(model, val, "cuda")
    return model, trajectory, validation_accuracy

def morepatient(aug_factory, model_factory):
    torch.cuda.empty_cache()
    mem_bg_allocated = torch.cuda.memory_allocated()
    mem_bg_reserved = torch.cuda.memory_reserved()
    
    train, test, val = aug_factory()
    model = model_factory()
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    early_stopper = EarlyStopper(patience = 10)
    trajectory = []
    model = train_one_run(model, optimizer, criterion,
                          train, val,
                          200, early_stopper,
                          trajectory, 
                          mem_bg_allocated=mem_bg_allocated,
                          mem_bg_reserved=mem_bg_reserved)
    validation_accuracy = eval_accuracy(model, val, "cuda")    
    return model, trajectory, validation_accuracy

def smallbatch(aug_factory, model_factory):
    torch.cuda.empty_cache()
    mem_bg_allocated = torch.cuda.memory_allocated()
    mem_bg_reserved = torch.cuda.memory_reserved()
    
    train, test, val = aug_factory(bs=16)
    model = model_factory()
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    early_stopper = EarlyStopper(patience = 5)
    trajectory = []
    model = train_one_run(model, optimizer, criterion,
                          train, val,
                          200, early_stopper,
                          trajectory, 
                          mem_bg_allocated=mem_bg_allocated,
                          mem_bg_reserved=mem_bg_reserved)
    validation_accuracy = eval_accuracy(model, val, "cuda")    
    return model, trajectory, validation_accuracy


def bigbatch(aug_factory, model_factory):
    torch.cuda.empty_cache()
    mem_bg_allocated = torch.cuda.memory_allocated()
    mem_bg_reserved = torch.cuda.memory_reserved()
    
    train, test, val = aug_factory(bs=1024)
    model = model_factory()
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    early_stopper = EarlyStopper(patience = 5)
    trajectory = []
    model = train_one_run(model, optimizer, criterion,
                          train, val,
                          200, early_stopper,
                          trajectory, 
                          mem_bg_allocated=mem_bg_allocated,
                          mem_bg_reserved=mem_bg_reserved)
    validation_accuracy = eval_accuracy(model, val, "cuda")    
    return model, trajectory, validation_accuracy

In [7]:
def flip(bs=128):
    return augmented_cifar10_dataset_randomflip(bs=bs)

def smallrotate(bs=128):
    return augmented_cifar10_dataset_rotate_randomapply(5, bs=bs)

def fliprotate(bs=128):
    return augmented_cifar10_dataset_randomflip_rotate_randomapply(5, bs=bs)


In [8]:
def vgg_from_cfg(cfg_key):
    return VGG(
        make_vgg11_bn_layers(cfg=vgg_cfg[cfg_key]),
        num_classes=10,
        avgpool_size=(1,1)
    ).to("cuda")

def vgg13():
    return vgg_from_cfg("13")

def vgg16():
    return vgg_from_cfg("16")


In [9]:
experiment_list = [
    (
        archfactory.__name__, 
        archfactory, 
        augfactory.__name__, 
        augfactory, 
        trainfunc.__name__,
        trainfunc,
        str(run)
    )
    for run in range(1, 10) 
    for archfactory in [vgg13, vgg16]
    for augfactory in [flip, smallrotate,  fliprotate]
    for trainfunc in [fasterlearn, morepatient]
] + [
    (
        archfactory.__name__, 
        archfactory, 
        "none", # memory is the only interesting part for batch size
        load_cifar10_dataloaders_validation, 
        trainfunc.__name__,
        trainfunc,
        str(1) # memory is the only interesting part for batch size
    )
    for archfactory in [vgg13, vgg16]
    for trainfunc in [smallbatch, bigbatch]    
]

In [10]:
len(experiment_list)

112

In [None]:
for experiment in experiment_list:
    print(
        "Time:", datetime.datetime.now().isoformat(),
        *experiment[::2]
    )
    try:
        run_train_experiment(*experiment)
    except Exception as e:
        print("Error occured, skipping...\n", repr(e))

Time: 2023-03-20T14:31:52.355162 vgg13 flip fasterlearn 1
Files already downloaded and verified
Files already downloaded and verified
