##  Let us begin

Can add descriptions later for users to load interactively. 


In [1]:
!nvidia-smi -L

GPU 0: NVIDIA RTX A6000 (UUID: GPU-8a75c7fd-6edd-bf0e-2d95-54106b2b6ada)


In [2]:
# import os
# import torch
# !pip install GPUtil

Collecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: GPUtil
  Building wheel for GPUtil (setup.py) ... [?25ldone
[?25h  Created wheel for GPUtil: filename=GPUtil-1.4.0-py3-none-any.whl size=7394 sha256=a43ed625e9ca747d311829fe8ce7207ce22a077bd656ce27387e76ebdf27e2ad
  Stored in directory: /root/.cache/pip/wheels/2b/b5/24/fbb56595c286984f7315ee31821d6121e1b9828436021a88b3
Successfully built GPUtil
Installing collected packages: GPUtil
Successfully installed GPUtil-1.4.0
[0m

In [3]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import json
import numpy as np
import argparse
import torch
import sys
import importlib as ipb
import pickle
import torch.nn as nn
import torch.nn.functional as F
import torch.optim.lr_scheduler as lr_scheduler
import GPUtil
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
class Logger(object):
    def __init__(self, runs, info=None):
        self.info = info
        self.results = [[] for _ in range(runs)]
      
    def pickle(self, key_save):
        f = open(key_save, 'wb')
        pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    def unpickle(self, key_save):
        with open(key_save, 'rb') as f:
            return pickle.load(f)

    def add_result(self, run, result):
        assert len(result) == 2
        assert run >= 0 and run < len(self.results)
        self.results[run].append(result)

    def print_statistics(self, run=None):
        if run is not None:
            result = 100 * torch.tensor(self.results[run])
            argmax = result[:, 0].argmax().item()
            print(f'Run {run + 1:02d}:')
            print(f'Highest Train: {result[:, 0].max():.2f}')
            print(f'Highest Test: {result[:, 1].max():.2f}')
            # Same as highest train, as we have no validation data
            print(f'  Final Train: {result[argmax, 0]:.2f}') 
            print(f'   Final Test: {result[argmax, 1]:.2f}')
        else:
            result = 100 * torch.tensor(self.results)
            best_results = []
            for r in result:
                train = r[:, 0].max().item()
                test = r[:, 1].max().item()
                train2 = r[r[:, 0].argmax(), 0].item()
                test2 = r[r[:, 0].argmax(), 1].item()
                best_results.append((train, test, train2, test2))

            best_result = torch.tensor(best_results)

            print(f'All runs:')
            r = best_result[:, 0]
            print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 1]
            print(f'Highest Test: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 2]
            print(f'  Final Train: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 3]
            print(f'   Final Test: {r.mean():.2f} ± {r.std():.2f}')


def test(model, train_loader, test_loader):
    model.eval()
    loader = {0: train_loader, 1:test_loader}
    accuracies = []
    with torch.no_grad():
        for data_loader in loader.values():
            correct = 0
            total = 0
            for i, (images, labels) in enumerate(data_loader):
                if args.FC:
                    images = images.reshape(-1, input_size).to(device)
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            accuracies.append(correct/total)
    return accuracies # train_accu & test accu

### Training function (include SVI)

In [5]:
def train(model, images, labels):
    model.train()
    out = model(images)
    cost = nn.CrossEntropyLoss()
    loss = cost(out, labels)
    # loss = F.mse_loss(out.float(), F.one_hot(labels).float())/2
    loss.backward()
    return loss.item()


def train_SVI(model, images, labels):
    #### New lines for SVI ####
    # NOTE: lines below are necessary, as o/w model.layers_x grow in size as epoches increases
    model.layers_Xtilde = []
    model.layers_grad = []
    model.on_training = True
    #### End #####
    model.train()
    out = model(images)
    cost = nn.CrossEntropyLoss()
    loss = cost(out, labels)
    # loss = F.mse_loss(out.float(), F.one_hot(labels).float())/2
    #### New lines for SVI ####
    model.turn_on_off_grad(on = False) 
    # print('#### Grad of model params before SVI')
    # for name, param in model.named_parameters():
    #     if param.grad is not None:
    #         print(name, param.grad.norm())
    #### End #####
    loss.backward()
    #### New lines for SVI ####
    loss_tilde = 0
    for Xlplus1, Xlplus1grad in zip(model.layers_Xtilde, model.layers_grad):
        Xlplus1grad = Xlplus1grad.grad.detach().to(device)
        loss_tilde += (Xlplus1*Xlplus1grad).sum()
    model.turn_on_off_grad(on = True)
    loss_tilde.backward()  # To get update direction by MVI for all layers at once
    # print('#### Grad of model params after SVI')
    # for name, param in model.named_parameters():
    #     if param.grad is not None:
    #         print(name, param.grad.norm())
    # raise Exception('Stop here')
    model.on_training = False  # To avoid additional .retain_grad()
    #### End #####
    return loss.item()

### LeNet (include SVI)

In [6]:
class LeNet5(nn.Module):
    # From https://towardsdatascience.com/implementing-yann-lecuns-lenet-5-in-pytorch-5e05a0911320
    def __init__(self, in_channels, n_classes):
        super(LeNet5, self).__init__()

        # The convolutional filters are feature extractors
        self.convs = torch.nn.ModuleList()
        self.convs.append(nn.Conv2d(in_channels=in_channels,
                          out_channels=6, kernel_size=5, stride=1))
        self.convs.append(
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1))
        self.convs.append(nn.Linear(in_features=400, out_features=120))
        self.convs.append(nn.Linear(in_features=120, out_features=84))
        self.convs.append(nn.Linear(in_features=84, out_features=n_classes))
        self.bns = torch.nn.ModuleList()
        self.bns.append(torch.nn.BatchNorm2d(6))
        self.bns.append(torch.nn.BatchNorm2d(16))

    def forward(self, x):
        for i, conv in enumerate(self.convs):
            if 0 < i <= 2:
                x = nn.AvgPool2d(2, stride=2)(x)
            if i == 2:
                x = torch.flatten(x, 1)
            x = conv(x)
            if i < 2: 
                x = self.bns[i](x)
            if i < len(self.convs) - 1:
                x = nn.ReLU()(x) 
        return x


class LeNet5_SVI(nn.Module):
    def __init__(self, in_channels, n_classes):
        super(LeNet5_SVI, self).__init__()

        # The convolutional filters are feature extractors
        self.convs = torch.nn.ModuleList()
        self.convs.append(nn.Conv2d(in_channels=in_channels,
                          out_channels=6, kernel_size=5, stride=1))
        self.convs.append(
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1))
        self.convs.append(nn.Linear(in_features=400, out_features=120))
        self.convs.append(nn.Linear(in_features=120, out_features=84))
        self.convs.append(nn.Linear(in_features=84, out_features=n_classes))
        self.bns = torch.nn.ModuleList()
        self.bns.append(torch.nn.BatchNorm2d(6))
        self.bns.append(torch.nn.BatchNorm2d(16))

        #### New lines for SVI ####
        # If any layer true, then it needs to use SVI
        self.layers_with_SVI = [True for i in range(len(self.convs))]
        # TODO: later, can treat this as an input to be decided (as some layers need not SVI)
        # Append pre-activation \tilde{X}_{l+1}, ONLY at layers i where self.layers_with_SVI[i] == True
        self.layers_Xtilde = []
        # Append the grad of L w.r.t. X_{l+1}, ONLY at layers i where self.layers_with_SVI[i] == True
        self.layers_grad = []
        self.on_training = True
        #### End #####
    
    #### New lines for SVI ####
    # Avoid gradient accumulation
    def turn_on_off_grad(self, on = True):
        for param in self.parameters():
            param.requires_grad = on
            
    #### End #####

    def forward(self, x):
        for i, conv in enumerate(self.convs):
            if 0 < i <= 2:
                x = nn.AvgPool2d(2, stride=2)(x)
            if i == 2:
                x = torch.flatten(x, 1)
            #### New lines for SVI ####
            # This is added b/c o/w the gradient backprop w.r.t. new loss also gets to earlier layers (undesirable)
            if self.layers_with_SVI[i] and self.on_training:
                x_tmp = x.detach().clone().to(device)
                x_tmp = conv(x_tmp)
                if i < 2:
                    x_tmp = self.bns[i](x_tmp)
                self.layers_Xtilde.append(x_tmp)
            #### End #####
            x = conv(x)
            if i < 2: 
                x = self.bns[i](x)
            if i < len(self.convs) - 1:
                x = nn.ReLU()(x)
            #### New lines for SVI ####
            if self.layers_with_SVI[i] and self.on_training:
                x.retain_grad()  # To get the gradient with respect to output
                self.layers_grad.append(x)
            #### End #####
        return x

### Utility function, including data functions

In [None]:
def mem_report():
    if device.type == 'cuda':
        GPUs = GPUtil.getGPUs()
        for i, gpu in enumerate(GPUs):
            print('GPU {:d} ... Mem Free: {:.0f}MB / {:.0f}MB | Utilization {:3.0f}%'.format(
                i, gpu.memoryFree, gpu.memoryTotal, gpu.memoryUtil*100))
    else:
        print("CPU RAM Free: "
              + humanize.naturalsize(psutil.virtual_memory().available))

In [None]:
import torchvision
import torchvision.transforms as transforms
def subset_data(data,frac=1):
    # Randomly subset a fraction of data from total data
    np.random.seed(1103)
    idx = np.random.choice(len(data),int(frac*len(data)),replace=False)
    return torch.utils.data.Subset(data,idx)
data_fixed = 'MNIST_batched' # 'MNIST_batched' or 'CIFAR10_batched'
if 'MNIST' in data_fixed:
    train_dataset0 = torchvision.datasets.MNIST(root = './data',
                                           train = True,
                                           transform = transforms.Compose([
                                                  transforms.Resize((32,32)),
                                                  transforms.ToTensor(),
                                                  transforms.Normalize(mean = (0.1307,), std = (0.3081,))]),
                                           download = True)
    test_dataset0 = torchvision.datasets.MNIST(root = './data',
                                              train = False,
                                              transform = transforms.Compose([
                                                      transforms.Resize((32,32)),
                                                      transforms.ToTensor(),
                                                      transforms.Normalize(mean = (0.1325,), std = (0.3105,))]),
                                              download=True)
    num_classes = 10
    in_channels = 1
else:
    # NOTE, CIFAR10 has color channels, so input size = 3*32*32 with FC net
    train_dataset0 = torchvision.datasets.CIFAR10(root='/data', 
                                              train=True, 
                                              transform=transforms.ToTensor(),  
                                              download=True)
    test_dataset0 = torchvision.datasets.CIFAR10(root='/data', 
                                              train=False, 
                                              transform=transforms.ToTensor())
    num_classes = 10 
    in_channels = 3
input_size = torch.prod(torch.tensor(train_dataset0[0][0].shape)).item()
frac=0.1 if 'MNIST' in data_fixed else 0.2
frac = 1
train_dataset = subset_data(train_dataset0,frac=frac)
test_dataset = subset_data(test_dataset0,frac=frac)
print(len(train_dataset))
print(len(test_dataset))
# Data loader (i.e., split to batches) see below

Start testing

In [None]:
SVI_ls = [True, True, False]  # If False, use ordinary SGD or Adam
SVI_pause = [False, True, False]  # If True, only SVI to warm start
# optim_ls = ['SGD', 'Adam']
SVI_ls = [True]
SVI_pause = [False]
optim_ls = ['SGD']
lr, hidden_channels = 0.001, 512
FC_only = False  # Use all FC layers if True, o/w use LeNet
dataname = data_fixed
if FC_only == False:
    dataname = data_fixed + 'LeNet_'
num_runs = 3
batch_size = 128
num_epochs = 20
num_log_steps = len(train_dataset) // batch_size # Evaluate loss every X iteration/mini-batches
if __name__ == "__main__":
    Final_result = {}  # For quick check at the end of training
    for use_SVI, pause_SVI in zip(SVI_ls, SVI_pause):
        for optim_name in optim_ls:
            result_dict = {'SVI-SGD': [],
                           'SVI_warmstart-SGD': [],
                           'SGD': [],
                           'SVI-Adam': [],
                           'SVI_warmstart-Adam': [],
                           'Adam': []}
            parser = argparse.ArgumentParser(
                description=dataname)
            parser.add_argument('--log_steps', type=int, default=num_log_steps)
            parser.add_argument('--num_layers', type=int, default=4)
            parser.add_argument('--dropout', type=float, default=0.25)
            parser.add_argument('--lr', type=float, default=lr)
            parser.add_argument('--momentum', type=float, default=0.95)
            parser.add_argument(
                '--epochs', type=int, default=num_epochs)  # Change to 100
            parser.add_argument(
                '--batch_size', type=int, default=batch_size)
            parser.add_argument('--runs', type=int,
                                default=num_runs)
            parser.add_argument('--SVI', type=bool, default=use_SVI)
            parser.add_argument('--FC', type=bool, default=FC_only)
            parser.add_argument(
                '--optimizer', type=str, default=optim_name)
            args = parser.parse_args(args=[])
            args.hidden_channels = hidden_channels
            print(args)
            # Get data loader from dataset
            train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                       batch_size=batch_size,
                                                       shuffle=True)
            test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                                      batch_size=batch_size,
                                                      shuffle=False)
            logger = Logger(args.runs, args)
            results_over_runs = {}
            for run in range(args.runs):
                accu_at_run = []
                # Initiatiate new model per run
                torch.manual_seed(1103 + run)
                args.SVI = use_SVI
                if args.SVI:
                    if args.FC:
                        model = FCnet_SVI(input_size, args.hidden_channels,
                                          num_classes, args.num_layers, args.dropout).to(device)
                    else:
                        model = LeNet5_SVI(in_channels, num_classes).to(device)
                else:
                    if args.FC:
                        model = FCnet(input_size, args.hidden_channels,
                                      num_classes, args.num_layers, args.dropout).to(device)
                    else:
                        model = LeNet5(in_channels, num_classes).to(device)
                if args.optimizer == 'SGD':
                    optimizer = torch.optim.SGD(
                        model.parameters(), lr=args.lr, momentum=args.momentum, nesterov=True)
                else:
                    optimizer = torch.optim.Adam(
                        model.parameters(), lr=args.lr)
                for epoch in range(1, 1 + args.epochs):
                    iter_per_epoch = int(len(train_dataset) / batch_size) + 1
                    frac_stop = 0.1
                    pause_threshold = int(
                        frac_stop * iter_per_epoch * args.epochs)
                    if device.type == 'cuda':
                        # Useful to avoid GPU allocation excess
                        torch.cuda.empty_cache()
                    print(f"LR is {optimizer.param_groups[0]['lr']}")
                    for i, (images, labels) in enumerate(train_loader):
                        current_itr = ((epoch - 1) * iter_per_epoch) + i + 1
                        if pause_SVI and current_itr > pause_threshold:
                            args.SVI = False
                            if current_itr <= pause_threshold + 1:
                                # Reinitialize optimizer to avoid gradient issue in Adam
                                sdict = model.state_dict()
                                print(
                                    '############ Pause SVI from now on ############')
                                if args.FC:
                                    model = FCnet(input_size, args.hidden_channels,
                                                  num_classes, args.num_layers, args.dropout)
                                else:
                                    model = LeNet5(in_channels, num_classes)
                                model.load_state_dict(sdict)
                                model = model.to(device)
                                if args.optimizer == 'SGD':
                                    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, 
                                                                momentum=args.momentum, 
                                                                nesterov=True)
                                else:
                                    optimizer = torch.optim.Adam(
                                        model.parameters(), lr=args.lr)
                        optimizer.zero_grad()
                        if args.FC:
                            images = images.reshape(-1, input_size).to(device)
                        images = images.to(device)
                        labels = labels.to(device)
                        loss = train_SVI(model, images, labels) if args.SVI else train(model, images, labels)
                        optimizer.step()
                        # Test at each iteration
                        if (i+1) % args.log_steps == 0:
                            print('Testing')
                            if args.SVI:
                                print(f'SVI-{args.optimizer} training at batch {i}, epoch {epoch}')
                            else:
                                print(f'{args.optimizer} training at batch {i}, epoch {epoch}')
                            # Do so because training data has too many images, and evaluation thus takes too long.
                            train_loader_sub = torch.utils.data.DataLoader(dataset=subset_data(train_dataset0,frac=0.1),
                                                       batch_size=batch_size,
                                                       shuffle=True)
                            result = test(model, train_loader_sub, test_loader)
                            logger.add_result(run, result)
                            train_acc, test_acc = result
                            accu_at_run += [[train_acc, test_acc]]
                            print(f'Run: {run + 1:02d}, '
                                  f'Epoch: {epoch:02d}, '
                                  f'Loss: {loss:.4f}, '
                                  f'Train: {100 * train_acc:.2f}%,'
                                  f'Test: {100 * test_acc:.2f}%')
                # Save model after training over all epochs
                results_over_runs[f'lr={args.lr}@Run{run+1}'] = accu_at_run
                logger.print_statistics(run)
                # Save results
                if use_SVI:
                    SVI_prefix = 'SVI_warmstart-' if pause_SVI else 'SVI-'
                else:
                    SVI_prefix = ''
                key = f'{SVI_prefix}{optim_name}'
                key_save = f'{SVI_prefix}{optim_name}-{args.num_layers}layers-{args.hidden_channels}nodes-{args.lr}LR'
                key_save = dataname + key_save
                # # Save it to file, but need not now because only one run.
                # logger.pickle(key_save)
                result_dict[key].append(results_over_runs)
                # Final train and test accuracy
                Final_result[key] = accu_at_run[-1]
                with open(f"{key_save}_loss_together_SVI_only.json", "w") as outfile:
                    json.dump(result_dict, outfile)
            logger.print_statistics()

In [None]:
# This is if we always use SVI, not just warm start
# All runs:
# Highest Train: 97.91 ± 0.18
# Highest Test: 97.80 ± 0.13
#   Final Train: 97.91 ± 0.18
#    Final Test: 97.80 ± 0.13
logger.print_statistics()