##  Let us begin

Can add descriptions later for users to load interactively. 


In [1]:
# !nvidia-smi -L

In [2]:
# import os
# import torch
# !pip install GPUtil

In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import json
import numpy as np
import argparse
import torch
import sys
import importlib as ipb
import pickle
import torch.nn as nn
import torch.nn.functional as F
import torch.optim.lr_scheduler as lr_scheduler
import GPUtil
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
class Logger(object):
    def __init__(self, runs, info=None):
        self.info = info
        self.results = [[] for _ in range(runs)]
      
    def pickle(self, key_save):
        f = open(key_save, 'wb')
        pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    def unpickle(self, key_save):
        with open(key_save, 'rb') as f:
            return pickle.load(f)

    def add_result(self, run, result):
        assert len(result) == 2
        assert run >= 0 and run < len(self.results)
        self.results[run].append(result)

    def print_statistics(self, run=None):
        if run is not None:
            result = 100 * torch.tensor(self.results[run])
            argmax = result[:, 0].argmax().item()
            print(f'Run {run + 1:02d}:')
            print(f'Highest Train: {result[:, 0].max():.2f}')
            print(f'Highest Test: {result[:, 1].max():.2f}')
            # Same as highest train, as we have no validation data
            print(f'  Final Train: {result[argmax, 0]:.2f}') 
            print(f'   Final Test: {result[argmax, 1]:.2f}')
        else:
            result = 100 * torch.tensor(self.results)
            best_results = []
            for r in result:
                train = r[:, 0].max().item()
                test = r[:, 1].max().item()
                train2 = r[r[:, 0].argmax(), 0].item()
                test2 = r[r[:, 0].argmax(), 1].item()
                best_results.append((train, test, train2, test2))

            best_result = torch.tensor(best_results)

            print(f'All runs:')
            r = best_result[:, 0]
            print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 1]
            print(f'Highest Test: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 2]
            print(f'  Final Train: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 3]
            print(f'   Final Test: {r.mean():.2f} ± {r.std():.2f}')


def test(model, train_loader, test_loader):
    model.eval()
    loader = {0: train_loader, 1:test_loader}
    accuracies = []
    with torch.no_grad():
        for data_loader in loader.values():
            correct = 0
            total = 0
            for i, (images, labels) in enumerate(data_loader):
                if args.FC:
                    images = images.reshape(-1, input_size).to(device)
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            accuracies.append(correct/total)
    return accuracies # train_accu & test accu

### Training function (include SVI)

In [3]:
def train(model, images, labels):
    model.train()
    out = model(images)
    cost = nn.CrossEntropyLoss()
    loss = cost(out, labels)
    loss.backward()
    return loss.item()


def train_SVI(model, images, labels):
    #### New lines for SVI ####
    # NOTE: lines below are necessary, as o/w model.layers_x grow in size as epoches increases
    model.layers_Xtilde = []
    model.layers_grad = []
    model.on_training = True
    #### End #####
    model.train()
    out = model(images)
    cost = nn.CrossEntropyLoss()
    loss = cost(out, labels)
    #### New lines for SVI ####
    model.turn_on_off_grad(on = False) 
    # print('#### Grad of model params before SVI')
    # for name, param in model.named_parameters():
    #     if param.grad is not None:
    #         print(name, param.grad.norm())
    #### End #####
    loss.backward()
    #### New lines for SVI ####
    loss_tilde = 0
    for Xlplus1, Xlplus1grad in zip(model.layers_Xtilde, model.layers_grad):
        Xlplus1grad = Xlplus1grad.grad.detach().to(device)
        loss_tilde += (Xlplus1*Xlplus1grad).sum()
    model.turn_on_off_grad(on = True)
    loss_tilde.backward()  # To get update direction by MVI for all layers at once
    # print('#### Grad of model params after SVI')
    # for name, param in model.named_parameters():
    #     if param.grad is not None:
    #         print(name, param.grad.norm())
    # raise Exception('Stop here')
    model.on_training = False  # To avoid additional .retain_grad()
    #### End #####
    return loss.item()

### VGG-16

In [4]:
class VGG16(nn.Module):
    def __init__(self,num_classes = 10):
        super(VGG16, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )
    def forward(self, x):
        for layer in self.features:
            x = layer(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

class VGG16_SVI(nn.Module):
    def __init__(self,num_classes = 10):
        super(VGG16_SVI, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )
        #### New lines for SVI ####
        self.layers_Xtilde = []
        self.layers_grad = []
        self.on_training = True
        #### End #####

    #### New lines for SVI ####
    # Avoid gradient accumulation
    def turn_on_off_grad(self, on = True):
        for param in self.parameters():
            param.requires_grad = on

    def append_to_layers_Xtilde(self, conv, x):
        if self.on_training:
            x_tmp = x.detach().clone().to(device)
            if len(conv) > 1:
                for conv_ in conv:
                    x_tmp = conv_(x_tmp)
                    if isinstance(conv_, nn.AdaptiveAvgPool2d):
                        x_tmp = torch.flatten(x_tmp, 1)
            else:
                x_tmp = conv[0](x_tmp)
            self.layers_Xtilde.append(x_tmp)
    
    def append_to_layers_grad(self, x):
        if self.on_training:
            x.retain_grad()
            self.layers_grad.append(x)
    #### End #####
    
    def forward(self, x):
        # Manually breaks down SVI into pre-activation and post-activation
        # (0) Conv2d
        self.append_to_layers_Xtilde([self.features[0]], x)
        x = self.features[0](x)
        x = self.features[1](x)
        self.append_to_layers_grad(x)
        # (2) Conv2d
        self.append_to_layers_Xtilde([self.features[2]], x)
        x = self.features[2](x)
        x = self.features[3](x)
        self.append_to_layers_grad(x)
        # (4) MaxPool2d + (5) Conv2d
        self.append_to_layers_Xtilde([self.features[4], self.features[5]], x)
        x = self.features[4](x)
        x = self.features[5](x)
        x = self.features[6](x)
        self.append_to_layers_grad(x)
        # (7) Conv2d
        self.append_to_layers_Xtilde([self.features[7]], x)
        x = self.features[7](x)
        x = self.features[8](x)
        self.append_to_layers_grad(x)
        # (9) MaxPool2d + (10) Conv2d
        self.append_to_layers_Xtilde([self.features[9], self.features[10]], x)
        x = self.features[9](x)
        x = self.features[10](x)
        x = self.features[11](x)
        self.append_to_layers_grad(x)
        # (12) Conv2d
        self.append_to_layers_Xtilde([self.features[12]], x)
        x = self.features[12](x)
        x = self.features[13](x)
        self.append_to_layers_grad(x)
        # (14) Conv2d
        self.append_to_layers_Xtilde([self.features[14]], x)
        x = self.features[14](x)
        x = self.features[15](x)
        self.append_to_layers_grad(x)
        # (16) MaxPool2d + (17) Conv2d
        self.append_to_layers_Xtilde([self.features[16],self.features[17]], x)
        x = self.features[16](x)
        x = self.features[17](x)
        x = self.features[18](x)
        self.append_to_layers_grad(x)
        # (19) Conv2d
        self.append_to_layers_Xtilde([self.features[19]], x)
        x = self.features[19](x)
        x = self.features[20](x)
        self.append_to_layers_grad(x)
        # (21) Conv2d
        self.append_to_layers_Xtilde([self.features[21]], x)
        x = self.features[21](x)
        x = self.features[22](x)
        self.append_to_layers_grad(x)
        # (23) MaxPool2d + (24) Conv2d
        self.append_to_layers_Xtilde([self.features[23], self.features[24]], x)
        x = self.features[23](x)
        x = self.features[24](x)
        x = self.features[25](x)
        self.append_to_layers_grad(x)
        # (26) Conv2d
        self.append_to_layers_Xtilde([self.features[26]], x)
        x = self.features[26](x)
        x = self.features[27](x)
        self.append_to_layers_grad(x)
        # (28) Conv2d
        self.append_to_layers_Xtilde([self.features[28]], x)
        x = self.features[28](x)
        x = self.features[29](x)
        self.append_to_layers_grad(x)
        # (30) MaxPool2d + avgpool + flatten + (0) Linear
        self.append_to_layers_Xtilde([self.features[30],self.avgpool, self.classifier[0]], x)
        x = self.features[30](x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier[0](x)
        x = self.classifier[1](x)
        self.append_to_layers_grad(x)
        # (2) Dropout + (3) Linear
        self.append_to_layers_Xtilde([self.classifier[2], self.classifier[3]], x)
        x = self.classifier[2](x)
        x = self.classifier[3](x)
        x = self.classifier[4](x)
        self.append_to_layers_grad(x)
        # (5) Dropout + (6) Linear
        self.append_to_layers_Xtilde([self.classifier[5], self.classifier[6]], x)
        x = self.classifier[5](x)
        x = self.classifier[6](x)
        self.append_to_layers_grad(x)
        return x

In [5]:
from torchvision import models
model_pretrained = models.vgg16(pretrained = True)

def vgg_16_load(SVI = False, load_pretrain = False):
    if load_pretrain:
        num_c = 1000
        model = VGG16_SVI(num_c) if SVI else VGG16(num_c)
        model.load_state_dict(model_pretrained.state_dict())
        model.classifier[6] = nn.Linear(4096, 10)
    else:
        num_c = 10
        model = VGG16_SVI(num_c) if SVI else VGG16(num_c)
    return model.to(device) 
    

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


  0%|          | 0.00/528M [00:00<?, ?B/s]

In [9]:
mod1 = vgg_16_load(SVI = False, load_pretrain = True)
x = torch.randn(10, 3, 32, 32).to(device)
out = mod1(x)
out.shape

torch.Size([10, 10])

In [11]:
mod1

VGG16(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation

### Utility function, including data functions

In [10]:
def mem_report():
    if device.type == 'cuda':
        GPUs = GPUtil.getGPUs()
        for i, gpu in enumerate(GPUs):
            print('GPU {:d} ... Mem Free: {:.0f}MB / {:.0f}MB | Utilization {:3.0f}%'.format(
                i, gpu.memoryFree, gpu.memoryTotal, gpu.memoryUtil*100))
    else:
        print("CPU RAM Free: "
              + humanize.naturalsize(psutil.virtual_memory().available))

In [11]:
import torchvision
import torchvision.transforms as transforms
def subset_data(data,frac=1):
    # Randomly subset a fraction of data from total data
    np.random.seed(1103)
    idx = np.random.choice(len(data),int(frac*len(data)),replace=False)
    return torch.utils.data.Subset(data,idx)
data_fixed = 'CIFAR10_batched' # 'MNIST_batched' or 'CIFAR10_batched'
if 'MNIST' in data_fixed:
    train_dataset0 = torchvision.datasets.MNIST(root = './data',
                                           train = True,
                                           transform = transforms.Compose([
                                                  transforms.Resize((32,32)),
                                                  transforms.ToTensor(),
                                                  transforms.Normalize(mean = (0.1307,), std = (0.3081,))]),
                                           download = True)
    test_dataset0 = torchvision.datasets.MNIST(root = './data',
                                              train = False,
                                              transform = transforms.Compose([
                                                      transforms.Resize((32,32)),
                                                      transforms.ToTensor(),
                                                      transforms.Normalize(mean = (0.1325,), std = (0.3105,))]),
                                              download=True)
    num_classes = 10
    in_channels = 1
else:
    # NOTE, CIFAR10 has color channels, so input size = 3*32*32 with FC net
    train_dataset0 = torchvision.datasets.CIFAR10(root='/data', 
                                              train=True, 
                                              transform=transforms.ToTensor(),  
                                              download=True)
    test_dataset0 = torchvision.datasets.CIFAR10(root='/data', 
                                              train=False, 
                                              transform=transforms.ToTensor())
    num_classes = 10 
    in_channels = 3
input_size = torch.prod(torch.tensor(train_dataset0[0][0].shape)).item()
frac=0.1 if 'MNIST' in data_fixed else 0.2
frac = 1
train_dataset = subset_data(train_dataset0,frac=frac)
test_dataset = subset_data(test_dataset0,frac=frac)
print(len(train_dataset))
print(len(test_dataset))
# Data loader (i.e., split to batches) see below

Files already downloaded and verified
50000
10000


Start testing

In [12]:
SVI_ls = [True, True, False]  # If False, use ordinary SGD or Adam
SVI_pause = [False, True, False]  # If True, only SVI to warm start
SVI_ls = [True, False]
SVI_pause = [True, False]
SVI_ls = [False]
SVI_pause = [False]
optim_ls = ['SGD']
load_pretrain = True # Load pretrained VGG16 to transfer learning
lr, hidden_channels = 0.001, 512
FC_only = False  # Use all FC layers if True, o/w use LeNet
dataname = data_fixed
if FC_only == False:
    dataname = data_fixed + 'VGG16_'
num_runs = 3
batch_size = 128
num_epochs = 20
num_log_steps = len(train_dataset) // batch_size # Evaluate loss every X iteration/mini-batches
if __name__ == "__main__":
    Final_result = {}  # For quick check at the end of training
    for use_SVI, pause_SVI in zip(SVI_ls, SVI_pause):
        for optim_name in optim_ls:
            result_dict = {'SVI-SGD': [],
                           'SVI_warmstart-SGD': [],
                           'SGD': [],
                           'SVI-Adam': [],
                           'SVI_warmstart-Adam': [],
                           'Adam': []}
            parser = argparse.ArgumentParser(
                description=dataname)
            parser.add_argument('--log_steps', type=int, default=num_log_steps)
            parser.add_argument('--num_layers', type=int, default=4)
            parser.add_argument('--dropout', type=float, default=0.25)
            parser.add_argument('--lr', type=float, default=lr)
            parser.add_argument('--momentum', type=float, default=0.95)
            parser.add_argument(
                '--epochs', type=int, default=num_epochs)  # Change to 100
            parser.add_argument(
                '--batch_size', type=int, default=batch_size)
            parser.add_argument('--runs', type=int,
                                default=num_runs)
            parser.add_argument('--SVI', type=bool, default=use_SVI)
            parser.add_argument('--FC', type=bool, default=FC_only)
            parser.add_argument(
                '--optimizer', type=str, default=optim_name)
            args = parser.parse_args(args=[])
            args.hidden_channels = hidden_channels
            print(args)
            # Get data loader from dataset
            train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                       batch_size=batch_size,
                                                       shuffle=True)
            test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                                      batch_size=batch_size,
                                                      shuffle=False)
            logger = Logger(args.runs, args)
            results_over_runs = {}
            for run in range(args.runs):
                accu_at_run = []
                # Initiatiate new model per run
                torch.manual_seed(1103 + run)
                args.SVI = use_SVI
                model = vgg_16_load(args.SVI, load_pretrain = load_pretrain)
                if args.optimizer == 'SGD':
                    optimizer = torch.optim.SGD(
                        model.parameters(), lr=args.lr, momentum=args.momentum, nesterov=True)
                else:
                    optimizer = torch.optim.Adam(
                        model.parameters(), lr=args.lr)
                for epoch in range(1, 1 + args.epochs):
                    iter_per_epoch = int(len(train_dataset) / batch_size) + 1
                    frac_stop = 0.1
                    if num_epochs > 20:
                        frac_stop = 0.05
                    pause_threshold = int(
                        frac_stop * iter_per_epoch * args.epochs)
                    if device.type == 'cuda':
                        # Useful to avoid GPU allocation excess
                        torch.cuda.empty_cache()
                    print(f"LR is {optimizer.param_groups[0]['lr']}")
                    for i, (images, labels) in enumerate(train_loader):
                        current_itr = ((epoch - 1) * iter_per_epoch) + i + 1
                        if pause_SVI and current_itr > pause_threshold:
                            args.SVI = False
                            if current_itr <= pause_threshold + 1:
                                # Reinitialize optimizer to avoid gradient issue in Adam
                                sdict = model.state_dict()
                                print(
                                    '############ Pause SVI from now on ############')
                                model = vgg_16_load(args.SVI, load_pretrain = load_pretrain)
                                model.load_state_dict(sdict)
                                model = model.to(device)
                                if args.optimizer == 'SGD':
                                    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, 
                                                                momentum=args.momentum, 
                                                                nesterov=True)
                                else:
                                    optimizer = torch.optim.Adam(
                                        model.parameters(), lr=args.lr)
                        optimizer.zero_grad()
                        if args.FC:
                            images = images.reshape(-1, input_size).to(device)
                        images = images.to(device)
                        labels = labels.to(device)
                        loss = train_SVI(model, images, labels) if args.SVI else train(model, images, labels)
                        optimizer.step()
                        # Test at each iteration
                        if (i+1) % args.log_steps == 0:
                            print('Testing')
                            if args.SVI:
                                print(f'SVI-{args.optimizer} training at batch {i}, epoch {epoch}')
                            else:
                                print(f'{args.optimizer} training at batch {i}, epoch {epoch}')
                            # Do so because training data has too many images, and evaluation thus takes too long.
                            train_loader_sub = torch.utils.data.DataLoader(dataset=subset_data(train_dataset0,frac=0.1),
                                                       batch_size=batch_size,
                                                       shuffle=True)
                            result = test(model, train_loader_sub, test_loader)
                            logger.add_result(run, result)
                            train_acc, test_acc = result
                            accu_at_run += [[train_acc, test_acc]]
                            print(f'Run: {run + 1:02d}, '
                                  f'Epoch: {epoch:02d}, '
                                  f'Loss: {loss:.4f}, '
                                  f'Train: {100 * train_acc:.2f}%,'
                                  f'Test: {100 * test_acc:.2f}%')
                # Save model after training over all epochs
                results_over_runs[f'lr={args.lr}@Run{run+1}'] = accu_at_run
                logger.print_statistics(run)
                # Save results
                if use_SVI:
                    SVI_prefix = 'SVI_warmstart-' if pause_SVI else 'SVI-'
                else:
                    SVI_prefix = ''
                key = f'{SVI_prefix}{optim_name}'
                key_save = f'{SVI_prefix}{optim_name}-{args.num_layers}layers-{args.hidden_channels}nodes-{args.lr}LR'
                key_save = dataname + key_save
                # # Save it to file, but need not now because only one run.
                # logger.pickle(key_save)
                result_dict[key].append(results_over_runs)
                # Final train and test accuracy
                Final_result[key] = accu_at_run[-1]
                epoch_suff = '_more_epoch' if num_epochs > 20 else ''
                with open(f"{key_save}_loss_together_SVI_only.json", "w") as outfile:
                    json.dump(result_dict, outfile)
            logger.print_statistics()

Namespace(log_steps=390, num_layers=4, dropout=0.25, lr=0.001, momentum=0.95, epochs=20, batch_size=128, runs=3, SVI=False, FC=False, optimizer='SGD', hidden_channels=512)
LR is 0.001
Testing
SGD training at batch 389, epoch 1
Run: 01, Epoch: 01, Loss: 0.5349, Train: 84.18%,Test: 80.48%
LR is 0.001
Testing
SGD training at batch 389, epoch 2
Run: 01, Epoch: 02, Loss: 0.3654, Train: 90.20%,Test: 84.92%
LR is 0.001
Testing
SGD training at batch 389, epoch 3
Run: 01, Epoch: 03, Loss: 0.2322, Train: 92.92%,Test: 85.57%
LR is 0.001
Testing
SGD training at batch 389, epoch 4
Run: 01, Epoch: 04, Loss: 0.2003, Train: 95.02%,Test: 86.76%
LR is 0.001
Testing
SGD training at batch 389, epoch 5
Run: 01, Epoch: 05, Loss: 0.1650, Train: 96.86%,Test: 86.76%
LR is 0.001
Testing
SGD training at batch 389, epoch 6
Run: 01, Epoch: 06, Loss: 0.1612, Train: 97.90%,Test: 87.29%
LR is 0.001
Testing
SGD training at batch 389, epoch 7
Run: 01, Epoch: 07, Loss: 0.0177, Train: 98.64%,Test: 87.43%
LR is 0.001
Test