In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from adamw import AdamW
from nadam import Nadam
from uoptim import UOptimizer


from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from utils_exp import plot_graphs

import numpy as np
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PATH = './fMNIST_data'

In [2]:
train_data = datasets.FashionMNIST(PATH, train=True, download=True, transform=transforms.ToTensor())

# calculate statistics for fashion MNIST
global_norm_mean = train_data.train_data.float().div(255).mean().item()
global_norm_std = train_data.train_data.float().div(255).std().item()

In [3]:
# util to create loaders
mnist_transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((global_norm_mean,), (global_norm_std,)),
           ])

def mnist(batch_size=50, valid=0, shuffle=True, transform=mnist_transform, path=PATH):
    test_data = datasets.FashionMNIST(path, train=False, download=True, transform=transform)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
    
    train_data = datasets.FashionMNIST(path, train=True, download=True, transform=transform)
    if valid > 0:
        num_train = len(train_data)
        indices = list(range(num_train))
        split = num_train-valid
        np.random.shuffle(indices)

        train_idx, valid_idx = indices[:split], indices[split:]
        train_sampler = SubsetRandomSampler(train_idx)
        valid_sampler = SubsetRandomSampler(valid_idx)

        train_loader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)
        valid_loader = DataLoader(train_data, batch_size=batch_size, sampler=valid_sampler)
    
        return train_loader, valid_loader, test_loader
    else:
        train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=shuffle)
        return train_loader, test_loader


def plot_mnist(images, shape):
    fig = plt.figure(figsize=shape[::-1], dpi=80)
    for j in range(1, len(images) + 1):
        ax = fig.add_subplot(shape[0], shape[1], j)
        ax.matshow(images[j - 1, 0, :, :], cmap = matplotlib.cm.binary)
        plt.xticks(np.array([]))
        plt.yticks(np.array([]))
    plt.show()

In [4]:
train_loader, valid_loader, test_loader = mnist(valid=10000)

In [5]:
class ConvLayer(nn.Module):
    def __init__(self, size, padding=1, pool_layer=nn.MaxPool2d(2, stride=2),
                 bn=False, dropout=False, activation_fn=nn.ReLU()):
        super(ConvLayer, self).__init__()
        layers = []
        layers.append(nn.Conv2d(size[0], size[1], size[2], padding=padding))
        if pool_layer is not None:
            layers.append(pool_layer)
        if bn:
            layers.append(nn.BatchNorm2d(size[1]))
        if dropout:
            layers.append(nn.Dropout2d())
        layers.append(activation_fn)
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

In [6]:
class FullyConnected(nn.Module):
    def __init__(self, sizes, dropout=False, activation_fn=nn.Tanh):
        super(FullyConnected, self).__init__()
        layers = []
        
        for i in range(len(sizes) - 2):
            layers.append(nn.Linear(sizes[i], sizes[i+1]))
            if dropout:
                layers.append(nn.Dropout())
            layers.append(activation_fn())
        else: # нам не нужен дропаут и фнкция активации в последнем слое
            layers.append(nn.Linear(sizes[-2], sizes[-1]))
        
        self.model = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.model(x)

In [7]:
class Net(nn.Module):
    def __init__(self, batchnorm=False, dropout=False, optim_type='UAdam', **optim_params):
        super(Net, self).__init__()
        
        self._conv1 = ConvLayer([1, 16, 3], bn=batchnorm)
        self._conv2 = ConvLayer([16, 32, 3], bn=batchnorm, activation_fn=nn.Sigmoid())
        
        self.fc = FullyConnected([32*7*7, 10], dropout=dropout)
        
        self._loss = None
        
        if optim_type == 'SGD':
            self.optim = optim.SGD(self.parameters(), **optim_params)
        elif optim_type == 'Adadelta':
            self.optim = optim.Adadelta(self.parameters(), **optim_params)
        elif optim_type == 'RMSProp':
            self.optim = optim.RMSprop(self.parameters(), **optim_params)
        elif optim_type == 'Adam':
            self.optim = optim.Adam(self.parameters(), **optim_params)
        elif optim_type == 'AdamW':
            self.optim = AdamW(self.parameters(), **optim_params)
        elif optim_type == 'Nadam':
            self.optim = Nadam(self.parameters(), **optim_params)
        elif optim_type == 'Adamax':
            self.optim = optim.Adamax(self.parameters(), **optim_params)
        elif optim_type == 'UOptimizer':
            self.optim = UOptimizer(params = self.parameters(), **optim_params)
    
    def conv(self, x):
        x = self._conv1(x)
        x = self._conv2(x)
        return x
        
    def forward(self, x):
        x = self.conv(x)
        x = x.view(-1, 32*7*7)
        x = self.fc(x)
        return x
    
    def loss(self, output, target, **kwargs):
        self._loss = F.cross_entropy(output, target, **kwargs)
        self._correct = output.data.max(1, keepdim=True)[1]
        self._correct = self._correct.eq(target.data.view_as(self._correct)).to(torch.float).cpu().mean()
        return self._loss

In [8]:
models = {
          # testing correctness of SGD
          'SGD': Net(True, False, 'SGD', lr=1e-3).to(device), 
          'UOSGD': Net(True, False, 'UOptimizer', lr=1e-3).to(device), 
    
          'SGD_momentum':Net(True, False, 'SGD', momentum=0.9, lr=1e-3).to(device),
          'UOSGD_momentum':Net(True, False, 'UOptimizer', use_exp_avg_norm = True, beta1_dump=0, lr=1e-3).to(device),
    
          'SGD_momentum_n':Net(True, False, 'SGD', momentum=0.9, nesterov=True,  lr=1e-3).to(device),
          'UOSGD_momentum_n':Net(True, False, 'UOptimizer', use_exp_avg_norm = True, beta1_dump=0,
                                 exp_avg_norm_type='nesterov', lr=1e-3).to(device),
    
          # testing RMSProp      
          'RMSProp': Net(True, False, 'RMSProp', lr=1e-4).to(device), 
          'UORMSProp': Net(True, False, 'UOptimizer', use_exp_avg_sq_norm = True,  lr=1e-4).to(device), 
          
          # testing Adadelta
          'Adadelta':Net(True, False, 'Adadelta', lr=1).to(device),
          'UOAdadelta':Net(True, False, 'UOptimizer', use_exp_avg_sq_norm = True, use_adadelta_lr=True, lr=1).to(device),
            
          # testing adam-like algoritms. 
          'Adam': Net(True, False, 'Adam', lr=1e-4).to(device), 
          'UOAdam': Net(True, False, 'UOptimizer', use_exp_avg_norm=True, 
                        use_exp_avg_sq_norm = True, use_bias_correction= True, lr=1e-4).to(device), 
    
          'Amsgrad':Net(True, False, 'Adam', lr=1e-4, amsgrad=True).to(device),
          'UOAmsgrad':Net(True, False, 'UOptimizer', use_exp_avg_norm=True, use_exp_avg_sq_norm = True, 
                            use_bias_correction=True, exp_avg_sq_norm_type='max_past_sq', lr=1e-4).to(device), 
          
          # AdamW is not included in Pytorch, so I used fastai implementation from here:
          # https://github.com/anandsaha/fastai.part1.v2/commit/159e1712e60f299e11c42caab35c726f367bcd61
          'AdamW':Net(True, False, 'AdamW', lr=1e-4, weight_decay=0.00025).to(device),
          'UOAdamW':Net(True, False, 'UOptimizer', use_exp_avg_norm=True, use_exp_avg_sq_norm = True, 
                        use_bias_correction= True, decouple_wd=True, lr=1e-4, weight_decay=0.00025).to(device),
    
          # Please note that pytorch Nadam is the official Nadam implementation for Keras translated to PyTorch
          # I use the classical Nadam formulas, so the results could be differ
          'Nadam':Net(True, False, 'Nadam', lr=1e-4).to(device),
          'UONadam':Net(True, False, 'UOptimizer', use_exp_avg_norm=True, use_exp_avg_sq_norm = True, 
                            use_bias_correction= True,exp_avg_norm_type='nesterov',  lr=1e-4).to(device),
          
          'Adamax':Net(True, False, 'Adamax', lr=1e-4).to(device),
          'UOAdamax':Net(True, False, 'UOptimizer', use_exp_avg_norm=True, use_exp_avg_sq_norm = True, 
                        use_bias_correction= True, exp_avg_sq_norm_type ='infinite_l', lr=1e-4).to(device),
    
    
          # create exotic combinations for fun
          'Adam_with_adadelta_coeff':Net(True, False, 'UOptimizer', 
                                         use_exp_avg_norm = True,
                                         use_exp_avg_sq_norm = True,
                                         use_adadelta_lr = True,
                                         use_bias_correction = True,
                                         lr=1).to(device),
          'Adam_with_exp_avg_norm_like_sgd':Net(True, False, 'UOptimizer', 
                                         use_exp_avg_norm = True,
                                         use_exp_avg_sq_norm = True,
                                         use_bias_correction = True,
                                         beta1_dump=0,
                                         lr=1e-4).to(device),                  
            }
train_log = {k: [] for k in models}
test_log = {k: [] for k in models}

You are using the AdamW optimizer


In [9]:
def train(epoch, models, log=None):
    train_size = len(train_loader.sampler)
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        for model in models.values():
            model.optim.zero_grad()
            output = model(data)
            loss = model.loss(output, target)
            loss.backward()
            model.optim.step()
            
        if batch_idx % 200 == 0:
            line = 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLosses '.format(
                epoch, batch_idx * len(data), train_size, 100. * batch_idx / len(train_loader))
            losses = ' '.join(['{}: {:.4f}'.format(k, m._loss.item()) for k, m in models.items()])
            print(line + losses)
            
    else:
        batch_idx += 1
        line = 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLosses '.format(
            epoch, batch_idx * len(data), train_size, 100. * batch_idx / len(train_loader))
        losses = ' '.join(['{}: {:.4f}'.format(k, m._loss.item()) for k, m in models.items()])
        if log is not None:
            for k in models:
                log[k].append((models[k]._loss, models[k]._correct))
        print(line + losses)

In [10]:
def test(models, loader, log=None):
    test_size = len(loader.sampler)
    avg_lambda = lambda l: 'Loss: {:.4f}'.format(l)
    acc_lambda = lambda c, p: 'Accuracy: {}/{} ({:.0f}%)'.format(c, test_size, p)
    line = lambda i, l, c, p: '{}: '.format(i) + avg_lambda(l) + '\t' + acc_lambda(c, p)

    test_loss = {k: 0. for k in models}
    correct = {k: 0. for k in models}
    with torch.no_grad():
        for data, target in loader:
            data, target = data.to(device), target.to(device)
            output = {k: m(data) for k, m in models.items()}
            for k, m in models.items():
                test_loss[k] += m.loss(output[k], target, size_average=False).item() # sum up batch loss
                pred = output[k].data.max(1, keepdim=True)[1] # get the index of the max log-probability
                correct[k] += pred.eq(target.data.view_as(pred)).cpu().sum().item()
    
    for k in models:
        test_loss[k] /= test_size
    correct_pct = {k: c / test_size for k, c in correct.items()}
    lines = '\n'.join([line(k, test_loss[k], correct[k], 100*correct_pct[k]) for k in models]) + '\n'
    report = 'Test set:\n' + lines
    if log is not None:
        for k in models:
            log[k].append((test_loss[k], correct_pct[k]))
    print(report)

In [None]:
n_epoch = 100
for epoch in range(1, n_epoch+1):
    for model in models.values():
        model.train()
    train(epoch, models, train_log)
    for model in models.values():
        model.eval()
    test(models, valid_loader, test_log)

Test set:
SGD: Loss: 1.9056	Accuracy: 6383.0/10000 (64%)
UOSGD: Loss: 1.7023	Accuracy: 6467.0/10000 (65%)
SGD_momentum: Loss: 0.6567	Accuracy: 7801.0/10000 (78%)
UOSGD_momentum: Loss: 0.6696	Accuracy: 7854.0/10000 (79%)
SGD_momentum_n: Loss: 0.6428	Accuracy: 7904.0/10000 (79%)
UOSGD_momentum_n: Loss: 0.6902	Accuracy: 7780.0/10000 (78%)
RMSProp: Loss: 0.8454	Accuracy: 7527.0/10000 (75%)
UORMSProp: Loss: 0.6428	Accuracy: 7857.0/10000 (79%)
Adadelta: Loss: 0.3909	Accuracy: 8621.0/10000 (86%)
UOAdadelta: Loss: 0.3663	Accuracy: 8686.0/10000 (87%)
Adam: Loss: 0.8468	Accuracy: 7626.0/10000 (76%)
UOAdam: Loss: 0.8849	Accuracy: 7558.0/10000 (76%)
Amsgrad: Loss: 0.8925	Accuracy: 7575.0/10000 (76%)
UOAmsgrad: Loss: 0.8451	Accuracy: 7573.0/10000 (76%)
AdamW: Loss: 1.0145	Accuracy: 7495.0/10000 (75%)
UOAdamW: Loss: 1.0285	Accuracy: 7419.0/10000 (74%)
Nadam: Loss: 0.7112	Accuracy: 7811.0/10000 (78%)
UONadam: Loss: 0.8552	Accuracy: 7620.0/10000 (76%)
Adamax: Loss: 1.3232	Accuracy: 7296.0/10000 (73%)




## Visual results

#### Loss

In [None]:
plot_graphs(test_log, 'loss', fs=(20, 10))

- Below we make a pairwise comparaison of standard algorithms and my implementation. Please note, that due to stochastic nature
of algorithms, the descent path could differ, but not too much. Exception for Nadam. For comparaison I use official implementation for Keras and it contains some hardcoded parameters that could not fit to any dataset. My implementation more classical and in case of this dataset looks better that Keras variant. 
- For better visualisation I excluded first 5 epochs, so below are the results from 6th epoch

In [None]:
for i in ['SGD', 'SGD_momentum', 'SGD_momentum_n', 'RMSProp', 'Adadelta', 'Adam', 'Amsgrad', 'AdamW', 'Nadam', 'Adamax']:
    new_test_log = {}
    for k, v in test_log.items():
        if k == 'UO'+i or k==i:
            new_test_log[k] = v[1:]
    plot_graphs(new_test_log, 'loss')

In [None]:
plot_graphs(test_log, 'accuracy', fs = (20, 10))

Now analyze dynamic of standard algorithms (we excluded the worst algos here like SGD, AdamW and Nadam). It looks like the best it is RMSProp here

In [None]:
new_test_log = {}
for i in ['SGD_momentum', 'SGD_momentum_n', 'RMSProp', 'Adadelta', 'Adam', 'Amsgrad']:
    for k, v in test_log.items():
        if  k==i:
            new_test_log[k] = v[1:]
plot_graphs(new_test_log, 'loss', fs = (20, 10))

Lets compare RMSProp with the exotic combinations of our algorithm. **It looks like our exotic combinations provided better results that the best standard algoritm**

In [None]:
new_test_log = {}
for i in ['RMSProp','Adam_with_adadelta_coeff', 'Adam_with_exp_avg_norm_like_sgd']:
    for k, v in test_log.items():
        if  k==i:
            new_test_log[k] = v[1:]
plot_graphs(new_test_log, 'loss', fs = (20, 10))