In [1]:
import matplotlib.pyplot as plt
import numpy as np

In [99]:
import torch
import torchvision.transforms as transforms
from torchvision import datasets
import torch.backends.cudnn as cudnn

# loading training data
train_dataset = datasets.MNIST(root='./data', 
                               train=True, 
                               transform=transforms.ToTensor(),
                               download=True)
#loading test data
test_dataset = datasets.MNIST(root='./data', 
                              train=False, 
                              transform=transforms.ToTensor())

In [86]:
class LogisticRegression(torch.nn.Module):    
    # build the constructor
    def __init__(self, n_inputs, n_outputs):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(n_inputs, n_outputs)
    # make predictions
    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred

# MNIST

In [100]:
from torch.utils.data import DataLoader
 
# load train and test data samples into dataloader
batach_size = 64
train_loader = DataLoader(dataset=train_dataset, batch_size=batach_size, shuffle=True) 
test_loader = DataLoader(dataset=test_dataset, batch_size=batach_size, shuffle=False)

# Training

In [101]:
def training(optimizer, device, log_regr):
    # defining Cross-Entropy loss
    criterion = torch.nn.CrossEntropyLoss()
    epochs = 50
    loss_grad_regular = []
    loss_grad_irregular = []
    Loss = []
    acc = []
    inputs = 28*28
    for epoch in range(epochs):
        avg_grad_regular = 0
        avg_grad_irregular = 0
        t = 0
        for i, (images, labels) in enumerate(train_loader):
            t += 1
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = log_regr(images.view(-1, inputs))
            loss = criterion(outputs, labels)
            # Loss.append(loss.item())
            loss.backward()
            optimizer.step()
            avg_grad_regular += optimizer._grad_norm[0]
            avg_grad_irregular += optimizer._grad_norm[1]

        loss_grad_regular.append(avg_grad_regular/t)
        loss_grad_irregular.append(avg_grad_irregular/t)
        
        Loss.append(loss.item())
        
        correct = 0
        
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = log_regr(images.view(-1, inputs))
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum()

        accuracy = 100 * (correct.item()) / len(test_dataset)
        acc.append(accuracy)
        print('Epoch: {}. Loss: {}. Accuracy: {}. Grad: {}. New_Grad: {}.'.format(epoch, loss.item(), accuracy, avg_grad_regular, avg_grad_irregular))
    return Loss, acc, loss_grad_regular, loss_grad_irregular

def train_models(learning_rates=[0.1 , 0.01 , 0.001 ], 
                 weight_decayes=[0.01, 0.001, 0.0001], name="AdamL2", optim=torch.optim.Adam):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    models = []
    names = []
    Loss = []
    Acc = []
    Loss_grad_regular = []
    Loss_grad_irregular = []

    for lr in learning_rates:
        for wd in weight_decayes:
            # instantiate the model
            n_inputs = 28*28 # makes a 1D vector of 1024
            n_outputs = 10
            log_regr = LogisticRegression(n_inputs, n_outputs)
            log_regr = log_regr.to(device)
            if device == 'cuda':
                log_regr = torch.nn.DataParallel(log_regr)
                cudnn.benchmark = True
                    
            optimizer = optim(log_regr.parameters(),lr=lr, weight_decay=wd)
            loss, acc, loss_grad_regular, loss_grad_irregular = training(optimizer, device=device, log_regr=log_regr)
            
            Loss.append(loss)
            Acc.append(acc)
            Loss_grad_regular.append(loss_grad_regular)
            Loss_grad_irregular.append(loss_grad_irregular)
            names.append(name+f"(lr={lr}, wd={wd})")
            models.append(optimizer)

    return models, names, Loss, Acc, Loss_grad_regular, Loss_grad_irregular

In [111]:
from importlib import reload
import s_adamw
reload(s_adamw)
from s_adamw import our_AdamW

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# instantiate the model
n_inputs = 28*28 # makes a 1D vector of 784
n_outputs = 10
log_regr = LogisticRegression(n_inputs, n_outputs)
log_regr = log_regr.to(device)
if device == 'cuda':
    log_regr = torch.nn.DataParallel(log_regr)
    cudnn.benchmark = True
Loss, acc, loss_grad_regular, loss_grad_irregular = training(our_AdamW(log_regr.parameters(),lr=0.1, weight_decay=0.01, betas=(0.9, 0.999)), device, log_regr)

## Kaggle

In [None]:
# part for kaggle
# import module we'll need to import our custom module
#from shutil import copyfile

# copy our file into the working directory (make sure it has .py suffix)
#copyfile(src = "../input/optimizers/adamwh.py", dst = "../working/adamwh.py")

# import all our functions
#from importlib import reload
#import adamwh
#reload(adamwh)
#from adamwh import AdamWH