In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
from torch.autograd import Variable
from ada_hessian import AdaHessian
import torch.optim.lr_scheduler as lr_scheduler
import time
import pandas as pd

import numpy as np
import math

In [None]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion *
                               planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])


def ResNet34():
    return ResNet(BasicBlock, [3, 4, 6, 3])


def ResNet50():
    return ResNet(Bottleneck, [3, 4, 6, 3])


def ResNet101():
    return ResNet(Bottleneck, [3, 4, 23, 3])


def ResNet152():
    return ResNet(Bottleneck, [3, 8, 36, 3])


#def test():
#    net = ResNet18()
#    y = net(torch.randn(1, 3, 32, 32))
#    print(y.size())


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 256


def getNet(device):
    net = ResNet18()
    net.to(device)
    return net

def generateExperiment(net, optimizer, trainloader, testloader, 
                       isHessian, csv_name, device, criterion = nn.CrossEntropyLoss(), total_epochs = 160):
    scheduler = lr_scheduler.MultiStepLR(
        optimizer,
        [80, 120],
        gamma=0.1,
        last_epoch=-1)
    train_losses = []
    train_acc = []
    train_times = []
    val_loss = []
    val_acc = []
    epochs = []

    for epoch in range(total_epochs):  

        train_loss = 0.0
        train_step = 0
        train_total = 0
        train_correct = 0
        opt_time = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            train_step = train_step + 1
            inputs, labels = data[0].to(device), data[1].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)

            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()

            loss = criterion(outputs, labels)
            if isHessian:
                loss.backward(create_graph=True)
            else:
                loss.backward()

            t = time.process_time()
            optimizer.step()
            opt_time += time.process_time() - t

            scheduler.step()

            # print statistics
            train_loss += loss.item()

        test_loss = 0.0
        test_step = 0
        test_total = 0
        test_correct = 0

        for i, data in enumerate(testloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            test_step = test_step + 1
            inputs, labels = data[0].to(device), data[1].to(device)


            # forward + backward + optimize
            outputs = net(inputs)

            _, predicted = torch.max(outputs.data, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

            loss = criterion(outputs, labels)
            # print statistics
            test_loss += loss.item()
        train_losses.append(train_loss / train_step)
        train_acc.append(train_correct / train_total)
        train_times.append(opt_time / train_step)
        val_loss.append(test_loss / test_step)
        val_acc.append(test_correct / test_total)
        epochs.append(epoch)
        print("Epoch: " + str(epoch) + " finished")
    extract_dat = pd.DataFrame({
        "epoch": epochs,
        "loss": train_losses,
        "accuracy": train_acc,
        "val_loss": val_loss,
        "val_acc": val_acc,
        "opt_time": train_times,
    })
    extract_dat.to_csv(csv_name, index=False)

## Experiment 1: Computer Vision

In [None]:
nj=32

transform =  transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])



trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=nj)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=nj)




In [None]:
net = getNet(device)

optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)

generateExperiment(net, optimizer, trainloader, testloader, False, "SGD_Moment_torch_new.csv", device)

In [None]:
net = getNet(device)

optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0, weight_decay=5e-4)

generateExperiment(net, optimizer, trainloader, testloader, False, "SGD_torch_new.csv", device)

In [None]:
net = getNet(device)
optimizer_ada = AdaHessian(net.parameters(), lr=0.15, 
                           average_conv_kernel=True, hessian_power=1, 
                           n_samples=1, weight_decay=5e-4)

generateExperiment(net, optimizer_ada, trainloader, testloader, True, "AdaHess_torch_new.csv", device)

In [None]:
net = getNet(device)
optimizer_adam = optim.Adam (net.parameters(), lr=0.001, weight_decay=5e-4)

generateExperiment(net, optimizer_adam, trainloader, testloader, False, "Adam_torch_new.csv", device)

In [None]:
net = getNet(device)
optimizer_adamw = optim.AdamW (net.parameters(), lr=0.01, weight_decay=5e-4)

generateExperiment(net, optimizer_adamw, trainloader, testloader, False, "AdamW_torch_new.csv", device)

## Experiment 2: DNN

In [None]:
n = 100000
batch_size=2000
def gen_egg_pts(n):
    x1 = np.random.uniform(-512, 512, n)
    x2 = np.random.uniform(-512, 512, n)
    f_x = -(x2 + 47) * np.sin(np.sqrt(np.abs(x1 / 2 + (x2 + 47)))) \
        - x1 * np.sin(np.abs(x1 - (x2 + 47))) 
    noise = np.random.normal(0, math.sqrt(0.3), n) 
    X = np.transpose(np.array([x1, x2]))
    return X, f_x + noise

x, y = gen_egg_pts(n)

In [None]:
train_size = int(n * 0.8)
test_size = n - train_size

criterion = nn.MSELoss()

tensor_x = torch.Tensor(x)
tensor_y = torch.Tensor(y)
my_dataset = torch.utils.data.TensorDataset(tensor_x,tensor_y)

train_reg, test_reg = torch.utils.data.random_split(my_dataset, (train_size, test_size))

train_reg_loader = torch.utils.data.DataLoader(train_reg, batch_size = batch_size)
test_reg_loader = torch.utils.data.DataLoader(test_reg, batch_size = batch_size)

In [None]:
class Reg_Net(nn.Module):
    def __init__(self):
        super(Reg_Net, self).__init__()
        self.fc1 = nn.Linear(2, 120)
        self.fc2 = nn.Linear(120, 120)
        self.fc3 = nn.Linear(120, 120)
        self.fc4 = nn.Linear(120, 120)
        self.fc5 = nn.Linear(120, 1)

    def forward(self, x):
        x = F.(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x

def getRegNet(device):
    net = Reg_Net()
    net.to(device)
    return net

In [None]:
def generateExperimentReg(net, optimizer, trainloader, testloader, 
                       isHessian, csv_name, device, criterion = nn.MSELoss(), total_epochs = 2000):
    scheduler = lr_scheduler.MultiStepLR(
        optimizer,
        [800, 1200],
        gamma=0.1,
        last_epoch=-1)
    train_losses = []
    train_times = []
    val_loss = []
    epochs = []

    for epoch in range(total_epochs):  

        train_loss = 0.0
        train_step = 0
        train_total = 0
        opt_time = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            train_step = train_step + 1
            inputs, y = data[0].to(device), data[1].to(device)
            y = torch.unsqueeze(y, 1)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)

            train_total += y.size(0)

            loss = criterion(outputs, y)
            if isHessian:
                loss.backward(create_graph=True)
            else:
                loss.backward()

            t = time.process_time()
            optimizer.step()
            opt_time += time.process_time() - t

            scheduler.step()

            # print statistics
            train_loss += loss.item()

        test_loss = 0.0
        test_step = 0
        test_total = 0

        for i, data in enumerate(testloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            test_step = test_step + 1
            inputs, y = data[0].to(device), data[1].to(device)
            y = torch.unsqueeze(y, 1)

            # forward + backward + optimize
            outputs = net(inputs)
            test_total += y.size(0)

            loss = criterion(outputs, y)
            # print statistics
            test_loss += loss.item()
        train_losses.append(train_loss / train_step)
        train_times.append(opt_time / train_step)
        val_loss.append(test_loss / test_step)
        epochs.append(epoch)
        print("Epoch: " + str(epoch) + " finished with training loss " + str(train_loss / train_step))
    extract_dat = pd.DataFrame({
        "epoch": epochs,
        "loss": train_losses,
        "val_loss": val_loss,
        "opt_time": train_times,
    })
    extract_dat.to_csv(csv_name, index=False)

In [None]:
net = getRegNet(device)


optimizer = optim.SGD(net.parameters(), lr=10e-6, momentum=10e-3, weight_decay=5e-4)



generateExperimentReg(net, optimizer, train_reg_loader, test_reg_loader, False, 
                      "SGD_Moment_Reg_torch.csv", device)

In [None]:
net = getRegNet(device)


optimizer = optim.SGD(net.parameters(), lr=10e-6, momentum=0, weight_decay=5e-4)



generateExperimentReg(net, optimizer, train_reg_loader, test_reg_loader, False, 
                      "SGD_Reg_torch.csv", device)

In [None]:
net = getRegNet(device)


optimizer = optim.Adam(net.parameters(), weight_decay=5e-4)



generateExperimentReg(net, optimizer, train_reg_loader, test_reg_loader, False, 
                      "Adam_Reg_torch.csv", device)

In [None]:
net = getRegNet(device)


optimizer = optim.AdamW(net.parameters(), weight_decay=5e-4)



generateExperimentReg(net, optimizer, train_reg_loader, test_reg_loader, False, 
                      "AdamW_Reg_torch.csv", device)

In [None]:
net = getRegNet(device)


optimizer = AdaHessian(net.parameters(), lr=0.1, weight_decay=5e-4)



generateExperimentReg(net, optimizer, train_reg_loader, test_reg_loader, True, 
                      "AdamHess_Reg_torch.csv", device)