In [3]:
import argparse
import os
import shutil
import time

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import resnet
from custom_datasets import CIFAR10C
from trainer import AverageMeter, model_names
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


['resnet110', 'resnet1202', 'resnet20', 'resnet32', 'resnet44', 'resnet56']


In [4]:
model = torch.nn.DataParallel(resnet.__dict__['resnet20'](num_classes=10))

    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 1. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


In [5]:
run_name = "xent_ls0._seed2"
checkpoint_name = "data/"+run_name+"/checkpoint.th"
model.cuda()

print("=> loading checkpoint '{}'".format(checkpoint_name))
checkpoint = torch.load(checkpoint_name)
model.load_state_dict(checkpoint['state_dict'])

model.eval()


=> loading checkpoint 'data/xent_ls0._seed2/checkpoint.th'


DataParallel(
  (module): ResNet(
    (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (shortcut): Sequential()
      )
      (1): BasicBlock(
        (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
       

In [6]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

val_loader = torch.utils.data.DataLoader(
    datasets.CIFAR10(root='./data', train=False, transform=transforms.Compose([
        transforms.ToTensor(),
        normalize,
    ])),
    batch_size=1024, shuffle=False,
    num_workers=4, pin_memory=True)

valc_loaders_all = []
for corruption_type in ["impulse_noise", "defocus_blur", "motion_blur", "shot_noise", "speckle_noise"]:
    valc_loaders = []
    for corruption_level in range(5):
        valc_loader = torch.utils.data.DataLoader(
            CIFAR10C(corruption_type, corruption_level, transform=transforms.Compose([
                transforms.ToTensor(),
                normalize,
            ])),
            batch_size=1024, shuffle=False,
            num_workers=4, pin_memory=True)
        valc_loaders.append(valc_loader)
    valc_loaders_all.append(valc_loaders)

In [8]:
i, (input, target) = next(enumerate(val_loader))

In [9]:


criterion = torch.nn.CrossEntropyLoss()
grads_all = torch.zeros((0, 16, 16, 3, 3))
entropies_all = torch.zeros((0))
accuracies_all = torch.zeros((0))

for i in range(1024):
    outputs = model((input[i].unsqueeze(0).cuda()))

    loss = criterion(outputs, target[i].unsqueeze(0).cuda())
    
    outputs = model((input[i].unsqueeze(0).cuda()))
    
    accuracy = (outputs.argmax(dim=1) == target[i].unsqueeze(0).cuda()).float().mean()
    accuracies_all = torch.cat((accuracies_all, accuracy.unsqueeze(0).detach().cpu()), dim=0)

    outputs  = F.softmax(outputs, dim=1)
    entropy = -torch.sum(outputs * torch.log(outputs), dim=1)
    entropies_all = torch.cat((entropies_all, entropy.unsqueeze(0).detach().cpu()), dim=0)

    # compute gradient and do SGD step
    loss.backward(gradient = torch.ones_like(loss))
    grad = model._modules['module'].layer1[2].conv2.weight.grad
    grads_all = torch.cat((grads_all, grad.unsqueeze(0).detach().cpu()), dim=0)


# grads_all_flatten = grads_all_test.view(1024, -1)
# grads_all_flatten_c = grads_all_flatten - grads_all_flatten.mean(axis=0)

# np.linalg.norm(np.dot(grads_all_flatten_c, projection_matrix), axis=-1).mean() / np.linalg.norm(grads_all_flatten_c, axis=-1).mean()

In [10]:
accuracies_all.mean()

tensor(0.9209)

In [11]:
entropies_all.mean()

tensor(0.0790)

In [12]:
grads_all_flatten = grads_all.view(1024, -1)

In [13]:
grads_all_flatten.shape

torch.Size([1024, 2304])

In [14]:
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca.fit(grads_all_flatten)
pca.explained_variance_ratio_.sum()

0.9999598355712964

In [15]:
singular_values = pca.singular_values_

In [16]:
projection_matrix =  np.dot(pca.components_[:10].T, pca.components_[:10])

In [17]:
grads_all_flatten_c = grads_all_flatten - grads_all_flatten.mean(axis=0)

np.linalg.norm(np.dot(grads_all_flatten_c, projection_matrix), axis=-1).mean() / np.linalg.norm(grads_all_flatten_c, axis=-1).mean()

0.9863259851871424

In [21]:
i, (input, target) = next(enumerate(valc_loaders_all[4][4]))

grads_all_test = torch.zeros((0, 16, 16, 3, 3))


for i in range(1024):
    outputs = model((input[i].unsqueeze(0).cuda()))

    loss = criterion(outputs, target[i].unsqueeze(0).cuda())    

    # compute gradient and do SGD step
    loss.backward(gradient = torch.ones_like(loss))
    grad = model._modules['module'].layer1[2].conv2.weight.grad
    grads_all_test = torch.cat((grads_all_test, grad.unsqueeze(0).detach().cpu()), dim=0)

In [22]:
grads_all_flatten = grads_all_test.view(1024, -1)
grads_all_flatten_c = grads_all_flatten - grads_all_flatten.mean(axis=0)

np.linalg.norm(np.dot(grads_all_flatten_c, projection_matrix), axis=-1).mean() / np.linalg.norm(grads_all_flatten_c, axis=-1).mean()

0.7396882623119353

In [23]:
i, (input, target) = next(enumerate(valc_loaders_all[4][2]))

grads_all_test = torch.zeros((0, 16, 16, 3, 3))


for i in range(1024):
    outputs = model((input[i].unsqueeze(0).cuda()))

    loss = criterion(outputs, target[i].unsqueeze(0).cuda())    

    # compute gradient and do SGD step
    loss.backward(gradient = torch.ones_like(loss))
    grad = model._modules['module'].layer1[2].conv2.weight.grad
    grads_all_test = torch.cat((grads_all_test, grad.unsqueeze(0).detach().cpu()), dim=0)

grads_all_flatten = grads_all_test.view(1024, -1)
grads_all_flatten_c = grads_all_flatten - grads_all_flatten.mean(axis=0)

np.linalg.norm(np.dot(grads_all_flatten_c, projection_matrix), axis=-1).mean() / np.linalg.norm(grads_all_flatten_c, axis=-1).mean()

0.7837717424451076

In [25]:
i, (input, target) = next(enumerate(valc_loaders_all[4][0]))

grads_all_test = torch.zeros((0, 16, 16, 3, 3))


for i in range(1024):
    outputs = model((input[i].unsqueeze(0).cuda()))

    loss = criterion(outputs, target[i].unsqueeze(0).cuda())    

    # compute gradient and do SGD step
    loss.backward(gradient = torch.ones_like(loss))
    grad = model._modules['module'].layer1[2].conv2.weight.grad
    grads_all_test = torch.cat((grads_all_test, grad.unsqueeze(0).detach().cpu()), dim=0)

grads_all_flatten = grads_all_test.view(1024, -1)
grads_all_flatten_c = grads_all_flatten - grads_all_flatten.mean(axis=0)

np.linalg.norm(np.dot(grads_all_flatten_c, projection_matrix), axis=-1).mean() / np.linalg.norm(grads_all_flatten_c, axis=-1).mean()

0.710528513861052

In [26]:
i, (input, target) = next(enumerate(val_loader))

grads_all_test = torch.zeros((0, 16, 16, 3, 3))


for i in range(1024):
    outputs = model((input[i].unsqueeze(0).cuda()))

    loss = criterion(outputs, target[i].unsqueeze(0).cuda())    

    # compute gradient and do SGD step
    loss.backward(gradient = torch.ones_like(loss))
    grad = model._modules['module'].layer1[2].conv2.weight.grad
    grads_all_test = torch.cat((grads_all_test, grad.unsqueeze(0).detach().cpu()), dim=0)

grads_all_flatten = grads_all_test.view(1024, -1)
grads_all_flatten_c = grads_all_flatten - grads_all_flatten.mean(axis=0)

np.linalg.norm(np.dot(grads_all_flatten_c, projection_matrix), axis=-1).mean() / np.linalg.norm(grads_all_flatten_c, axis=-1).mean()

0.9863259574131259

In [23]:
i, (input, target) = next(enumerate(val_loader))


criterion = torch.nn.CrossEntropyLoss()
grads_all = torch.zeros((0, 16, 16, 3, 3))
entropies_all = torch.zeros((0))
accuracies_all = torch.zeros((0))

for i in range(1024):
    outputs = model((5*(input[i]).unsqueeze(0).cuda()))

    loss = criterion(outputs, target[i].unsqueeze(0).cuda())
        
    accuracy = (outputs.argmax(dim=1) == target[i].unsqueeze(0).cuda()).float().mean()
    accuracies_all = torch.cat((accuracies_all, accuracy.unsqueeze(0).detach().cpu()), dim=0)

    outputs  = F.softmax(outputs, dim=1)
    entropy = -torch.sum(outputs * torch.log(outputs), dim=1)
    entropies_all = torch.cat((entropies_all, entropy.unsqueeze(0).detach().cpu()), dim=0)

    # compute gradient and do SGD step
    loss.backward(gradient = torch.ones_like(loss))
    grad = model._modules['module'].layer1[2].conv2.weight.grad
    grads_all = torch.cat((grads_all, grad.unsqueeze(0).detach().cpu()), dim=0)


grads_all_flatten = grads_all.view(1024, -1)
grads_all_flatten_c = grads_all_flatten - grads_all_flatten.mean(axis=0)

print(np.linalg.norm(np.dot(grads_all_flatten_c, projection_matrix), axis=-1).mean() / np.linalg.norm(grads_all_flatten_c, axis=-1).mean())
print(entropies_all.mean())
print(accuracies_all.mean())

0.6075110270664625
tensor(0.8297)
tensor(0.2109)


In [19]:
entropies_all.mean()

tensor(0.0790)

In [20]:
accuracies_all.mean()

tensor(0.9209)

In [20]:

i, (input, target) = next(enumerate(val_loader))

grads_all_test = torch.zeros((0, 16, 16, 3, 3))
entropies_all = torch.zeros((0))
accuracies_all = torch.zeros((0))

criterion = nn.CrossEntropyLoss()
for i in range(1024):
    outputs = model((input[i].unsqueeze(0).cuda()))
    
    accuracy = (outputs.argmax(dim=1) == target[i].unsqueeze(0).cuda()).float().mean()
    accuracies_all = torch.cat((accuracies_all, accuracy.unsqueeze(0).detach().cpu()), dim=0)

    outputs  = F.softmax(outputs, dim=1)
    print(outputs)
    entropy = -torch.sum(outputs * torch.log(outputs), dim=1)
    entropies_all = torch.cat((entropies_all, entropy.unsqueeze(0).detach().cpu()), dim=0)

    loss = criterion(outputs, target[i].unsqueeze(0).cuda())

    # compute gradient and do SGD step
    loss.backward(gradient = torch.ones_like(loss))
    grad = model._modules['module'].layer1[2].conv2.weight.grad
    grads_all_test = torch.cat((grads_all_test, grad.unsqueeze(0).detach().cpu()), dim=0)

grads_all_flatten = grads_all_test.view(1024, -1)
grads_all_flatten_c = grads_all_flatten - grads_all_flatten.mean(axis=0)

np.linalg.norm(np.dot(grads_all_flatten_c, projection_matrix), axis=-1).mean() / np.linalg.norm(grads_all_flatten_c, axis=-1).mean()

tensor([[2.6027e-09, 7.9256e-08, 4.7952e-06, 9.9971e-01, 5.6225e-10, 2.8793e-04,
         6.6093e-07, 5.4351e-09, 7.7908e-09, 1.2098e-10]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
tensor([[1.7047e-11, 1.9151e-09, 4.1213e-20, 9.2134e-19, 1.8432e-20, 4.6462e-21,
         1.4232e-18, 1.0124e-19, 1.0000e+00, 1.7421e-14]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
tensor([[6.7282e-07, 2.7054e-05, 7.2177e-09, 1.7553e-13, 3.5858e-12, 9.5804e-14,
         1.1264e-11, 1.5339e-09, 9.9997e-01, 1.2221e-07]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
tensor([[9.9873e-01, 1.0275e-05, 7.9748e-06, 8.4663e-06, 1.6488e-09, 1.3663e-10,
         1.6061e-08, 1.1769e-08, 1.2185e-03, 2.8139e-05]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
tensor([[2.3366e-11, 1.5238e-09, 2.4341e-06, 1.2332e-08, 6.6234e-10, 1.1532e-12,
         1.0000e+00, 1.0007e-12, 5.0937e-11, 5.7497e-10]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
tensor([[3.0926e-15, 1.0807e-12, 1.1399e

NameError: name 'projection_matrix' is not defined

In [None]:
entropies_all.mean()

tensor(1.0357)

In [None]:
accuracies_all

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [None]:
outputs

tensor([[0.0437, 0.0198, 0.0205, 0.0270, 0.0170, 0.0136, 0.0120, 0.0157, 0.0157,
         0.0263, 0.7885]], device='cuda:0', grad_fn=<SoftmaxBackward0>)