In [1]:
import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.nn.functional as F

import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

import os
import re
import random as ra

In [2]:
input_size, h0, h1, out_size = 784, 32, 16, 10

In [3]:
ds = datasets.MNIST(root='./data', train=False,
                    transform=transforms.ToTensor(), download=True)
loader = torch.utils.data.DataLoader(ds, batch_size=1000, shuffle=False)
images, _ = next(iter(loader))
images = images.view(-1, 784)

In [4]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size0, hidden_size1, out_size):
        super(Net, self).__init__()
        self.fc0 = nn.Linear(input_size, hidden_size0)     #784 - 32
        self.fc1 = nn.Linear(hidden_size0, hidden_size1)   #32 - 16
        self.fc2 = nn.Linear(hidden_size1, out_size)       #16 - 10
        self.tanh = nn.Tanh()
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.fc0.weight, gain=nn.init.calculate_gain('tanh')) 
        nn.init.xavier_uniform_(self.fc1.weight, gain=nn.init.calculate_gain('tanh'))
        nn.init.xavier_uniform_(self.fc2.weight, gain=nn.init.calculate_gain('tanh'))

    def forward(self, x):
        a0 = self.tanh(self.fc0(x))
        a1 = self.tanh(self.fc1(a0))
        out = self.fc2(a1)
        return a0, a1

In [None]:
#examples of two model instances being loaded for comparison
netA = Net(input_size, h0, h1, out_size)
netA.load_state_dict(torch.load('stored_model_weights/model_inst_0'))
netA.eval()
netB = Net(input_size, h0, h1, out_size)
netB.load_state_dict(torch.load('stored_model_weights/model_inst_4'))
netB.eval()

Net(
  (fc0): Linear(in_features=784, out_features=32, bias=True)
  (fc1): Linear(in_features=32, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=10, bias=True)
  (tanh): Tanh()
)

In [6]:
#collecting activations
actsA0, actsA1, actsB0, actsB1 = [], [], [], []
for img in images:
    a0, a1 = netA(img.unsqueeze(0))
    b0, b1 = netB(img.unsqueeze(0))
    actsA0.append(a0.detach().numpy().flatten())
    actsA1.append(a1.detach().numpy().flatten())
    actsB0.append(b0.detach().numpy().flatten())
    actsB1.append(b1.detach().numpy().flatten())
actsA0 = np.stack(actsA0); actsA1 = np.stack(actsA1)
actsB0 = np.stack(actsB0); actsB1 = np.stack(actsB1)

#colecting similarities
raw_cos0, raw_corr0, sorted_cos0, sorted_corr0 = [], [], [], []
raw_cos1, raw_corr1, sorted_cos1, sorted_corr1 = [], [], [], []

for a0, b0, a1, b1 in zip(actsA0, actsB0, actsA1, actsB1):
    #raw comparison of distribution
    raw_cos0.append(F.cosine_similarity(torch.tensor(a0), torch.tensor(b0), dim=0).item())
    raw_corr0.append(pearsonr(a0, b0)[0])
    raw_cos1.append(F.cosine_similarity(torch.tensor(a1), torch.tensor(b1), dim=0).item())
    raw_corr1.append(pearsonr(a1, b1)[0])

    #sorted comparison of values
    sa0 = np.sort(a0)[::-1].copy()
    sb0 = np.sort(b0)[::-1].copy()
    sorted_cos0.append(F.cosine_similarity(torch.tensor(sa0), torch.tensor(sb0), dim=0).item())
    sorted_corr0.append(pearsonr(sa0, sb0)[0])

    sa1 = np.sort(a1)[::-1].copy()
    sb1 = np.sort(b1)[::-1].copy()
    sorted_cos1.append(F.cosine_similarity(torch.tensor(sa1), torch.tensor(sb1), dim=0).item())
    sorted_corr1.append(pearsonr(sa1, sb1)[0])



In [7]:
def report(layer, raw_cos, raw_cor, sorted_cos, sorted_cor):
    print(f"Layer {layer} raw cosine similarity: {np.mean(raw_cos):.3f} ± {np.std(raw_cos):.3f}")
    print(f"Layer {layer} sorted cosine similarity: {np.mean(sorted_cos):.3f} ± {np.std(sorted_cos):.3f}")
    print(f"Layer {layer} raw Pearson correlation: {np.mean(raw_cor):.3f} ± {np.std(raw_cor):.3f}")
    print(f"Layer {layer} sorted Pearson correlation: {np.mean(sorted_cor):.3f} ± {np.std(sorted_cor):.3f}\n")

report(0, raw_cos0, raw_corr0, sorted_cos0, sorted_corr0)
report(1, raw_cos1, raw_corr1, sorted_cos1, sorted_corr1)

Layer 0 raw cosine similarity: 0.066 ± 0.156
Layer 0 sorted cosine similarity: 0.967 ± 0.031
Layer 0 raw Pearson correlation: -0.014 ± 0.171
Layer 0 sorted Pearson correlation: 0.973 ± 0.024

Layer 1 raw cosine similarity: 0.121 ± 0.201
Layer 1 sorted cosine similarity: 0.931 ± 0.059
Layer 1 raw Pearson correlation: 0.034 ± 0.195
Layer 1 sorted Pearson correlation: 0.948 ± 0.039



-> output will always have highly similar activation patterns, assuming all model instances were trained accurately.
This results from only the output being explcitly optimised for, while the hidden layer only implcitly have to optimise, but aren't used for loss calculation -> their structure can be arbitrary.

Conclusion:

When comparing the distribution of neuron activations across model instances for hidden layers, similarity is very low because it is arbitrarily based on the random weight init.
When comparing the sorted neuron activations the similarity clearly peaks, suggesting that the same values are encoded, meaning the features encoded in any model instance is lilely very similar.
Only the distribution differs, because the model does not optimise for any spatial structure (like convolutional kernels in CNNs would), but simply finds some configuration, regardless of spatial structure, that encodes the necessary features.

In the next experiment 09_subnet_encoding.ipynb I will look for such spatialy ignorant but feature-aware encoding, not for the entire digit, but for lower-level concepts that humans can interpret, unlike the pixels that neurons encode normally as seen in 04_neuron_attention(_binary).ipynb.
Having seen the scattered neuron attention in experiments 04, I expect not to find any lower-level concepts clearly encoded in sub-networks.

The model loses information about which human and low-level concepts actually compose the pixels a neuron "looks at". That is because they don't optimise for recognising low-level concepts, instead they just need some way of encoding the entire digit, for which they can use any arbitrary pixel combionation summing up to a useful activation in the forward pass, ignorant of human-interpretable concepts. For the simplicity of the experiments I define a human-interpretable conept as a localised and continuously connected cluster of pixels, rather than the scattered attention that my neurons seem to display.

This is why their attention (which will be formally defined in the paper) may be scattered and still achieve high accuracy on the given OCR tasks (as well as tasks from outside of CV).