In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
cwd = os.getcwd()

NOTEBOOK_DIR = os.path.dirname(cwd)
ROOT = os.path.dirname(os.path.dirname(os.path.dirname(NOTEBOOK_DIR)))

FIGURES_DIR = os.path.join(ROOT, 'figures/abc_parameterizations/initialization')
CONFIG_PATH = os.path.join(ROOT, 'pytorch/configs/abc_parameterizations', 'fc_ipllr_mnist.yaml')

In [3]:
import sys
sys.path.append(ROOT)

In [4]:
import os
from copy import deepcopy
import torch
import math
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, Subset, DataLoader
import torch.nn.functional as F

from utils.tools import read_yaml, set_random_seeds
from pytorch.configs.base import BaseConfig
from pytorch.configs.model import ModelConfig
from pytorch.models.abc_params.fully_connected.ipllr import FcIPLLR
from pytorch.models.abc_params.fully_connected.muP import FCmuP
from pytorch.models.abc_params.fully_connected.ntk import FCNTK
from pytorch.models.abc_params.fully_connected.standard_fc_ip import StandardFCIP
from utils.data.mnist import load_data
from utils.abc_params.debug_ipllr import *

### Load basic configuration and define variables 

In [5]:
N_TRIALS = 1
SEED = 30
L = 6
width = 1024
n_warmup_steps = 1
batch_size = 512
base_lr = 0.1
n_steps = 50

set_random_seeds(SEED)  # set random seed for reproducibility
config_dict = read_yaml(CONFIG_PATH)

In [6]:
config_dict = read_yaml(CONFIG_PATH)

input_size = config_dict['architecture']['input_size']

config_dict['architecture']['width'] = width
config_dict['architecture']['n_layers'] = L + 1
config_dict['optimizer']['params']['lr'] = base_lr
config_dict['scheduler'] = {'name': 'warmup_switch',
                            'params': {'n_warmup_steps': n_warmup_steps,
                                       'calibrate_base_lr': True,
                                       'default_calibration': False}}
        
base_model_config = ModelConfig(config_dict)

### Load data & define model

In [7]:
training_dataset, test_dataset = load_data(download=False, flatten=True)
train_data_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size)
test_batches = list(DataLoader(test_dataset, shuffle=False, batch_size=batch_size))
batches = list(train_data_loader)
eval_batch = test_batches[0]

In [8]:
### Train on FULL batch for first step
full_x = torch.cat([a for a,_ in batches], dim=0)
full_y = torch.cat([b for _,b in batches], dim=0)
batches[0] = (full_x, full_y)

### Define model

In [9]:
ipllr = FcIPLLR(base_model_config, n_warmup_steps=12, lr_calibration_batches=batches)

initial base lr : [78.5, 45.31412124633789, 73.59468078613281, 78.43993377685547, 87.94955444335938, 110.81439208984375, 41.807186126708984]


In [10]:
ipllr.scheduler.warm_lrs[0] = ipllr.scheduler.warm_lrs[0] * (ipllr.d + 1)

### Save initial model : t=0

In [11]:
ipllr_0 = deepcopy(ipllr)

### Train model one step : t=1

In [12]:
x, y = batches[0]
train_model_one_step(ipllr, x, y, normalize_first=True)
ipllr_1 = deepcopy(ipllr)

input abs mean in training:  0.695576548576355
loss derivatives for model: tensor([[-0.9000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000, -0.9000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.9000],
        ...,
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000, -0.9000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [-0.9000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000]])
average training loss for model1 : 2.302889108657837



### Train model for a second step : t=2

In [13]:
x, y = batches[1]
train_model_one_step(ipllr, x, y, normalize_first=True)
ipllr_2 = deepcopy(ipllr)

input abs mean in training:  0.6921874284744263
loss derivatives for model: tensor([[ 0.1366,  0.0793,  0.1140,  ...,  0.0946, -0.8923,  0.0951],
        [ 0.1761,  0.0632,  0.1252,  ...,  0.0882,  0.1126,  0.0891],
        [ 0.1460,  0.0750,  0.1170,  ...,  0.0931,  0.1091,  0.0938],
        ...,
        [ 0.1508,  0.0729,  0.1184,  ...,  0.0923,  0.1098,  0.0930],
        [ 0.1479, -0.9258,  0.1175,  ...,  0.0928,  0.1094,  0.0935],
        [ 0.1711,  0.0650,  0.1240,  ...,  0.0890, -0.8879,  0.0899]])
average training loss for model1 : 2.2885348796844482



In [14]:
ipllr.eval()
ipllr_0.eval()
ipllr_1.eval()
ipllr_2.eval()
print()




In [15]:
layer_scales = ipllr.layer_scales
intermediate_layer_keys = ["layer_{:,}_intermediate".format(l) for l in range(2, L + 1)]

### Define W0 and b0

In [16]:
with torch.no_grad():
    W0 = {1: layer_scales[0] * ipllr_0.input_layer.weight.data.detach() / math.sqrt(ipllr_0.d + 1)}
    for i, l in enumerate(range(2, L + 1)):
        layer = getattr(ipllr_0.intermediate_layers, intermediate_layer_keys[i])
        W0[l] = layer_scales[l-1] * layer.weight.data.detach()

    W0[L+1] = layer_scales[L] * ipllr_0.output_layer.weight.data.detach()

In [17]:
with torch.no_grad():
    b0 = layer_scales[0] * ipllr_0.input_layer.bias.data.detach() / math.sqrt(ipllr_0.d + 1)

### Define Delta_W_1 and Delta_b_1

In [18]:
with torch.no_grad():
    Delta_W_1 = {1: layer_scales[0] * (ipllr_1.input_layer.weight.data.detach() -
                                       ipllr_0.input_layer.weight.data.detach()) / math.sqrt(ipllr_1.d + 1)}
    for i, l in enumerate(range(2, L + 1)):
        layer_1 = getattr(ipllr_1.intermediate_layers, intermediate_layer_keys[i])
        layer_0 = getattr(ipllr_0.intermediate_layers, intermediate_layer_keys[i])
        Delta_W_1[l] = layer_scales[l-1] * (layer_1.weight.data.detach() -
                                            layer_0.weight.data.detach())

    Delta_W_1[L+1] = layer_scales[L] * (ipllr_1.output_layer.weight.data.detach() -
                                        ipllr_0.output_layer.weight.data.detach())

In [19]:
with torch.no_grad():
    Delta_b_1 = layer_scales[0] * (ipllr_1.input_layer.bias.data.detach() -
                                   ipllr_0.input_layer.bias.data.detach()) / math.sqrt(ipllr_1.d + 1)

### Define Delta_W_2

In [20]:
with torch.no_grad():
    Delta_W_2 = {1: layer_scales[0] * (ipllr_2.input_layer.weight.data.detach() -
                                       ipllr_1.input_layer.weight.data.detach()) / math.sqrt(ipllr_2.d + 1)}
    for i, l in enumerate(range(2, L + 1)):
        layer_2 = getattr(ipllr_2.intermediate_layers, intermediate_layer_keys[i])
        layer_1 = getattr(ipllr_1.intermediate_layers, intermediate_layer_keys[i])
        Delta_W_2[l] = layer_scales[l-1] * (layer_2.weight.data.detach() -
                                            layer_1.weight.data.detach())

    Delta_W_2[L+1] = layer_scales[L] * (ipllr_2.output_layer.weight.data.detach() -
                                        ipllr_1.output_layer.weight.data.detach())

In [21]:
with torch.no_grad():
    Delta_b_2 = layer_scales[0] * (ipllr_2.input_layer.bias.data.detach() -
                                   ipllr_1.input_layer.bias.data.detach()) / math.sqrt(ipllr_1.d + 1)

## Ranks

In [22]:
columns = ['W0', 'Delta_W_1', 'Delta_W_2', 'max']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'

for l in df.index:
    df.loc[l, columns] = [torch.matrix_rank(W0[l], tol=1e-6).item(), 
                          torch.matrix_rank(Delta_W_1[l], tol=1e-6).item(), 
                          torch.matrix_rank(Delta_W_2[l], tol=1e-6).item(),
                          min(W0[l].shape[0], W0[l].shape[1])]
    
df

Unnamed: 0_level_0,W0,Delta_W_1,Delta_W_2,max
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,784,712,505,784
2,1024,1022,163,1024
3,1024,1019,20,1024
4,1024,1011,8,1024
5,1024,996,7,1024
6,1023,957,4,1024
7,10,9,1,10


## Explore at step 1

### On all training examples

In [23]:
x, y = full_x, full_y

In [24]:
with torch.no_grad():
    x1 = {0: x}
    h0 = {1: F.linear(x, W0[1], b0)}
    delta_h_1 = {1: F.linear(x, Delta_W_1[1], Delta_b_1)}
    h1 = {1: layer_scales[0] * ipllr_1.input_layer.forward(x) / math.sqrt(ipllr_1.d + 1)}
    x1[1] = ipllr_1.activation(h1[1])

In [25]:
torch.testing.assert_allclose(h0[1] + delta_h_1[1], h1[1], rtol=1e-5, atol=1e-5)

In [26]:
with torch.no_grad():
    for i, l in enumerate(range(2, L + 1)):
        layer_1 = getattr(ipllr_1.intermediate_layers, intermediate_layer_keys[i])
        x = x1[l-1]

        h0[l] =  F.linear(x, W0[l])
        delta_h_1[l] = F.linear(x, Delta_W_1[l])
        
        h1[l] = layer_scales[l-1] * layer_1.forward(x)
        x1[l] = ipllr_1.activation(h1[l])
        
        torch.testing.assert_allclose(h0[l] + delta_h_1[l], h1[l], rtol=1e-5, atol=1e-5)

In [27]:
with torch.no_grad():
    x = x1[L] 
    h0[L+1] = F.linear(x, W0[L+1])
    delta_h_1[L+1] = F.linear(x, Delta_W_1[L+1])
    h1[L+1] = layer_scales[L] * ipllr_1.output_layer.forward(x)
    x1[L+1] = ipllr_1.activation(h1[L+1])
                              
    torch.testing.assert_allclose(h0[L+1] + delta_h_1[L+1], h1[L+1], rtol=1e-5, atol=1e-5)

##### Diversity

In [28]:
columns = ['h0', 'delta_h_1', 'h1', 'x1']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'
bs = x.shape[0]

for l in df.index:

    df.loc[l, columns] = [torch.matrix_rank(h0[l]).item(),
                          torch.matrix_rank(delta_h_1[l]).item(),
                          torch.matrix_rank(h1[l]).item(),
                          torch.matrix_rank(x1[l]).item()]
    df.loc[l, 'max'] = min(h0[l].shape[0], h0[l].shape[1])
    
df.loc[:, 'n_el'] = bs
df.loc[:, 'max'] = df.loc[:, 'max'].astype(int)
df

Unnamed: 0_level_0,h0,delta_h_1,h1,x1,max,n_el
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,432,24,335,491,1024,60000
2,384,13,14,19,1024,60000
3,17,4,4,4,1024,60000
4,4,2,2,2,1024,60000
5,2,1,1,1,1024,60000
6,1,1,1,1,1024,60000
7,1,1,1,1,10,60000


In [28]:
columns = ['h0', 'delta_h_1', 'h1']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'
bs = x.shape[0]

for l in df.index:
    maxes = dict()
    
    _, maxes['h0'] = torch.max(h0[l] , dim=1)
    _, maxes['delta_h_1'] = torch.max(delta_h_1[l] , dim=1)
    _, maxes['h1'] = torch.max(h1[l] , dim=1)

    df.loc[l, columns] = [maxes[key].unique().numel() for key in columns]
    df.loc[l, 'max'] = min(bs, h0[l].shape[1])
    
df.loc[:, 'batch_size'] = bs
df.loc[:, 'max'] = df.loc[:, 'max'].astype(int)
df

Unnamed: 0_level_0,h0,delta_h_1,h1,max,batch_size
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,716,47,318,1024,60000
2,154,2,2,1024,60000
3,7,1,2,1024,60000
4,1,1,1,1024,60000
5,1,1,1,1024,60000
6,1,1,1,1024,60000
7,1,1,1,10,60000


##### Scales

In [29]:
columns = ['h0', 'delta_h_1', 'h1', 'x1']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'
for l in df.index:
    df.loc[l, columns] = [h0[l][0, :].abs().mean().item(),
                          delta_h_1[l][0, :].abs().mean().item(), 
                          h1[l][0, :].abs().mean().item(),
                          x1[l][0, :].abs().mean().item()]
df

Unnamed: 0_level_0,h0,delta_h_1,h1,x1
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.17093,1.0656,1.61108,0.823059
2,0.0603477,1.12432,1.14772,0.579163
3,0.0441967,1.13337,1.15401,0.602801
4,0.046833,1.13561,1.15744,0.593479
5,0.0447512,1.1359,1.1586,0.584524
6,0.0431981,1.13629,1.15273,0.642845
7,0.0952198,0.113666,0.208886,0.110978


## Explore at step 2

### On examples from the second batch

In [30]:
x, y = full_x, full_y

In [31]:
with torch.no_grad():
    x2 = {0: x}
    h0 = {1: F.linear(x, W0[1], b0)}
    delta_h_1 = {1: F.linear(x, Delta_W_1[1], Delta_b_1)}
    delta_h_2 = {1: F.linear(x, Delta_W_2[1], Delta_b_2)}
    h1 = {1: layer_scales[0] * ipllr_1.input_layer.forward(x) / math.sqrt(ipllr_1.d + 1)}
    h2 = {1: layer_scales[0] * ipllr_2.input_layer.forward(x) / math.sqrt(ipllr_2.d + 1)}
    x2[1] = ipllr_2.activation(h2[1])

In [32]:
torch.testing.assert_allclose(h0[1] + delta_h_1[1], h1[1], rtol=1e-5, atol=1e-4)
torch.testing.assert_allclose(h0[1] + delta_h_1[1] + delta_h_2[1], h2[1], rtol=1e-5, atol=1e-4)

In [33]:
with torch.no_grad():
    for i, l in enumerate(range(2, L + 1)):
        layer_1 = getattr(ipllr_1.intermediate_layers, intermediate_layer_keys[i])
        layer_2 = getattr(ipllr_2.intermediate_layers, intermediate_layer_keys[i])
        x = x2[l-1]

        h0[l] =  F.linear(x, W0[l])
        delta_h_1[l] = F.linear(x, Delta_W_1[l])
        delta_h_2[l] = F.linear(x, Delta_W_2[l])
        
        h1[l] = layer_scales[l-1] * layer_1.forward(x)
        h2[l] = layer_scales[l-1] * layer_2.forward(x)
        x2[l] = ipllr_2.activation(h2[l])
        
        torch.testing.assert_allclose(h0[l] + delta_h_1[l], h1[l], rtol=1e-5, atol=1e-5)
        torch.testing.assert_allclose(h0[l] + delta_h_1[l] + delta_h_2[l], h2[l], rtol=1e-5, atol=1e-5)

In [34]:
with torch.no_grad():
    x = x2[L] 
    h0[L+1] = F.linear(x, W0[L+1])
    delta_h_1[L+1] = F.linear(x, Delta_W_1[L+1])
    delta_h_2[L+1] = F.linear(x, Delta_W_2[L+1])
    h1[L+1] = layer_scales[L] * ipllr_1.output_layer.forward(x)
    h2[L+1] = layer_scales[L] * ipllr_2.output_layer.forward(x)
    x2[L+1] = ipllr_2.activation(h2[L+1])
                              
    torch.testing.assert_allclose(h0[L+1] + delta_h_1[L+1], h1[L+1], rtol=1e-5, atol=1e-5)
    torch.testing.assert_allclose(h0[L+1] + delta_h_1[L+1] + delta_h_2[L+1], h2[L+1], rtol=1e-5, atol=1e-5)

##### Diversity

In [35]:
columns = ['h0', 'delta_h_1', 'delta_h_2', 'h1', 'h2']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'
bs = x.shape[0]

for l in df.index:
    maxes = dict()
    
    _, maxes['h0'] = torch.max(h0[l] , dim=1)
    _, maxes['delta_h_1'] = torch.max(delta_h_1[l] , dim=1)
    _, maxes['delta_h_2'] = torch.max(delta_h_2[l] , dim=1)
    _, maxes['h1'] = torch.max(h1[l] , dim=1)
    _, maxes['h2'] = torch.max(h2[l] , dim=1)

    df.loc[l, columns] = [maxes[key].unique().numel() for key in columns]
    df.loc[l, 'max'] = min(bs, h0[l].shape[1])
    
df.loc[:, 'batch_size'] = bs
df.loc[:, 'max'] = df.loc[:, 'max'].astype(int)
df

Unnamed: 0_level_0,h0,delta_h_1,delta_h_2,h1,h2,max,batch_size
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,716,47,9,318,431,1024,60000
2,322,6,2,7,7,1024,60000
3,13,3,1,4,4,1024,60000
4,1,1,1,1,1,1024,60000
5,1,1,1,1,1,1024,60000
6,1,1,1,1,1,1024,60000
7,1,1,1,1,1,10,60000


##### Scales

In [36]:
columns = ['h0', 'delta_h_1', 'delta_h_2', 'h1', 'h2', 'x2']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'
for l in df.index:
    df.loc[l, columns] = [h0[l][0, :].abs().mean().item(), delta_h_1[l][0, :].abs().mean().item(), 
                          delta_h_2[l][0, :].abs().mean().item(), h1[l][0, :].abs().mean().item(),  
                          h2[l][0, :].abs().mean().item(),
                          x2[l][0, :].abs().mean().item()]
df

Unnamed: 0_level_0,h0,delta_h_1,delta_h_2,h1,h2,x2
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.17093,1.0656,0.497143,1.61108,1.73718,0.834751
2,0.0619237,1.18527,0.00184346,1.20599,1.20745,0.613684
3,0.0472125,1.24061,0.00132274,1.26207,1.26145,0.659869
4,0.0514968,1.24531,0.00166271,1.26933,1.26827,0.649876
5,0.0490219,1.24306,0.00281645,1.26791,1.26647,0.63825
6,0.0471972,1.23627,0.00348511,1.25418,1.25295,0.698146
7,0.102555,0.122415,0.00128374,0.22497,0.224461,0.119413


In [37]:
(h0[1] < 0).sum() / h0[1].numel()

tensor(0.4898)

In [38]:
(delta_h_1[1] < 0).sum() / delta_h_1[1].numel()

tensor(0.5079)

In [39]:
(delta_h_2[1] < 0).sum() / delta_h_2[1].numel()

tensor(0.7287)

In [40]:
(h1[1] < 0).sum() / h1[1].numel()

tensor(0.5256)

In [41]:
(h2[1] < 0).sum() / h2[1].numel()

tensor(0.6095)

In [42]:
(h0[2] < 0).sum() / h0[2].numel()

tensor(0.5209)

In [43]:
(delta_h_1[2] < 0).sum() / delta_h_1[2].numel()

tensor(0.4872)

In [44]:
(delta_h_2[2] < 0).sum() / delta_h_2[2].numel()

tensor(0.4501)

In [45]:
(h1[2] < 0).sum() / h1[2].numel()

tensor(0.5114)

In [46]:
(h2[2] < 0).sum() / h2[2].numel()

tensor(0.5117)