In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
cwd = os.getcwd()

NOTEBOOK_DIR = os.path.dirname(cwd)
ROOT = os.path.dirname(os.path.dirname(os.path.dirname(NOTEBOOK_DIR)))

FIGURES_DIR = os.path.join(ROOT, 'figures/abc_parameterizations/initialization')
CONFIG_PATH = os.path.join(ROOT, 'pytorch/configs/abc_parameterizations', 'fc_ipllr_mnist.yaml')

In [3]:
import sys
sys.path.append(ROOT)

In [4]:
import os
from copy import deepcopy
import torch
import math
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, Subset, DataLoader
import torch.nn.functional as F

from utils.tools import read_yaml, set_random_seeds
from pytorch.configs.base import BaseConfig
from pytorch.configs.model import ModelConfig
from pytorch.models.abc_params.fully_connected.ipllr import FcIPLLR
from pytorch.models.abc_params.fully_connected.muP import FCmuP
from pytorch.models.abc_params.fully_connected.ntk import FCNTK
from pytorch.models.abc_params.fully_connected.standard_fc_ip import StandardFCIP
from utils.data.mnist import load_data
from utils.abc_params.debug_ipllr import *

### Load basic configuration and define variables 

In [5]:
N_TRIALS = 1
SEED = 30
L = 6
width = 1024
n_warmup_steps = 1
batch_size = 512
base_lr = 0.1
n_steps = 50

set_random_seeds(SEED)  # set random seed for reproducibility
config_dict = read_yaml(CONFIG_PATH)

In [6]:
config_dict = read_yaml(CONFIG_PATH)

input_size = config_dict['architecture']['input_size']

config_dict['architecture']['width'] = width
config_dict['architecture']['n_layers'] = L + 1
config_dict['optimizer']['params']['lr'] = base_lr
config_dict['scheduler'] = {'name': 'warmup_switch',
                            'params': {'n_warmup_steps': n_warmup_steps,
                                       'calibrate_base_lr': True,
                                       'default_calibration': False}}
        
base_model_config = ModelConfig(config_dict)

### Load data & define model

In [7]:
training_dataset, test_dataset = load_data(download=False, flatten=True)
train_data_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size)
test_batches = list(DataLoader(test_dataset, shuffle=False, batch_size=batch_size))
batches = list(train_data_loader)
eval_batch = test_batches[0]

### Define model

In [8]:
ipllr = FcIPLLR(base_model_config, n_warmup_steps=12, lr_calibration_batches=batches)

initial base lr : [0.1, 47.14387893676758, 62.35230255126953, 61.66318893432617, 69.83497619628906, 80.47408294677734, 2.422165632247925]


In [9]:
ipllr.scheduler.warm_lrs[0] = ipllr.scheduler.warm_lrs[0] * (ipllr.d + 1)

### Save initial model : t=0

In [10]:
ipllr_0 = deepcopy(ipllr)

### Train model one step : t=1

In [11]:
x, y = batches[0]
train_model_one_step(ipllr, x, y, normalize_first=True)
ipllr_1 = deepcopy(ipllr)

input abs mean in training:  0.6950533986091614
loss derivatives for model: tensor([[-0.9000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000, -0.9000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.9000],
        ...,
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.9000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000, -0.9000,  0.1000]])
average training loss for model1 : 2.3025991916656494



### Train model for a second step : t=2

In [12]:
x, y = batches[1]
train_model_one_step(ipllr, x, y, normalize_first=True)
ipllr_2 = deepcopy(ipllr)

input abs mean in training:  0.6921874284744263
loss derivatives for model: tensor([[ 0.0820,  0.1149,  0.1018,  ...,  0.0950, -0.8772,  0.1078],
        [ 0.0744,  0.1220,  0.1021,  ...,  0.0924,  0.1345,  0.1112],
        [ 0.0787,  0.1179,  0.1020,  ...,  0.0939,  0.1277,  0.1093],
        ...,
        [ 0.0799,  0.1168,  0.1019,  ...,  0.0944,  0.1258,  0.1087],
        [ 0.0795, -0.8829,  0.1019,  ...,  0.0942,  0.1264,  0.1089],
        [ 0.0726,  0.1238,  0.1022,  ...,  0.0917, -0.8625,  0.1120]])
average training loss for model1 : 2.3087775707244873



In [13]:
x, y = batches[2]
ipllr.train()
train_model_one_step(ipllr, x, y, normalize_first=True)
ipllr_3 = deepcopy(ipllr)

input abs mean in training:  0.689268946647644
loss derivatives for model: tensor([[ 0.0756,  0.1208, -0.8977,  ...,  0.0927,  0.1333,  0.1108],
        [ 0.0845,  0.1127,  0.1017,  ..., -0.9042,  0.1196,  0.1068],
        [ 0.0783,  0.1182,  0.1022,  ...,  0.0937,  0.1288, -0.8904],
        ...,
        [ 0.0901,  0.1078,  0.1012,  ...,  0.0975,  0.1120,  0.1043],
        [ 0.0649, -0.8683,  0.1025,  ...,  0.0884,  0.1526,  0.1155],
        [ 0.0790,  0.1176,  0.1021,  ...,  0.0940,  0.1277, -0.8908]])
average training loss for model1 : 2.2643256187438965



In [14]:
x, y = batches[3]
train_model_one_step(ipllr, x, y, normalize_first=True)

input abs mean in training:  0.6968955397605896
loss derivatives for model: tensor([[-0.9019,  0.1015,  0.1002,  ...,  0.0996,  0.1021,  0.1008],
        [ 0.0539, -0.8545,  0.1016,  ...,  0.0831,  0.1739,  0.1210],
        [ 0.0581,  0.1403,  0.1020,  ...,  0.0853,  0.1644,  0.1191],
        ...,
        [ 0.0472,  0.1545,  0.1006,  ...,  0.0791, -0.8088,  0.1239],
        [ 0.0732,  0.1236, -0.8977,  ...,  0.0920,  0.1358,  0.1122],
        [ 0.0875,  0.1102,  0.1014,  ...,  0.0968,  0.1149,  0.1056]])
average training loss for model1 : 2.2438840866088867



In [15]:
x, y = batches[4]
train_model_one_step(ipllr, x, y, normalize_first=True)

input abs mean in training:  0.6987342238426208
loss derivatives for model: tensor([[ 0.0737,  0.1252,  0.1024,  ...,  0.0927,  0.1326,  0.1099],
        [ 0.0404, -0.8270,  0.0996,  ...,  0.0756,  0.2025,  0.1211],
        [ 0.0831,  0.1153,  0.1018,  ...,  0.0957,  0.1194,  0.1064],
        ...,
        [-0.9010,  0.1008,  0.1001,  ...,  0.0998,  0.1010,  0.1004],
        [ 0.0801,  0.1183,  0.1021,  ..., -0.9052,  0.1234,  0.1075],
        [ 0.0942,  0.1049,  0.1007,  ...,  0.0987,  0.1061,  0.1022]])
average training loss for model1 : 2.209421157836914



In [16]:
x, y = batches[5]
train_model_one_step(ipllr, x, y, normalize_first=True)

input abs mean in training:  0.6898333430290222
loss derivatives for model: tensor([[ 0.0444,  0.1765,  0.1011,  ...,  0.0771,  0.1859, -0.8852],
        [ 0.0248, -0.7720,  0.0931,  ...,  0.0603,  0.2479,  0.1142],
        [ 0.0440,  0.1772,  0.1010,  ...,  0.0769, -0.8132,  0.1148],
        ...,
        [ 0.0229, -0.7654,  0.0917,  ...,  0.0581,  0.2560,  0.1135],
        [ 0.0623,  0.1445,  0.1029,  ..., -0.9128,  0.1492,  0.1112],
        [ 0.0771,  0.1243,  0.1025,  ..., -0.9067,  0.1265,  0.1071]])
average training loss for model1 : 2.159430503845215



In [17]:
x, y = batches[6]
train_model_one_step(ipllr, x, y, normalize_first=True)

input abs mean in training:  0.6898800730705261
loss derivatives for model: tensor([[ 0.0932,  0.1073,  0.1011,  ...,  0.0981,  0.1064,  0.1018],
        [ 0.0671,  0.1435,  0.1038,  ...,  0.0886,  0.1371,  0.1081],
        [ 0.0372,  0.2121,  0.1011,  ...,  0.0703,  0.1911,  0.1110],
        ...,
        [ 0.0181,  0.2933,  0.0895,  ...,  0.0500, -0.7518,  0.1039],
        [ 0.0981,  0.1020,  0.1003,  ...,  0.0995,  0.1017,  0.1005],
        [ 0.0845,  0.1179,  0.1023,  ...,  0.0954,  0.1156,  0.1041]])
average training loss for model1 : 2.1696534156799316



In [18]:
x, y = batches[7]
train_model_one_step(ipllr, x, y, normalize_first=True)

input abs mean in training:  0.6930931806564331
loss derivatives for model: tensor([[ 0.0977,  0.1024,  0.1004,  ...,  0.0995,  0.1020,  0.1005],
        [ 0.0947,  0.1057,  0.1010,  ..., -0.9011,  0.1047,  0.1013],
        [ 0.0434,  0.1947,  0.1048,  ...,  0.0784,  0.1725,  0.1086],
        ...,
        [ 0.0992,  0.1008,  0.1002,  ...,  0.0998,  0.1007,  0.1002],
        [ 0.0434,  0.1947,  0.1048,  ...,  0.0784, -0.8274,  0.1086],
        [ 0.0310, -0.7652,  0.1018,  ...,  0.0689,  0.1995,  0.1069]])
average training loss for model1 : 2.118596076965332



In [19]:
x, y = batches[8]
train_model_one_step(ipllr, x, y, normalize_first=True)

input abs mean in training:  0.695981502532959
loss derivatives for model: tensor([[ 0.0650,  0.1526,  0.1038,  ...,  0.0887,  0.1365,  0.1056],
        [ 0.0249,  0.2837,  0.0946,  ...,  0.0604,  0.2063, -0.9006],
        [ 0.0165,  0.3387,  0.0866,  ...,  0.0496,  0.2281, -0.9079],
        ...,
        [ 0.0339,  0.2411,  0.0994,  ...,  0.0693,  0.1865,  0.1035],
        [-0.9003,  0.1003,  0.1000,  ...,  0.0999,  0.1002,  0.1001],
        [ 0.0664,  0.1498,  0.1038,  ...,  0.0893,  0.1347, -0.8945]])
average training loss for model1 : 2.1547701358795166



In [20]:
x, y = batches[9]
train_model_one_step(ipllr, x, y, normalize_first=True)

input abs mean in training:  0.6933886408805847
loss derivatives for model: tensor([[ 0.0988,  0.1013,  0.1003,  ...,  0.0997,  0.1010,  0.1002],
        [ 0.0975,  0.1027,  0.1006,  ..., -0.9006,  0.1021,  0.1005],
        [ 0.0745,  0.1329,  0.1054,  ..., -0.9070,  0.1238,  0.1041],
        ...,
        [ 0.0949,  0.1055,  0.1012,  ...,  0.0989,  0.1042,  0.1009],
        [ 0.0985,  0.1017,  0.1004,  ...,  0.0997,  0.1013,  0.1003],
        [ 0.0549,  0.1696,  0.1080,  ...,  0.0846,  0.1478,  0.1054]])
average training loss for model1 : 2.128692626953125



In [21]:
x, y = batches[10]
train_model_one_step(ipllr, x, y, normalize_first=True)

input abs mean in training:  0.6905764937400818
loss derivatives for model: tensor([[ 0.0959,  0.1050,  0.1009,  ...,  0.0991,  0.1032,  0.1006],
        [ 0.0911,  0.1112,  0.1020,  ...,  0.0981,  0.1072,  0.1013],
        [ 0.0439,  0.2123,  0.1073,  ...,  0.0786,  0.1587, -0.8986],
        ...,
        [ 0.0047,  0.5272,  0.0681,  ...,  0.0268, -0.7795,  0.0575],
        [ 0.0019, -0.3783,  0.0503,  ...,  0.0160,  0.2129,  0.0408],
        [ 0.0955,  0.1055,  0.1010,  ...,  0.0991,  0.1036, -0.8993]])
average training loss for model1 : 2.089322090148926



In [22]:
ipllr.eval()
ipllr_0.eval()
ipllr_1.eval()
ipllr_2.eval()
print()




In [23]:
layer_scales = ipllr.layer_scales
intermediate_layer_keys = ["layer_{:,}_intermediate".format(l) for l in range(2, L + 1)]

### Define W0 and b0

In [24]:
W0 = {1: layer_scales[0] * ipllr_0.input_layer.weight.data.detach() / math.sqrt(ipllr_0.d + 1)}
for i, l in enumerate(range(2, L + 1)):
    layer = getattr(ipllr_0.intermediate_layers, intermediate_layer_keys[i])
    W0[l] = layer_scales[l-1] * layer.weight.data.detach()

W0[L+1] = layer_scales[L] * ipllr_0.output_layer.weight.data.detach()

In [25]:
b0 = layer_scales[0] * ipllr_0.input_layer.bias.data.detach() / math.sqrt(ipllr_0.d + 1)

### Define Delta_W_1 and Delta_b_1

In [26]:
Delta_W_1 = {1: layer_scales[0] * (ipllr_1.input_layer.weight.data.detach() -
                                   ipllr_0.input_layer.weight.data.detach()) / math.sqrt(ipllr_1.d + 1)}
for i, l in enumerate(range(2, L + 1)):
    layer_1 = getattr(ipllr_1.intermediate_layers, intermediate_layer_keys[i])
    layer_0 = getattr(ipllr_0.intermediate_layers, intermediate_layer_keys[i])
    Delta_W_1[l] = layer_scales[l-1] * (layer_1.weight.data.detach() -
                                        layer_0.weight.data.detach())

Delta_W_1[L+1] = layer_scales[L] * (ipllr_1.output_layer.weight.data.detach() -
                                    ipllr_0.output_layer.weight.data.detach())

In [27]:
Delta_b_1 = layer_scales[0] * (ipllr_1.input_layer.bias.data.detach() -
                               ipllr_0.input_layer.bias.data.detach()) / math.sqrt(ipllr_1.d + 1)

### Define Delta_W_2

In [28]:
Delta_W_2 = {1: layer_scales[0] * (ipllr_2.input_layer.weight.data.detach() -
                                   ipllr_1.input_layer.weight.data.detach()) / math.sqrt(ipllr_2.d + 1)}
for i, l in enumerate(range(2, L + 1)):
    layer_2 = getattr(ipllr_2.intermediate_layers, intermediate_layer_keys[i])
    layer_1 = getattr(ipllr_1.intermediate_layers, intermediate_layer_keys[i])
    Delta_W_2[l] = layer_scales[l-1] * (layer_2.weight.data.detach() -
                                        layer_1.weight.data.detach())

Delta_W_2[L+1] = layer_scales[L] * (ipllr_2.output_layer.weight.data.detach() -
                                    ipllr_1.output_layer.weight.data.detach())

In [29]:
Delta_b_2 = layer_scales[0] * (ipllr_2.input_layer.bias.data.detach() -
                               ipllr_1.input_layer.bias.data.detach()) / math.sqrt(ipllr_1.d + 1)

## Explore at step 2

### On examples from the second batch

In [30]:
x, y = batches[1]

In [31]:
with torch.no_grad():
    x2 = {0: x}
    h0 = {1: F.linear(x, W0[1], b0)}
    delta_h_1 = {1: F.linear(x, Delta_W_1[1], Delta_b_1)}
    delta_h_2 = {1: F.linear(x, Delta_W_2[1], Delta_b_2)}
    h1 = {1: layer_scales[0] * ipllr_1.input_layer.forward(x) / math.sqrt(ipllr_1.d + 1)}
    h2 = {1: layer_scales[0] * ipllr_2.input_layer.forward(x) / math.sqrt(ipllr_2.d + 1)}
    x2[1] = ipllr_2.activation(h2[1])

In [32]:
torch.testing.assert_allclose(h0[1] + delta_h_1[1], h1[1], rtol=1e-5, atol=1e-5)
torch.testing.assert_allclose(h0[1] + delta_h_1[1] + delta_h_2[1], h2[1], rtol=1e-5, atol=1e-5)

In [33]:
prod_1 = delta_h_1[1] * delta_h_2[1]

In [34]:
(prod_1 < 0).sum() / prod_1.numel()

tensor(0.4852)

In [35]:
h0[1][0, :5]

tensor([ 0.4677,  0.8577,  0.4337, -1.1502, -1.4598])

In [36]:
delta_h_1[1][0, :5]

tensor([ 0.0002, -0.0013,  0.0007,  0.0010,  0.0013])

In [37]:
delta_h_2[1][0, :5]

tensor([ 0.4359, -0.1783, -0.1025, -0.0653, -0.0453])

In [38]:
(delta_h_1[1] < 0).sum() / delta_h_1[1].numel()

tensor(0.4914)

In [39]:
(delta_h_2[1] < 0).sum() / delta_h_2[1].numel()

tensor(0.6582)

In [40]:
h0[1][0, :].abs().mean()

tensor(0.9147)

In [41]:
h1[1][0, :].abs().mean()

tensor(0.9146)

In [42]:
h2[1][0, :].abs().mean()

tensor(1.1195)

In [43]:
with torch.no_grad():
    for i, l in enumerate(range(2, L + 1)):
        layer_1 = getattr(ipllr_1.intermediate_layers, intermediate_layer_keys[i])
        layer_2 = getattr(ipllr_2.intermediate_layers, intermediate_layer_keys[i])
        x = x2[l-1]

        h0[l] =  F.linear(x, W0[l])
        delta_h_1[l] = F.linear(x, Delta_W_1[l])
        delta_h_2[l] = F.linear(x, Delta_W_2[l])
        
        h1[l] = layer_scales[l-1] * layer_1.forward(x)
        h2[l] = layer_scales[l-1] * layer_2.forward(x)
        x2[l] = ipllr_2.activation(h2[l])
        
        torch.testing.assert_allclose(h0[l] + delta_h_1[l], h1[l], rtol=1e-5, atol=1e-5)
        torch.testing.assert_allclose(h0[l] + delta_h_1[l] + delta_h_2[l], h2[l], rtol=1e-5, atol=1e-5)

In [44]:
with torch.no_grad():
    x = x2[L] 
    h0[L+1] = F.linear(x, W0[L+1])
    delta_h_1[L+1] = F.linear(x, Delta_W_1[L+1])
    delta_h_2[L+1] = F.linear(x, Delta_W_2[L+1])
    h1[L+1] = layer_scales[L] * ipllr_1.output_layer.forward(x)
    h2[L+1] = layer_scales[L] * ipllr_2.output_layer.forward(x)
    x2[L+1] = ipllr_2.activation(h2[L+1])
                              
    torch.testing.assert_allclose(h0[L+1] + delta_h_1[L+1], h1[L+1], rtol=1e-5, atol=1e-5)
    torch.testing.assert_allclose(h0[L+1] + delta_h_1[L+1] + delta_h_2[L+1], h2[L+1], rtol=1e-5, atol=1e-5)

In [45]:
prod_1 = delta_h_1[2] * delta_h_2[2]

In [46]:
(prod_1 < 0).sum() / prod_1.numel()

tensor(0.4418)

In [47]:
h0[2][0, :10]

tensor([ 0.0166,  0.0181,  0.0374,  0.0283,  0.0368,  0.0983, -0.0549, -0.0159,
        -0.0541, -0.0067])

In [48]:
delta_h_1[2][0, :10]

tensor([-0.4206,  1.9491,  1.2373, -1.0512, -0.5204,  2.1929, -0.0042,  0.0685,
        -0.1399,  0.6556])

In [49]:
delta_h_2[1][0, :10]

tensor([ 0.4359, -0.1783, -0.1025, -0.0653, -0.0453,  0.4192,  0.7015, -0.0719,
         0.4446, -0.2780])

In [50]:
(delta_h_2[1] < 0).sum() / delta_h_2[1].numel()

tensor(0.6582)

In [51]:
(delta_h_2[2] < 0).sum() / delta_h_2[2].numel()

tensor(0.5192)

In [52]:
(delta_h_2[3] < 0).sum() / delta_h_2[3].numel()

tensor(0.4844)

In [53]:
(delta_h_2[4] < 0).sum() / delta_h_2[4].numel()

tensor(0.4092)

In [54]:
(delta_h_2[5] < 0).sum() / delta_h_2[4].numel()

tensor(0.3379)

In [55]:
(delta_h_2[6] < 0).sum() / delta_h_2[6].numel()

tensor(0.3008)

In [56]:
(delta_h_2[7] < 0).sum() / delta_h_2[7].numel()

tensor(0.6000)

In [57]:
delta_h_2[7]

tensor([[ 0.0057, -0.0036,  0.0013,  ...,  0.0008, -0.0018, -0.0012],
        [ 0.0018, -0.0011,  0.0004,  ...,  0.0003, -0.0006, -0.0004],
        [ 0.0033, -0.0021,  0.0008,  ...,  0.0005, -0.0011, -0.0007],
        ...,
        [ 0.0017, -0.0011,  0.0004,  ...,  0.0002, -0.0005, -0.0004],
        [ 0.0083, -0.0052,  0.0019,  ...,  0.0012, -0.0026, -0.0017],
        [ 0.0044, -0.0027,  0.0010,  ...,  0.0006, -0.0014, -0.0009]])

In [58]:
h0[7].abs().mean()

tensor(0.0888)

In [59]:
h1[7].abs().mean()

tensor(0.0968)

In [60]:
h2[7].abs().mean()

tensor(0.0961)

In [61]:
delta_h_1[7].abs().mean()

tensor(0.0083)

In [62]:
delta_h_2[7].abs().mean()

tensor(0.0015)

In [63]:
delta_h_2[7].shape

torch.Size([512, 10])

In [64]:
torch.max(delta_h_2[7][0, :], dim=0)

torch.return_types.max(
values=tensor(0.0057),
indices=tensor(0))

In [65]:
torch.max(delta_h_2[7][1, :], dim=0)

torch.return_types.max(
values=tensor(0.0018),
indices=tensor(0))

In [66]:
torch.max(delta_h_2[7][2, :], dim=0)

torch.return_types.max(
values=tensor(0.0033),
indices=tensor(0))

In [67]:
torch.max(delta_h_2[7][23, :], dim=0)

torch.return_types.max(
values=tensor(0.0035),
indices=tensor(0))

In [68]:
torch.nonzero(delta_h_2[7][0, :] < 0)

tensor([[1],
        [4],
        [5],
        [6],
        [8],
        [9]])

In [69]:
torch.nonzero(delta_h_2[7][1, :] < 0)

tensor([[1],
        [4],
        [5],
        [6],
        [8],
        [9]])

In [70]:
torch.nonzero(delta_h_2[7][24, :] < 0)

tensor([[1],
        [4],
        [5],
        [6],
        [8],
        [9]])

In [71]:
torch.nonzero(delta_h_2[7]< 0)

tensor([[  0,   1],
        [  0,   4],
        [  0,   5],
        ...,
        [511,   6],
        [511,   8],
        [511,   9]])

In [72]:
(delta_h_2[1][0, :] < 0).sum() / delta_h_2[1][0, :].numel()

tensor(0.3584)

In [73]:
(delta_h_2[2][0, :] < 0).sum() / delta_h_2[2][0, :].numel()

tensor(0.5088)

In [74]:
(delta_h_2[3][0, :] < 0).sum() / delta_h_2[3][0, :].numel()

tensor(0.4844)

In [75]:
(delta_h_2[4][0, :] < 0).sum() / delta_h_2[4][0, :].numel()

tensor(0.4092)

In [76]:
(delta_h_2[5][0, :] < 0).sum() / delta_h_2[4][0, :].numel()

tensor(0.3379)

In [77]:
(delta_h_2[6][0, :] < 0).sum() / delta_h_2[6][0, :].numel()

tensor(0.3008)

In [78]:
(delta_h_2[7][1, :] < 0).sum() / delta_h_2[7][1, :].numel()

tensor(0.6000)