In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
cwd = os.getcwd()

NOTEBOOK_DIR = os.path.dirname(cwd)
ROOT = os.path.dirname(os.path.dirname(os.path.dirname(NOTEBOOK_DIR)))

FIGURES_DIR = os.path.join(ROOT, 'figures/abc_parameterizations/initialization')
CONFIG_PATH = os.path.join(ROOT, 'pytorch/configs/abc_parameterizations', 'fc_ipllr_mnist.yaml')

In [3]:
import sys
sys.path.append(ROOT)

In [4]:
import os
from copy import deepcopy
import torch
import math
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, Subset, DataLoader
import torch.nn.functional as F

from utils.tools import read_yaml, set_random_seeds
from pytorch.configs.base import BaseConfig
from pytorch.configs.model import ModelConfig
from pytorch.models.abc_params.fully_connected.ipllr import FcIPLLR
from pytorch.models.abc_params.fully_connected.muP import FCmuP
from pytorch.models.abc_params.fully_connected.ntk import FCNTK
from pytorch.models.abc_params.fully_connected.standard_fc_ip import StandardFCIP
from utils.data.mnist import load_data
from utils.abc_params.debug_ipllr import *

### Load basic configuration and define variables 

In [5]:
N_TRIALS = 1
SEED = 30
L = 6
width = 1024
n_warmup_steps = 1
batch_size = 512
base_lr = 0.1
n_steps = 50

set_random_seeds(SEED)  # set random seed for reproducibility
config_dict = read_yaml(CONFIG_PATH)

In [6]:
config_dict = read_yaml(CONFIG_PATH)

input_size = config_dict['architecture']['input_size']

config_dict['architecture']['width'] = width
config_dict['architecture']['n_layers'] = L + 1
config_dict['optimizer']['params']['lr'] = base_lr
config_dict['scheduler'] = {'name': 'warmup_switch',
                            'params': {'n_warmup_steps': n_warmup_steps,
                                       'calibrate_base_lr': True,
                                       'default_calibration': False}}
        
base_model_config = ModelConfig(config_dict)

### Load data & define model

In [7]:
training_dataset, test_dataset = load_data(download=False, flatten=True)
train_data_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size)
test_batches = list(DataLoader(test_dataset, shuffle=False, batch_size=batch_size))
batches = list(train_data_loader)
eval_batch = test_batches[0]

In [8]:
full_x = torch.cat([a for a,_ in batches], dim=0)
full_y = torch.cat([b for _,b in batches], dim=0)

In [81]:
torch.matrix_rank(full_x, tol=1e-6)

tensor(761)

In [86]:
torch.matrix_rank(full_x, tol=1e-8)

tensor(772)

In [87]:
torch.matrix_rank(full_x, tol=1e-4)

tensor(727)

In [89]:
full_x.shape

torch.Size([60000, 784])

In [88]:
print(Delta_W_1[1].shape)
print(torch.matrix_rank(Delta_W_1[1], tol=1e-8))

torch.Size([1024, 784])
tensor(783)


In [90]:
delt_h1 = F.linear(full_x, Delta_W_1[1])
print(delt_h1.shape)
print(torch.matrix_rank(delt_h1, tol=1e-8))

torch.Size([60000, 1024])
tensor(1024)


In [82]:
x_0, y_0 =  batches[0]
torch.matrix_rank(x_0, tol=1e-6)

tensor(512)

In [83]:
x_0, y_0 =  batches[0]
torch.matrix_rank(x_0, tol=1e-8)

tensor(512)

In [85]:
x_0, y_0 =  batches[0]
torch.matrix_rank(x_0, tol=1e-3)

tensor(512)

### Define model

In [9]:
ipllr = FcIPLLR(base_model_config, n_warmup_steps=12, lr_calibration_batches=batches)

initial base lr : [78.5, 35.073184967041016, 59.817752838134766, 61.449581146240234, 69.81730651855469, 80.47264099121094, 24.22148323059082]


In [10]:
ipllr.scheduler.warm_lrs[0] = ipllr.scheduler.warm_lrs[0] * (ipllr.d + 1)

### Save initial model : t=0

In [11]:
ipllr_0 = deepcopy(ipllr)

### Train model one step : t=1

In [12]:
x, y = batches[0]
train_model_one_step(ipllr, x, y, normalize_first=True)
ipllr_1 = deepcopy(ipllr)

input abs mean in training:  0.6950533986091614
loss derivatives for model: tensor([[-0.9000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000, -0.9000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.9000],
        ...,
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.9000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000, -0.9000,  0.1000]])
average training loss for model1 : 2.3025991916656494



### Train model for a second step : t=2

In [13]:
x, y = batches[1]
train_model_one_step(ipllr, x, y, normalize_first=True)
ipllr_2 = deepcopy(ipllr)

input abs mean in training:  0.6921874284744263
loss derivatives for model: tensor([[ 0.0702,  0.1263,  0.0989,  ...,  0.0898, -0.8576,  0.1143],
        [ 0.0582,  0.1394,  0.0970,  ...,  0.0840,  0.1667,  0.1202],
        [ 0.0681,  0.1285,  0.0987,  ...,  0.0888,  0.1463,  0.1153],
        ...,
        [ 0.0675,  0.1291,  0.0986,  ...,  0.0886,  0.1474,  0.1156],
        [ 0.0635, -0.8665,  0.0980,  ...,  0.0867,  0.1553,  0.1176],
        [ 0.0540,  0.1445,  0.0960,  ...,  0.0816, -0.8233,  0.1222]])
average training loss for model1 : 2.3237860202789307



In [14]:
ipllr.eval()
ipllr_0.eval()
ipllr_1.eval()
ipllr_2.eval()
print()




In [15]:
layer_scales = ipllr.layer_scales
intermediate_layer_keys = ["layer_{:,}_intermediate".format(l) for l in range(2, L + 1)]

### Define W0 and b0

In [16]:
with torch.no_grad():
    W0 = {1: layer_scales[0] * ipllr_0.input_layer.weight.data.detach() / math.sqrt(ipllr_0.d + 1)}
    for i, l in enumerate(range(2, L + 1)):
        layer = getattr(ipllr_0.intermediate_layers, intermediate_layer_keys[i])
        W0[l] = layer_scales[l-1] * layer.weight.data.detach()

    W0[L+1] = layer_scales[L] * ipllr_0.output_layer.weight.data.detach()

In [17]:
with torch.no_grad():
    b0 = layer_scales[0] * ipllr_0.input_layer.bias.data.detach() / math.sqrt(ipllr_0.d + 1)

### Define Delta_W_1 and Delta_b_1

In [18]:
with torch.no_grad():
    Delta_W_1 = {1: layer_scales[0] * (ipllr_1.input_layer.weight.data.detach() -
                                       ipllr_0.input_layer.weight.data.detach()) / math.sqrt(ipllr_1.d + 1)}
    for i, l in enumerate(range(2, L + 1)):
        layer_1 = getattr(ipllr_1.intermediate_layers, intermediate_layer_keys[i])
        layer_0 = getattr(ipllr_0.intermediate_layers, intermediate_layer_keys[i])
        Delta_W_1[l] = layer_scales[l-1] * (layer_1.weight.data.detach() -
                                            layer_0.weight.data.detach())

    Delta_W_1[L+1] = layer_scales[L] * (ipllr_1.output_layer.weight.data.detach() -
                                        ipllr_0.output_layer.weight.data.detach())

In [19]:
with torch.no_grad():
    Delta_b_1 = layer_scales[0] * (ipllr_1.input_layer.bias.data.detach() -
                                   ipllr_0.input_layer.bias.data.detach()) / math.sqrt(ipllr_1.d + 1)

### Define Delta_W_2

In [20]:
with torch.no_grad():
    Delta_W_2 = {1: layer_scales[0] * (ipllr_2.input_layer.weight.data.detach() -
                                       ipllr_1.input_layer.weight.data.detach()) / math.sqrt(ipllr_2.d + 1)}
    for i, l in enumerate(range(2, L + 1)):
        layer_2 = getattr(ipllr_2.intermediate_layers, intermediate_layer_keys[i])
        layer_1 = getattr(ipllr_1.intermediate_layers, intermediate_layer_keys[i])
        Delta_W_2[l] = layer_scales[l-1] * (layer_2.weight.data.detach() -
                                            layer_1.weight.data.detach())

    Delta_W_2[L+1] = layer_scales[L] * (ipllr_2.output_layer.weight.data.detach() -
                                        ipllr_1.output_layer.weight.data.detach())

In [21]:
with torch.no_grad():
    Delta_b_2 = layer_scales[0] * (ipllr_2.input_layer.bias.data.detach() -
                                   ipllr_1.input_layer.bias.data.detach()) / math.sqrt(ipllr_1.d + 1)

## Ranks

In [71]:
np.linalg.matrix_rank(Delta_W_1[1].numpy(), tol=1e-6)

512

In [73]:
np.linalg.matrix_rank(Delta_W_1[1].numpy(), tol=1e-7)

512

In [74]:
np.linalg.matrix_rank(Delta_W_1[1].numpy(), tol=1e-6)

512

In [75]:
np.linalg.matrix_rank(Delta_W_1[1].numpy(), tol=1e-4)

509

In [76]:
np.linalg.matrix_rank(Delta_W_1[1].numpy(), tol=1e-8)

768

In [77]:
np.linalg.matrix_rank(Delta_W_1[1].numpy(), tol=1e-9)

784

In [69]:
columns = ['W0', 'Delta_W_1', 'Delta_W_2', 'max']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'

for l in df.index:
    df.loc[l, columns] = [torch.matrix_rank(W0[l], tol=1e-9).item(), 
                          torch.matrix_rank(Delta_W_1[l], tol=1e-9).item(), 
                          torch.matrix_rank(Delta_W_2[l], tol=1e-9).item(),
                          min(W0[l].shape[0], W0[l].shape[1])]
    
df.loc[:, 'batch_size'] = batch_size
df

Unnamed: 0_level_0,W0,Delta_W_1,Delta_W_2,max,batch_size
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,784,784,784,784,512
2,1024,948,568,1024,512
3,1024,951,353,1024,512
4,1024,929,278,1024,512
5,1024,907,261,1024,512
6,1024,890,265,1024,512
7,10,10,6,10,512


In [22]:
columns = ['W0', 'Delta_W_1', 'Delta_W_2', 'max']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'

for l in df.index:
    df.loc[l, columns] = [torch.matrix_rank(W0[l], tol=1e-6).item(), 
                          torch.matrix_rank(Delta_W_1[l], tol=1e-6).item(), 
                          torch.matrix_rank(Delta_W_2[l], tol=1e-6).item(),
                          min(W0[l].shape[0], W0[l].shape[1])]
    
df.loc[:, 'batch_size'] = batch_size
df

Unnamed: 0_level_0,W0,Delta_W_1,Delta_W_2,max,batch_size
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,784,520,513,784,512
2,1024,512,166,1024,512
3,1024,512,19,1024,512
4,1024,512,7,1024,512
5,1024,512,6,1024,512
6,1023,512,4,1024,512
7,10,9,1,10,512


## Explore at step 1

### On all training examples

In [23]:
x, y = full_x, full_y

In [24]:
with torch.no_grad():
    x1 = {0: x}
    h0 = {1: F.linear(x, W0[1], b0)}
    delta_h_1 = {1: F.linear(x, Delta_W_1[1], Delta_b_1)}
    h1 = {1: layer_scales[0] * ipllr_1.input_layer.forward(x) / math.sqrt(ipllr_1.d + 1)}
    x1[1] = ipllr_1.activation(h1[1])

In [25]:
torch.testing.assert_allclose(h0[1] + delta_h_1[1], h1[1], rtol=1e-5, atol=1e-5)

In [26]:
with torch.no_grad():
    for i, l in enumerate(range(2, L + 1)):
        layer_1 = getattr(ipllr_1.intermediate_layers, intermediate_layer_keys[i])
        x = x1[l-1]

        h0[l] =  F.linear(x, W0[l])
        delta_h_1[l] = F.linear(x, Delta_W_1[l])
        
        h1[l] = layer_scales[l-1] * layer_1.forward(x)
        x1[l] = ipllr_1.activation(h1[l])
        
        torch.testing.assert_allclose(h0[l] + delta_h_1[l], h1[l], rtol=1e-5, atol=1e-5)

In [27]:
with torch.no_grad():
    x = x1[L] 
    h0[L+1] = F.linear(x, W0[L+1])
    delta_h_1[L+1] = F.linear(x, Delta_W_1[L+1])
    h1[L+1] = layer_scales[L] * ipllr_1.output_layer.forward(x)
    x1[L+1] = ipllr_1.activation(h1[L+1])
                              
    torch.testing.assert_allclose(h0[L+1] + delta_h_1[L+1], h1[L+1], rtol=1e-5, atol=1e-5)

##### Diversity

In [28]:
columns = ['h0', 'delta_h_1', 'h1']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'
bs = x.shape[0]

for l in df.index:
    maxes = dict()
    
    _, maxes['h0'] = torch.max(h0[l] , dim=1)
    _, maxes['delta_h_1'] = torch.max(delta_h_1[l] , dim=1)
    _, maxes['h1'] = torch.max(h1[l] , dim=1)

    df.loc[l, columns] = [maxes[key].unique().numel() for key in columns]
    df.loc[l, 'max'] = min(bs, h0[l].shape[1])
    
df.loc[:, 'batch_size'] = bs
df.loc[:, 'max'] = df.loc[:, 'max'].astype(int)
df

Unnamed: 0_level_0,h0,delta_h_1,h1,max,batch_size
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,716,44,267,1024,60000
2,142,6,6,1024,60000
3,6,1,1,1024,60000
4,1,1,1,1024,60000
5,1,1,1,1024,60000
6,1,1,1,1024,60000
7,1,1,1,10,60000


In [91]:
columns = ['h0', 'delta_h_1', 'h1', 'x1']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'
bs = x.shape[0]

for l in df.index:

    df.loc[l, columns] = [torch.matrix_rank(h0[l], tol=1e-8).item(),
                          torch.matrix_rank(delta_h_1[l], tol=1e-8).item(),
                          torch.matrix_rank(h1[l], tol=1e-8).item(),
                          torch.matrix_rank(x1[l], tol=1e-8).item()]
    df.loc[l, 'max'] = min(h0[l].shape[0], h0[l].shape[1])
    
df.loc[:, 'n_el'] = bs
df.loc[:, 'max'] = df.loc[:, 'max'].astype(int)
df

Unnamed: 0_level_0,h0,delta_h_1,h1,x1,max,n_el
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1024,1024,1024,1024,1024,60000
2,1024,1021,1024,930,1024,60000
3,1024,1010,1024,718,1024,60000
4,1024,999,1024,502,1024,60000
5,1024,979,1024,489,1024,60000
6,1024,951,1024,503,1024,60000
7,10,10,10,4,10,60000


In [68]:
columns = ['h0', 'delta_h_1', 'h1', 'x1']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'
bs = x.shape[0]

for l in df.index:

    df.loc[l, columns] = [torch.matrix_rank(h0[l], tol=1e-8).item(),
                          torch.matrix_rank(delta_h_1[l], tol=1e-8).item(),
                          torch.matrix_rank(h1[l], tol=1e-8).item(),
                          torch.matrix_rank(x1[l], tol=1e-8).item()]
    df.loc[l, 'max'] = min(h0[l].shape[0], h0[l].shape[1])
    
df.loc[:, 'n_el'] = bs
df.loc[:, 'max'] = df.loc[:, 'max'].astype(int)
df

Unnamed: 0_level_0,h0,delta_h_1,h1,x1,max,n_el
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,432,33,308,412,1024,60000
2,451,15,17,16,1024,60000
3,18,3,3,3,1024,60000
4,3,1,1,1,1024,60000
5,1,1,1,1,1024,60000
6,1,1,1,1,1024,60000
7,1,1,1,1,10,60000


In [65]:
torch.matrix_rank(full_x).item()

476

In [29]:
u, v = torch.sort(delta_h_1[1][12349, :])
u[-10:]

tensor([3.9573, 3.9907, 4.0631, 4.2020, 4.2488, 4.5373, 4.7364, 5.1119, 5.2464,
        5.5249])

In [30]:
_, b = torch.max(delta_h_1[3], dim=1)
b.unique(return_counts=True)

(tensor([754]), tensor([60000]))

In [32]:
b.shape

torch.Size([60000])

In [33]:
Delta_W_1[3][754, :].abs().sum()

tensor(7.5635)

In [34]:
Delta_W_1[3][755, :].abs().sum()

tensor(1.5987)

In [35]:
(Delta_W_1[3][754, :] > 0).sum()

tensor(934)

In [36]:
(Delta_W_1[3][550, :] > 0).sum()

tensor(294)

In [37]:
Delta_W_1[1].shape

torch.Size([1024, 784])

In [38]:
u, v = torch.sort(delta_h_1[6][0, :])
u[-10:]

tensor([4.3520, 4.4196, 4.4502, 4.4621, 4.4831, 4.6523, 4.7699, 5.3600, 5.8370,
        5.9171])

In [39]:
u, v = torch.sort(delta_h_1[3][5, :])
u[-10:]

tensor([5.2452, 5.2787, 5.4603, 5.5938, 5.6062, 5.6423, 5.9521, 6.9021, 6.9459,
        8.4552])

In [40]:
delta_h_1[2][1239, :]
u, v = torch.sort(delta_h_1[1][0, :])
u[-10:]

tensor([4.2463, 4.2988, 4.4392, 4.5113, 4.5426, 4.6546, 4.6784, 4.7447, 4.9751,
        5.4855])

In [41]:
u, v = torch.sort(delta_h_1[1][0, :])

In [42]:
u

tensor([-6.4779, -6.1498, -5.1613,  ...,  4.7447,  4.9751,  5.4855])

In [43]:
u[-10:]

tensor([4.2463, 4.2988, 4.4392, 4.5113, 4.5426, 4.6546, 4.6784, 4.7447, 4.9751,
        5.4855])

In [44]:
v

tensor([542, 150, 743,  ..., 156, 541, 524])

##### Scales

In [46]:
columns = ['h0', 'delta_h_1', 'h1', 'x1']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'
for l in df.index:
    df.loc[l, columns] = [h0[l][0, :].abs().mean().item(),
                          delta_h_1[l][0, :].abs().mean().item(), 
                          h1[l][0, :].abs().mean().item(),
                          x1[l][0, :].abs().mean().item()]
df

Unnamed: 0_level_0,h0,delta_h_1,h1,x1
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.17093,1.23502,1.70444,0.876051
2,0.0629568,1.06045,1.08428,0.547302
3,0.0417902,1.05533,1.07386,0.553764
4,0.041076,1.05511,1.07585,0.531404
5,0.0399128,1.05536,1.07446,0.537794
6,0.0408191,1.05542,1.0723,0.561044
7,0.112787,0.105541,0.213825,0.117217


## Explore at step 2

### On examples from the second batch

In [47]:
x, y = full_x, full_y

In [48]:
with torch.no_grad():
    x2 = {0: x}
    h0 = {1: F.linear(x, W0[1], b0)}
    delta_h_1 = {1: F.linear(x, Delta_W_1[1], Delta_b_1)}
    delta_h_2 = {1: F.linear(x, Delta_W_2[1], Delta_b_2)}
    h1 = {1: layer_scales[0] * ipllr_1.input_layer.forward(x) / math.sqrt(ipllr_1.d + 1)}
    h2 = {1: layer_scales[0] * ipllr_2.input_layer.forward(x) / math.sqrt(ipllr_2.d + 1)}
    x2[1] = ipllr_2.activation(h2[1])

In [49]:
torch.testing.assert_allclose(h0[1] + delta_h_1[1], h1[1], rtol=1e-5, atol=1e-4)
torch.testing.assert_allclose(h0[1] + delta_h_1[1] + delta_h_2[1], h2[1], rtol=1e-5, atol=1e-4)

In [50]:
with torch.no_grad():
    for i, l in enumerate(range(2, L + 1)):
        layer_1 = getattr(ipllr_1.intermediate_layers, intermediate_layer_keys[i])
        layer_2 = getattr(ipllr_2.intermediate_layers, intermediate_layer_keys[i])
        x = x2[l-1]

        h0[l] =  F.linear(x, W0[l])
        delta_h_1[l] = F.linear(x, Delta_W_1[l])
        delta_h_2[l] = F.linear(x, Delta_W_2[l])
        
        h1[l] = layer_scales[l-1] * layer_1.forward(x)
        h2[l] = layer_scales[l-1] * layer_2.forward(x)
        x2[l] = ipllr_2.activation(h2[l])
        
        torch.testing.assert_allclose(h0[l] + delta_h_1[l], h1[l], rtol=1e-5, atol=1e-5)
        torch.testing.assert_allclose(h0[l] + delta_h_1[l] + delta_h_2[l], h2[l], rtol=1e-5, atol=1e-5)

In [51]:
with torch.no_grad():
    x = x2[L] 
    h0[L+1] = F.linear(x, W0[L+1])
    delta_h_1[L+1] = F.linear(x, Delta_W_1[L+1])
    delta_h_2[L+1] = F.linear(x, Delta_W_2[L+1])
    h1[L+1] = layer_scales[L] * ipllr_1.output_layer.forward(x)
    h2[L+1] = layer_scales[L] * ipllr_2.output_layer.forward(x)
    x2[L+1] = ipllr_2.activation(h2[L+1])
                              
    torch.testing.assert_allclose(h0[L+1] + delta_h_1[L+1], h1[L+1], rtol=1e-5, atol=1e-5)
    torch.testing.assert_allclose(h0[L+1] + delta_h_1[L+1] + delta_h_2[L+1], h2[L+1], rtol=1e-5, atol=1e-5)

##### Diversity

In [52]:
columns = ['h0', 'delta_h_1', 'delta_h_2', 'h1', 'h2']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'
bs = x.shape[0]

for l in df.index:
    maxes = dict()
    
    _, maxes['h0'] = torch.max(h0[l] , dim=1)
    _, maxes['delta_h_1'] = torch.max(delta_h_1[l] , dim=1)
    _, maxes['delta_h_2'] = torch.max(delta_h_2[l] , dim=1)
    _, maxes['h1'] = torch.max(h1[l] , dim=1)
    _, maxes['h2'] = torch.max(h2[l] , dim=1)

    df.loc[l, columns] = [maxes[key].unique().numel() for key in columns]
    df.loc[l, 'max'] = min(bs, h0[l].shape[1])
    
df.loc[:, 'batch_size'] = bs
df.loc[:, 'max'] = df.loc[:, 'max'].astype(int)
df

Unnamed: 0_level_0,h0,delta_h_1,delta_h_2,h1,h2,max,batch_size
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,716,44,17,267,435,1024,60000
2,435,7,3,9,9,1024,60000
3,13,1,1,1,1,1024,60000
4,2,1,1,1,1,1024,60000
5,1,1,1,1,1,1024,60000
6,1,1,1,1,1,1024,60000
7,1,1,1,1,1,10,60000


##### Scales

In [53]:
columns = ['h0', 'delta_h_1', 'delta_h_2', 'h1', 'h2', 'x2']
df = pd.DataFrame(columns=columns, index=range(1, L+2))
df.index.name = 'layer'
for l in df.index:
    df.loc[l, columns] = [h0[l][0, :].abs().mean().item(), delta_h_1[l][0, :].abs().mean().item(), 
                          delta_h_2[l][0, :].abs().mean().item(), h1[l][0, :].abs().mean().item(),  
                          h2[l][0, :].abs().mean().item(),
                          x2[l][0, :].abs().mean().item()]
df

Unnamed: 0_level_0,h0,delta_h_1,delta_h_2,h1,h2,x2
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.17093,1.23502,1.24305,1.70444,1.63518,0.230311
2,0.0243453,0.18269,0.00212241,0.190494,0.188877,0.0923212
3,0.00685483,0.167899,0.000751395,0.170882,0.170193,0.0869264
4,0.00646817,0.164112,0.000662647,0.167349,0.166707,0.081949
5,0.00615875,0.161679,0.000683194,0.164618,0.164009,0.0817646
6,0.00621307,0.159171,0.000889066,0.161737,0.161196,0.0840817
7,0.0167676,0.015659,0.00038317,0.0317657,0.0315065,0.0173044


In [54]:
(h0[1] < 0).sum() / h0[1].numel()

tensor(0.4898)

In [55]:
(delta_h_1[1] < 0).sum() / delta_h_1[1].numel()

tensor(0.4916)

In [56]:
(delta_h_2[1] < 0).sum() / delta_h_2[1].numel()

tensor(0.8158)

In [57]:
(h1[1] < 0).sum() / h1[1].numel()

tensor(0.5226)

In [58]:
(h2[1] < 0).sum() / h2[1].numel()

tensor(0.6515)

In [59]:
(h0[2] < 0).sum() / h0[2].numel()

tensor(0.5144)

In [60]:
(delta_h_1[2] < 0).sum() / delta_h_1[2].numel()

tensor(0.5111)

In [61]:
(delta_h_2[2] < 0).sum() / delta_h_2[2].numel()

tensor(0.5522)

In [62]:
(h1[2] < 0).sum() / h1[2].numel()

tensor(0.5263)

In [63]:
(h2[2] < 0).sum() / h2[2].numel()

tensor(0.5277)