In [50]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
import os
cwd = os.getcwd()

NOTEBOOK_DIR = os.path.dirname(cwd)
ROOT = os.path.dirname(os.path.dirname(os.path.dirname(NOTEBOOK_DIR)))

FIGURES_DIR = os.path.join(ROOT, 'figures/abc_parameterizations/initialization')
CONFIG_PATH = os.path.join(ROOT, 'pytorch/configs/abc_parameterizations/fc_abc.yaml')

In [52]:
import sys
sys.path.append(ROOT)

In [53]:
import torch

from utils.tools import read_yaml, set_random_seeds
from utils.plot.abc_parameterizations.initializations import *
from pytorch.configs.model import ModelConfig
from pytorch.models.abc_params.fully_connected import ntk, ip, muP, ipllr
from pytorch.models.abc_params.fully_connected.standard_fc_ip import StandardFCIP

In [61]:
L = 5
bias = False
width = 1024
SEED = 42

set_random_seeds(SEED)  # set random seed for reproducibility
config_dict = read_yaml(CONFIG_PATH)

config_dict['architecture']['n_layers'] = L + 1
base_model_config = ModelConfig(config_dict)
base_model_config.architecture['bias'] = bias
base_model_config.architecture['width'] = width

### NTK

In [76]:
model = ntk.FCNTK(base_model_config)
model.init_scales

[1, 1, 1, 1, 1, 1]

In [63]:
model.input_layer.weight

Parameter containing:
tensor([[ 0.2514, -1.5190,  0.8979,  ..., -2.1606, -0.3204,  0.0087],
        [-0.8779,  0.1613, -0.3990,  ..., -1.4587,  0.9024, -3.3510],
        [-0.1970,  0.6471, -1.0071,  ..., -0.0973,  0.2561,  0.0622],
        ...,
        [-1.9033, -0.0588, -2.1137,  ..., -1.2381, -0.3363, -0.9464],
        [ 0.1160, -0.1671,  0.3530,  ..., -1.7505, -1.0413,  1.9345],
        [-0.5322, -0.1971, -0.1059,  ..., -0.6193, -2.8023, -0.3825]],
       requires_grad=True)

In [64]:
model.input_layer.weight.min()

tensor(-5.5518, grad_fn=<MinBackward1>)

In [65]:
model.input_layer.weight.max()

tensor(5.7181, grad_fn=<MaxBackward1>)

In [66]:
for l, layer in enumerate(model.intermediate_layers):
    print(layer.weight.shape)

torch.Size([1024, 1024])
torch.Size([1024, 1024])
torch.Size([1024, 1024])
torch.Size([1024, 1024])


In [67]:
layer.weight

Parameter containing:
tensor([[-0.5464, -2.8679, -2.1360,  ..., -1.4663, -1.5891,  0.6221],
        [-1.5016,  1.3141,  0.6846,  ..., -2.2101, -0.6813,  0.1306],
        [ 0.8891, -2.7589, -0.5882,  ..., -0.8268,  0.7493,  2.0032],
        ...,
        [ 0.7670,  1.3615,  0.8179,  ..., -0.1174, -2.4032,  1.4691],
        [-0.6120,  2.5579, -2.0396,  ...,  1.5252,  2.1685, -0.5176],
        [ 1.5199, -0.4976, -1.2771,  ..., -1.5009,  0.4476,  0.2565]],
       requires_grad=True)

In [68]:
print(layer.weight.min())
print(layer.weight.max())

tensor(-6.9049, grad_fn=<MinBackward1>)
tensor(7.3323, grad_fn=<MaxBackward1>)


### muP

In [69]:
model = muP.FCmuP(base_model_config)

In [75]:
model.init_scales

[0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125]

In [70]:
model.input_layer.weight

Parameter containing:
tensor([[ 0.0006, -0.0795,  0.0343,  ...,  0.0354,  0.0134, -0.0436],
        [ 0.0874, -0.0167,  0.0194,  ..., -0.0643, -0.0449, -0.0027],
        [-0.0594,  0.0119,  0.0116,  ...,  0.0064, -0.0837,  0.0390],
        ...,
        [ 0.0317,  0.0235,  0.0061,  ...,  0.0040,  0.0278,  0.0158],
        [ 0.0348, -0.0858,  0.0655,  ..., -0.0564, -0.1332, -0.0009],
        [-0.0589, -0.1162,  0.0023,  ...,  0.0028, -0.0088, -0.0072]],
       requires_grad=True)

In [71]:
model.input_layer.weight.min()

tensor(-0.1904, grad_fn=<MinBackward1>)

In [72]:
model.input_layer.weight.max()

tensor(0.1642, grad_fn=<MaxBackward1>)

In [73]:
for l, layer in enumerate(model.intermediate_layers):
    print(layer.weight.shape)

torch.Size([1024, 1024])
torch.Size([1024, 1024])
torch.Size([1024, 1024])
torch.Size([1024, 1024])


In [74]:
layer.weight

Parameter containing:
tensor([[ 0.0392, -0.0066, -0.0253,  ..., -0.0693,  0.1090,  0.0372],
        [ 0.0506, -0.0527, -0.0031,  ...,  0.0178,  0.0054, -0.0444],
        [ 0.0443,  0.0079,  0.0031,  ..., -0.0035,  0.0084, -0.0077],
        ...,
        [ 0.0633,  0.0477,  0.0267,  ..., -0.0108,  0.0675,  0.0224],
        [-0.0389,  0.0447,  0.0604,  ..., -0.0927,  0.0067, -0.0268],
        [ 0.0177,  0.0112,  0.0229,  ..., -0.0127,  0.0472, -0.0184]],
       requires_grad=True)

In [68]:
print(layer.weight.min())
print(layer.weight.max())

tensor(-6.9049, grad_fn=<MinBackward1>)
tensor(7.3323, grad_fn=<MaxBackward1>)


In [79]:
l = [1, 2, 3]