In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
cwd = os.getcwd()

NOTEBOOK_DIR = os.path.dirname(cwd)
ROOT = os.path.dirname(os.path.dirname(os.path.dirname(NOTEBOOK_DIR)))

FIGURES_DIR = os.path.join(ROOT, 'figures/abc_parameterizations/initialization')
CONFIG_PATH = os.path.join(ROOT, 'pytorch/configs/abc_parameterizations', 'fc_ipllr_mnist.yaml')

In [3]:
import sys
sys.path.append(ROOT)

In [4]:
import os
from copy import deepcopy
import torch
import math
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, Subset, DataLoader
import torch.nn.functional as F

from utils.tools import read_yaml, set_random_seeds
from pytorch.configs.base import BaseConfig
from pytorch.configs.model import ModelConfig
from pytorch.models.abc_params.fully_connected.ipllr import FcIPLLR
from pytorch.models.abc_params.fully_connected.muP import FCmuP
from pytorch.models.abc_params.fully_connected.ntk import FCNTK
from pytorch.models.abc_params.fully_connected.standard_fc_ip import StandardFCIP
from utils.data.mnist import load_data
from utils.abc_params.debug_ipllr import *

### Load basic configuration and define variables 

In [5]:
N_TRIALS = 1
SEED = 30
L = 6
width = 1024
n_warmup_steps = 1
batch_size = 512
base_lr = 0.001
n_steps = 50

set_random_seeds(SEED)  # set random seed for reproducibility
config_dict = read_yaml(CONFIG_PATH)

In [6]:
config_dict = read_yaml(CONFIG_PATH)

input_size = config_dict['architecture']['input_size']

config_dict['architecture']['width'] = width
config_dict['architecture']['n_layers'] = L + 1
config_dict['optimizer']['params']['lr'] = base_lr
config_dict['scheduler'] = {'name': 'warmup_switch',
                            'params': {'n_warmup_steps': n_warmup_steps,
                                       'calibrate_base_lr': True,
                                       'default_calibration': False}}
        
base_model_config = ModelConfig(config_dict)

### Load data & define model

In [7]:
training_dataset, test_dataset = load_data(download=False, flatten=True)
train_data_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size)
test_batches = list(DataLoader(test_dataset, shuffle=False, batch_size=batch_size))
batches = list(train_data_loader)
eval_batch = test_batches[0]

## Look at model at different steps of training

### lr = 0.01

In [8]:
ipllr = FcIPLLR(base_model_config, n_warmup_steps=12, lr_calibration_batches=batches)

initial base lr : [69.26097106933594, 36.901771545410156, 60.06058120727539, 61.465023040771484, 69.81842803955078, 80.47272491455078, 242.21490478515625]


### 1. Initial model : t=0

In [10]:
ipllr_0 = deepcopy(ipllr)

In [14]:
batch = batches[0]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_0, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 2.3025991916656494


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,1.112075,0.0,0.0,0.0,1.112075,0.0
1,IPLLR,2.0,0.03530343,0.0,0.0,0.0,0.03530343,0.0
2,IPLLR,3.0,0.001097583,0.0,0.0,0.0,0.001097583,0.0
3,IPLLR,4.0,3.452109e-05,0.0,0.0,0.0,3.452109e-05,0.0
4,IPLLR,5.0,1.106034e-06,0.0,0.0,0.0,1.106034e-06,0.0
5,IPLLR,6.0,3.51856e-08,0.0,0.0,0.0,3.51856e-08,0.0
6,IPLLR,7.0,8.658727e-10,0.0,0.0,0.0,8.658727e-10,0.0


In [15]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_0, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 2.3025991916656494


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,1.108155,0.0,0.0,0.0,1.108155,0.0
1,IPLLR,2.0,0.03515991,0.0,0.0,0.0,0.03515991,0.0
2,IPLLR,3.0,0.001093562,0.0,0.0,0.0,0.001093562,0.0
3,IPLLR,4.0,3.447231e-05,0.0,0.0,0.0,3.447231e-05,0.0
4,IPLLR,5.0,1.105869e-06,0.0,0.0,0.0,1.105869e-06,0.0
5,IPLLR,6.0,3.517691e-08,0.0,0.0,0.0,3.517691e-08,0.0
6,IPLLR,7.0,8.610861e-10,0.0,0.0,0.0,8.610861e-10,0.0


### 2. After one step of SGD : t=1

In [10]:
x, y = batches[0]
train_model_one_step(ipllr, x, y, batch_size)
ipllr_1 = deepcopy(ipllr)

input abs mean in training:  0.6950533986091614
loss derivatives for model: tensor([[-0.9000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000, -0.9000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.9000],
        ...,
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.9000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000, -0.9000,  0.1000]])
average training loss for model1 : 2.3025991916656494



##### Look at model_1 on the 2nd batch

In [11]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
#contributions_df.columns = ['model', 'layer', 'h_0', 'model_0_', 'Delta_h_0', 'Delta_h_1', 'delta_h_1', 'h_1', 'id']
#contributions_df.loc[:, ['model', 'layer', 'h_0', 'Delta_h_1', 'delta_h_1', 'h_1']]
contributions_df

average validation loss for IPLLR : 3.235839366912842


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,1.517923,1.0,1.0,1.0,1.517923,0.0
1,IPLLR,2.0,1.020457,1.0,1.0,1.0,1.020457,0.0
2,IPLLR,3.0,1.017958,1.0,1.0,1.0,1.017958,0.0
3,IPLLR,4.0,1.019624,1.0,1.0,1.0,1.019624,0.0
4,IPLLR,5.0,1.018097,1.0,1.0,1.0,1.018097,0.0
5,IPLLR,6.0,1.015998,1.0,1.0,1.0,1.015998,0.0
6,IPLLR,7.0,1.096326,1.0,1.0,1.0,1.096326,0.0


##### Look at model_1 on the 2nd batch

In [12]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 3.1598029136657715


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,1.507129,0.994181,0.994181,0.994181,1.507129,0.0
1,IPLLR,2.0,1.014484,0.994174,0.994174,0.994174,1.014484,0.0
2,IPLLR,3.0,1.011096,0.993221,0.993221,0.993221,1.011096,0.0
3,IPLLR,4.0,1.012631,0.993137,0.993137,0.993137,1.012631,0.0
4,IPLLR,5.0,1.011111,0.993138,0.993138,0.993138,1.011111,0.0
5,IPLLR,6.0,1.009026,0.993138,0.993138,0.993138,1.009026,0.0
6,IPLLR,7.0,1.088804,0.993138,0.993138,0.993138,1.088804,0.0


##### Look at model_1 on a couple of random batches

In [13]:
batch = batches[4:20]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=batch)
contributions_df

contributions_by_layer = contributions_df.groupby(by='layer').mean()
contributions_by_layer

average validation loss for IPLLR : 3.2408464699983597


Unnamed: 0_level_0,h_1,Delta_h_1,Delta_h,delta_h,h,id
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,1.515744,0.996178,0.996178,0.996178,1.515744,7.5
2.0,1.018259,0.997988,0.997988,0.997988,1.018259,7.5
3.0,1.015772,0.997861,0.997861,0.997861,1.015772,7.5
4.0,1.017391,0.997806,0.997806,0.997806,1.017391,7.5
5.0,1.015878,0.99782,0.99782,0.99782,1.015878,7.5
6.0,1.013787,0.997824,0.997824,0.997824,1.013787,7.5
7.0,1.09394,0.997824,0.997824,0.997824,1.09394,7.5


### 3. After 2 steps of SGD : t=2

In [14]:
x, y = batches[1]
train_model_one_step(ipllr, x, y, batch_size)
ipllr_2 = deepcopy(ipllr)

input abs mean in training:  0.6921874284744263
loss derivatives for model: tensor([[ 0.0096,  0.2137,  0.0487,  ...,  0.0331, -0.5867,  0.1340],
        [ 0.0021,  0.2148,  0.0237,  ...,  0.0133,  0.5745,  0.1072],
        [ 0.0074,  0.2168,  0.0432,  ...,  0.0284,  0.4452,  0.1304],
        ...,
        [ 0.0071,  0.2172,  0.0424,  ...,  0.0277,  0.4501,  0.1298],
        [ 0.0045, -0.7809,  0.0341,  ...,  0.0210,  0.5019,  0.1219],
        [ 0.0012,  0.2074,  0.0176,  ...,  0.0093, -0.3769,  0.0953]])
average training loss for model1 : 3.235839366912842



In [15]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_1, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 3.235839366912842


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,1.517923,1.0,1.0,1.0,1.517923,0.0
1,IPLLR,2.0,1.020457,1.0,1.0,1.0,1.020457,0.0
2,IPLLR,3.0,1.017958,1.0,1.0,1.0,1.017958,0.0
3,IPLLR,4.0,1.019624,1.0,1.0,1.0,1.019624,0.0
4,IPLLR,5.0,1.018097,1.0,1.0,1.0,1.018097,0.0
5,IPLLR,6.0,1.015998,1.0,1.0,1.0,1.015998,0.0
6,IPLLR,7.0,1.096326,1.0,1.0,1.0,1.096326,0.0


In [16]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_1, batches=[batch])
contributions_df

average validation loss for IPLLR : 2.300420045852661


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,1.517923,1.0,0.990732,0.026572,1.504412,0.0
1,IPLLR,2.0,0.988245,0.96818,0.860694,0.157932,0.877559,0.0
2,IPLLR,3.0,0.617084,0.603676,0.540123,0.092284,0.551405,0.0
3,IPLLR,4.0,0.369427,0.360164,0.323165,0.056664,0.330889,0.0
4,IPLLR,5.0,0.210208,0.205365,0.187277,0.034607,0.191421,0.0
5,IPLLR,6.0,0.114048,0.111618,0.103372,0.02418,0.104956,0.0
6,IPLLR,7.0,0.046458,0.040874,0.039945,0.001033,0.045425,0.0


In [17]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_1, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 3.1598029136657715


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,1.507129,0.994181,0.994181,0.994181,1.507129,0.0
1,IPLLR,2.0,1.014484,0.994174,0.994174,0.994174,1.014484,0.0
2,IPLLR,3.0,1.011096,0.993221,0.993221,0.993221,1.011096,0.0
3,IPLLR,4.0,1.012631,0.993137,0.993137,0.993137,1.012631,0.0
4,IPLLR,5.0,1.011111,0.993138,0.993138,0.993138,1.011111,0.0
5,IPLLR,6.0,1.009026,0.993138,0.993138,0.993138,1.009026,0.0
6,IPLLR,7.0,1.088804,0.993138,0.993138,0.993138,1.088804,0.0


In [18]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_1, batches=[batch])
contributions_df

average validation loss for IPLLR : 2.2967820167541504


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,1.507129,0.994181,0.985057,0.026255,1.493806,0.0
1,IPLLR,2.0,0.982637,0.962717,0.85568,0.157012,0.872422,0.0
2,IPLLR,3.0,0.612707,0.599357,0.536147,0.091838,0.547382,0.0
3,IPLLR,4.0,0.366245,0.357053,0.320352,0.056228,0.328014,0.0
4,IPLLR,5.0,0.208251,0.203451,0.185531,0.034297,0.189637,0.0
5,IPLLR,6.0,0.112958,0.110551,0.102387,0.023952,0.103955,0.0
6,IPLLR,7.0,0.046007,0.040478,0.039559,0.001023,0.044984,0.0


### Train further

In [19]:
# ['model', 'layer', 'h_1', 'Delta_h_1', 'Delta_h', 'delta_h', 'h', 'id']
for i in range(3, n_steps):
    print('----  step {}  ----'.format(i))
    
    # train model
    ipllr_previous = deepcopy(ipllr)
    x, y = batches[i]
    train_model_one_step(ipllr, x, y, batch_size)
    
    # compute contributions
    next_batch = batches[(i+1) % len(batches)]
    contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1,ipllr_previous, 
                                                         batches=[next_batch])
    print(contributions_df)
    
    print('\n\n')

----  step 3  ----
input abs mean in training:  0.6968955397605896
loss derivatives for model: tensor([[-0.9108,  0.1071,  0.0988,  ...,  0.0978,  0.1088,  0.1077],
        [ 0.0892, -0.8928,  0.0988,  ...,  0.0978,  0.1087,  0.1077],
        [ 0.0903,  0.1064,  0.0989,  ...,  0.0981,  0.1078,  0.1069],
        ...,
        [ 0.0889,  0.1074,  0.0987,  ...,  0.0977, -0.8909,  0.1080],
        [ 0.0889,  0.1074, -0.9012,  ...,  0.0978,  0.1090,  0.1079],
        [ 0.0908,  0.1060,  0.0990,  ...,  0.0982,  0.1074,  0.1065]])
average training loss for model1 : 2.3024003505706787

average validation loss for IPLLR : 2.3017044067382812
   model  layer       h_1  Delta_h_1   Delta_h   delta_h         h   id
0  IPLLR    1.0  1.534716   1.011540  1.002111  0.000226  1.521024  0.0
1  IPLLR    2.0  1.002527   0.982270  0.872708  0.001425  0.889773  0.0
2  IPLLR    3.0  0.627101   0.613488  0.548524  0.000697  0.560012  0.0
3  IPLLR    4.0  0.374741   0.365336  0.327612  0.000264  0.335482  0.0
4

input abs mean in training:  0.6905764937400818
loss derivatives for model: tensor([[ 0.0918,  0.1062,  0.0996,  ...,  0.0981,  0.1067,  0.1051],
        [ 0.0893,  0.1082,  0.0995,  ...,  0.0975,  0.1090,  0.1067],
        [ 0.0895,  0.1080,  0.0995,  ...,  0.0975,  0.1088, -0.8935],
        ...,
        [ 0.0800,  0.1162,  0.0987,  ...,  0.0949, -0.8822,  0.1131],
        [ 0.0855, -0.8886,  0.0992,  ...,  0.0965,  0.1125,  0.1092],
        [ 0.0868,  0.1103,  0.0993,  ...,  0.0968,  0.1112, -0.8916]])
average training loss for model1 : 2.2964179515838623

average validation loss for IPLLR : 2.2962286472320557
   model  layer       h_1  Delta_h_1   Delta_h   delta_h         h   id
0  IPLLR    1.0  1.529661   1.004235  0.995128  0.000237  1.516235  0.0
1  IPLLR    2.0  0.995496   0.975500  0.867347  0.001487  0.884290  0.0
2  IPLLR    3.0  0.626566   0.613023  0.548873  0.000444  0.560537  0.0
3  IPLLR    4.0  0.379709   0.370292  0.333242  0.000128  0.341213  0.0
4  IPLLR    5.0  0.2

input abs mean in training:  0.6988633871078491
loss derivatives for model: tensor([[ 0.0884,  0.1100,  0.0999,  ..., -0.9028,  0.1094,  0.1066],
        [-0.9094,  0.1079,  0.1000,  ...,  0.0978,  0.1075,  0.1053],
        [ 0.0851,  0.1130,  0.0998,  ...,  0.0963, -0.8877,  0.1086],
        ...,
        [ 0.0831,  0.1150, -0.9002,  ...,  0.0957,  0.1142,  0.1099],
        [ 0.0840, -0.8859,  0.0998,  ...,  0.0959,  0.1134,  0.1092],
        [ 0.0879,  0.1104,  0.0999,  ...,  0.0970,  0.1098,  0.1069]])
average training loss for model1 : 2.2906534671783447

average validation loss for IPLLR : 2.28883957862854
   model  layer       h_1  Delta_h_1   Delta_h   delta_h         h   id
0  IPLLR    1.0  1.519391   1.000894  0.992162  0.000304  1.506253  0.0
1  IPLLR    2.0  0.989569   0.969592  0.862619  0.001893  0.879637  0.0
2  IPLLR    3.0  0.626066   0.612612  0.548632  0.000607  0.560484  0.0
3  IPLLR    4.0  0.381941   0.372530  0.335814  0.000414  0.343775  0.0
4  IPLLR    5.0  0.224

input abs mean in training:  0.6967038512229919
loss derivatives for model: tensor([[ 0.0831,  0.1162,  0.1002,  ...,  0.0953,  0.1145,  0.1088],
        [ 0.0887,  0.1105,  0.1003,  ...,  0.0970,  0.1094,  0.1058],
        [ 0.0905,  0.1087,  0.1003,  ..., -0.9025,  0.1078,  0.1048],
        ...,
        [ 0.0855,  0.1137,  0.1002,  ...,  0.0960,  0.1123,  0.1075],
        [ 0.0884,  0.1107,  0.1003,  ...,  0.0969,  0.1096, -0.8941],
        [ 0.0876,  0.1115,  0.1003,  ...,  0.0967,  0.1103,  0.1064]])
average training loss for model1 : 2.294235944747925

average validation loss for IPLLR : 2.2887089252471924
   model  layer       h_1  Delta_h_1   Delta_h   delta_h         h   id
0  IPLLR    1.0  1.508324   0.984130  0.975637  0.000304  1.495180  0.0
1  IPLLR    2.0  0.980915   0.961160  0.853067  0.001937  0.869933  0.0
2  IPLLR    3.0  0.621692   0.608404  0.544212  0.000857  0.556123  0.0
3  IPLLR    4.0  0.380157   0.370832  0.335060  0.000188  0.342945  0.0
4  IPLLR    5.0  0.22

input abs mean in training:  0.6930533647537231
loss derivatives for model: tensor([[ 0.0787,  0.1235,  0.1011,  ...,  0.0934, -0.8812,  0.1094],
        [ 0.0782,  0.1241,  0.1011,  ..., -0.9068,  0.1193,  0.1096],
        [ 0.0750, -0.8717,  0.1010,  ...,  0.0920,  0.1226,  0.1111],
        ...,
        [ 0.0825,  0.1187,  0.1010,  ..., -0.9053,  0.1150,  0.1076],
        [ 0.0933,  0.1066,  0.1005,  ..., -0.9018,  0.1054,  0.1029],
        [ 0.0692, -0.8635,  0.1009,  ...,  0.0896,  0.1289,  0.1138]])
average training loss for model1 : 2.2757492065429688

average validation loss for IPLLR : 2.2714338302612305
   model  layer       h_1  Delta_h_1   Delta_h   delta_h         h   id
0  IPLLR    1.0  1.523290   0.999482  0.991401  0.000361  1.510392  0.0
1  IPLLR    2.0  0.993054   0.972960  0.866442  0.002097  0.883802  0.0
2  IPLLR    3.0  0.640876   0.627385  0.563769  0.000711  0.575951  0.0
3  IPLLR    4.0  0.403638   0.393965  0.358908  0.000783  0.367122  0.0
4  IPLLR    5.0  0.2

input abs mean in training:  0.6920092701911926
loss derivatives for model: tensor([[ 0.0713,  0.1367,  0.1016,  ...,  0.0903, -0.8736,  0.1107],
        [ 0.0773,  0.1276,  0.1015,  ...,  0.0928,  0.1201,  0.1085],
        [ 0.0606, -0.8445,  0.1011,  ...,  0.0854,  0.1389,  0.1146],
        ...,
        [ 0.0842,  0.1183,  0.1013,  ...,  0.0952,  0.1134,  0.1059],
        [ 0.0696, -0.8606,  0.1015,  ...,  0.0896,  0.1284,  0.1114],
        [ 0.0683,  0.1417,  0.1015,  ..., -0.9110,  0.1298,  0.1118]])
average training loss for model1 : 2.265228509902954

average validation loss for IPLLR : 2.2678534984588623
   model  layer       h_1  Delta_h_1   Delta_h   delta_h         h   id
0  IPLLR    1.0  1.518699   1.000495  0.992949  0.000503  1.505976  0.0
1  IPLLR    2.0  0.990304   0.970257  0.864025  0.002675  0.881285  0.0
2  IPLLR    3.0  0.642755   0.629413  0.567339  0.000989  0.579313  0.0
3  IPLLR    4.0  0.413363   0.403634  0.372266  0.000403  0.380585  0.0
4  IPLLR    5.0  0.27

input abs mean in training:  0.6961998343467712
loss derivatives for model: tensor([[ 0.0704,  0.1416,  0.1023,  ...,  0.0901,  0.1260,  0.1090],
        [ 0.0465, -0.8041,  0.1002,  ...,  0.0773,  0.1548,  0.1147],
        [ 0.0427, -0.7925,  0.0993,  ...,  0.0747,  0.1602,  0.1153],
        ...,
        [-0.9138,  0.1171,  0.1016,  ...,  0.0960,  0.1111,  0.1042],
        [ 0.0746,  0.1344,  0.1022,  ...,  0.0918,  0.1218,  0.1078],
        [-0.9143,  0.1177,  0.1016,  ...,  0.0958,  0.1115,  0.1044]])
average training loss for model1 : 2.2543394565582275

average validation loss for IPLLR : 2.243591785430908
   model  layer       h_1  Delta_h_1   Delta_h   delta_h         h   id
0  IPLLR    1.0  1.485567   0.970206  0.963453  0.000627  1.473564  0.0
1  IPLLR    2.0  0.962178   0.942777  0.836175  0.003300  0.852815  0.0
2  IPLLR    3.0  0.615187   0.602458  0.540271  0.001576  0.551548  0.0
3  IPLLR    4.0  0.389579   0.380400  0.353339  0.000187  0.361152  0.0
4  IPLLR    5.0  0.26