In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
cwd = os.getcwd()

NOTEBOOK_DIR = os.path.dirname(cwd)
ROOT = os.path.dirname(os.path.dirname(os.path.dirname(NOTEBOOK_DIR)))

FIGURES_DIR = os.path.join(ROOT, 'figures/abc_parameterizations/initialization')
CONFIG_PATH = os.path.join(ROOT, 'pytorch/configs/abc_parameterizations', 'fc_ipllr_mnist.yaml')

In [3]:
import sys
sys.path.append(ROOT)

In [4]:
import os
from copy import deepcopy
import torch
import math
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, Subset, DataLoader
import torch.nn.functional as F

from utils.tools import read_yaml, set_random_seeds
from pytorch.configs.base import BaseConfig
from pytorch.configs.model import ModelConfig
from pytorch.models.abc_params.fully_connected.ipllr import FcIPLLR
from pytorch.models.abc_params.fully_connected.muP import FCmuP
from pytorch.models.abc_params.fully_connected.ntk import FCNTK
from pytorch.models.abc_params.fully_connected.standard_fc_ip import StandardFCIP
from utils.data.mnist import load_data
from utils.abc_params.debug_ipllr import *

### Load basic configuration and define variables 

In [5]:
N_TRIALS = 1
SEED = 30
L = 6
width = 1024
n_warmup_steps = 1
batch_size = 512
base_lr = 0.01
n_steps = 50

set_random_seeds(SEED)  # set random seed for reproducibility
config_dict = read_yaml(CONFIG_PATH)

In [6]:
config_dict = read_yaml(CONFIG_PATH)

input_size = config_dict['architecture']['input_size']

config_dict['architecture']['width'] = width
config_dict['architecture']['n_layers'] = L + 1
config_dict['optimizer']['params']['lr'] = base_lr
config_dict['scheduler'] = {'name': 'warmup_switch',
                            'params': {'n_warmup_steps': n_warmup_steps,
                                       'calibrate_base_lr': True,
                                       'default_calibration': False}}
        
base_model_config = ModelConfig(config_dict)

### Load data & define model

In [7]:
training_dataset, test_dataset = load_data(download=False, flatten=True)
train_data_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size)
test_batches = list(DataLoader(test_dataset, shuffle=False, batch_size=batch_size))
batches = list(train_data_loader)
eval_batch = test_batches[0]

## Look at model at different steps of training

### lr = 0.01

In [8]:
ipllr = FcIPLLR(base_model_config, n_warmup_steps=12, lr_calibration_batches=batches)

initial base lr : [0.08823053538799286, 0.06004498898983002, 1.2183654308319092, 2.177013397216797, 2.4937825202941895, 2.8725390434265137, 8.645325660705566]


### 1. Initial model : t=0

In [9]:
ipllr_0 = deepcopy(ipllr)

### 2. After one step of SGD : t=1

In [10]:
x, y = batches[0]
train_model_one_step(ipllr, x, y, batch_size)
ipllr_1 = deepcopy(ipllr)

input abs mean in training:  0.6950533986091614
loss derivatives for model: tensor([[-0.9000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000, -0.9000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.9000],
        ...,
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.9000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000, -0.9000,  0.1000]])
average training loss for model1 : 2.3025991916656494



##### Look at model_1 on the 2nd batch

In [11]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
#contributions_df.columns = ['model', 'layer', 'h_0', 'model_0_', 'Delta_h_0', 'Delta_h_1', 'delta_h_1', 'h_1', 'id']
#contributions_df.loc[:, ['model', 'layer', 'h_0', 'Delta_h_1', 'delta_h_1', 'h_1']]
contributions_df

average validation loss for IPLLR : 3.267101287841797


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.054676,1.0,1.0,1.0,31.054676,0.0
1,IPLLR,2.0,1.461119,1.0,1.0,1.0,1.461119,0.0
2,IPLLR,3.0,1.02213,1.0,1.0,1.0,1.02213,0.0
3,IPLLR,4.0,1.019486,1.0,1.0,1.0,1.019486,0.0
4,IPLLR,5.0,1.018075,1.0,1.0,1.0,1.018075,0.0
5,IPLLR,6.0,1.015994,1.0,1.0,1.0,1.015994,0.0
6,IPLLR,7.0,1.096317,1.0,1.0,1.0,1.096317,0.0


##### Look at model_1 on the 2nd batch

In [12]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 3.1852619647979736


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,30.810158,0.994181,0.994181,0.994181,30.810158,0.0
1,IPLLR,2.0,1.452204,0.995297,0.995297,0.995297,1.452204,0.0
2,IPLLR,3.0,1.01588,0.993826,0.993826,0.993826,1.01588,0.0
3,IPLLR,4.0,1.012521,0.99315,0.99315,0.99315,1.012521,0.0
4,IPLLR,5.0,1.011044,0.993094,0.993094,0.993094,1.011044,0.0
5,IPLLR,6.0,1.008975,0.993091,0.993091,0.993091,1.008975,0.0
6,IPLLR,7.0,1.088743,0.993091,0.993091,0.993091,1.088743,0.0


##### Look at model_1 on a couple of random batches

In [13]:
batch = batches[4:20]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=batch)
contributions_df

contributions_by_layer = contributions_df.groupby(by='layer').mean()
contributions_by_layer

average validation loss for IPLLR : 3.268497943878174


Unnamed: 0_level_0,h_1,Delta_h_1,Delta_h,delta_h,h,id
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,31.111295,0.996178,0.996178,0.996178,31.111295,7.5
2.0,1.461472,0.999912,0.999912,0.999912,1.461472,7.5
3.0,1.021265,0.999136,0.999136,0.999136,1.021265,7.5
4.0,1.018179,0.99871,0.99871,0.99871,1.018179,7.5
5.0,1.016742,0.998691,0.998691,0.998691,1.016742,7.5
6.0,1.014668,0.998696,0.998696,0.998696,1.014668,7.5
7.0,1.094885,0.998694,0.998694,0.998694,1.094885,7.5


### 3. After 2 steps of SGD : t=2

In [14]:
x, y = batches[1]
train_model_one_step(ipllr, x, y, batch_size)
ipllr_2 = deepcopy(ipllr)

input abs mean in training:  0.6921874284744263
loss derivatives for model: tensor([[ 0.0107,  0.2119,  0.0509,  ...,  0.0351, -0.5999,  0.1352],
        [ 0.0025,  0.2165,  0.0259,  ...,  0.0149,  0.5578,  0.1109],
        [ 0.0062,  0.2181,  0.0399,  ...,  0.0256,  0.4658,  0.1277],
        ...,
        [ 0.0063,  0.2180,  0.0401,  ...,  0.0258,  0.4643,  0.1279],
        [ 0.0060, -0.7815,  0.0393,  ...,  0.0252,  0.4688,  0.1272],
        [ 0.0018,  0.2131,  0.0220,  ...,  0.0122, -0.4125,  0.1042]])
average training loss for model1 : 3.267101287841797



In [15]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_1, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 3.267101287841797


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.054676,1.0,1.0,1.0,31.054676,0.0
1,IPLLR,2.0,1.461119,1.0,1.0,1.0,1.461119,0.0
2,IPLLR,3.0,1.02213,1.0,1.0,1.0,1.02213,0.0
3,IPLLR,4.0,1.019486,1.0,1.0,1.0,1.019486,0.0
4,IPLLR,5.0,1.018075,1.0,1.0,1.0,1.018075,0.0
5,IPLLR,6.0,1.015994,1.0,1.0,1.0,1.015994,0.0
6,IPLLR,7.0,1.096317,1.0,1.0,1.0,1.096317,0.0


In [16]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_1, batches=[batch])
contributions_df

average validation loss for IPLLR : 2.3024208545684814


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.054676,1.0,0.988773,0.115531,31.009148,0.0
1,IPLLR,2.0,1.453387,0.9944,3.419696,3.359601,3.45883,0.0
2,IPLLR,3.0,0.014889,0.012448,0.012326,0.000304,0.014789,0.0
3,IPLLR,4.0,0.011646,0.011461,0.011355,0.000117,0.011539,0.0
4,IPLLR,5.0,0.011092,0.010894,0.010771,0.000142,0.010967,0.0
5,IPLLR,6.0,0.010551,0.010381,0.010248,0.000164,0.010414,0.0
6,IPLLR,7.0,0.010652,0.009687,0.009675,1.2e-05,0.01064,0.0


In [17]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_1, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 3.1852619647979736


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,30.810158,0.994181,0.994181,0.994181,30.810158,0.0
1,IPLLR,2.0,1.452204,0.995297,0.995297,0.995297,1.452204,0.0
2,IPLLR,3.0,1.01588,0.993826,0.993826,0.993826,1.01588,0.0
3,IPLLR,4.0,1.012521,0.99315,0.99315,0.99315,1.012521,0.0
4,IPLLR,5.0,1.011044,0.993094,0.993094,0.993094,1.011044,0.0
5,IPLLR,6.0,1.008975,0.993091,0.993091,0.993091,1.008975,0.0
6,IPLLR,7.0,1.088743,0.993091,0.993091,0.993091,1.088743,0.0


In [18]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_1, batches=[batch])
contributions_df

average validation loss for IPLLR : 2.301940679550171


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,30.810158,0.994181,0.98332,0.114152,30.765423,0.0
1,IPLLR,2.0,1.444565,0.989757,3.394865,3.334319,3.433323,0.0
2,IPLLR,3.0,0.014856,0.012454,0.012331,0.000303,0.014753,0.0
3,IPLLR,4.0,0.011561,0.011377,0.011271,0.000116,0.011455,0.0
4,IPLLR,5.0,0.010999,0.010802,0.010681,0.000141,0.010875,0.0
5,IPLLR,6.0,0.010462,0.010293,0.010161,0.000162,0.010326,0.0
6,IPLLR,7.0,0.010562,0.009605,0.009593,1.2e-05,0.010549,0.0


In [19]:
batch = batches[3]
contributions_df = compute_contributions_with_previous('IPLLR', ipllr_2, ipllr_0, ipllr_1, batches=[batch])
contributions_df.columns = ['model', 'layer', 'h_0', 'h_1', 'Delta_h_1', 'Delta_h_2', 'delta_h_2', 'h_2', 'id']
contributions_df.loc[:, ['model', 'layer', 'h_0', 'h_1', 'Delta_h_1', 'Delta_h_2', 'delta_h_2', 'h_2']]

TypeError: compute_contributions_with_previous() got multiple values for argument 'batches'

In [None]:
batch = batches[3]
contributions_df = compute_contributions_with_previous('IPLLR', ipllr_2, ipllr_0, ipllr_1, batches=[batch])
contributions_df.columns = ['model', 'layer', 'h_0', 'h_1', 'Delta_h_1', 'Delta_h_2', 'delta_h_2', 'h_2', 'id']
contributions_df.loc[:, ['model', 'layer', 'h_0', 'h_1', 'Delta_h_1', 'Delta_h_2', 'delta_h_2', 'h_2']]

### Train further

In [20]:
# ['model', 'layer', 'h_1', 'Delta_h_1', 'Delta_h', 'delta_h', 'h', 'id']
for i in range(3, n_steps):
    print('----  step {}  ----'.format(i))
    
    # train model
    ipllr_previous = deepcopy(ipllr)
    x, y = batches[i]
    train_model_one_step(ipllr, x, y, batch_size)
    
    # compute contributions
    next_batch = batches[(i+1) % len(batches)]
    contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1,ipllr_previous, 
                                                         batches=[next_batch])
    print(contributions_df)
    
    print('\n\n')

----  step 3  ----
input abs mean in training:  0.6968955397605896
loss derivatives for model: tensor([[-0.9025,  0.1019,  0.0998,  ...,  0.0993,  0.1028,  0.1012],
        [ 0.0969, -0.8976,  0.0997,  ...,  0.0991,  0.1036,  0.1016],
        [ 0.0984,  0.1013,  0.0999,  ...,  0.0995,  0.1019,  0.1008],
        ...,
        [ 0.0986,  0.1011,  0.0999,  ...,  0.0996, -0.8984,  0.1007],
        [ 0.0979,  0.1016, -0.9002,  ...,  0.0994,  0.1023,  0.1010],
        [ 0.0965,  0.1027,  0.0997,  ...,  0.0989,  0.1040,  0.1018]])
average training loss for model1 : 2.303046703338623

average validation loss for IPLLR : 2.299846887588501
   model  layer        h_1  Delta_h_1   Delta_h       delta_h          h   id
0  IPLLR    1.0  31.294338   1.011541  1.000023  1.161919e-03  31.248394  0.0
1  IPLLR    2.0   1.465770   1.002522  3.446465  3.328696e-02   3.486909  0.0
2  IPLLR    3.0   0.015341   0.012921  0.012796  1.100047e-05   0.015237  0.0
3  IPLLR    4.0   0.012420   0.012219  0.012107  1.

input abs mean in training:  0.6905764937400818
loss derivatives for model: tensor([[ 0.0899,  0.1081,  0.0990,  ...,  0.0968,  0.1121,  0.1052],
        [ 0.0849,  0.1125,  0.0983,  ...,  0.0950,  0.1190,  0.1079],
        [ 0.0799,  0.1170,  0.0975,  ...,  0.0931,  0.1262, -0.8894],
        ...,
        [ 0.0672,  0.1297,  0.0947,  ...,  0.0875, -0.8522,  0.1176],
        [ 0.0628, -0.8654,  0.0934,  ...,  0.0852,  0.1567,  0.1202],
        [ 0.0855,  0.1119,  0.0984,  ...,  0.0953,  0.1180, -0.8925]])
average training loss for model1 : 2.2644970417022705

average validation loss for IPLLR : 2.2592523097991943
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  31.371746   1.004235  0.994692  0.002297  31.328032  0.0
1  IPLLR    2.0   1.468198   1.003455  3.482228  0.023441   3.503951  0.0
2  IPLLR    3.0   0.081274   0.077481  0.078345  0.000326   0.082202  0.0
3  IPLLR    4.0   0.090765   0.089358  0.088734  0.000017   0.090138  0.0
4  IPLLR  

input abs mean in training:  0.6988633871078491
loss derivatives for model: tensor([[ 0.0829,  0.1144,  0.0980,  ..., -0.9057,  0.1217,  0.1089],
        [-0.9014,  0.1011,  0.0999,  ...,  0.0996,  0.1016,  0.1007],
        [ 0.0533,  0.1465,  0.0901,  ...,  0.0799, -0.8223,  0.1253],
        ...,
        [ 0.0812,  0.1160, -0.9023,  ...,  0.0936,  0.1242,  0.1098],
        [ 0.0448, -0.8420,  0.0862,  ...,  0.0741,  0.2010,  0.1301],
        [ 0.0826,  0.1147,  0.0980,  ...,  0.0942,  0.1221,  0.1090]])
average training loss for model1 : 2.2165207862854004

average validation loss for IPLLR : 2.208606004714966
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  31.144203   1.000894  0.994240  0.004866  31.105389  0.0
1  IPLLR    2.0   1.456948   0.999001  3.513904  0.039731   3.529868  0.0
2  IPLLR    3.0   0.137735   0.133026  0.137978  0.000904   0.143006  0.0
3  IPLLR    4.0   0.169440   0.166902  0.166045  0.000062   0.168570  0.0
4  IPLLR   

input abs mean in training:  0.6967038512229919
loss derivatives for model: tensor([[ 0.0631,  0.1353,  0.0936,  ...,  0.0855,  0.1552,  0.1196],
        [ 0.0743,  0.1230,  0.0964,  ...,  0.0909,  0.1347,  0.1134],
        [ 0.0974,  0.1021,  0.0998,  ..., -0.9008,  0.1030,  0.1013],
        ...,
        [ 0.0680,  0.1297,  0.0950,  ...,  0.0880,  0.1457,  0.1168],
        [ 0.0966,  0.1027,  0.0997,  ...,  0.0990,  0.1038, -0.8983],
        [ 0.0935,  0.1053,  0.0994,  ...,  0.0980,  0.1076,  0.1033]])
average training loss for model1 : 2.215245485305786

average validation loss for IPLLR : 2.1972901821136475
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  31.078907   0.984130  0.978926  0.002616  31.044289  0.0
1  IPLLR    2.0   1.450012   0.988560  3.510918  0.016609   3.534479  0.0
2  IPLLR    3.0   0.124919   0.120671  0.128136  0.000476   0.132822  0.0
3  IPLLR    4.0   0.163761   0.161362  0.160823  0.000039   0.163208  0.0
4  IPLLR   

input abs mean in training:  0.6930533647537231
loss derivatives for model: tensor([[ 0.0324,  0.1816,  0.0785,  ...,  0.0642, -0.7587,  0.1348],
        [ 0.0464,  0.1585,  0.0872,  ..., -0.9245,  0.1942,  0.1282],
        [ 0.0191, -0.7910,  0.0651,  ...,  0.0493,  0.3102,  0.1380],
        ...,
        [ 0.0640,  0.1351,  0.0939,  ..., -0.9139,  0.1529,  0.1187],
        [ 0.0978,  0.1018,  0.0998,  ..., -0.9006,  0.1024,  0.1011],
        [ 0.0118, -0.7735,  0.0538,  ...,  0.0381,  0.3686,  0.1358]])
average training loss for model1 : 2.1690187454223633

average validation loss for IPLLR : 2.160810708999634
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  31.248421   0.999482  0.997476  0.003554  31.218801  0.0
1  IPLLR    2.0   1.462589   0.999377  3.581809  0.014762   3.606091  0.0
2  IPLLR    3.0   0.161614   0.156819  0.170669  0.000761   0.176202  0.0
3  IPLLR    4.0   0.227031   0.223785  0.223445  0.000070   0.226689  0.0
4  IPLLR   

input abs mean in training:  0.6920092701911926
loss derivatives for model: tensor([[ 0.0315,  0.1869,  0.0779,  ...,  0.0636, -0.7577,  0.1342],
        [ 0.0699,  0.1292,  0.0955,  ...,  0.0890,  0.1414,  0.1152],
        [ 0.0198, -0.7872,  0.0662,  ...,  0.0505,  0.3011,  0.1368],
        ...,
        [ 0.0981,  0.1015,  0.0999,  ...,  0.0995,  0.1020,  0.1009],
        [ 0.0257, -0.8011,  0.0728,  ...,  0.0576,  0.2681,  0.1360],
        [ 0.0327,  0.1844,  0.0789,  ..., -0.9353,  0.2373,  0.1337]])
average training loss for model1 : 2.163208246231079

average validation loss for IPLLR : 2.1586525440216064
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  31.126289   1.000494  0.999999  0.003560  31.099098  0.0
1  IPLLR    2.0   1.458339   0.999874  3.600541  0.018904   3.628510  0.0
2  IPLLR    3.0   0.130667   0.126593  0.140844  0.000424   0.145544  0.0
3  IPLLR    4.0   0.193194   0.190494  0.190534  0.000043   0.193239  0.0
4  IPLLR   

input abs mean in training:  0.6961998343467712
loss derivatives for model: tensor([[ 0.0996,  0.1003,  0.1000,  ...,  0.0999,  0.1004,  0.1002],
        [ 0.0100, -0.7509,  0.0505,  ...,  0.0353,  0.3725,  0.1311],
        [ 0.0057, -0.7349,  0.0393,  ...,  0.0256,  0.4290,  0.1230],
        ...,
        [-0.9001,  0.1001,  0.1000,  ...,  0.1000,  0.1002,  0.1001],
        [ 0.0969,  0.1027,  0.0998,  ...,  0.0991,  0.1034,  0.1015],
        [-0.9005,  0.1004,  0.1000,  ...,  0.0999,  0.1005,  0.1002]])
average training loss for model1 : 2.159735679626465

average validation loss for IPLLR : 2.143129348754883
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  30.685677   0.970206  0.972220  0.003224  30.663731  0.0
1  IPLLR    2.0   1.430730   0.975713  3.544558  0.010879   3.581214  0.0
2  IPLLR    3.0   0.127266   0.123428  0.139911  0.000417   0.144367  0.0
3  IPLLR    4.0   0.197023   0.194323  0.194668  0.000045   0.197377  0.0
4  IPLLR    

### lr = 0.001

In [21]:
base_lr = 0.001
config = deepcopy(base_model_config)
config.optimizer.params['lr'] = base_lr

In [22]:
ipllr = FcIPLLR(config, n_warmup_steps=12, lr_calibration_batches=batches)

initial base lr : [0.08689195662736893, 0.059653766453266144, 1.2754932641983032, 2.544787883758545, 2.7952122688293457, 3.329961061477661, 10.34122371673584]


### 1. Initial model : t=0

In [23]:
ipllr_0 = deepcopy(ipllr)

### 2. After one step of SGD : t=1

In [24]:
x, y = batches[0]
train_model_one_step(ipllr, x, y, batch_size)
ipllr_1 = deepcopy(ipllr)

input abs mean in training:  0.6950533986091614
loss derivatives for model: tensor([[-0.9000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000, -0.9000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.9000],
        ...,
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.9000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000, -0.9000,  0.1000]])
average training loss for model1 : 2.3025991916656494



##### Look at model_1 on the 2nd batch

In [25]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
#contributions_df.columns = ['model', 'layer', 'h_0', 'model_0_', 'Delta_h_0', 'Delta_h_1', 'delta_h_1', 'h_1', 'id']
#contributions_df.loc[:, ['model', 'layer', 'h_0', 'Delta_h_1', 'delta_h_1', 'h_1']]
contributions_df

average validation loss for IPLLR : 3.3118808269500732


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.534925,1.0,1.0,1.0,31.534925,0.0
1,IPLLR,2.0,1.468945,1.0,1.0,1.0,1.468945,0.0
2,IPLLR,3.0,1.020643,1.0,1.0,1.0,1.020643,0.0
3,IPLLR,4.0,1.016827,1.0,1.0,1.0,1.016827,0.0
4,IPLLR,5.0,1.016429,1.0,1.0,1.0,1.016429,0.0
5,IPLLR,6.0,1.018574,1.0,1.0,1.0,1.018574,0.0
6,IPLLR,7.0,1.090831,1.0,1.0,1.0,1.090831,0.0


##### Look at model_1 on the 2nd batch

In [26]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 3.261470317840576


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.36364,0.991881,0.991881,0.991881,31.36364,0.0
1,IPLLR,2.0,1.462198,0.994578,0.994578,0.994578,1.462198,0.0
2,IPLLR,3.0,1.013887,0.993334,0.993334,0.993334,1.013887,0.0
3,IPLLR,4.0,1.010134,0.993411,0.993411,0.993411,1.010134,0.0
4,IPLLR,5.0,1.009836,0.993517,0.993517,0.993517,1.009836,0.0
5,IPLLR,6.0,1.011979,0.993526,0.993526,0.993526,1.011979,0.0
6,IPLLR,7.0,1.083773,0.993529,0.993529,0.993529,1.083773,0.0


##### Look at model_1 on a couple of random batches

In [27]:
batch = batches[4:20]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=batch)
contributions_df

contributions_by_layer = contributions_df.groupby(by='layer').mean()
contributions_by_layer

average validation loss for IPLLR : 3.3461484760046005


Unnamed: 0_level_0,h_1,Delta_h_1,Delta_h,delta_h,h,id
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,31.617947,0.995288,0.995288,0.995288,31.617947,7.5
2.0,1.47179,0.998586,0.998586,0.998586,1.47179,7.5
3.0,1.020745,1.000098,1.000098,1.000098,1.020745,7.5
4.0,1.016672,0.99984,0.99984,0.99984,1.016672,7.5
5.0,1.016234,0.999807,0.999807,0.999807,1.016234,7.5
6.0,1.018377,0.999808,0.999808,0.999808,1.018377,7.5
7.0,1.090621,0.999807,0.999807,0.999807,1.090621,7.5


### 3. After 2 steps of SGD : t=2

In [28]:
x, y = batches[1]
train_model_one_step(ipllr, x, y, batch_size)
ipllr_2 = deepcopy(ipllr)

input abs mean in training:  0.6921874284744263
loss derivatives for model: tensor([[ 0.0067,  0.1506,  0.0392,  ...,  0.0411, -0.4866,  0.1249],
        [ 0.0030,  0.1367,  0.0262,  ...,  0.0277,  0.6176,  0.1086],
        [ 0.0048,  0.1455,  0.0333,  ...,  0.0350,  0.5589,  0.1185],
        ...,
        [ 0.0048,  0.1455,  0.0332,  ...,  0.0350,  0.5593,  0.1184],
        [ 0.0066, -0.8496,  0.0388,  ...,  0.0407,  0.5163,  0.1245],
        [ 0.0010,  0.1133,  0.0148,  ...,  0.0159, -0.2744,  0.0853]])
average training loss for model1 : 3.3118808269500732



##### look at model on 2nd batch

In [29]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_1, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 3.3118808269500732


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.534925,1.0,1.0,1.0,31.534925,0.0
1,IPLLR,2.0,1.468945,1.0,1.0,1.0,1.468945,0.0
2,IPLLR,3.0,1.020643,1.0,1.0,1.0,1.020643,0.0
3,IPLLR,4.0,1.016827,1.0,1.0,1.0,1.016827,0.0
4,IPLLR,5.0,1.016429,1.0,1.0,1.0,1.016429,0.0
5,IPLLR,6.0,1.018574,1.0,1.0,1.0,1.018574,0.0
6,IPLLR,7.0,1.090831,1.0,1.0,1.0,1.090831,0.0


In [30]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_2, ipllr_0, ipllr_1, ipllr_1, batches=[batch])
contributions_df

average validation loss for IPLLR : 2.4998767375946045


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.534925,1.0,0.998846,0.012595,31.529808,0.0
1,IPLLR,2.0,1.468086,0.999372,0.968938,0.380584,1.279623,0.0
2,IPLLR,3.0,0.501591,0.487325,0.486204,0.001325,0.500457,0.0
3,IPLLR,4.0,0.495481,0.487315,0.486703,0.000651,0.494852,0.0
4,IPLLR,5.0,0.492699,0.484712,0.484052,0.000708,0.49203,0.0
5,IPLLR,6.0,0.490869,0.481891,0.481141,0.000854,0.490121,0.0
6,IPLLR,7.0,0.521685,0.478114,0.478063,6e-05,0.521635,0.0


In [31]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_1, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 3.261470317840576


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.36364,0.991881,0.991881,0.991881,31.36364,0.0
1,IPLLR,2.0,1.462198,0.994578,0.994578,0.994578,1.462198,0.0
2,IPLLR,3.0,1.013887,0.993334,0.993334,0.993334,1.013887,0.0
3,IPLLR,4.0,1.010134,0.993411,0.993411,0.993411,1.010134,0.0
4,IPLLR,5.0,1.009836,0.993517,0.993517,0.993517,1.009836,0.0
5,IPLLR,6.0,1.011979,0.993526,0.993526,0.993526,1.011979,0.0
6,IPLLR,7.0,1.083773,0.993529,0.993529,0.993529,1.083773,0.0


In [32]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_2, ipllr_0, ipllr_1, ipllr_1, batches=[batch])
contributions_df

average validation loss for IPLLR : 2.4834163188934326


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.36364,0.991881,0.990722,0.012454,31.358612,0.0
1,IPLLR,2.0,1.46135,0.993957,0.963408,0.37775,1.273524,0.0
2,IPLLR,3.0,0.497874,0.483646,0.482528,0.001319,0.496739,0.0
3,IPLLR,4.0,0.491687,0.483569,0.482961,0.000647,0.49106,0.0
4,IPLLR,5.0,0.48899,0.481064,0.48041,0.000703,0.488326,0.0
5,IPLLR,6.0,0.487185,0.478274,0.47753,0.000847,0.486443,0.0
6,IPLLR,7.0,0.517774,0.474529,0.474479,5.9e-05,0.517724,0.0


### Train further

In [33]:
# ['model', 'layer', 'h_1', 'Delta_h_1', 'Delta_h', 'delta_h', 'h', 'id']
for i in range(3, n_steps):
    print('----  step {}  ----'.format(i))
    
    # train model
    ipllr_previous = deepcopy(ipllr)
    x, y = batches[i]
    train_model_one_step(ipllr, x, y, batch_size)
    
    # compute contributions
    next_batch = batches[(i+1) % len(batches)]
    contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1,ipllr_previous, 
                                                         batches=[next_batch])
    print(contributions_df)
    
    print('\n\n')

----  step 3  ----
input abs mean in training:  0.6968955397605896
loss derivatives for model: tensor([[-0.9723,  0.1500,  0.0723,  ...,  0.0742,  0.2913,  0.1356],
        [ 0.0325, -0.8533,  0.0765,  ...,  0.0783,  0.2648,  0.1340],
        [ 0.0250,  0.1518,  0.0697,  ...,  0.0716,  0.3080,  0.1362],
        ...,
        [ 0.0264,  0.1509,  0.0711,  ...,  0.0729, -0.7008,  0.1359],
        [ 0.0303,  0.1482, -0.9253,  ...,  0.0765,  0.2766,  0.1348],
        [ 0.0209,  0.1540,  0.0651,  ...,  0.0671,  0.3371,  0.1367]])
average training loss for model1 : 2.549348831176758

average validation loss for IPLLR : 2.3944790363311768
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  31.798203   1.010580  1.008690  0.003613  31.791996  0.0
1  IPLLR    2.0   1.480342   1.006744  1.000196  0.153380   1.247269  0.0
2  IPLLR    3.0   0.347209   0.335489  0.334369  0.000377   0.346081  0.0
3  IPLLR    4.0   0.341419   0.335827  0.335301  0.000117   0.3408

input abs mean in training:  0.6905764937400818
loss derivatives for model: tensor([[ 0.0675,  0.1208,  0.0940,  ...,  0.0948,  0.1517,  0.1166],
        [ 0.0661,  0.1217,  0.0936,  ...,  0.0944,  0.1545,  0.1173],
        [ 0.0620,  0.1247,  0.0922,  ...,  0.0932,  0.1639, -0.8805],
        ...,
        [ 0.0499,  0.1336,  0.0874,  ...,  0.0886, -0.8034,  0.1260],
        [ 0.0412, -0.8597,  0.0827,  ...,  0.0842,  0.2266,  0.1303],
        [ 0.0536,  0.1308,  0.0891,  ...,  0.0902,  0.1854, -0.8760]])
average training loss for model1 : 2.2889041900634766

average validation loss for IPLLR : 2.280773878097534
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  31.889481   1.002257  0.999226  0.000527  31.882950  0.0
1  IPLLR    2.0   1.479133   1.002897  1.072731  0.024824   1.241741  0.0
2  IPLLR    3.0   0.198306   0.190277  0.189414  0.000034   0.197446  0.0
3  IPLLR    4.0   0.196291   0.193134  0.192786  0.000004   0.195935  0.0
4  IPLLR   

input abs mean in training:  0.6988633871078491
loss derivatives for model: tensor([[ 0.0591,  0.1268,  0.0912,  ..., -0.9077,  0.1708,  0.1211],
        [-0.9149,  0.1090,  0.0980,  ...,  0.0983,  0.1201,  0.1074],
        [ 0.0537,  0.1308,  0.0891,  ...,  0.0903, -0.8150,  0.1239],
        ...,
        [ 0.0694,  0.1195, -0.9055,  ...,  0.0953,  0.1477,  0.1156],
        [ 0.0459, -0.8633,  0.0854,  ...,  0.0868,  0.2092,  0.1279],
        [ 0.0749,  0.1157,  0.0959,  ...,  0.0965,  0.1371,  0.1127]])
average training loss for model1 : 2.2481472492218018

average validation loss for IPLLR : 2.245486259460449
   model  layer        h_1  Delta_h_1   Delta_h       delta_h          h   id
0  IPLLR    1.0  31.602627   1.004465  1.001286  6.282989e-04  31.596460  0.0
1  IPLLR    2.0   1.470874   0.999961  1.107765  1.728764e-02   1.265401  0.0
2  IPLLR    3.0   0.179503   0.172596  0.171888  1.242935e-05   0.178799  0.0
3  IPLLR    4.0   0.179777   0.176934  0.176620  2.461631e-06   0.179

input abs mean in training:  0.6967038512229919
loss derivatives for model: tensor([[ 0.0649,  0.1227,  0.0932,  ...,  0.0941,  0.1572,  0.1180],
        [ 0.0794,  0.1127,  0.0969,  ...,  0.0974,  0.1292,  0.1103],
        [ 0.0773,  0.1141,  0.0964,  ..., -0.9030,  0.1329,  0.1114],
        ...,
        [ 0.0710,  0.1184,  0.0949,  ...,  0.0957,  0.1444,  0.1147],
        [ 0.0771,  0.1143,  0.0964,  ...,  0.0970,  0.1332, -0.8884],
        [ 0.0773,  0.1141,  0.0965,  ...,  0.0970,  0.1327,  0.1114]])
average training loss for model1 : 2.2633538246154785

average validation loss for IPLLR : 2.2480952739715576
   model  layer        h_1  Delta_h_1   Delta_h       delta_h          h   id
0  IPLLR    1.0  31.604149   0.981297  0.977988  3.823846e-04  31.598583  0.0
1  IPLLR    2.0   1.460970   0.989375  1.130932  1.419200e-02   1.286291  0.0
2  IPLLR    3.0   0.170306   0.164259  0.163676  9.590603e-06   0.169728  0.0
3  IPLLR    4.0   0.171380   0.168689  0.168401  4.371662e-07   0.17

input abs mean in training:  0.6930533647537231
loss derivatives for model: tensor([[ 0.0536,  0.1311,  0.0891,  ...,  0.0903, -0.8147,  0.1239],
        [ 0.0503,  0.1336,  0.0876,  ..., -0.9111,  0.1950,  0.1257],
        [ 0.0365, -0.8558,  0.0797,  ...,  0.0813,  0.2453,  0.1323],
        ...,
        [ 0.0614,  0.1253,  0.0921,  ..., -0.9070,  0.1651,  0.1198],
        [ 0.0855,  0.1088,  0.0980,  ..., -0.9016,  0.1195,  0.1072],
        [ 0.0271, -0.8489,  0.0719,  ...,  0.0737,  0.2940,  0.1357]])
average training loss for model1 : 2.224065065383911

average validation loss for IPLLR : 2.2200927734375
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  31.681572   0.999149  0.996101  0.000452  31.676428  0.0
1  IPLLR    2.0   1.474881   1.000851  1.157167  0.009522   1.325224  0.0
2  IPLLR    3.0   0.189425   0.183419  0.182913  0.000011   0.188926  0.0
3  IPLLR    4.0   0.191946   0.188947  0.188642  0.000004   0.191636  0.0
4  IPLLR    5.

input abs mean in training:  0.6920092701911926
loss derivatives for model: tensor([[ 0.0497,  0.1342,  0.0873,  ...,  0.0886, -0.8034,  0.1259],
        [ 0.0614,  0.1254,  0.0920,  ...,  0.0930,  0.1651,  0.1198],
        [ 0.0285, -0.8496,  0.0733,  ...,  0.0751,  0.2851,  0.1352],
        ...,
        [ 0.0837,  0.1100,  0.0977,  ...,  0.0981,  0.1222,  0.1081],
        [ 0.0425, -0.8602,  0.0835,  ...,  0.0850,  0.2210,  0.1295],
        [ 0.0390,  0.1425,  0.0814,  ..., -0.9171,  0.2346,  0.1312]])
average training loss for model1 : 2.231389045715332

average validation loss for IPLLR : 2.226576566696167
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  31.653313   0.998587  0.995523  0.000363  31.648540  0.0
1  IPLLR    2.0   1.475554   1.001184  1.179632  0.009401   1.345382  0.0
2  IPLLR    3.0   0.185586   0.179977  0.179568  0.000006   0.185184  0.0
3  IPLLR    4.0   0.188546   0.185611  0.185333  0.000002   0.188263  0.0
4  IPLLR    

input abs mean in training:  0.6961998343467712
loss derivatives for model: tensor([[ 0.0925,  0.1045,  0.0991,  ...,  0.0993,  0.1095,  0.1037],
        [ 0.0295, -0.8500,  0.0741,  ...,  0.0759,  0.2794,  0.1348],
        [ 0.0206, -0.8442,  0.0648,  ...,  0.0668,  0.3380,  0.1364],
        ...,
        [-0.9036,  0.1021,  0.0996,  ...,  0.0997,  0.1044,  0.1017],
        [ 0.0821,  0.1111,  0.0974,  ...,  0.0979,  0.1247,  0.1089],
        [-0.9048,  0.1028,  0.0995,  ...,  0.0996,  0.1059,  0.1023]])
average training loss for model1 : 2.2291080951690674

average validation loss for IPLLR : 2.2199056148529053
   model  layer        h_1  Delta_h_1   Delta_h       delta_h          h   id
0  IPLLR    1.0  31.193043   0.970264  0.967379  3.378532e-04  31.188856  0.0
1  IPLLR    2.0   1.448895   0.979388  1.178893  7.473791e-03   1.339821  0.0
2  IPLLR    3.0   0.174927   0.169784  0.169461  6.642363e-06   0.174610  0.0
3  IPLLR    4.0   0.177991   0.175227  0.174983  2.733573e-06   0.17