In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
cwd = os.getcwd()

NOTEBOOK_DIR = os.path.dirname(cwd)
ROOT = os.path.dirname(os.path.dirname(os.path.dirname(NOTEBOOK_DIR)))

FIGURES_DIR = os.path.join(ROOT, 'figures/abc_parameterizations/initialization')
CONFIG_PATH = os.path.join(ROOT, 'pytorch/configs/abc_parameterizations', 'fc_ipllr_mnist.yaml')

In [3]:
import sys
sys.path.append(ROOT)

In [4]:
import os
from copy import deepcopy
import torch
import math
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, Subset, DataLoader
import torch.nn.functional as F

from utils.tools import read_yaml, set_random_seeds
from pytorch.configs.base import BaseConfig
from pytorch.configs.model import ModelConfig
from pytorch.models.abc_params.fully_connected.ipllr import FcIPLLR
from pytorch.models.abc_params.fully_connected.muP import FCmuP
from pytorch.models.abc_params.fully_connected.ntk import FCNTK
from pytorch.models.abc_params.fully_connected.standard_fc_ip import StandardFCIP
from utils.data.mnist import load_data
from utils.abc_params.debug_ipllr import *

### Load basic configuration and define variables 

In [5]:
N_TRIALS = 1
SEED = 30
L = 6
width = 1024
n_warmup_steps = 1
batch_size = 128
base_lr = 0.01
n_steps = 50

set_random_seeds(SEED)  # set random seed for reproducibility
config_dict = read_yaml(CONFIG_PATH)

In [6]:
config_dict = read_yaml(CONFIG_PATH)

input_size = config_dict['architecture']['input_size']

config_dict['architecture']['width'] = width
config_dict['architecture']['n_layers'] = L + 1
config_dict['optimizer']['params']['lr'] = base_lr
config_dict['scheduler'] = {'name': 'warmup_switch',
                            'params': {'n_warmup_steps': n_warmup_steps,
                                       'calibrate_base_lr': True,
                                       'default_calibration': False}}
        
base_model_config = ModelConfig(config_dict)

### Load data & define model

In [7]:
training_dataset, test_dataset = load_data(download=False, flatten=True)
train_data_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size)
test_batches = list(DataLoader(test_dataset, shuffle=False, batch_size=batch_size))
batches = list(train_data_loader)
eval_batch = test_batches[0]

## Look at model at different steps of training

### lr = 0.01

In [8]:
ipllr = FcIPLLR(base_model_config, n_warmup_steps=12, lr_calibration_batches=batches)

initial base lr : [0.06567156314849854, 0.04262147098779678, 0.8678671717643738, 1.53627347946167, 1.5419093370437622, 1.4907394647598267, 3.480341911315918]


### 1. Initial model : t=0

In [9]:
ipllr_0 = deepcopy(ipllr)

### 2. After one step of SGD : t=1

In [10]:
x, y = batches[0]
train_model_one_step(ipllr, x, y, batch_size)
ipllr_1 = deepcopy(ipllr)

input abs mean in training:  0.7020161151885986
loss derivatives for model: tensor([[-0.9000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000, -0.9000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.9000],
        ...,
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000, -0.9000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000, -0.9000,  0.1000],
        [ 0.1000, -0.9000,  0.1000,  ...,  0.1000,  0.1000,  0.1000]])
average training loss for model1 : 2.3025827407836914



##### Look at model_1 on the 2nd batch

In [11]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
#contributions_df.columns = ['model', 'layer', 'h_0', 'model_0_', 'Delta_h_0', 'Delta_h_1', 'delta_h_1', 'h_1', 'id']
#contributions_df.loc[:, ['model', 'layer', 'h_0', 'Delta_h_1', 'delta_h_1', 'h_1']]
contributions_df

average validation loss for IPLLR : 4.361799716949463


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.256001,1.0,1.0,1.0,31.256001,0.0
1,IPLLR,2.0,1.474337,1.0,1.0,1.0,1.474337,0.0
2,IPLLR,3.0,1.021131,1.0,1.0,1.0,1.021131,0.0
3,IPLLR,4.0,1.019765,1.0,1.0,1.0,1.019765,0.0
4,IPLLR,5.0,1.019644,1.0,1.0,1.0,1.019644,0.0
5,IPLLR,6.0,1.019809,1.0,1.0,1.0,1.019809,0.0
6,IPLLR,7.0,1.116286,1.0,1.0,1.0,1.116286,0.0


##### Look at model_1 on the 2nd batch

In [12]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 4.5204973220825195


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.156452,1.0148,1.0148,1.0148,31.156452,0.0
1,IPLLR,2.0,1.487203,1.0136,1.0136,1.0136,1.487203,0.0
2,IPLLR,3.0,1.030404,1.008917,1.008917,1.008917,1.030404,0.0
3,IPLLR,4.0,1.02962,1.009682,1.009682,1.009682,1.02962,0.0
4,IPLLR,5.0,1.029527,1.009693,1.009693,1.009693,1.029527,0.0
5,IPLLR,6.0,1.029693,1.009692,1.009692,1.009692,1.029693,0.0
6,IPLLR,7.0,1.127104,1.009692,1.009692,1.009692,1.127104,0.0


##### Look at model_1 on a couple of random batches

In [13]:
batch = batches[4:20]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=batch)
contributions_df

contributions_by_layer = contributions_df.groupby(by='layer').mean()
contributions_by_layer

average validation loss for IPLLR : 4.488128691911697


Unnamed: 0_level_0,h_1,Delta_h_1,Delta_h,delta_h,h,id
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,31.082361,1.008691,1.008691,1.008691,31.082361,7.5
2.0,1.475015,1.003997,1.003997,1.003997,1.475015,7.5
3.0,1.022582,1.001296,1.001296,1.001296,1.022582,7.5
4.0,1.021262,1.001467,1.001467,1.001467,1.021262,7.5
5.0,1.021204,1.001529,1.001529,1.001529,1.021204,7.5
6.0,1.021376,1.001537,1.001537,1.001537,1.021376,7.5
7.0,1.118003,1.001538,1.001538,1.001538,1.118003,7.5


### 3. After 2 steps of SGD : t=2

In [14]:
x, y = batches[1]
lrs_t_2 = [0.01, 0.0001, 0.1, 0.1, 1.0, 1.0, 1.0]
for i, pg in enumerate(ipllr.optimizer.param_groups):
    pg['lr'] = lrs_t_2[i] / base_lr * pg['lr']
#pg = list(ipllr.optimizer.param_groups)[1]
#pg['lr'] = pg['lr'] / 10
train_model_one_step(ipllr, x, y, batch_size)
ipllr_2 = deepcopy(ipllr)

input abs mean in training:  0.6987618207931519
loss derivatives for model: tensor([[ 2.8254e-03,  2.9189e-03,  7.5742e-03,  ...,  4.4570e-03,
          9.5627e-01,  7.3971e-03],
        [ 5.3059e-03,  5.4614e-03,  1.2712e-02,  ...,  7.9467e-03,
          9.2477e-01,  1.2449e-02],
        [ 5.4005e-03,  5.5582e-03,  1.2898e-02,  ...,  8.0764e-03,
          9.2361e-01, -9.8737e-01],
        ...,
        [ 6.3855e-03, -9.9343e-01,  1.4791e-02,  ...,  9.4155e-03,
          9.1174e-01,  1.4496e-02],
        [ 5.9272e-03,  6.0966e-03,  1.3918e-02,  ...,  8.7947e-03,
          9.1723e-01,  1.3635e-02],
        [ 3.4016e-04,  3.5567e-04, -9.9869e-01,  ...,  6.3515e-04,
          9.9283e-01,  1.2715e-03]])
average training loss for model1 : 4.361799716949463



##### look at model on 2nd batch

In [15]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_1, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 4.361799716949463


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.256001,1.0,1.0,1.0,31.256001,0.0
1,IPLLR,2.0,1.474337,1.0,1.0,1.0,1.474337,0.0
2,IPLLR,3.0,1.021131,1.0,1.0,1.0,1.021131,0.0
3,IPLLR,4.0,1.019765,1.0,1.0,1.0,1.019765,0.0
4,IPLLR,5.0,1.019644,1.0,1.0,1.0,1.019644,0.0
5,IPLLR,6.0,1.019809,1.0,1.0,1.0,1.019809,0.0
6,IPLLR,7.0,1.116286,1.0,1.0,1.0,1.116286,0.0


In [16]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_1, batches=[batch])
contributions_df

average validation loss for IPLLR : 2.302395820617676


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.256001,1.0,0.992324,0.253125,31.154274,0.0
1,IPLLR,2.0,1.456719,0.987054,0.956897,0.070879,1.40967,0.0
2,IPLLR,3.0,0.888219,0.868857,0.766284,0.360034,0.774832,0.0
3,IPLLR,4.0,0.25455,0.244716,0.22906,0.114654,0.235824,0.0
4,IPLLR,5.0,0.056806,0.054493,0.300791,0.295681,0.301175,0.0
5,IPLLR,6.0,0.002476,0.00214,0.012306,0.011163,0.012561,0.0
6,IPLLR,7.0,0.000962,0.000998,0.000969,6.3e-05,0.000939,0.0


In [17]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_1, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 4.5204973220825195


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.156452,1.0148,1.0148,1.0148,31.156452,0.0
1,IPLLR,2.0,1.487203,1.0136,1.0136,1.0136,1.487203,0.0
2,IPLLR,3.0,1.030404,1.008917,1.008917,1.008917,1.030404,0.0
3,IPLLR,4.0,1.02962,1.009682,1.009682,1.009682,1.02962,0.0
4,IPLLR,5.0,1.029527,1.009693,1.009693,1.009693,1.029527,0.0
5,IPLLR,6.0,1.029693,1.009692,1.009692,1.009692,1.029693,0.0
6,IPLLR,7.0,1.127104,1.009692,1.009692,1.009692,1.127104,0.0


In [18]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_1, batches=[batch])
contributions_df

average validation loss for IPLLR : 2.316787004470825


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,31.156452,1.0148,1.005777,0.25715,31.052893,0.0
1,IPLLR,2.0,1.469167,1.000363,1.109881,0.714508,1.308319,0.0
2,IPLLR,3.0,0.217763,0.208555,0.199138,0.012529,0.208187,0.0
3,IPLLR,4.0,0.174351,0.17059,0.166196,0.004787,0.169865,0.0
4,IPLLR,5.0,0.157268,0.154092,0.15002,0.004402,0.153048,0.0
5,IPLLR,6.0,0.142169,0.139234,0.13582,0.00409,0.138581,0.0
6,IPLLR,7.0,0.142714,0.127164,0.12702,0.000257,0.142569,0.0


In [None]:
batch = batches[3]
contributions_df = compute_contributions_with_previous('IPLLR', ipllr_2, ipllr_0, ipllr_1, batches=[batch])
contributions_df.columns = ['model', 'layer', 'h_0', 'h_1', 'Delta_h_1', 'Delta_h_2', 'delta_h_2', 'h_2', 'id']
contributions_df.loc[:, ['model', 'layer', 'h_0', 'h_1', 'Delta_h_1', 'Delta_h_2', 'delta_h_2', 'h_2']]

In [None]:
batch = batches[3]
contributions_df = compute_contributions_with_previous('IPLLR', ipllr_2, ipllr_0, ipllr_1, batches=[batch])
contributions_df.columns = ['model', 'layer', 'h_0', 'h_1', 'Delta_h_1', 'Delta_h_2', 'delta_h_2', 'h_2', 'id']
contributions_df.loc[:, ['model', 'layer', 'h_0', 'h_1', 'Delta_h_1', 'Delta_h_2', 'delta_h_2', 'h_2']]

### Train further

In [19]:
# ['model', 'layer', 'h_1', 'Delta_h_1', 'Delta_h', 'delta_h', 'h', 'id']
for i in range(3, n_steps):
    print('----  step {}  ----'.format(i))
    
    # train model
    ipllr_previous = deepcopy(ipllr)
    x, y = batches[i]
    train_model_one_step(ipllr, x, y, batch_size)
    
    # compute contributions
    next_batch = batches[(i+1) % len(batches)]
    contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1,ipllr_previous, 
                                                         batches=[next_batch])
    print(contributions_df)
    
    print('\n\n')

----  step 3  ----
input abs mean in training:  0.68505859375
loss derivatives for model: tensor([[ 0.0893, -0.9103,  0.1002,  ...,  0.0942,  0.1741,  0.1000],
        [ 0.0899,  0.0902,  0.1003,  ...,  0.0946,  0.1705,  0.1001],
        [ 0.0902,  0.0905, -0.8996,  ...,  0.0948,  0.1683,  0.1001],
        ...,
        [ 0.0898,  0.0902,  0.1003,  ...,  0.0946,  0.1706, -0.8999],
        [ 0.0919,  0.0922,  0.1007,  ...,  0.0959,  0.1558,  0.1005],
        [ 0.0817,  0.0823,  0.0976,  ...,  0.0887, -0.7706,  0.0972]])
average training loss for model1 : 2.3489251136779785

average validation loss for IPLLR : 2.322089433670044
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  30.530596   0.994678  0.986242  0.006975  30.432518  0.0
1  IPLLR    2.0   1.431061   0.974879  1.086994  0.030785   1.275479  0.0
2  IPLLR    3.0   0.199113   0.190596  0.181802  0.000666   0.190177  0.0
3  IPLLR    4.0   0.157714   0.154300  0.150240  0.000110   0.153574  0

input abs mean in training:  0.6977292895317078
loss derivatives for model: tensor([[ 0.0915,  0.0919,  0.1006,  ...,  0.0956,  0.1587,  0.1004],
        [ 0.0866,  0.0871,  0.0995,  ...,  0.0923,  0.1937,  0.0992],
        [ 0.0920,  0.0923,  0.1007,  ...,  0.0959,  0.1552,  0.1005],
        ...,
        [-0.9113,  0.0891,  0.1001,  ...,  0.0938,  0.1786,  0.0998],
        [ 0.0907,  0.0911,  0.1005,  ...,  0.0951,  0.1644,  0.1003],
        [-0.9080,  0.0923,  0.1007,  ...,  0.0959,  0.1554,  0.1005]])
average training loss for model1 : 2.3104963302612305

average validation loss for IPLLR : 2.3098134994506836
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  30.640430   0.994016  0.985917  0.003775  30.540039  0.0
1  IPLLR    2.0   1.444640   0.981897  1.122952  0.015775   1.296767  0.0
2  IPLLR    3.0   0.172779   0.164890  0.156650  0.000267   0.164398  0.0
3  IPLLR    4.0   0.134273   0.131340  0.127732  0.000037   0.130594  0.0
4  IPLLR  

input abs mean in training:  0.7002273797988892
loss derivatives for model: tensor([[ 0.0863,  0.0867,  0.0994,  ...,  0.0921,  0.1961,  0.0990],
        [ 0.0956,  0.0958,  0.1008,  ...,  0.0980,  0.1302,  0.1007],
        [ 0.0941,  0.0943,  0.1009,  ...,  0.0971,  0.1410,  0.1007],
        ...,
        [ 0.0949,  0.0951,  0.1009,  ...,  0.0976,  0.1350, -0.8993],
        [ 0.0946,  0.0948,  0.1009,  ...,  0.0974,  0.1371,  0.1007],
        [ 0.0944, -0.9054,  0.1009,  ...,  0.0973,  0.1390,  0.1007]])
average training loss for model1 : 2.2762668132781982

average validation loss for IPLLR : 2.2568728923797607
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  31.141968   1.049508  1.041968  0.005319  31.037193  0.0
1  IPLLR    2.0   1.479765   1.014757  1.171534  0.012371   1.334609  0.0
2  IPLLR    3.0   0.171380   0.163874  0.155786  0.000118   0.163137  0.0
3  IPLLR    4.0   0.135437   0.132487  0.128819  0.000009   0.131695  0.0
4  IPLLR  

input abs mean in training:  0.6936290264129639
loss derivatives for model: tensor([[ 0.0972,  0.0973,  0.1007,  ...,  0.0988,  0.1192,  0.1006],
        [ 0.0925,  0.0928,  0.1008,  ...,  0.0962,  0.1516,  0.1005],
        [ 0.0837,  0.0842,  0.0985,  ...,  0.0902,  0.2147,  0.0980],
        ...,
        [-0.9045,  0.0957,  0.1009,  ...,  0.0979,  0.1311,  0.1007],
        [ 0.0960,  0.0962,  0.1008,  ...,  0.0982,  0.1273,  0.1007],
        [ 0.0935,  0.0938,  0.1008,  ...,  0.0968,  0.1451,  0.1006]])
average training loss for model1 : 2.219236135482788

average validation loss for IPLLR : 2.2540366649627686
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  30.912212   0.997137  0.993548  0.009369  30.818539  0.0
1  IPLLR    2.0   1.445455   0.982007  1.156763  0.019313   1.323012  0.0
2  IPLLR    3.0   0.171179   0.163976  0.156646  0.000312   0.163676  0.0
3  IPLLR    4.0   0.137914   0.134943  0.131353  0.000069   0.134247  0.0
4  IPLLR   

input abs mean in training:  0.6915256977081299
loss derivatives for model: tensor([[ 0.0953,  0.0955,  0.1009,  ...,  0.0978,  0.1326,  0.1007],
        [ 0.0923,  0.0926,  0.1007,  ...,  0.0961,  0.1532, -0.8995],
        [ 0.0899,  0.0903,  0.1004,  ...,  0.0946,  0.1701,  0.1000],
        ...,
        [ 0.0964,  0.0965,  0.1008,  ...,  0.0984,  0.1250,  0.1007],
        [ 0.0836,  0.0842,  0.0984,  ...,  0.0901, -0.7845,  0.0980],
        [ 0.0948, -0.9049,  0.1009,  ...,  0.0975,  0.1357,  0.1007]])
average training loss for model1 : 2.247668743133545

average validation loss for IPLLR : 2.2915499210357666
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  31.358624   0.976912  0.971639  0.005002  31.266533  0.0
1  IPLLR    2.0   1.468087   0.990734  1.184224  0.008900   1.348012  0.0
2  IPLLR    3.0   0.170411   0.163330  0.156345  0.000098   0.163271  0.0
3  IPLLR    4.0   0.140673   0.137703  0.134218  0.000024   0.137114  0.0
4  IPLLR   

input abs mean in training:  0.6917909979820251
loss derivatives for model: tensor([[ 0.0975,  0.0977,  0.1006,  ...,  0.0989,  0.1170,  0.1005],
        [-0.9029,  0.0973,  0.1007,  ...,  0.0987,  0.1198,  0.1006],
        [ 0.0978,  0.0979,  0.1006,  ...,  0.0991,  0.1150,  0.1005],
        ...,
        [ 0.0929,  0.0932,  0.1008,  ..., -0.9036,  0.1491,  0.1005],
        [ 0.0794, -0.9199,  0.0966,  ...,  0.0869,  0.2463,  0.0960],
        [ 0.0804,  0.0810,  0.0970,  ...,  0.0876,  0.2392,  0.0965]])
average training loss for model1 : 2.2458183765411377

average validation loss for IPLLR : 2.2694180011749268
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  31.197762   0.988841  0.989575  0.004064  31.112232  0.0
1  IPLLR    2.0   1.450926   0.983825  1.193962  0.009347   1.354017  0.0
2  IPLLR    3.0   0.173940   0.167048  0.160940  0.000080   0.167661  0.0
3  IPLLR    4.0   0.146331   0.143270  0.139872  0.000017   0.142855  0.0
4  IPLLR  

input abs mean in training:  0.6948956847190857
loss derivatives for model: tensor([[ 0.0701,  0.0709,  0.0910,  ...,  0.0789, -0.6821,  0.0902],
        [ 0.0989,  0.0990,  0.1003,  ...,  0.0996,  0.1076,  0.1003],
        [ 0.0972,  0.0973,  0.1007,  ..., -0.9012,  0.1194,  0.1006],
        ...,
        [ 0.0915,  0.0919, -0.8994,  ...,  0.0956,  0.1586,  0.1003],
        [ 0.0978,  0.0979,  0.1006,  ...,  0.0991,  0.1152,  0.1005],
        [ 0.0692,  0.0700,  0.0903,  ...,  0.0782, -0.6751,  0.0896]])
average training loss for model1 : 2.1941428184509277

average validation loss for IPLLR : 2.2910354137420654
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  30.982382   0.996821  0.997916  0.011625  30.898758  0.0
1  IPLLR    2.0   1.450237   0.984373  1.210795  0.018865   1.364178  0.0
2  IPLLR    3.0   0.164556   0.157907  0.152157  0.000267   0.158623  0.0
3  IPLLR    4.0   0.139341   0.136455  0.133353  0.000075   0.136165  0.0
4  IPLLR  

### lr = 0.001

In [58]:
base_lr = 0.001
config = deepcopy(base_model_config)
config.optimizer.params['lr'] = base_lr

In [59]:
ipllr = FcIPLLR(config, n_warmup_steps=12, lr_calibration_batches=batches)

initial base lr : [0.06314382702112198, 0.04407282918691635, 0.8801496028900146, 1.5207953453063965, 1.448553204536438, 1.7072001695632935, 4.226461887359619]


### 1. Initial model : t=0

In [60]:
ipllr_0 = deepcopy(ipllr)

### 2. After one step of SGD : t=1

In [61]:
x, y = batches[0]
train_model_one_step(ipllr, x, y, batch_size)
ipllr_1 = deepcopy(ipllr)

input abs mean in training:  0.7020161151885986
loss derivatives for model: tensor([[-0.9000,  0.1000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000, -0.9000,  0.1000,  ...,  0.1000,  0.1000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.9000],
        ...,
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000, -0.9000,  0.1000],
        [ 0.1000,  0.1000,  0.1000,  ...,  0.1000, -0.9000,  0.1000],
        [ 0.1000, -0.9000,  0.1000,  ...,  0.1000,  0.1000,  0.1000]])
average training loss for model1 : 2.3025827407836914



##### Look at model_1 on the 2nd batch

In [62]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
#contributions_df.columns = ['model', 'layer', 'h_0', 'model_0_', 'Delta_h_0', 'Delta_h_1', 'delta_h_1', 'h_1', 'id']
#contributions_df.loc[:, ['model', 'layer', 'h_0', 'Delta_h_1', 'delta_h_1', 'h_1']]
contributions_df

average validation loss for IPLLR : 4.183976173400879


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,30.922825,1.0,1.0,1.0,30.922825,0.0
1,IPLLR,2.0,1.487166,1.0,1.0,1.0,1.487166,0.0
2,IPLLR,3.0,1.02213,1.0,1.0,1.0,1.02213,0.0
3,IPLLR,4.0,1.01897,1.0,1.0,1.0,1.01897,0.0
4,IPLLR,5.0,1.020229,1.0,1.0,1.0,1.020229,0.0
5,IPLLR,6.0,1.021049,1.0,1.0,1.0,1.021049,0.0
6,IPLLR,7.0,1.094514,1.0,1.0,1.0,1.094514,0.0


##### Look at model_1 on the 2nd batch

In [63]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 4.31351375579834


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,30.742512,1.013753,1.013753,1.013753,30.742512,0.0
1,IPLLR,2.0,1.487305,1.003237,1.003237,1.003237,1.487305,0.0
2,IPLLR,3.0,1.024118,1.001932,1.001932,1.001932,1.024118,0.0
3,IPLLR,4.0,1.022035,1.002994,1.002994,1.002994,1.022035,0.0
4,IPLLR,5.0,1.023353,1.003062,1.003062,1.003062,1.023353,0.0
5,IPLLR,6.0,1.024178,1.003064,1.003064,1.003064,1.024178,0.0
6,IPLLR,7.0,1.097868,1.003064,1.003064,1.003064,1.097868,0.0


##### Look at model_1 on a couple of random batches

In [64]:
batch = batches[4:20]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1, ipllr_0, batches=batch)
contributions_df

contributions_by_layer = contributions_df.groupby(by='layer').mean()
contributions_by_layer

average validation loss for IPLLR : 4.274628937244415


Unnamed: 0_level_0,h_1,Delta_h_1,Delta_h,delta_h,h,id
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,30.841065,1.008896,1.008896,1.008896,30.841065,7.5
2.0,1.488668,1.005045,1.005045,1.005045,1.488668,7.5
3.0,1.024342,1.002154,1.002154,1.002154,1.024342,7.5
4.0,1.021561,1.002538,1.002538,1.002538,1.021561,7.5
5.0,1.02284,1.002559,1.002559,1.002559,1.02284,7.5
6.0,1.023666,1.002563,1.002563,1.002563,1.023666,7.5
7.0,1.09732,1.002563,1.002563,1.002563,1.09732,7.5


### 3. After 2 steps of SGD : t=2

In [65]:
x, y = batches[1]
train_model_one_step(ipllr, x, y, batch_size)
ipllr_2 = deepcopy(ipllr)

input abs mean in training:  0.6987618207931519
loss derivatives for model: tensor([[ 4.0565e-03,  3.4256e-03,  1.0370e-02,  ...,  5.0280e-03,
          9.4398e-01,  6.3709e-03],
        [ 6.5130e-03,  5.5868e-03,  1.5266e-02,  ...,  7.9142e-03,
          9.1578e-01,  9.8109e-03],
        [ 6.2691e-03,  5.3707e-03,  1.4799e-02,  ...,  7.6300e-03,
          9.1851e-01, -9.9052e-01],
        ...,
        [ 1.0281e-02, -9.9104e-01,  2.2103e-02,  ...,  1.2248e-02,
          8.7500e-01,  1.4855e-02],
        [ 1.1229e-02,  9.8131e-03,  2.3729e-02,  ...,  1.3325e-02,
          8.6507e-01,  1.6092e-02],
        [ 1.0027e-03,  8.0982e-04, -9.9672e-01,  ...,  1.3153e-03,
          9.8297e-01,  1.7740e-03]])
average training loss for model1 : 4.183976173400879



##### look at model on 2nd batch

In [66]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_1, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 4.183976173400879


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,30.922825,1.0,1.0,1.0,30.922825,0.0
1,IPLLR,2.0,1.487166,1.0,1.0,1.0,1.487166,0.0
2,IPLLR,3.0,1.02213,1.0,1.0,1.0,1.02213,0.0
3,IPLLR,4.0,1.01897,1.0,1.0,1.0,1.01897,0.0
4,IPLLR,5.0,1.020229,1.0,1.0,1.0,1.020229,0.0
5,IPLLR,6.0,1.021049,1.0,1.0,1.0,1.021049,0.0
6,IPLLR,7.0,1.094514,1.0,1.0,1.0,1.094514,0.0


In [67]:
batch = batches[1]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_2, ipllr_0, ipllr_1, ipllr_1, batches=[batch])
contributions_df

average validation loss for IPLLR : 2.363178253173828


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,30.922825,1.0,0.996355,0.023686,30.913288,0.0
1,IPLLR,2.0,1.485437,0.998767,1.087764,0.645969,1.307584,0.0
2,IPLLR,3.0,0.282482,0.270954,0.269614,0.001619,0.281116,0.0
3,IPLLR,4.0,0.269914,0.264937,0.264363,0.000613,0.269315,0.0
4,IPLLR,5.0,0.267752,0.262414,0.261766,0.000676,0.267091,0.0
5,IPLLR,6.0,0.265471,0.25997,0.259368,0.000679,0.264828,0.0
6,IPLLR,7.0,0.281902,0.257412,0.257383,4e-05,0.281873,0.0


In [68]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_1, ipllr_0, ipllr_1, ipllr_0, batches=[batch])
contributions_df

average validation loss for IPLLR : 4.31351375579834


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,30.742512,1.013753,1.013753,1.013753,30.742512,0.0
1,IPLLR,2.0,1.487305,1.003237,1.003237,1.003237,1.487305,0.0
2,IPLLR,3.0,1.024118,1.001932,1.001932,1.001932,1.024118,0.0
3,IPLLR,4.0,1.022035,1.002994,1.002994,1.002994,1.022035,0.0
4,IPLLR,5.0,1.023353,1.003062,1.003062,1.003062,1.023353,0.0
5,IPLLR,6.0,1.024178,1.003064,1.003064,1.003064,1.024178,0.0
6,IPLLR,7.0,1.097868,1.003064,1.003064,1.003064,1.097868,0.0


In [69]:
batch = batches[2]
contributions_df = compute_contributions_with_step_1('IPLLR', ipllr_2, ipllr_0, ipllr_1, ipllr_1, batches=[batch])
contributions_df

average validation loss for IPLLR : 2.4003608226776123


Unnamed: 0,model,layer,h_1,Delta_h_1,Delta_h,delta_h,h,id
0,IPLLR,1.0,30.742512,1.013753,1.010026,0.024053,30.732771,0.0
1,IPLLR,2.0,1.485542,1.001977,1.089083,0.645921,1.304728,0.0
2,IPLLR,3.0,0.281531,0.27001,0.268666,0.00162,0.280153,0.0
3,IPLLR,4.0,0.269432,0.264451,0.263877,0.000613,0.268832,0.0
4,IPLLR,5.0,0.267303,0.261974,0.261327,0.000675,0.266643,0.0
5,IPLLR,6.0,0.265026,0.259534,0.258933,0.000678,0.264385,0.0
6,IPLLR,7.0,0.28143,0.256981,0.256952,4e-05,0.281401,0.0


### Train further

In [70]:
# ['model', 'layer', 'h_1', 'Delta_h_1', 'Delta_h', 'delta_h', 'h', 'id']
for i in range(3, n_steps):
    print('----  step {}  ----'.format(i))
    
    # train model
    ipllr_previous = deepcopy(ipllr)
    x, y = batches[i]
    train_model_one_step(ipllr, x, y, batch_size)
    
    # compute contributions
    next_batch = batches[(i+1) % len(batches)]
    contributions_df = compute_contributions_with_step_1('IPLLR', ipllr, ipllr_0, ipllr_1,ipllr_previous, 
                                                         batches=[next_batch])
    print(contributions_df)
    
    print('\n\n')

----  step 3  ----
input abs mean in training:  0.68505859375
loss derivatives for model: tensor([[ 0.0793, -0.9237,  0.0977,  ...,  0.0832,  0.2666,  0.0877],
        [ 0.0799,  0.0770,  0.0980,  ...,  0.0837,  0.2618,  0.0881],
        [ 0.0835,  0.0809, -0.9003,  ...,  0.0870,  0.2329,  0.0909],
        ...,
        [ 0.0819,  0.0792,  0.0990,  ...,  0.0855,  0.2457, -0.9103],
        [ 0.0817,  0.0789,  0.0989,  ...,  0.0853,  0.2476,  0.0895],
        [ 0.0652,  0.0617,  0.0883,  ...,  0.0699, -0.6205,  0.0755]])
average training loss for model1 : 2.4561309814453125

average validation loss for IPLLR : 2.35770845413208
   model  layer        h_1  Delta_h_1   Delta_h   delta_h          h   id
0  IPLLR    1.0  30.283821   0.996427  0.992213  0.002210  30.274239  0.0
1  IPLLR    2.0   1.456616   0.986709  1.082340  0.087076   1.264694  0.0
2  IPLLR    3.0   0.216055   0.206411  0.205196  0.000231   0.214826  0.0
3  IPLLR    4.0   0.204924   0.201143  0.200674  0.000038   0.204437  0.

input abs mean in training:  0.6977292895317078
loss derivatives for model: tensor([[ 0.0898,  0.0878,  0.1016,  ...,  0.0923,  0.1840,  0.0952],
        [ 0.0841,  0.0816,  0.0999,  ...,  0.0875,  0.2282,  0.0914],
        [ 0.0881,  0.0859,  0.1012,  ...,  0.0909,  0.1974,  0.0941],
        ...,
        [-0.9148,  0.0828,  0.1003,  ...,  0.0885,  0.2195,  0.0922],
        [ 0.0883,  0.0861,  0.1012,  ...,  0.0911,  0.1956,  0.0943],
        [-0.9085,  0.0898,  0.1018,  ...,  0.0938,  0.1697,  0.0963]])
average training loss for model1 : 2.31650447845459

average validation loss for IPLLR : 2.3160600662231445
   model  layer        h_1  Delta_h_1   Delta_h       delta_h          h   id
0  IPLLR    1.0  30.451462   0.992619  0.987514  5.873648e-04  30.441479  0.0
1  IPLLR    2.0   1.464677   0.987623  1.125807  2.453955e-02   1.275921  0.0
2  IPLLR    3.0   0.136701   0.129694  0.128743  3.755969e-05   0.135747  0.0
3  IPLLR    4.0   0.128593   0.126217  0.125896  4.081860e-06   0.1282

input abs mean in training:  0.7002273797988892
loss derivatives for model: tensor([[ 0.0870,  0.0847,  0.1009,  ...,  0.0900,  0.2058,  0.0934],
        [ 0.0972,  0.0965,  0.1013,  ...,  0.0982,  0.1236,  0.0992],
        [ 0.0933,  0.0918,  0.1019,  ...,  0.0952,  0.1561,  0.0973],
        ...,
        [ 0.0949,  0.0937,  0.1018,  ...,  0.0964,  0.1432, -0.9018],
        [ 0.0925,  0.0909,  0.1019,  ...,  0.0945,  0.1624,  0.0969],
        [ 0.0954, -0.9057,  0.1018,  ...,  0.0968,  0.1387,  0.0984]])
average training loss for model1 : 2.2678627967834473

average validation loss for IPLLR : 2.244934320449829
   model  layer        h_1  Delta_h_1   Delta_h       delta_h          h   id
0  IPLLR    1.0  30.848370   1.052745  1.047096  6.912915e-04  30.837475  0.0
1  IPLLR    2.0   1.512596   1.028281  1.180760  1.680506e-02   1.324864  0.0
2  IPLLR    3.0   0.129699   0.123666  0.122830  9.248817e-06   0.128864  0.0
3  IPLLR    4.0   0.123800   0.121490  0.121176  8.039432e-07   0.123

input abs mean in training:  0.6936290264129639
loss derivatives for model: tensor([[ 0.0980,  0.0974,  0.1011,  ...,  0.0987,  0.1175,  0.0995],
        [ 0.0949,  0.0937,  0.1018,  ...,  0.0964,  0.1429,  0.0982],
        [ 0.0836,  0.0810,  0.0997,  ...,  0.0871,  0.2323,  0.0910],
        ...,
        [-0.9055,  0.0932,  0.1019,  ...,  0.0961,  0.1463,  0.0980],
        [ 0.0966,  0.0958,  0.1015,  ...,  0.0977,  0.1286,  0.0990],
        [ 0.0918,  0.0901,  0.1018,  ...,  0.0940,  0.1678,  0.0965]])
average training loss for model1 : 2.211461067199707

average validation loss for IPLLR : 2.2483439445495605
   model  layer        h_1  Delta_h_1   Delta_h       delta_h          h   id
0  IPLLR    1.0  30.692825   0.996284  0.990886  1.065032e-03  30.683002  0.0
1  IPLLR    2.0   1.474473   0.994056  1.167428  2.182889e-02   1.312933  0.0
2  IPLLR    3.0   0.129297   0.123959  0.123235  1.984732e-05   0.128570  0.0
3  IPLLR    4.0   0.124865   0.122526  0.122217  4.919636e-06   0.124

input abs mean in training:  0.6915256977081299
loss derivatives for model: tensor([[ 0.0964,  0.0955,  0.1016,  ...,  0.0976,  0.1305,  0.0989],
        [ 0.0910,  0.0892,  0.1018,  ...,  0.0933,  0.1742, -0.9040],
        [ 0.0935,  0.0920,  0.1019,  ...,  0.0953,  0.1542,  0.0974],
        ...,
        [ 0.0984,  0.0979,  0.1009,  ...,  0.0990,  0.1141,  0.0996],
        [ 0.0870,  0.0847,  0.1009,  ...,  0.0900, -0.7945,  0.0934],
        [ 0.0973, -0.9034,  0.1013,  ...,  0.0982,  0.1232,  0.0992]])
average training loss for model1 : 2.2436749935150146

average validation loss for IPLLR : 2.278656482696533
   model  layer        h_1  Delta_h_1   Delta_h       delta_h          h   id
0  IPLLR    1.0  30.996103   0.973673  0.968712  5.810884e-04  30.987118  0.0
1  IPLLR    2.0   1.478619   0.989095  1.187608  1.208900e-02   1.337631  0.0
2  IPLLR    3.0   0.124610   0.119795  0.119187  6.166432e-06   0.123999  0.0
3  IPLLR    4.0   0.121185   0.118916  0.118626  1.564799e-06   0.120

input abs mean in training:  0.6917909979820251
loss derivatives for model: tensor([[ 0.0976,  0.0969,  0.1012,  ...,  0.0984,  0.1209,  0.0993],
        [-0.9022,  0.0972,  0.1011,  ...,  0.0986,  0.1188,  0.0994],
        [ 0.0983,  0.0978,  0.1010,  ...,  0.0989,  0.1147,  0.0996],
        ...,
        [ 0.0932,  0.0917,  0.1019,  ..., -0.9049,  0.1569,  0.0973],
        [ 0.0843, -0.9183,  0.1000,  ...,  0.0876,  0.2273,  0.0915],
        [ 0.0773,  0.0742,  0.0966,  ...,  0.0813,  0.2823,  0.0860]])
average training loss for model1 : 2.2355446815490723

average validation loss for IPLLR : 2.256910562515259
   model  layer        h_1  Delta_h_1   Delta_h       delta_h          h   id
0  IPLLR    1.0  30.971605   0.989110  0.984159  5.101226e-04  30.962612  0.0
1  IPLLR    2.0   1.486494   1.002000  1.213922  1.173088e-02   1.363503  0.0
2  IPLLR    3.0   0.129438   0.124749  0.124220  6.126862e-06   0.128907  0.0
3  IPLLR    4.0   0.126957   0.124585  0.124298  1.608980e-06   0.126

input abs mean in training:  0.6948956847190857
loss derivatives for model: tensor([[ 0.0697,  0.0663,  0.0917,  ...,  0.0742, -0.6565,  0.0795],
        [ 0.0994,  0.0992,  0.1004,  ...,  0.0996,  0.1055,  0.0999],
        [ 0.0962,  0.0953,  0.1016,  ..., -0.9026,  0.1322,  0.0988],
        ...,
        [ 0.0945,  0.0932, -0.8981,  ...,  0.0961,  0.1459,  0.0980],
        [ 0.0981,  0.0976,  0.1010,  ...,  0.0988,  0.1161,  0.0995],
        [ 0.0719,  0.0686,  0.0932,  ...,  0.0763, -0.6744,  0.0814]])
average training loss for model1 : 2.184135675430298

average validation loss for IPLLR : 2.288431406021118
   model  layer        h_1  Delta_h_1   Delta_h       delta_h          h   id
0  IPLLR    1.0  30.847225   0.994848  0.989903  1.389207e-03  30.838409  0.0
1  IPLLR    2.0   1.478066   0.993721  1.225063  2.237478e-02   1.366164  0.0
2  IPLLR    3.0   0.121076   0.116858  0.116416  1.897129e-05   0.120634  0.0
3  IPLLR    4.0   0.119278   0.117051  0.116791  6.004289e-06   0.1190