# Experiment: Performance Evaluation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import functools
import sys

import datasets
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm
import os
import torch.nn.functional as F

In [3]:
seed = 0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(seed)

<torch._C.Generator at 0x125e4af50>

## Generate Data

In [4]:
import math
sin_wave = np.array([math.sin(x) for x in np.arange(200)])
plt.plot(sin_wave[:50])

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x16ca99b80>]

In [5]:
X = []
Y = []

seq_len = 50
num_records = len(sin_wave) - seq_len
val_num = 100
for i in range(num_records - val_num):
    X.append(sin_wave[i:i+seq_len])
    Y.append(sin_wave[i+seq_len])
    
X = np.array(X)
X = np.expand_dims(X, axis=2)

Y = np.array(Y)
Y = np.expand_dims(Y, axis=1)

In [6]:
X_val = []
Y_val = []

for i in range(num_records - val_num, num_records):
    X_val.append(sin_wave[i:i+seq_len])
    Y_val.append(sin_wave[i+seq_len])
    
X_val = np.array(X_val)
X_val = np.expand_dims(X_val, axis=2)

Y_val = np.array(Y_val)
Y_val = np.expand_dims(Y_val, axis=1)

In [7]:
from torch.utils.data import TensorDataset, DataLoader
batch_size = 10
X_total = np.concatenate((X, X_val), axis=0)
Y_total = np.concatenate((Y, Y_val), axis=0)
train_dataset = TensorDataset(torch.Tensor(X), torch.Tensor(Y))
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)
val_dataset = TensorDataset(torch.Tensor(X_val), torch.Tensor(Y_val))
val_dataloader = DataLoader(val_dataset, batch_size, shuffle=True)
total_dataset = TensorDataset(torch.Tensor(X_total), torch.Tensor(Y_total))
total_dataloader = DataLoader(total_dataset, 1, shuffle=False)

## Train RNN

In [8]:
def train(dataloader, model):
    total_loss = 0
    loss_func = nn.MSELoss()
    for data in dataloader:
        x = data[0].to(device)
        y = data[1].to(device)
            
        output =model(x)
        loss = loss_func(output,y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    total_loss /= len(dataloader)
    return total_loss

def validation(dataloader, model):
    total_loss = 0
    loss_func = nn.MSELoss()
    for data in dataloader:
        x = data[0].to(device)
        y = data[1].to(device)

        output =model(x)
        loss = loss_func(output,y)
        
        total_loss += loss.item()
    total_loss /= len(dataloader)
    return total_loss

In [9]:
from models import RNN, SimpleRNN, DNN
from models import count_parameters
learning_rate = 0.1
nepoch = 25               
T = 50                   # length of sequence
hidden_dim = 16
output_dim = 1
model = RNN(input_dim=1, hidden_dim = hidden_dim, output_dim=1)
# model = SimpleRNN(indim=1, statedim=hidden_dim)
# model = DNN(input_dim=50, hidden_dim = hidden_dim, output_dim=1)


print(f'The model has {count_parameters(model):,} trainable parameters')


model = model.to(device)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
for name, p in model.named_parameters(): 
    print(name)

The model has 321 trainable parameters
readout
rnn1.weight_ih_l0
rnn1.weight_hh_l0
rnn1.bias_ih_l0
rnn1.bias_hh_l0
fc.weight
fc.bias


In [10]:
nepoch = 100
train_loss = validation(train_dataloader, model)
val_loss = validation(val_dataloader, model)
print("before train",train_loss, val_loss)

for epoch in range(nepoch):
    # check loss on train
    total_loss = train(train_dataloader, model)
    val_loss = validation(val_dataloader, model)
    print(epoch, total_loss, val_loss)

before train 0.5180685341358184 0.5163480579853058
0 0.468161529302597 0.4002066761255264
1 0.344817578792572 0.2262914076447487
2 0.1617561399936676 0.029301009699702264
3 0.011100260284729302 0.0006977611366892234
4 0.0004666032153181732 0.00044325104681774974
5 0.0004324947658460587 0.0003853964328300208
6 0.00038545726565644147 0.0003357713634613901
7 0.0003242177190259099 0.00042023524292744696
8 0.000373477244284004 0.00033925696625374256
9 0.00037498126621358094 0.00033401835680706425
10 0.0003629056678619236 0.00036270191631047053
11 0.00041539479861967265 0.00033526926272315906
12 0.0003378299152245745 0.0003024856836418621
13 0.0003431990538956597 0.0003033026194316335
14 0.0002856976294424385 0.00033102692395914346
15 0.00030962249729782343 0.00032818067265907305
16 0.00027737206837628035 0.0002918595899245702
17 0.0003103828930761665 0.0003126566225546412
18 0.0003068722289754078 0.00025158340722555297
19 0.00029302039183676243 0.00027545020857360213
20 0.000242086933576501

## Calculate Finite NTK

In [11]:
import gc
import time

from models import clone_grads, paramdot


def normalize_matrix(matrix):
    m = np.max(matrix)
    out = matrix / m
    return out

def get_finite_ntk_trained(model, dataloader):
    grads = []
    M = len(dataloader)
    print(M)
    i = 0
    loss_func = nn.MSELoss()
    for data in tqdm.tqdm(dataloader):
        if model.__class__.__name__ == "RNN":
            x = data[0].to(device)
            y = data[1].to(device)
        else:
            x = data[0].squeeze(dim=2).to(device)
            y = data[1].to(device)

        model.train()
        model.zero_grad()
        output =model(x)
        loss = loss_func(output,y)
        scale = 2*(output.item()-y.item())
        loss /= scale
        loss.backward()
        grads.append(clone_grads(model))
    
    finite_ntk = np.zeros((M,M))
    for i in tqdm.tqdm(range(M)):
        for j in range(i+1):
            finite_ntk[i, j] = finite_ntk[j, i] = paramdot(grads[i], grads[j])
    
    return finite_ntk

def get_finite_ntk(model, dataloader):
    grads = []
    M = len(dataloader)
    print(M)
    i = 0

    for data in tqdm.tqdm(dataloader):
        x = data[0].to(device)
        y = data[1].to(device)
        
        model.train()
        model.zero_grad()
        loss =model(x)
        loss.backward()
        grads.append(clone_grads(model))
    
    finite_ntk = np.zeros((M,M))
    for i in tqdm.tqdm(range(M)):
        for j in range(i+1):
            finite_ntk[i, j] = finite_ntk[j, i] = paramdot(grads[i], grads[j])
    
    return finite_ntk


finite_ntk_trained = get_finite_ntk(model, total_dataloader)

model_init = SimpleRNN(indim=1, statedim=256).to(device)
finite_ntk_init = get_finite_ntk(model_init, total_dataloader)

finite_ntk = finite_ntk_init

150


100%|████████████████████████████████████████| 150/150 [00:00<00:00, 720.65it/s]
100%|████████████████████████████████████████| 150/150 [00:00<00:00, 503.61it/s]


150


100%|████████████████████████████████████████| 150/150 [00:00<00:00, 202.93it/s]
100%|████████████████████████████████████████| 150/150 [00:00<00:00, 464.31it/s]


## Calculate inf NTK

In [12]:
from kernels import RNTK,TNTK
from kernels.utils import VErf3, VDerErf3
varw = 1
varu = 2
varb = 0.2
varv = 1
avgpool = True
inps = X_total
inpcov = np.einsum('ais,bjs->aibj', inps, inps) / inps.shape[-1]
inpcov = np.moveaxis(inpcov, 1, 2)

# inf_ntk = RNTK(inpcov, VErf3, VDerErf3, varw, varu, varb, varv, avgpool=avgpool)
inf_ntk = RNTK(inpcov)


## SVM

In [13]:
import numpy as np
import pandas as pd
from kernels import svr_search

train_fold_idx = np.array([[i for i in range(50)]])
test_fold_idx = np.array([[i for i in range(50,150)]])

# gram = finite_ntk
gram = inf_ntk
labels = Y_total.squeeze(axis=1)
results = svr_search(gram, labels, train_fold_idx, test_fold_idx)
results

Unnamed: 0,C,normalized,train,test
0,0.0001,False,-0.49506,-0.493216
1,0.000774,False,-0.466568,-0.464831
2,0.005995,False,-0.277225,-0.27622
3,0.046416,False,-0.007923,-0.007997
4,0.359381,False,-0.007806,-0.007883
5,2.782559,False,-0.007806,-0.007883
6,21.544347,False,-0.007806,-0.007883
7,166.810054,False,-0.007806,-0.007883
8,1291.549665,False,-0.007806,-0.007883
9,10000.0,False,-0.007806,-0.007883
