In [1]:
import torch
import sklearn
import numpy as np
import pandas as pd
from torch import nn
import sklearn.datasets
import sklearn.linear_model
import torch.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import sklearn.model_selection
plt.rcParams['figure.dpi'] = 200
from joblib import Parallel, delayed
import h5py

In [3]:
class ConcreteNN(nn.Module):

  def __init__(self, size):
    super().__init__()
    # layers
    self.input_layer = nn.Linear(size, size)     
    self.hidden_layer = nn.Linear(size, size)
    self.output_layer = nn.Linear(size, 1) 
    self.activation = nn.Sigmoid()

  def forward(self, x):
    x = self.input_layer(x)
    x = self.activation(x)
    x = self.hidden_layer(x)
    x = self.activation(x)
    x = self.output_layer(x)
    return x

def test_loss(model, X_test, y_test):
  model.eval()
  output = model(X_test)
  loss = sklearn.metrics.mean_squared_error(output.detach().numpy(), y_test.detach().numpy())
  return loss.item()

In [3]:
# Import small dataset (n=103)
data = pd.read_csv("slump_test.csv")

X = torch.FloatTensor(data.iloc[:, 1:10].values)
y = torch.FloatTensor(data.iloc[:, 10].values)
x_min = X.min(dim=0)[0]
x_max = X.max(dim=0)[0]
X = (X - x_min) / (x_max - x_min) # min max scaler

In [24]:
def train_cv(train_index, test_index):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    LR = sklearn.linear_model.LinearRegression(fit_intercept=True)
    inputs_reg = X_train[:, [0, 3, 5, 6]]
    LR.fit(inputs_reg, y_train)
    init_weights = LR.coef_

    model = ConcreteNN(9)

    # warmstarting (shrinking and perturb): https://arxiv.org/pdf/1910.08475.pdf
    # Most significantly, it allows us to
    # quickly fit high-performing models in sequential environments without having to retrain from scratch.
    # Separately, it offers a slight regularization benefit, which in combination with the first property
    # sometimes allows shrink-perturb models to generalize even better than randomly-initialized models.
    init_input = [0, 3, 5, 6]
    lamb = 0.3
    sigma = 0.001
    with torch.no_grad():
        for i, (init) in enumerate(init_input):
            model.input_layer.weight[init][init] = torch.Tensor([init_weights[i]])

    criterion = nn.MSELoss()
    optimizer= torch.optim.SGD(model.parameters(), lr=0.001, momentum= 0.5)
    epochs = 2000
    loss_over_time = []
    test_loss_over_time = []

    # Training Loop
    for i in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train).flatten()
        loss = criterion(y_train, output)
        loss_over_time.append(loss.item())
        test_loss_over_time.append(test_loss(model=model, X_test=X_test, y_test=y_test))
        loss.backward()
        optimizer.step()
    return test_loss_over_time[-1]

def train_multirun(K, i):
    skf = sklearn.model_selection.KFold(n_splits=K)
    skf.get_n_splits(X, y)
    loss_test = []
    cv_loss_test = Parallel(n_jobs=-1)(delayed(train_cv)(train_index, test_index) for train_index, test_index in skf.split(X, y))
    return cv_loss_test


def train_sp_cv(train_index, test_index):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    LR = sklearn.linear_model.LinearRegression(fit_intercept=True)
    inputs_reg = X_train[:, [0, 3, 5, 6]]
    LR.fit(inputs_reg, y_train)
    init_weights = LR.coef_

    model = ConcreteNN(9)

    # warmstarting (shrinking and perturb): https://arxiv.org/pdf/1910.08475.pdf
    # Most significantly, it allows us to
    # quickly fit high-performing models in sequential environments without having to retrain from scratch.
    # Separately, it offers a slight regularization benefit, which in combination with the first property
    # sometimes allows shrink-perturb models to generalize even better than randomly-initialized models.

    init_input = [0, 3, 5, 6]
    lamb = 0.3
    sigma = 0.001
    with torch.no_grad():
        for i, (init) in enumerate(init_input):
            model.input_layer.weight[init][init] = torch.Tensor([init_weights[i]]) * lamb  + torch.normal(0.0, sigma, size=(1,1))

    criterion = nn.MSELoss()
    optimizer= torch.optim.SGD(model.parameters(), lr=0.001, momentum= 0.5)
    epochs = 2000
    loss_over_time = []
    test_loss_over_time = []

    # Training Loop
    for i in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train).flatten()
        loss = criterion(y_train, output)
        loss_over_time.append(loss.item())
        test_loss_over_time.append(test_loss(model=model, X_test=X_test, y_test=y_test))
        loss.backward()
        optimizer.step()
    return test_loss_over_time[-1]

def train_sp_multirun(K, i):
    skf = sklearn.model_selection.KFold(n_splits=K)
    skf.get_n_splits(X, y)
    loss_test = []
    cv_loss_test = Parallel(n_jobs=-1)(delayed(train_sp_cv)(train_index, test_index) for train_index, test_index in skf.split(X, y))
    return cv_loss_test

In [None]:
kfold = 10
multiruns = 50
loss_warm = Parallel(n_jobs=-1)(delayed(train_multirun)(multiruns, i) for i in range(kfold))
loss_sp = Parallel(n_jobs=-1)(delayed(train_sp_multirun)(multiruns, i) for i in range(kfold))

hf = h5py.File('warmstarting.h5', 'w')
hf.create_dataset('Loss Warmstarting LR', data=loss_warm)
hf.create_dataset('Loss Warmstarting LR SP', data=loss_sp)
hf.close()

In [2]:
hf = h5py.File('warmstarting_lr.h5', 'r')
print(hf.keys())
loss_va = np.array(hf.get('Loss Vanilla'))
loss_lr = np.array(hf.get('Loss Warmstarting LR'))
loss_lrsp = np.array(hf.get('Loss Warmstarting LR SP'))

print(loss_va.mean()) # Loss without anything
print(loss_lr.mean()) # Loss Warmstarting
print(loss_lrsp.mean()) # Loss Warmstarting with Shrinking and Perturbing

<KeysViewHDF5 ['Loss Vanilla', 'Loss Warmstarting LR', 'Loss Warmstarting LR SP']>
9.541884859018028
8.817429725181311
7.64127300927043
