In [24]:
import torch
import sklearn
import numpy as np
import pandas as pd
from torch import nn
import sklearn.datasets
import sklearn.linear_model
import torch.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import sklearn.model_selection
plt.rcParams['figure.dpi'] = 200
from joblib import Parallel, delayed
import h5py

In [7]:
# create MLP from literature
class ConcreteDataset(Dataset):
  def __init__(self, data, labels):
    # takes input data and target labels with dtype numpy array and converts it to a FloatTensor
    self.data = data# Input Data
    self.labels = labels # Target Labels
  def __len__(self):
    # returns size of dataset
    return len(self.labels)
  def __getitem__(self, idx):
    # returns a single data row with target label
    X = self.data[idx, :]
    y = self.labels[idx].view(-1)
    return X, y


class ConcreteNN(nn.Module):

  def __init__(self, size):
    super().__init__()
    # layers
    self.input_layer = nn.Linear(size, size)     
    self.hidden_layer = nn.Linear(size, size)
    self.output_layer = nn.Linear(size, 1) 
    self.activation = nn.ReLU()

  def forward(self, x):
    x = self.input_layer(x)
    x = self.activation(x)
    x = self.hidden_layer(x)
    x = self.activation(x)
    x = self.output_layer(x)
    return x

def train(model, X_train, y_train, X_test, y_test):
  criterion = nn.MSELoss()
  #optimizer= torch.optim.SGD(model.parameters(), lr=0.001, momentum= 0.5)
  optimizer= torch.optim.Adam(model.parameters(), lr=0.001)
  epochs = 25000
  loss_over_time = []
  test_loss_over_time = []
  training_data = ConcreteDataset(X_train, y_train)
  train_dataloader = DataLoader(training_data, batch_size = 128, shuffle=True)
  for i in range(epochs):
    for X_train, y_train in train_dataloader:
      model.train()
      optimizer.zero_grad()
      output = model(X_train).flatten()
      target = y_train.flatten()
      loss = criterion(output, target)
      loss.backward()
      optimizer.step()
    loss_over_time.append(loss.item())
    test_loss_over_time.append(test_loss(model=model, X_test=X_test, y_test=y_test))
  return loss_over_time, test_loss_over_time
  
def test_loss(model, X_test, y_test):
  model.eval()
  output = model(X_test)
  loss = sklearn.metrics.mean_squared_error(output.detach().numpy(), y_test.detach().numpy())
  return loss.item()

In [28]:
def lr_model(X_train, y_train):
    LR = sklearn.linear_model.LinearRegression(fit_intercept=True)
    inputs_reg = X_train[:, [0, 3, 5, 6]]
    LR.fit(inputs_reg, y_train)
    return torch.FloatTensor(LR.predict(inputs_reg))

def train_cv(train_index, test_index):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = ConcreteNN(9)

    # warmstarting (shrinking and perturb): https://arxiv.org/pdf/1910.08475.pdf
    # Most significantly, it allows us to
    # quickly fit high-performing models in sequential environments without having to retrain from scratch.
    # Separately, it offers a slight regularization benefit, which in combination with the first property
    # sometimes allows shrink-perturb models to generalize even better than randomly-initialized models.

    lamb = 0.3
    sigma = 0.001
    with torch.no_grad():
        for i in range(7):
            for j in range(7):
                model.input_layer.weight[i][j] = start_model.input_layer.weight[i][j]*lamb  + torch.normal(0.0, sigma, size=(1,1))
                model.hidden_layer.weight[i][j] = start_model.hidden_layer.weight[i][j]*lamb  + torch.normal(0.0, sigma, size=(1,1))
            model.output_layer.weight[0][i] = start_model.output_layer.weight[0][i]*lamb  + torch.normal(0.0, sigma, size=(1,1))
            model.input_layer.bias[i] = start_model.input_layer.bias[i]*lamb  + torch.normal(0.0, sigma, size=(1,1))
            model.hidden_layer.bias[i] = start_model.hidden_layer.bias[i]*lamb  + torch.normal(0.0, sigma, size=(1,1))

    criterion = nn.MSELoss()
    optimizer= torch.optim.SGD(model.parameters(), lr=0.001, momentum= 0.5)
    epochs = 2000
    loss_over_time = []
    test_loss_over_time = []

    # Training Loop
    for i in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train).flatten()
        loss = criterion(y_train, output)
        loss_over_time.append(loss.item())
        test_loss_over_time.append(test_loss(model=model, X_test=X_test, y_test=y_test))
        loss.backward()
        optimizer.step()
    return test_loss_over_time[-1]

def train_multirun(K, i):
    skf = sklearn.model_selection.KFold(n_splits=K)
    skf.get_n_splits(X, y)
    loss_test = []
    cv_loss_test = Parallel(n_jobs=-1)(delayed(train_cv)(train_index, test_index) for train_index, test_index in skf.split(X, y))
    return cv_loss_test

def train_iml_cv(train_index, test_index):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = ConcreteNN(9)

    # warmstarting (shrinking and perturb): https://arxiv.org/pdf/1910.08475.pdf
    # Most significantly, it allows us to
    # quickly fit high-performing models in sequential environments without having to retrain from scratch.
    # Separately, it offers a slight regularization benefit, which in combination with the first property
    # sometimes allows shrink-perturb models to generalize even better than randomly-initialized models.

    lamb = 0.3
    sigma = 0.001
    with torch.no_grad():
        for i in range(7):
            for j in range(7):
                model.input_layer.weight[i][j] = start_model.input_layer.weight[i][j]*lamb  + torch.normal(0.0, sigma, size=(1,1))
                model.hidden_layer.weight[i][j] = start_model.hidden_layer.weight[i][j]*lamb  + torch.normal(0.0, sigma, size=(1,1))
            model.output_layer.weight[0][i] = start_model.output_layer.weight[0][i]*lamb  + torch.normal(0.0, sigma, size=(1,1))
            model.input_layer.bias[i] = start_model.input_layer.bias[i]*lamb  + torch.normal(0.0, sigma, size=(1,1))
            model.hidden_layer.bias[i] = start_model.hidden_layer.bias[i]*lamb  + torch.normal(0.0, sigma, size=(1,1))

    criterion = nn.MSELoss()
    iml_crit = nn.ReLU()
    optimizer= torch.optim.SGD(model.parameters(), lr=0.001, momentum= 0.5)
    epochs = 2000
    lamda = 0.1
    loss_over_time = []
    test_loss_over_time = []

    # Training Loop
    for i in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train).flatten()
        loss = criterion(y_train, output) + lamda * iml_crit(torch.norm(lr_model(X_train, y_train)-output))
        loss_over_time.append(loss.item())
        test_loss_over_time.append(test_loss(model=model, X_test=X_test, y_test=y_test))
        loss.backward()
        optimizer.step()
    return test_loss_over_time[-1]

def train_iml_multirun(K, i):
    skf = sklearn.model_selection.KFold(n_splits=K)
    skf.get_n_splits(X, y)
    loss_test = []
    cv_loss_test = Parallel(n_jobs=-1)(delayed(train_iml_cv)(train_index, test_index) for train_index, test_index in skf.split(X, y))
    return cv_loss_test

def train_mlp_cv(train_index, test_index):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = ConcreteNN(9)
    criterion = nn.MSELoss()
    optimizer= torch.optim.SGD(model.parameters(), lr=0.001, momentum= 0.5)
    epochs = 2000
    loss_over_time = []
    test_loss_over_time = []

    # Training Loop
    for i in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train).flatten()
        loss = criterion(y_train, output)
        loss_over_time.append(loss.item())
        test_loss_over_time.append(test_loss(model=model, X_test=X_test, y_test=y_test))
        loss.backward()
        optimizer.step()
    return test_loss_over_time[-1]

def train_mlp_multirun(K, i):
    skf = sklearn.model_selection.KFold(n_splits=K)
    skf.get_n_splits(X, y)
    loss_test = []
    cv_loss_test = Parallel(n_jobs=-1)(delayed(train_mlp_cv)(train_index, test_index) for train_index, test_index in skf.split(X, y))
    return cv_loss_test

In [8]:
# Import big dataset (n=1000) for warmstarting
pretrain_data = pd.read_excel("Concrete_Data.xls")
pretrain_data.head()
filter = pretrain_data['Age (day)'].isin([28])
pretrain_data[filter]
pretrain_data = pd.read_excel("Concrete_Data.xls")
filter = pretrain_data['Age (day)'].isin([28])
pretrain_data = pretrain_data[filter]
del pretrain_data['Age (day)']
X = torch.FloatTensor(pretrain_data.iloc[:, 0:7].values)
y = torch.FloatTensor(pretrain_data.iloc[:, 7].values)
x_min = X.min(dim=0)[0]
x_max = X.max(dim=0)[0]
X = (X - x_min) / (x_max - x_min) # minmax scaler

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.76, random_state=1)
pretrain_data.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,61.887366
7,380.0,95.0,0.0,228.0,0.0,932.0,594.0,36.44777
8,266.0,114.0,0.0,228.0,0.0,932.0,670.0,45.854291
9,475.0,0.0,0.0,228.0,0.0,932.0,594.0,39.28979


In [9]:
# train pretrain model
start_model = ConcreteNN(7)
loss_over_time, test_loss_over_time = train(start_model, X_train, y_train, X_test, y_test)

In [10]:
# Import small dataset (n=103)
data = pd.read_csv("slump_test.csv")

X = torch.FloatTensor(data.iloc[:, 1:10].values)
y = torch.FloatTensor(data.iloc[:, 10].values)
x_min = X.min(dim=0)[0]
x_max = X.max(dim=0)[0]
X = (X - x_min) / (x_max - x_min) # min max scaler


In [25]:
kfold = 10
multiruns = 50
loss = Parallel(n_jobs=-1)(delayed(train_multirun)(multiruns, i) for i in range(kfold))
loss_iml = Parallel(n_jobs=-1)(delayed(train_iml_multirun)(multiruns, i) for i in range(kfold))
loss_mlp = Parallel(n_jobs=-1)(delayed(train_mlp_multirun)(multiruns, i) for i in range(kfold))

hf = h5py.File('warmstarting.h5', 'w')
hf.create_dataset('Loss Vanilla', data=loss)
hf.create_dataset('Loss IML', data=loss_iml)
hf.create_dataset('Loss Chen', data=loss_mlp)
hf.close()

In [30]:
hf = h5py.File('warmstarting.h5', 'r')
loss_va = np.array(hf.get('Loss Vanilla'))
loss_iml = np.array(hf.get('Loss IML'))
loss_mlp = np.array(hf.get('Loss Chen'))
print(loss_mlp.mean())
print(loss_va.mean())
print(loss_iml.mean())

4.628148223392666
6.249178976211697
6.808675774153322
