In [None]:
#@title

!pip install --upgrade xlrd # important to upgrade to open xls file
!pip install git+https://github.com/shukon/HpBandSter.git # probably not necessary, additional to autoPyTorch
!pip install autoPyTorch 

In [None]:
# import needed packages
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torchvision import datasets
from torch.utils.data import DataLoader
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import sklearn.ensemble
import matplotlib.pyplot as plt
import numpy as np
import time
#read dataset
data = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/AutoML/Concrete_Data.xls', engine='xlrd')
data.head()

In [None]:
# Generate Training, Test and Validation Dataset
X = data.iloc[:, 0:8].to_numpy()
y = data.iloc[:, 8].to_numpy()

print('Size of X: ', X.shape)

X_train, X_vt, y_train, y_vt = sklearn.model_selection.train_test_split(X, y, train_size=0.7,random_state=1)
X_test, X_val, y_test, y_val = sklearn.model_selection.train_test_split(X_vt, y_vt, train_size=0.5,random_state=1)

In [None]:
# Plot data
plt.figure(figsize=(18,9))
for i, col in zip(range(8), data.columns):
  plt.plot(X[:, i], label=col)
plt.plot(y, label='y')
plt.legend(loc='upper right', bbox_to_anchor=(1.35, 0.75))

In [None]:
#Korrelationskoeffizienten
for i in range(8):
  print(data.columns[i], ' Pearson-Korrelation: ', np.corrcoef(X_train[:, i], y_train)[0,1])

In [None]:
# Generate Dataset Class for PyTorch, so we can easily use the dataset for batch learning
class ConcreteDataset(Dataset):
  def __init__(self, data, labels):
    # takes input data and target labels with dtype numpy array and converts it to a FloatTensor
    self.data = torch.from_numpy(data).float() # Input Data
    self.labels = torch.from_numpy(labels).float() # Target Labels
  def __len__(self):
    # returns size of dataset
    return len(self.labels)
  def __getitem__(self, idx):
    # returns a single data row with target label
    X = self.data[idx, :]
    y = self.labels[idx].view(-1)
    return X, y

In [None]:
# Generate Dataloaders for Training and Validation Data and determine batch size
training_data = ConcreteDataset(X_train, y_train)
train_dataloader = DataLoader(training_data, batch_size = 10, shuffle=True)
# It's called test data but it's used for validation while training
test_data = ConcreteDataset(X_val, y_val)
test_dataloader = DataLoader(test_data, batch_size = 10)

In [None]:
# check if dataloader works
data, labels = next(iter(train_dataloader))
labels

In [None]:
# R2 Score for Validation Dataset 
def test_r2():
  model.eval()
  data, labels = next(iter(test_dataloader))
  data = data.to(device)
  labels = labels.to(device)
  outputs = model(data)

  return sklearn.metrics.r2_score(labels.cpu().detach().numpy(), outputs.cpu().detach().numpy())

In [None]:
# tested this physical loss to get better results. didnt work.
# ReLU hits as long as output for cement from neural network and target label of cement isn't similiar + weight term
def phy_loss(output, target):
  mse = nn.MSELoss()
  phy1 = torch.abs(torch.sum(output[:, 0] - target[:, 0]))
  ReLU = nn.ReLU()
  loss = mse(output, target) + 0.01*ReLU(phy1)
  return loss

In [None]:
# Generate Multilayer-Perceptron with ReLU Activation Function. Use of Adam Optimizer and MSELoss.

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.linear1 = nn.Linear(8, 100)
        self.linear2 = nn.Linear(100, 50)
        self.linear3 = nn.Linear(50, 50)
        self.linear4 = nn.Linear(50, 10)
        self.linear5 = nn.Linear(10, 1)
        self.af = nn.ReLU()
    def forward(self, x):
        x = self.af(self.linear1(x))
        x = self.af(self.linear2(x))
        x = self.af(self.linear3(x))
        x = self.af(self.linear4(x))
        x= self.linear5(x)
        return x

model = MLP().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
print(model)

In [None]:
# Training of MLP. Termination Criterions: max 6000 epochs. 1000 seconds.
start_time = time.time()
n_epochs = 6000
list_loss = []
test_list_loss = []
for epoch in range(n_epochs):
    model.train()
    # monitor training loss
    train_loss = 0.0
    for data, target in train_dataloader:
        data = data.to(device)
        target = target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(train_dataloader.dataset)
    list_loss.append(train_loss)
    r2_test = test_r2()
    test_list_loss.append(r2_test)
    print('Epoch: {} \tTraining Loss: {:.6f} \t R^2 Test: {}'.format(
        epoch+1, 
        train_loss,
        r2_test
        ))
    timer = time.time() - start_time
    if timer >= 1000:
      break
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
runs = [] # initialize list of all test data r2 scores


In [None]:
# Get R2 Score of MLP for Test Data
model.eval()
X_test = X_test
y_test = y_test
outputs = model(X_test)

run = sklearn.metrics.r2_score(y_test.cpu().detach().numpy(), outputs.cpu().detach().numpy())
runs.append(run)

In [None]:
# Print average R2 Score for MLP
print(sum(runs)/len(runs))


In [None]:
# Plot Training Loss
plt.figure()
plt.plot(list_loss[500:])
# Plot R2 Score of Validation Data
plt.figure(figsize=(14, 9))
plt.plot(test_list_loss[500:])

# NAS


Used NAS based on following example: https://automl.github.io/Auto-PyTorch/master/examples/20_basics/example_tabular_regression.html#sphx-glr-examples-20-basics-example-tabular-regression-py

In [None]:
# Generate Train and Test Data
data = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/AutoML/Concrete_Data.xls', engine='xlrd')
X = data.iloc[:, 0:8].to_numpy()
y = data.iloc[:, 8].to_numpy()

print('Size of X: ', X.shape)

X_train, X_test, y_train, y_test = \
    sklearn.model_selection.train_test_split(X, y,random_state=1)

In [None]:
# Import TabularRegressionTask from autoPyTorch
from autoPyTorch.api.tabular_regression import TabularRegressionTask

api = TabularRegressionTask()

# .search() takes train and test data, asks for optimization metric and how long the algorithm should run.
# set memory_limit=None to use complete memory instead of default 4096 MB, so that used algorithms dont crash.
api.search(
    X_train=X_train,
    y_train=y_train,
    X_test=X_test.copy(),
    y_test=y_test.copy(),
    optimize_metric='r2',
    total_walltime_limit=1000,
    func_eval_time_limit_secs=50,
    memory_limit=None
)

In [None]:
y_pred = api.predict(X_test)

# Rescale the Neural Network predictions into the original target range
score = api.score(y_pred, y_test)

print(score)

# Print the final ensemble built by AutoPyTorch
print(api.show_models())

# Print statistics from search
print(api.sprint_statistics())

# wie wird der validation score gemessen?

Wie sehen die Netzwerkarchitekturen der AutoML Lösung aus?
