<a href="https://colab.research.google.com/github/koba341/AutoML/blob/main/ConcreteSlump_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Concrete Slump Dataset

Dataset is from following paper: \\
 Yeh, I-Cheng, "Modeling slump flow of concrete using second-order regressions and artificial neural networks," Cement and Concrete Composites, Vol.29, No. 6, 474-480, 2007.

In [None]:
# install libraries
!pip install autoPyTorch 
!pip install gpytorch

In [14]:
# import needed packages
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torchvision import datasets
from torch.utils.data import DataLoader
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import sklearn.ensemble
import matplotlib.pyplot as plt
import numpy as np
import time
import gpytorch
from matplotlib import pyplot as plt

In [47]:
#import the data set
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AutoML/slump_test.csv")
X = data.iloc[:, 1:7].to_numpy()
y = data.iloc[:, 7:10].to_numpy()


X_train, X_test, y_train, y_test = \
    sklearn.model_selection.train_test_split(X, y, train_size=0.80,random_state=1) # create training and test dataset. 

# print some information
print('Size of X: ', X.shape)
print('Size of X_train: ', X_train.shape)
print('Size of X_test: ', X_test.shape)
print(data.head()) # regression problem y:= (SLUMP, FLOW, Compressive Strength)

Size of X:  (103, 6)
Size of X_train:  (82, 6)
Size of X_test:  (21, 6)
   No  Cement   Slag  Fly ash  Water    SP  Coarse Aggr.  Fine Aggr.  \
0   1   273.0   82.0    105.0  210.0   9.0         904.0       680.0   
1   2   163.0  149.0    191.0  180.0  12.0         843.0       746.0   
2   3   162.0  148.0    191.0  179.0  16.0         840.0       743.0   
3   4   162.0  148.0    190.0  179.0  19.0         838.0       741.0   
4   5   154.0  112.0    144.0  220.0  10.0         923.0       658.0   

   SLUMP(cm)  FLOW(cm)  Compressive Strength (28-day)(Mpa)  
0       23.0      62.0                               34.99  
1        0.0      20.0                               41.14  
2        1.0      20.0                               41.81  
3        3.0      21.5                               42.08  
4       20.0      64.0                               26.82  


array([683.8, 841.1, 827. , 765. , 742.7, 818. , 688. , 790. , 883. ,
       743. , 679. , 649.1, 685. , 650. , 710. , 656. , 761. , 778. ,
       815. , 680. , 715.3, 749. , 757. , 670.5, 672. , 768. , 720. ,
       647.1, 683. , 737. , 644.1, 778. , 686. , 651.8, 797. , 765. ,
       785. , 658. , 640.9, 646. , 792. , 804. , 652.5, 741. , 774. ,
       696. , 780.5, 892. , 799. , 680. , 641.4, 758. , 757. , 829. ,
       725. , 853. , 644.1, 691. , 723. , 667.2, 730. , 757. , 655. ,
       902. , 704. , 804. , 705. , 776. , 775. , 695. , 789. , 789. ,
       746. , 684. , 790. , 789.2, 829. , 780. , 722. , 813. , 836. ,
       729. ])

# AutoPyTorch

In [None]:
# Import TabularRegressionTask from autoPyTorch
# AutoPyTorch cant access the GPU... 
from autoPyTorch.api.tabular_regression import TabularRegressionTask

api = TabularRegressionTask(ensemble_size=0) #ensemble_size = 0: If set to 0, no ensemble will be constructed

api.search(
    X_train=X_train,
    y_train=y_train,
    X_test=X_test.copy(),
    y_test=y_test.copy(),
    optimize_metric='r2',
    total_walltime_limit=3600*5, # Train for 5 hours
    func_eval_time_limit_secs=np.inf, # When set to None, this time will automatically be set to total_walltime_limit // 2 to allow enough time to fit at least 2 individual machine learning algorithms. Set to np.inf in case no time limit is desired.
    memory_limit=None,
    enable_traditional_pipeline=False, # If set to False, no traditional machine learning pipelines will be trained.
)

In [None]:
y_pred = api.predict(X_test)

# Rescale the Neural Network predictions into the original target range
score = api.score(y_pred, y_test)

print(score)

# Print the final ensemble built by AutoPyTorch
print(api.show_models())

# Print statistics from search
print(api.sprint_statistics())

# Print model architecture
print(api.models_)

# MLP

In [48]:
# Generate Dataset Class for PyTorch, so we can easily use the dataset for batch learning
# we have to normalize and standardize data maybe to get better results. Not done yet.
class ConcreteDataset(Dataset):
  def __init__(self, data, labels):
    # takes input data and target labels with dtype numpy array and converts it to a FloatTensor
    self.data = torch.from_numpy(data).float() # Input Data
    self.labels = torch.from_numpy(labels).float() # Target Labels
  def __len__(self):
    # returns size of dataset
    return len(self.labels)
  def __getitem__(self, idx):
    # returns a single data row with target label
    X = self.data[idx, :]
    y = self.labels[idx].view(-1)
    return X, y

In [49]:
# Generate Dataloaders for Training and Validation Data and determine batch size
training_data = ConcreteDataset(X_train, y_train)
train_dataloader = DataLoader(training_data, batch_size = X_train.shape[0], shuffle=False)
# It's called test data but it's used for validation while training
test_data = ConcreteDataset(X_test, y_test)
test_dataloader = DataLoader(test_data, batch_size = X_test.shape[0])

# R2 Score for Validation Dataset 
def test_r2():
  model.eval()
  data, labels = next(iter(test_dataloader))
  data = data.to(device)
  labels = labels.to(device)
  outputs = model(data)

  return sklearn.metrics.r2_score(labels.cpu().detach().numpy(), outputs.cpu().detach().numpy())

In [58]:
# Generate Multilayer-Perceptron with ReLU Activation Function. Use of Adam Optimizer and MSELoss.

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.linear1 = nn.Linear(6, 7)
        self.linear2 = nn.Linear(7, 3)
        self.af = nn.Sigmoid()
    def forward(self, x):
        x = self.af(self.linear1(x))
        x = self.linear2(x)
        return x

model = MLP().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
#optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.5)
criterion = nn.MSELoss()

In [None]:
# Training of MLP. Termination Criterions: stop training after 2000 iterations.
start_time = time.time()
n_epochs = 2000
list_loss = []
test_list_loss = []
for epoch in range(n_epochs):
    model.train()
    # monitor training loss
    train_loss = 0.0
    for data, target in train_dataloader:
        data = data.to(device)
        target = target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*data.size(0)
        
    train_loss = train_loss/len(train_dataloader.dataset)
    list_loss.append(train_loss)
    r2_test = test_r2()
    test_list_loss.append(r2_test)
    print('Epoch: {} \tTraining Loss: {:.6f} \t R^2 Test: {}'.format(
        epoch+1, 
        train_loss,
        r2_test
        ))
    timer = time.time() - start_time
print("--- %s seconds ---" % (time.time() - start_time))

# Gaussprozessregression
made with https://docs.gpytorch.ai/en/stable/examples/03_Multitask_Exact_GPs/Multitask_GP_Regression.html

In [None]:
# Set up GPR Model
class MultitaskGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(MultitaskGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.MultitaskMean(
            gpytorch.means.ConstantMean(), num_tasks=3
        )
        self.covar_module = gpytorch.kernels.MultitaskKernel(
            gpytorch.kernels.MaternKernel(ard_num_dims=6), num_tasks=3, rank=1
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultitaskMultivariateNormal(mean_x, covar_x)

X_train, X_test, y_train, y_test = \
    sklearn.model_selection.train_test_split(X, y, train_size=0.8, random_state=1)

X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float()
likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=3)
model = MultitaskGPModel(X_train, y_train, likelihood)


In [None]:
# Train the GPR
training_iterations = 100000


# Find optimal model hyperparameters
model.train()
likelihood.train()

# Use the adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)  # Includes GaussianLikelihood parameters

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

for i in range(training_iterations):
    optimizer.zero_grad()
    output = model(X_train)
    loss = -mll(output, y_train)
    loss.backward()
    print('Iter %d/%d - Loss: %.3f' % (i + 1, training_iterations, loss.item()))
    optimizer.step()



In [None]:
# Print R2 Value of GPR
model.eval()
output= model(X_test)

r2 = sklearn.metrics.r2_score(y_test, output.mean.detach().numpy())
print('R2: ', r2)