Implementation of https://iopscience.iop.org/article/10.1088/1742-6596/1325/1/012079/pdf

In [172]:
import numpy as np
import torch
from sklearn.model_selection import train_test_split
import sklearn.model_selection
import pandas as pd
import matplotlib.pyplot as plt

In [255]:
class VirtualSampleGeneration():
    def __init__(self, train_input, train_output):
        self.train_input = train_input
        self.train_output = train_output
        self.x_min = train_input.min(axis=0)
        self.x_max = train_input.max(axis=0)
        self.cl = (self.x_min + self.x_max)*0.5
        self.attributes = train_input.shape[1]
        self.n = train_input.shape[0]
        self.x_var = train_input.var(axis=0)/(self.n-1)
        self.eta = np.log(10**(-20)) # parameter for numeric stability
        self.lb, self.ub = self.get_ub_lb()

        

    def get_ub_lb(self):
        x_min = self.x_min
        x_max = self.x_max
        attributes = self.attributes
        x_var = self.x_var

        # Berechne Center of Attributes CL
        cl = self.cl

        # Berechnung NL, NU für Xi (amount of samples smaller than CL)
        nu = []
        nl = []
        for i in range(attributes):
            nl_i = (X[:, i] <= cl[i]).nonzero()[0].shape[0]
            nu_i = (X[:, i] > cl[i]).nonzero()[0].shape[0]

            nl.append(nl_i)
            nu.append(nu_i)

        nl = np.array(nl)
        nu = np.array(nu)

        # Berechnung SkewL, SKewU
        skew_l = nl/(nl + nu)
        skew_u = nu/(nl + nu)

        lb = cl - skew_l*np.sqrt(-2*x_var/nl*eta)
        ub = cl + skew_l*np.sqrt(-2*x_var/nu*eta)
        return lb, ub
    
    def _mf_for_attribute(self, i):
        lb = self.lb
        ub = self.ub
        cl = self.cl

        MF = []
        x_range = np.linspace(lb[i], ub[i], 100)
        for x in x_range:
            lb_to_cl = (x - lb[i]) / (cl[i] - lb[i])
            ub_from_cl = (ub[i] - x) / (ub[i] - cl[i])

            if (lb[i] <= x and x < cl[i]):
                MF.append(lb_to_cl)
            elif (cl[i] <= x and x <=ub[i]):
                MF.append(ub_from_cl)
            else:
                MF.append(0)
        return np.array(MF), x_range

    def mf_all(self):
        attributes = self.attributes
        MF = []
        ranges = []
        for i in range(attributes):
            mfi, x_range = self._mf_for_attribute(i)
            MF.append(mfi)
            ranges.append(x_range)
        return MF, ranges
    
    # Latin Hypercube Sampling
    def get_samples(self, n):
        # generate n samples
        from scipy.stats.qmc import LatinHypercube
        import scipy.stats.qmc as qmc
        attributes = self.attributes
        lb = self.lb
        ub = self.ub

        sampler = LatinHypercube(attributes)
        sample = sampler.random(n)
        return qmc.scale(sample, lb, ub)
    
    # Generate Labels with Model
    def get_labels(self, model, samples):
        train_input = self.train_input
        train_output = self.train_output
        model.fit(train_input, train_output)
        y_samples = model.predict(samples)
        return y_samples

    # Add the data to train data set
    def add_virtual_data(self, model, n):
        additional_train_input = self.get_samples(n)
        additional_train_output = self.get_labels(model, additional_train_input)

        self.train_input = np.vstack((self.train_input, additional_train_input))
        self.train_output = np.hstack((self.train_output, additional_train_output))

In [267]:
data = pd.read_csv("slump_test.csv")

X = data.iloc[:, 1:10].values
y = data.iloc[:, 10].values

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=1)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Ohne VSG
print('ohne VSG')
model = RandomForestRegressor(n_estimators=400, max_depth=9)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
y_pred_test = model.predict(X_test)
print('Train RMSE: ',np.sqrt(mean_squared_error(y_train, y_pred)))
print('Train R2: ', model.score(X_train, y_train))
print('Test RMSE: ',np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('Test R2: ', model.score(X_test, y_test))

# Mit VSG
print('Mit VSG')
vsg = VirtualSampleGeneration(X_train, y_train)

model = RandomForestRegressor(n_estimators=400, max_depth=9)
vsg.add_virtual_data(model, 100)

snew_X_train = vsg.train_input
snew_y_train = vsg.train_output

model.fit(snew_X_train, snew_y_train)
y_pred = model.predict(snew_X_train)
y_pred_test = model.predict(X_test)
print('Train RMSE: ',np.sqrt(mean_squared_error(snew_y_train, y_pred)))
print('Train R2: ', model.score(snew_X_train, snew_y_train))
print('Test RMSE: ',np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('Test R2: ', model.score(X_test, y_test))

ohne VSG
Train RMSE:  1.3768546647061861
Train R2:  0.9674328616276255
Test RMSE:  3.3502304047220997
Test R2:  0.8251633242631613
Mit VSG
Train RMSE:  0.8814801103286092
Train R2:  0.9723395274886042
Test RMSE:  3.2513032473501537
Test R2:  0.8353361958054434
