In [5]:
import math

import time

import numpy as np

import sys, pickle

import matplotlib.pyplot as plt

from black_box_alphavi import fit_q

from sklearn import preprocessing

In [6]:
def get_test_error(X, y, index_train, index_test, i, dataset, alpha = 0.0):
    
    # load training and test data
    X_train = X[ index_train, ]
    y_train = y[ index_train ]
    X_test = X[ index_test ]
    y_test = y[ index_test ]

    # standardize the data
    scaler_X = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler_X.transform(X_train)
    X_test = scaler_X.transform(X_test)
    
    mean_y_train = np.mean(y_train)
    std_y_train = np.std(y_train)
    y_train = (y_train - mean_y_train) / std_y_train
    
    y_train = np.array(y_train, ndmin = 2).reshape((-1, 1))
    y_test = np.array(y_test, ndmin = 2).reshape((-1, 1))
    
    # We iterate the method 
    learning_rate = 0.001
    v_prior = 1.0
    batch_size = 32
    epochs = 500
    K = 100
    hidden_layer_size = 50
    start_time = time.time()
    w, v_prior, get_error_and_ll = fit_q(X_train, y_train, hidden_layer_size, 
        batch_size, epochs, K, alpha, learning_rate, v_prior)
    running_time = time.time() - start_time

    # We obtain the test RMSE and the test ll
    
    error, ll = get_error_and_ll(w, v_prior, X_test, y_test, K, mean_y_train, std_y_train)
        
    return -ll, error, running_time

# Write a function to get random splits for train index and test index
def split_data(n):
    
    permutation = np.random.choice(range(n), n, replace = False)
    end_train = round(n * 9.0 / 10)

    index_train = permutation[ 0 : end_train ]
    index_test = permutation[ end_train : n ]
    index_train = list(map(int, index_train))
    index_test = list(map(int, index_test))
    
    return index_train, index_test
    

# Write a function like this called 'main'
def main(dataset, alpha, n_splits):
    
    print("    Dataset    |     Alpha     |   Number of Splits  ")
    print("{0:15}|{1:15}|{2:15}".format(dataset, alpha, n_splits))
    sys.stdout.flush()

    # We load the data
    datapath = 'data/' + dataset + '/'
    data = np.loadtxt(datapath + 'data.txt')
    index_features = np.loadtxt(datapath + 'index_features.txt')
    index_target = np.loadtxt(datapath + 'index_target.txt')
    
    # We generate the training test splits

    i_features = list(map(int, np.linspace(0,index_features.size-1, index_features.size)))
    i_target = int(index_target)
    X = data[ : , i_features ]
    y = data[ : , i_target ]
    
    n = data.shape[0]
    savepath = datapath + 'results/'
    
    for i in range(n_splits):
        
        print(i/n_splits * 100, '% completed')
        
        np.random.seed(i)
        index_train, index_test = split_data(n)
        
        neg_test_ll, test_error, running_time = get_test_error(X, y, index_train, index_test,
                                                               i+1, dataset, alpha)
        
        with open(savepath + dataset + "_test_ll_alpha{}.txt".format(alpha), 'a') as f:
            f.write(repr(neg_test_ll) + '\n')
        with open(savepath + dataset + "_test_error_alpha{}.txt".format(alpha), 'a') as f:
            f.write(repr(test_error) + '\n')
        with open(savepath + dataset + "_test_time_alpha{}.txt".format(alpha), 'a') as f:
            f.write(repr(running_time) + '\n')
            
    print('100.0% completed')        
    print("")
    print("")

In [7]:
datasets = ['boston', 'computer', 'concrete','energy', 'housing', 
               'power', 'slump', 'wine', 'yacht']
alpha = [np.NINF, 0, 0.5, 1, 10**5]

In [None]:
for df in datasets:
    for j in range(len(alpha)):
        main(df, alpha[j], n_splits = 10)