In [1]:
import os, sys, time, random, warnings
# Disable warnings
warnings.filterwarnings('ignore')
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
p = os.path.abspath('..')
sys.path.insert(1, p)
from utils import import_dataset, create_search_space

In [2]:
# Import dataset
df = import_dataset('../../data/Annthyroid_withoutdupl_norm_07.arff')

# Maximum number of points
N = 5000

# Subsample if necessary
if(len(df) > N):
    df = df.sample(n=N)

# Extract X, y
X  = df.iloc[:, :-1]
y = df['outlier']

# Split to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=10)

print("Ratio of outliers in training set:", len(y_train[y_train==1])/len(y_train))
print("Ratio of outliers in test set:", len(y_test[y_test==1])/len(y_test))
print("Training size:", len(X_train))
print("Test size:", len(X_test))

Ratio of outliers in training set: 0.07466666666666667
Ratio of outliers in test set: 0.0752
Training size: 3750
Test size: 1250


In [3]:
# Create algorithm entries and their search spaces
models, search_space, evaluated = create_search_space()
print('Number of algorithms:', len(models))

Number of algorithms: 11


In [4]:
# Time
times = []
# Sample from the model-hyperparam space
n = 10 # how many sample runs?
for i in range(n):
    # Step 1 - Sample a model uniformally
    [key] = random.sample(list(models), 1)
    model = models[key]
    hp_space = search_space[key]
    # Step 2 - Sample a configuration from its hyperparam space
    params = hp_space.sample_configuration().get_dictionary()
    model.set_params(**params)

    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()
    elapsed = end - start
    print('Elapsed:', elapsed)
    times.append(elapsed)

Elapsed: 0.6228451728820801
Elapsed: 2.7018861770629883
Elapsed: 5.560974836349487
Elapsed: 1.5814154148101807
Elapsed: 0.09375333786010742
Elapsed: 3.5412116050720215
Elapsed: 17.39661407470703
Elapsed: 0.0344696044921875
Elapsed: 11.768535375595093
Elapsed: 0.015494346618652344


In [5]:
# Cap large execution times
lim = 60 # seconds, should be equal to the max allowed threshold for fit()
times_cap = []
for val in times:
    if val > lim:
        times_cap.append(lim)
    else:
        times_cap.append(val)
        
# Print statistics
print('Total time:', sum(times_cap))
print('Average time:', np.average(times_cap))
print('Standard deviation:', np.std(times_cap))
print('Estimated budget:', 100 * (np.average(times_cap) + 3 * np.std(times_cap)))

Total time: 43.31719994544983
Average time: 4.331719994544983
Standard deviation: 5.542260039515339
Estimated budget: 2095.8500113091
