In [1]:
import os, sys, time, random, warnings
# Disable warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.model_selection import train_test_split
p = os.path.abspath('..')
sys.path.insert(1, p)
from utils import import_dataset, create_search_space

In [2]:
# Import dataset
df = import_dataset(
    '../../data/Cardiotocography_withoutdupl_norm_10_v10.arff',
)

# Maximum number of points
N = 5000

# Subsample if necessary
if(len(df) > N):
    df = df.sample(n=N)

# Extract X, y
X  = df.iloc[:, :-1]
y = df['outlier']

# Split to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=10)

print("Ratio of outliers in training set:", len(y_train[y_train==1])/len(y_train))
print("Ratio of outliers in test set:", len(y_test[y_test==1])/len(y_test))
print("Training size:", len(X_train))
print("Test size:", len(X_test))

Ratio of outliers in training set: 0.09992193598750976
Ratio of outliers in test set: 0.1
Training size: 1281
Test size: 550


In [3]:
models, search_space, evaluated = create_search_space()
print('Number of algorithms:', len(models))

Number of algorithms: 3


In [4]:
# Time
times = []
# Sample from the model-hyperparam space
n = 50 # how many sample runs?
for i in range(n):
    # Step 1 - Sample a model uniformally
    [key] = random.sample(list(models), 1)
    model = models[key]
    hp_space = search_space[key]

    # Step 2 - Sample a configuration from its hyperparam space
    params = hp_space.sample_configuration().get_dictionary()
    model.set_params(**params)
    start = time.time() # start ticking
    model.fit(X_train, y_train)
    end = time.time() # end ticking
    elapsed = end - start # fit time
    print('Run {:2d} took: {:2.3f} seconds'.format(i+1, elapsed))
    times.append(elapsed)

Run  1 took: 0.353 seconds
Run  2 took: 0.116 seconds
Run  3 took: 0.054 seconds
Run  4 took: 0.015 seconds
Run  5 took: 0.058 seconds
Run  6 took: 0.053 seconds
Run  7 took: 0.068 seconds
Run  8 took: 0.060 seconds
Run  9 took: 0.079 seconds
Run 10 took: 0.061 seconds
Run 11 took: 0.061 seconds
Run 12 took: 0.056 seconds
Run 13 took: 0.017 seconds
Run 14 took: 0.054 seconds
Run 15 took: 0.017 seconds
Run 16 took: 0.084 seconds
Run 17 took: 0.078 seconds
Run 18 took: 0.015 seconds
Run 19 took: 0.026 seconds
Run 20 took: 0.102 seconds
Run 21 took: 0.045 seconds
Run 22 took: 0.061 seconds
Run 23 took: 0.085 seconds
Run 24 took: 0.064 seconds
Run 25 took: 0.079 seconds
Run 26 took: 0.017 seconds
Run 27 took: 0.018 seconds
Run 28 took: 0.069 seconds
Run 29 took: 0.073 seconds
Run 30 took: 0.079 seconds
Run 31 took: 0.067 seconds
Run 32 took: 0.041 seconds
Run 33 took: 0.022 seconds
Run 34 took: 0.080 seconds
Run 35 took: 0.024 seconds
Run 36 took: 0.068 seconds
Run 37 took: 0.065 seconds
R

In [5]:
# Cap large execution times
lim = 60 # seconds, should be equal to the max allowed threshold for fit()
times_cap = []
for val in times:
    if val > lim:
        times_cap.append(lim)
    else:
        times_cap.append(val)
# Print statistics
print('Total time:', sum(times_cap))
print('Average time:', np.average(times_cap))
print('Standard deviation:', np.std(times_cap))
print('Estimated budget:', 100 * (np.average(times_cap) + 3 * np.std(times_cap)))

Total time: 3.1606605052948
Average time: 0.06321321010589599
Standard deviation: 0.04924485501333547
Estimated budget: 21.09477751459024
