In [1]:
import os
import sys
import time
import random
import warnings
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.io import arff
from sklearn.model_selection import train_test_split
p = os.path.abspath('..')
sys.path.insert(1, p)
from utils import import_dataset, clf_lookup, get_search_space
# Disable warnings
warnings.filterwarnings('ignore')

In [8]:
# Import dataset
df = import_dataset('../../data/Cardiotocography_withoutdupl_norm_10_v10.arff')

# Maximum number of samples to keep
max_samples = 5000

# Subsample if necessary
if(len(df) > max_samples):
    df = df.sample(n=max_samples)

# Extract X, y
X  = df.iloc[:, :-1]
y = df['outlier']

# Split to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, 
                                                    random_state=10)

print("Ratio of outliers in training set:", len(y_train[y_train==1])/len(y_train))
print("Ratio of outliers in test set:", len(y_test[y_test==1])/len(y_test))
print("Training size:", len(X_train))
print("Test size:", len(X_test))

Ratio of outliers in training set: 0.09992193598750976
Ratio of outliers in test set: 0.1
Training size: 1281
Test size: 550


In [9]:
# PyOD classifiers to include
classifiers = [
    'CBLOFClassifier',
    'COPODClassifier',
    'IForestClassifier',
    'KNNClassifier',
    'LOFClassifier',
    # add more
]
# Create the search space
models = []
search_spaces = []
for clf in classifiers:
    models.append(clf_lookup(clf))
    search_spaces.append(get_search_space(clf))

In [10]:
# Time
times = []
# Sample from the model-hyperparam space
n = 50 # sample runs
for i in tqdm(range(n)):
    # Step 1 - Sample a model uniformally
    idx = random.randint(0, len(models)-1) # index
    model = models[idx]
    hp_space = search_spaces[idx]

    # Step 2 - Sample a configuration from its hyperparam space
    params = hp_space.sample_configuration().get_dictionary()
    model.set_params(**params)
    start = time.time() # start ticking
    model.fit(X_train, y_train)
    end = time.time() # end ticking
    elapsed = end - start # fit time
    times.append(elapsed)

100%|██████████| 50/50 [00:07<00:00,  6.95it/s]


In [17]:
# Cap large execution times
lim = 30 # seconds, should be equal to the max allowed threshold for fit()
times_cap = []
for val in times:
    if val > lim:
        times_cap.append(lim)
    else:
        times_cap.append(val)
# Print statistics
print('Total time:\t\t\t', sum(times_cap))
print('Average time per run:\t\t', np.average(times_cap))
print('Standard deviation:\t\t', np.std(times_cap))
print('Estimated 100-run budget:\t', 100 * (np.average(times_cap) + 3 * np.std(times_cap)))

Total time:			 7.143418073654175
Average time per run:		 0.1428683614730835
Standard deviation:		 0.15062967388131662
Estimated 100-run budget:	 59.47573831170333
