In [58]:
import os
import sys
import time
import random
import warnings
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.io import arff
from sklearn.model_selection import train_test_split
p = os.path.abspath('..')
sys.path.insert(1, p)
from utils import import_dataset, clf_lookup, get_search_space, get_search_space_size
# Disable warnings
warnings.filterwarnings('ignore')

In [59]:
# Import dataset
df = import_dataset('../../data/Cardiotocography_withoutdupl_norm_10_v10.arff')

# Maximum number of samples to keep
max_samples = 5000

# Subsample if necessary
if(len(df) > max_samples):
    df = df.sample(n=max_samples)

# Extract X, y
X  = df.iloc[:, :-1]
y = df['outlier']

# Split to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, 
                                                    random_state=10)

print("Ratio of outliers in training set:", len(y_train[y_train==1])/len(y_train))
print("Ratio of outliers in test set:", len(y_test[y_test==1])/len(y_test))
print("Training size:", len(X_train))
print("Test size:", len(X_test))

Ratio of outliers in training set: 0.09992193598750976
Ratio of outliers in test set: 0.1
Training size: 1281
Test size: 550


In [60]:
# PyOD classifiers to include
classifiers = [
    'CBLOFClassifier',
    'COPODClassifier',
    'IForestClassifier',
    'KNNClassifier',
    'LOFClassifier',
    # add more
]
# Create the search space
models = []
search_spaces = []
for clf in classifiers:
    models.append(clf_lookup(clf))
    print(get_search_space_size([clf]))
    search_spaces.append(get_search_space(clf))
print('Total space size:', get_search_space_size(classifiers))

8800.0
10.0
6800.0
2000.0
2000.0
Total space size: 19610.0


In [61]:
# Time
times = []
# Sample from the model-hyperparam space
n_total = 100 # attempted runs
n_succeeded = 0 # succeeded runs
n_failed = 0 # failed runs
for i in tqdm(range(n_total)):
    # Step 1 - Sample a model uniformally
    idx = random.randint(0, len(models)-1) # index
    model = models[idx]
    hp_space = search_spaces[idx]

    # Step 2 - Sample a configuration from its hyperparam space
    params = hp_space.sample_configuration().get_dictionary()
    model.set_params(**params)
    try:
        start = time.time() # start ticking
        model.fit(X_train, y_train)
        end = time.time() # end ticking
        elapsed = end - start # fit time
        times.append(elapsed)
        n_succeeded += 1
    except:
        n_failed += 1

100%|██████████| 100/100 [00:19<00:00,  5.05it/s]


In [62]:
# Report successful vs failed runs
print('Runs attempted:\t\t\t', n_total)
print('Runs succeeded:\t\t\t', n_succeeded)
print('Runs failed:\t\t\t', n_failed)
# Cap large execution times
cap = 30 # seconds, should be equal to the max allowed threshold for fit()
times_cap = []
for val in times:
    if val > cap:
        times_cap.append(cap)
    else:
        times_cap.append(val)
# Print statistics
print('Total time:\t\t\t', sum(times_cap))
print('Average time per run:\t\t', np.average(times_cap))
print('Standard deviation:\t\t', np.std(times_cap))
print('Estimated 100-run budget:\t', 100 * (np.average(times_cap) + 3 * np.std(times_cap)))

Runs attempted:			 100
Runs succeeded:			 95
Runs failed:			 5
Total time:			 19.061918020248413
Average time per run:		 0.20065176863419382
Standard deviation:		 0.30313269616966204
Estimated 100-run budget:	 111.00498571431798
