In [None]:
# Shuttle dataset
import os, sys, time
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pyod.models.abod import ABOD # probabilistic
from pyod.models.cblof import CBLOF # proximity-based
from pyod.models.copod import COPOD # probabilistic
from pyod.models.ecod import ECOD # probabilistic
from pyod.models.hbos import HBOS # proximity-based
from pyod.models.iforest import IForest # outlier ensembles
from pyod.models.knn import KNN # proximity-based
from pyod.models.lmdd import LMDD # linear model
from pyod.models.lof import LOF # proximity-based
from pyod.models.mcd import MCD # linear model
from pyod.models.ocsvm import OCSVM # linear model
from pyod.models.pca import PCA # linear model
from pyod.models.rod import ROD # proximity-based
from pyod.models.sos import SOS # probabilistic
p = os.path.abspath('..')
sys.path.insert(1, p)
from utils import import_dataset

In [None]:
# Import dataset
df = import_dataset('../../data/Annthyroid_withoutdupl_norm_07.arff')

# Maximum number of points
N = 5000

# Subsample if necessary
if(len(df) > N):
    df = df.sample(n=N)

# Extract X, y
X  = df.iloc[:, :-1]
y = df['outlier']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=10)

print("Ratio of outliers in training set:", len(y_train[y_train==1])/len(y_train))
print("Ratio of outliers in test set:", len(y_test[y_test==1])/len(y_test))
print("Training size:", len(X_train))
print("Test size:", len(X_test))

In [None]:
# PyOD Algorithms
alg_list = [
    ABOD(), #1
    CBLOF(), #2
    COPOD(), #3
    ECOD(), #4
    HBOS(), #5
    IForest(),# 6
    KNN(), #7
    LMDD(), #8
    LOF(), #9
    MCD(), #10
    OCSVM(), #11
    PCA(), #12
    ROD(), #13
    SOS() #14
]

In [None]:
# Time
times = []
# Ideally, one would sample uniformally from the given algorithm's hyperparameter space
# For now, just use the default values
for alg in alg_list:
    start = time.time()
    alg.fit(X_train)
    end = time.time()
    elapsed = end - start
    print('Elapsed:', elapsed)
    times.append(elapsed)

In [None]:
# Print statistics
print('Total time:', sum(times))
print('Average time:', np.average(times))
print('Standard deviation:', np.std(times))
print('Estimated budget:', 100 * (np.average(times) + 3 * np.std(times)))

In [None]:
# Cap large execution times
lim = 60 # seconds, should be equal to the max allowed threshold for fit()
times_cap = []
for val in times:
    if val > lim:
        times_cap.append(lim)
    else:
        times_cap.append(val)
        
# Print statistics
print('Total time:', sum(times_cap))
print('Average time:', np.average(times_cap))
print('Standard deviation:', np.std(times_cap))
print('Estimated budget:', 100 * (np.average(times_cap) + 2 * np.std(times_cap)))