## CASH for anomaly detection using random search

In [1]:
# Imports
import os, sys, time, warnings, random
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, \
    StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score, average_precision_score
# Hide warnings
warnings.filterwarnings('ignore')
# Add parent path to sys
p = os.path.abspath('..')
sys.path.insert(1, p)
# Import from parent
from utils import import_dataset, create_search_space

In [2]:
# Import dataset
dataset = '../../data/Cardiotocography_withoutdupl_norm_10_v10.arff'
df = import_dataset(dataset)

# Sampling
N = 5000
if(len(df) > N):
    df = df.sample(n=N)

# Extract X, y
X  = df.iloc[:, :-1]
y = df['outlier']

# Split to train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=10)

# Prints
print("Ratio of outliers in training set:", len(y_train[y_train==1])/len(y_train))
print("Ratio of outliers in test set:", len(y_test[y_test==1])/len(y_test))
print("Training size:", len(X_train))
print("Test size:", len(X_test))

Ratio of outliers in training set: 0.09978150036416605
Ratio of outliers in test set: 0.10043668122270742
Training size: 1373
Test size: 458


In [3]:
# Create algorithm entries and their search spaces
models, search_space, evaluated = create_search_space()
print('Number of algorithms:', len(models))

Number of algorithms: 5


In [4]:
# Initialize
best_model = None
best_params = {}
best_score = 0.0
budget = 60 # time budget in seconds

# Track progress
with tqdm(total=budget) as pbar:
    # Random search
    while budget > 0:

        # Step 1 - Sample a model uniformally
        [key] = random.sample(list(models), 1)
        model = models[key]
        hp_space = search_space[key]

        # Step 2 - Sample a configuration from its hyperparam space
        params = hp_space.sample_configuration().get_dictionary()
        model.set_params(**params)

        # cv strategy
        cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3)

        # Evaluate
        start = time.time()
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')
        end = time.time()
        elapsed = end - start
        budget -= elapsed
        evaluated[key] += 1

        # Update best model
        if scores.mean() > best_score:
            best_model = key
            best_score = scores.mean()
            best_params = params

        # Update progress bar
        pbar.update(elapsed)

62.816235065460205it [01:02,  1.00s/it]                        


In [5]:
# Results
print('Distribution of runs:')
for key, val in evaluated.items():
    print('\t' + key + ':\t', val)
print('')
print('Best model:')
print('\tName:', best_model)
print('\tConfiguration:', best_params)
print('\tROC AUC score:', best_score)
# Apply best model on test set
final_model = models[best_model].set_params(**best_params)
final_model.fit(X_train, y_train) # refit
y_pred = final_model.predict_proba(X_test)
auc_score = roc_auc_score(y_test, y_pred[:,1])
avg_prec_score = average_precision_score(y_test, y_pred[:,1])
print('')
print('Test performance:')
print('\tROC AUC score:\t\t', auc_score)
print('\tAverage pecision score:\t', avg_prec_score)

Distribution of runs:
	cblof:	 15
	copod:	 10
	ifor:	 15
	knn:	 10
	lof:	 9

Best model:
	Name: ifor
	Configuration: {'bootstrap': False, 'contamination': 0.32, 'max_features': 0.13, 'max_samples': 0.69, 'n_estimators': 38}
	ROC AUC score: 0.794438235487476

Test performance:
	ROC AUC score:		 0.7376530181511186
	Average pecision score:	 0.2409203229606974
