### Automated anomaly detection using PyOD and Auto-Sklearn
Hyberband search

In [1]:
from pprint import pprint
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
from autosklearn.metrics import roc_auc, average_precision
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, PredefinedSplit
import autosklearn.classification
import os, sys
p = os.path.abspath('..')
sys.path.insert(1, p)
from utils import import_dataset, add_pyod_models_to_pipeline, balanced_split, get_metric_result
# Add models to Auto-Sklearn
add_pyod_models_to_pipeline()

In [2]:
def get_bosh_object_callback(budget_type):
    def get_smac_object(
        scenario_dict,
        seed,
        ta,
        ta_kwargs,
        metalearning_configurations,
        n_jobs,
        dask_client,
    ):
        from smac.facade.smac_ac_facade import SMAC4AC
        from smac.intensification.successive_halving import SuccessiveHalving
        from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
        from smac.scenario.scenario import Scenario

        if n_jobs > 1 or (dask_client and len(dask_client.nthreads()) > 1):
            raise ValueError("Please make sure to guard the code invoking Auto-sklearn by "
                             "`if __name__ == '__main__'` and remove this exception.")

        scenario = Scenario(scenario_dict)
        if len(metalearning_configurations) > 0:
            default_config = scenario.cs.get_default_configuration()
            initial_configurations = [default_config] + metalearning_configurations
        else:
            initial_configurations = None
        rh2EPM = RunHistory2EPM4LogCost

        ta_kwargs['budget_type'] = budget_type

        return SMAC4AC(
            scenario=scenario,
            rng=seed,
            runhistory2epm=rh2EPM,
            tae_runner=ta,
            tae_runner_kwargs=ta_kwargs,
            initial_configurations=initial_configurations,
            run_id=seed,
            intensifier=SuccessiveHalving,
            intensifier_kwargs={
                'initial_budget': 10.0,
                'max_budget': 100,
                'eta': 2,
                'min_chall': 1
            },
            n_jobs=n_jobs,
            dask_client=dask_client,
        )
    return get_smac_object

In [3]:
def get_bohb_object_callback(budget_type):
    def get_smac_object(
        scenario_dict,
        seed,
        ta,
        ta_kwargs,
        metalearning_configurations,
        n_jobs,
        dask_client,
    ):
        from smac.facade.smac_ac_facade import SMAC4AC
        from smac.intensification.hyperband import Hyperband
        from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
        from smac.scenario.scenario import Scenario

        if n_jobs > 1 or (dask_client and len(dask_client.nthreads()) > 1):
            raise ValueError("Please make sure to guard the code invoking Auto-sklearn by "
                             "`if __name__ == '__main__'` and remove this exception.")

        scenario = Scenario(scenario_dict)
        if len(metalearning_configurations) > 0:
            default_config = scenario.cs.get_default_configuration()
            initial_configurations = [default_config] + metalearning_configurations
        else:
            initial_configurations = None
        rh2EPM = RunHistory2EPM4LogCost

        ta_kwargs['budget_type'] = budget_type

        return SMAC4AC(
            scenario=scenario,
            rng=seed,
            runhistory2epm=rh2EPM,
            tae_runner=ta,
            tae_runner_kwargs=ta_kwargs,
            initial_configurations=initial_configurations,
            run_id=seed,
            intensifier=Hyperband,
            intensifier_kwargs={
                'initial_budget': 10.0,
                'max_budget': 100,
                'eta': 2,
                'min_chall': 1
            },
            n_jobs=n_jobs,
            dask_client=dask_client,
        )
    return get_smac_object

In [4]:
# Classifiers to be included
classifiers = [
    'CBLOFClassifier',
    'COPODClassifier',
    'IForestClassifier',
    'KNNClassifier',
    'LOFClassifier',
]

In [5]:
# Import DataFrame
df = import_dataset('../../data/Cardiotocography_withoutdupl_norm_05_v10.arff')
# Subsample
N = 5000
if(len(df) > N):
    df = df.sample(n=N)
# Extract X, y
X  = df.iloc[:, :-1]
y = df['outlier']
# Split to train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=10)

In [7]:
# Resampling strategy
#resampling_strategy = StratifiedShuffleSplit(n_splits=5, test_size=0.3)
selected_indices = balanced_split(y_train)
resampling_strategy = PredefinedSplit(test_fold=selected_indices)
#
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=20,
    metric=roc_auc,
    scoring_functions = [roc_auc, average_precision],
    initial_configurations_via_metalearning = 0,
    ensemble_size = 0,
    resampling_strategy=resampling_strategy,
    include={
        'classifier': ['extra_trees', 'gradient_boosting', 'random_forest'],
        #'classifier': classifiers,
        'feature_preprocessor': ['no_preprocessing']
    },
    get_smac_object_callback=get_bosh_object_callback('iterations'),
    delete_tmp_folder_after_terminate=False
)
automl.fit(X_train, y_train, X_test, y_test, dataset_name='cardiotocography')

# Print stats
print(automl.sprint_statistics())



auto-sklearn results:
  Dataset name: cardiotocography
  Metric: roc_auc
  Best validation score: 0.701235
  Number of target algorithm runs: 63
  Number of successful target algorithm runs: 55
  Number of crashed target algorithm runs: 7
  Number of target algorithms that exceeded the time limit: 1
  Number of target algorithms that exceeded the memory limit: 0

