# Setup

In [1]:
# download red wine quality data
#! mkdir wine
#! wget -P wine https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv

# download and unzip Musk v2
# ! mkdir musk
# ! wget -P musk https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.data.Z
# ! uncompress musk/clean2.data.Z

# ! git clone --branch wip/clustering https://github.com/jsokolowska/anomaly-detection.git
# ! mv anomaly-detection ad

In [2]:
# install pyod - not present by default in google colab
! pip install pyod


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
WINE_PATH = "./wine/winequality-red.csv"
MUSK_PATH = "./musk/clean2.data"

In [1]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

def score( y_true, y_pred, y_proba):
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    roc = roc_auc_score(y_true, y_proba)
    f1 = f1_score(y_true, y_pred)
    return precision, recall, f1, roc

# Parameter Tuning

In [4]:
from sklearn.model_selection import GridSearchCV
from anomaly_detection import ClusterBasedAnomalyDetection
from sklearn.cluster import Birch
from sklearn.model_selection import StratifiedKFold
from data_loading import load_wine, load_musk
import pandas as pd

In [5]:
df = pd.DataFrame(index=["musk", "wine"], columns=["birch-lcdof", "birch-cblof", "dbscan-lcdof", "dbscan-cblof"])
# Params for wine
param_grid = {
    "alpha": [0.8, 0.9, 0.95],
    "beta": [3, 5, 10],
    "contamination": [0.1, 0.07, 0.05, 0.03],
    "n_clusters" : [10, 3, 4, 9, 16, 25],
    "threshold": [0.2, 0.5, 0.7]
}
N_SPLITS = 5
X, y = load_wine(WINE_PATH)
skf = StratifiedKFold(n_splits=N_SPLITS, random_state = 91, shuffle = True)

In [6]:
clustering_algs = [("birch", Birch())]
measures = ["ldcof", "cblof"]
datasets = {"musk": load_musk(MUSK_PATH), "wine": load_wine(WINE_PATH)}

for dataset_name, values in datasets.items():
    X,y = values
    for name, algorithm in clustering_algs:
        for measure in measures:
            cbad = ClusterBasedAnomalyDetection(clustering_estimator=algorithm, dissimilarity_measure=measure)

            search = GridSearchCV(param_grid=param_grid, estimator=cbad, scoring="roc_auc", cv = 5, n_jobs=1, verbose=4)
            search.fit(X, y)
            print(f"[{name}/{measure}/{dataset_name}] Best params: {search.best_params_}")
            print(f"[{name}/{measure}/{dataset_name}] Best score: {search.best_score_}")
            df[f"{name}-{measure}"][dataset_name] = search.best_params_


Fitting 5 folds for each of 648 candidates, totalling 3240 fits
[CV 1/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=10, threshold=0.2;, score=0.732 total time=   1.1s
[CV 2/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=10, threshold=0.2;, score=0.554 total time=   1.2s
[CV 3/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=10, threshold=0.2;, score=0.696 total time=   1.1s
[CV 4/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=10, threshold=0.2;, score=0.679 total time=   1.1s
[CV 5/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=10, threshold=0.2;, score=0.658 total time=   1.1s
[CV 1/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=10, threshold=0.5;, score=0.732 total time=   1.1s
[CV 2/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=10, threshold=0.5;, score=0.554 total time=   1.1s
[CV 3/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=10, threshold=0.5;, score=0.696 total time=   1.1s
[CV 4/5] END alpha=0.8, beta=3, 

ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 1/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.2;, score=nan total time=   1.1s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 2/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.2;, score=nan total time=   1.1s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 3/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.2;, score=nan total time=   1.1s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 4/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.2;, score=nan total time=   1.1s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 5/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.2;, score=nan total time=   1.2s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 1/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.5;, score=nan total time=   1.1s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 2/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.5;, score=nan total time=   1.1s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 3/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.5;, score=nan total time=   1.3s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 4/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.5;, score=nan total time=   1.2s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 5/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.5;, score=nan total time=   1.1s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 1/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.7;, score=nan total time=   1.1s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 2/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.7;, score=nan total time=   1.1s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 3/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.7;, score=nan total time=   1.1s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 4/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.7;, score=nan total time=   1.1s


ERROR:root:['Traceback (most recent call last):\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 150, in fit\n    self._set_big_clusters()\n', '  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/src/anomaly_detection.py", line 204, in _set_big_clusters\n    raise ValueError("Could not separate into large and small clusters")\n', 'ValueError: Could not separate into large and small clusters\n']
Traceback (most recent call last):
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/home/joanna/Documents/Studia/22Z/SAD/anomaly-detection/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py

[CV 5/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=3, threshold=0.7;, score=nan total time=   1.1s
[CV 1/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=4, threshold=0.2;, score=0.790 total time=   1.1s
[CV 2/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=4, threshold=0.2;, score=0.741 total time=   1.1s
[CV 3/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=4, threshold=0.2;, score=0.717 total time=   1.1s
[CV 4/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=4, threshold=0.2;, score=0.732 total time=   1.3s
[CV 5/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=4, threshold=0.2;, score=0.805 total time=   1.1s
[CV 1/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=4, threshold=0.5;, score=0.790 total time=   1.2s
[CV 2/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=4, threshold=0.5;, score=0.741 total time=   1.1s
[CV 3/5] END alpha=0.8, beta=3, contamination=0.1, n_clusters=4, threshold=0.5;, score=0.717 total time=  

KeyboardInterrupt: 