# Setup

In [1]:
# download red wine quality data
! mkdir wine
! wget -P wine https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv

# download and unzip Musk v2
! mkdir musk
! wget -P musk https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.data.Z
! uncompress musk/clean2.data.Z

! git clone https://github.com/jsokolowska/anomaly-detection.git
! mv anomaly-detection ad

--2023-06-11 12:18:34--  https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘wine/winequality-red.csv’

winequality-red.csv     [<=>                 ]       0  --.-KB/s               winequality-red.csv     [ <=>                ]  82.23K  --.-KB/s    in 0.05s   

2023-06-11 12:18:34 (1.49 MB/s) - ‘wine/winequality-red.csv’ saved [84199]

--2023-06-11 12:18:34--  https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.data.Z
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘musk/clean2.data.Z’

clean2.data.Z     

In [2]:
# install pyod - not present by default in google colab
! pip install pyod


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyod
  Downloading pyod-1.0.9.tar.gz (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.0/150.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py) ... [?25l[?25hdone
  Created wheel for pyod: filename=pyod-1.0.9-py3-none-any.whl size=184097 sha256=70f67c8ca956abecb5a4869c6485ecd5f2da0ba5698aa52998ad306e8e067505
  Stored in directory: /root/.cache/pip/wheels/83/55/6b/552e083cf5509c0afe808b76cf434f1be284d01a112623bd37
Successfully built pyod
Installing collected packages: pyod
Successfully installed pyod-1.0.9


In [3]:
WINE_PATH = "./wine/winequality-red.csv"
MUSK_PATH = "./musk/clean2.data"

In [60]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

def score( y_true, y_pred, y_proba):
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    roc = roc_auc_score(y_true, y_proba)
    f1 = f1_score(y_true, y_pred)
    return precision, recall, f1, roc

# Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from ad.src.anomaly_detection import ClusterBasedAnomalyDetection
from sklearn.cluster import Birch
from sklearn.model_selection import StratifiedKFold
from ad.src.data_loading import load_wine, load_musk
import pandas as pd

In [None]:
df = pd.DataFrame(index=["musk", "wine"], columns=["birch-ldcof", "birch-cblof", "kmeans-ldcof", "kmeans-cblof"])
# Params for wine
birch_grid = {
    "alpha": [0.8, 0.9, 0.95],
    "beta": [3, 5, 10],
    "contamination": [0.1, 0.07, 0.05, 0.03],
    "n_clusters" : [ 3, 5, 7, 10],
    "threshold": [0.2, 0.5, 0.7]
}
kmeans_grid = {
    "alpha": [0.8, 0.9, 0.95],
    "beta": [3, 5, 10],
    "contamination": [0.1, 0.07, 0.05, 0.03],
    "n_clusters": [ 3, 5, 7, 10],
    "random_state": [4512],
    "n_init": [10]
}
N_SPLITS = 5
X_train, X_test, y_train, y_test  = load_wine(WINE_PATH)
skf = StratifiedKFold(n_splits=N_SPLITS, random_state = 91, shuffle = True)

In [None]:
from sklearn.cluster import KMeans
clustering_algs = [("kmeans", KMeans(), kmeans_grid), ("birch", Birch(), birch_grid)]
measures = ["ldcof", "cblof"]
datasets = {"musk": load_musk(MUSK_PATH), "wine": load_wine(WINE_PATH)}

for dataset_name, values in datasets.items():
    X_train, X_test, y_train, y_test  = values
    for name, algorithm, param_grid in clustering_algs:
        for measure in measures:
            cbad = ClusterBasedAnomalyDetection(clustering_estimator=algorithm, dissimilarity_measure=measure)

            search = GridSearchCV(param_grid=param_grid, estimator=cbad, scoring="roc_auc", cv = 5, n_jobs=1)
            search.fit(X_train, y_train)
            print(f"[{name}/{measure}/{dataset_name}] Best params: {search.best_params_}")
            print(f"[{name}/{measure}/{dataset_name}] Best score: {search.best_score_}")
            df[f"{name}-{measure}"][dataset_name] = search.best_params_


324 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
324 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/kaggle/working/ad/src/anomaly_detection.py", line 84, in fit
    self._chosen_measure.fit(X)
  File "/kaggle/working/ad/src/anomaly_detection.py", line 156, in fit
    raise e
  File "/kaggle/working/ad/src/anomaly_detection.py", line 147, in fit
    self._set_big_clusters()
  File "/kaggle/working/ad/src/anomaly_detection.py", line 204, in _set_big_clusters
    raise ValueError("Could not separate into 

[kmeans/ldcof/musk] Best params: {'alpha': 0.8, 'beta': 3, 'contamination': 0.1, 'n_clusters': 5, 'n_init': 10, 'random_state': 4512}
[kmeans/ldcof/musk] Best score: 0.731697522336632


324 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
324 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/kaggle/working/ad/src/anomaly_detection.py", line 84, in fit
    self._chosen_measure.fit(X)
  File "/opt/conda/lib/python3.10/site-packages/pyod/models/cblof.py", line 194, in fit
    self._set_small_large_clusters(n_samples)
  File "/opt/conda/lib/python3.10/site-packages/pyod/models/cblof.py", line 292, in _set_small_large_clusters
    raise ValueError("Could not form valid cluster separation. Please 

[kmeans/cblof/musk] Best params: {'alpha': 0.95, 'beta': 3, 'contamination': 0.1, 'n_clusters': 10, 'n_init': 10, 'random_state': 4512}
[kmeans/cblof/musk] Best score: 0.4181319438290222


828 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
828 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/kaggle/working/ad/src/anomaly_detection.py", line 84, in fit
    self._chosen_measure.fit(X)
  File "/kaggle/working/ad/src/anomaly_detection.py", line 156, in fit
    raise e
  File "/kaggle/working/ad/src/anomaly_detection.py", line 147, in fit
    self._set_big_clusters()
  File "/kaggle/working/ad/src/anomaly_detection.py", line 204, in _set_big_clusters
    raise ValueError("Could not separate into

[birch/ldcof/musk] Best params: {'alpha': 0.8, 'beta': 10, 'contamination': 0.1, 'n_clusters': 5, 'threshold': 0.2}
[birch/ldcof/musk] Best score: 0.762793641322804


828 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
828 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/kaggle/working/ad/src/anomaly_detection.py", line 84, in fit
    self._chosen_measure.fit(X)
  File "/opt/conda/lib/python3.10/site-packages/pyod/models/cblof.py", line 194, in fit
    self._set_small_large_clusters(n_samples)
  File "/opt/conda/lib/python3.10/site-packages/pyod/models/cblof.py", line 292, in _set_small_large_clusters
    raise ValueError("Could not form valid cluster separation. Please

[birch/cblof/musk] Best params: {'alpha': 0.95, 'beta': 3, 'contamination': 0.1, 'n_clusters': 10, 'threshold': 0.5}
[birch/cblof/musk] Best score: 0.3992267436585478


120 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/kaggle/working/ad/src/anomaly_detection.py", line 84, in fit
    self._chosen_measure.fit(X)
  File "/kaggle/working/ad/src/anomaly_detection.py", line 156, in fit
    raise e
  File "/kaggle/working/ad/src/anomaly_detection.py", line 147, in fit
    self._set_big_clusters()
  File "/kaggle/working/ad/src/anomaly_detection.py", line 204, in _set_big_clusters
    raise ValueError("Could not separate into 

[kmeans/ldcof/wine] Best params: {'alpha': 0.8, 'beta': 3, 'contamination': 0.1, 'n_clusters': 10, 'n_init': 10, 'random_state': 4512}
[kmeans/ldcof/wine] Best score: 0.530490667028481


120 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/kaggle/working/ad/src/anomaly_detection.py", line 84, in fit
    self._chosen_measure.fit(X)
  File "/opt/conda/lib/python3.10/site-packages/pyod/models/cblof.py", line 194, in fit
    self._set_small_large_clusters(n_samples)
  File "/opt/conda/lib/python3.10/site-packages/pyod/models/cblof.py", line 292, in _set_small_large_clusters
    raise ValueError("Could not form valid cluster separation. Please 

[kmeans/cblof/wine] Best params: {'alpha': 0.9, 'beta': 3, 'contamination': 0.1, 'n_clusters': 5, 'n_init': 10, 'random_state': 4512}
[kmeans/cblof/wine] Best score: 0.46832160437175024


432 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
432 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/kaggle/working/ad/src/anomaly_detection.py", line 84, in fit
    self._chosen_measure.fit(X)
  File "/kaggle/working/ad/src/anomaly_detection.py", line 156, in fit
    raise e
  File "/kaggle/working/ad/src/anomaly_detection.py", line 147, in fit
    self._set_big_clusters()
  File "/kaggle/working/ad/src/anomaly_detection.py", line 204, in _set_big_clusters
    raise ValueError("Could not separate into

[birch/ldcof/wine] Best params: {'alpha': 0.8, 'beta': 3, 'contamination': 0.1, 'n_clusters': 10, 'threshold': 0.7}
[birch/ldcof/wine] Best score: 0.5317538118145919




[birch/cblof/wine] Best params: {'alpha': 0.9, 'beta': 3, 'contamination': 0.1, 'n_clusters': 5, 'threshold': 0.2}
[birch/cblof/wine] Best score: 0.458695405039281


432 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
432 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/kaggle/working/ad/src/anomaly_detection.py", line 84, in fit
    self._chosen_measure.fit(X)
  File "/opt/conda/lib/python3.10/site-packages/pyod/models/cblof.py", line 194, in fit
    self._set_small_large_clusters(n_samples)
  File "/opt/conda/lib/python3.10/site-packages/pyod/models/cblof.py", line 292, in _set_small_large_clusters
    raise ValueError("Could not form valid cluster separation. Please

In [None]:
df.head()

Unnamed: 0,birch-ldcof,birch-cblof,kmeans-ldcof,kmeans-cblof
musk,"{'alpha': 0.8, 'beta': 10, 'contamination': 0....","{'alpha': 0.95, 'beta': 3, 'contamination': 0....","{'alpha': 0.8, 'beta': 3, 'contamination': 0.1...","{'alpha': 0.95, 'beta': 3, 'contamination': 0...."
wine,"{'alpha': 0.8, 'beta': 3, 'contamination': 0.1...","{'alpha': 0.9, 'beta': 3, 'contamination': 0.1...","{'alpha': 0.8, 'beta': 3, 'contamination': 0.1...","{'alpha': 0.9, 'beta': 3, 'contamination': 0.1..."


In [None]:
df.to_csv("./tuning-results")

# Detection based on IsolationForest and OneClassSVM

In [66]:
# Model and performance
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
# Data processing
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Own imports
from ad.src.data_loading import load_musk, load_wine




In [None]:
DATA_INDEX = 0
QUALITY_INDEX = 1

wine_x, wine_y = load_wine(WINE_PATH) # numpy.ndarray, numpy.ndarray
musk_x, musk_y = load_musk(MUSK_PATH) # pandas.core.frame.DataFrame, pandas.core.series.Series

wine_labels = ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "free_sulfur_dioxide", "total_sulfur_dioxide", "density", "ph", "sulphates", "alcohol"]
musk_labels = range(2, 168 + 1)

wine_data_df = pd.DataFrame(columns=wine_labels, data=wine_x)
musk_data_df = musk_x

## Isolation Forest

### Wine dataset

In [51]:
# Create IF model
# anomaly_inputs = wine_labels
wine_model_IF = IsolationForest(contamination=0.1, random_state=42)
wine_model_IF.fit(wine_x)

In [52]:
# Predict
wine_anomaly_scores = wine_model_IF.decision_function(wine_x)
wine_is_anomaly = wine_model_IF.predict(wine_x)
wine_is_anomaly # -1 means anomaly, +1 means ordinary

array([1, 1, 1, ..., 1, 1, 1])

In [53]:
# View data
wine_data_df["quality"] = wine_y
wine_data_df["anomaly_score"] = wine_anomaly_scores
wine_data_df["anomaly"] = wine_is_anomaly
wine_data_df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality,anomaly_score,anomaly
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0,0.097896,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0,0.074272,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0,0.118202,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0,0.062978,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0,0.097896,1


### Musk dataset

In [54]:
# Create IF model
musk_model_IF = IsolationForest(contamination=0.1, random_state=42)
musk_model_IF.fit(musk_data_df[musk_labels])

In [55]:
# Predict
musk_anomaly_scores = musk_model_IF.decision_function(musk_data_df[musk_labels])
musk_is_anomaly = musk_model_IF.predict(musk_data_df[musk_labels])
musk_is_anomaly # -1 means anomaly, +1 means ordinary

array([ 1,  1,  1, ...,  1,  1, -1])

In [59]:
# View data
musk_data_df["anomaly_score"] = musk_anomaly_scores
musk_data_df["anomaly"] = musk_is_anomaly
musk_data_df.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,161,162,163,164,165,166,167,168,anomaly_score,anomaly
3397,41,26,19,-63,-115,66,-167,22,-234,-22,...,82,60,51,229,-71,-216,-26,0.0,0.050699,1
6140,41,62,15,-32,-116,48,-162,94,-218,3,...,55,78,61,214,-70,-201,-26,0.0,0.03548,1
5999,64,-172,28,-32,-107,56,-117,44,-236,-21,...,7,24,84,190,-38,-122,142,0.0,0.045053,1
6242,40,-191,-124,108,-117,20,50,-1,127,-77,...,-97,139,104,227,-73,-215,-8,0.0,0.037912,1
3345,60,26,48,-80,-117,76,-160,74,-235,11,...,-19,-41,69,196,-40,-130,141,0.0,0.041549,1


In [64]:
# Evaluation
precision, recall, f1 = score(wine_y, wine_is_anomaly)

ValueError: ignored

## OneClassSVM

### Wine dataset

In [65]:
# Create SVM model
one_class_svm = OneClassSVM(nu=0.1, kernel='rbf', gamma='auto')
one_class_svm.fit(wine_x)

In [None]:
# Predict
one_class_svm.predict()