# Setup

In [1]:
# download red wine quality data
! mkdir wine
!wget -P wine https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv

# download and unzip Musk v2
! mkdir musk
! wget -P musk https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean1.data.Z
! wget -P musk https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.data.Z
! uncompress musk/clean1.data.Z
! uncompress musk/clean2.data.Z

! git clone --branch wip/clustering https://github.com/jsokolowska/anomaly-detection.git

--2023-05-06 08:14:24--  https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84199 (82K) [application/x-httpd-php]
Saving to: ‘wine/winequality-red.csv’


2023-05-06 08:14:25 (648 KB/s) - ‘wine/winequality-red.csv’ saved [84199/84199]

Cloning into 'anomaly-detection'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 58 (delta 24), reused 49 (delta 18), pack-reused 0[K
Unpacking objects: 100% (58/58), 1.55 MiB | 6.24 MiB/s, done.


In [None]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

def score( y_true, y_pred, y_proba):
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    roc = roc_auc_score(y_true, y_proba)
    f1 = f1_score(y_true, y_pred)
    print(f"F1: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"ROC AUC: {roc}")
    return [precision, recall, f1, roc]

# Parameter Tuning

In [None]:
from anomaly_detection import ClusterBasedAnomalyDetection
from sklearn.cluster import Birch
from data_loading import load_wine

#Birch
n_clusters = 10
birch = Birch(n_clusters=n_clusters)
X, y = load_wine("../data/raw/wine/winequality-red.csv")

cbad_cblof = ClusterBasedAnomalyDetection(birch, dissimilarity_measure="cblof", measure_args={"n_clusters": n_clusters})
y_scores = cbad_cblof.decision_fun(X)
y_pred = cbad_cblof.detect(X)

print(f"----- Birch clustering with CBLOF -----")
res = score(y, y_pred, y_scores)

----- Birch clustering with CBLOF -----
F1: 0.06639004149377593
Precision: 0.05
Recall: 0.09876543209876543
ROC AUC: 0.43262740122643506




In [None]:
cbad_ldcof = ClusterBasedAnomalyDetection(birch, dissimilarity_measure="ldcof", measure_args={"n_clusters": n_clusters})
y_scores = cbad_ldcof.decision_fun(X)
y_pred = cbad_ldcof.detect(X)

print(f"----- Birch clustering with LDCOF -----")
res = score(y, y_pred, y_scores)

----- Birch clustering with LDCOF -----
F1: 0.07468879668049792
Precision: 0.05625
Recall: 0.1111111111111111
ROC AUC: 0.49668179378324306
