# Setup

In [1]:
# download red wine quality data
! mkdir wine
! wget -P wine https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv

# download and unzip Musk v2
! mkdir musk
! wget -P musk https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.data.Z
! uncompress musk/clean2.data.Z

! git clone --branch wip/clustering https://github.com/jsokolowska/anomaly-detection.git
! mv anomaly-detection ad

--2023-05-07 07:07:04--  https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84199 (82K) [application/x-httpd-php]
Saving to: ‘wine/winequality-red.csv’


2023-05-07 07:07:04 (636 KB/s) - ‘wine/winequality-red.csv’ saved [84199/84199]

--2023-05-07 07:07:04--  https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.data.Z
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1470557 (1.4M) [application/x-httpd-php]
Saving to: ‘musk/clean2.data.Z’


2023-05-07 07:07:05 (3.61 MB/s) - ‘musk/clean2.data.Z’ saved [1470557/1470557]

Cloning into 'anomaly-detection'...
re

In [2]:
# install pyod - not present by default in google colab
! pip install pyod

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyod
  Downloading pyod-1.0.9.tar.gz (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.0/150.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py) ... [?25l[?25hdone
  Created wheel for pyod: filename=pyod-1.0.9-py3-none-any.whl size=184112 sha256=988f944603dcdf770dc25412a77dede6ac12b7a8798922f32cc34470336bf4a6
  Stored in directory: /root/.cache/pip/wheels/83/55/6b/552e083cf5509c0afe808b76cf434f1be284d01a112623bd37
Successfully built pyod
Installing collected packages: pyod
Successfully installed pyod-1.0.9


In [3]:
WINE_PATH = "./wine/winequality-red.csv"
MUSK_PATH = "./musk/clean2.data"

In [28]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

def score( y_true, y_pred, y_proba):
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    roc = roc_auc_score(y_true, y_proba)
    f1 = f1_score(y_true, y_pred)
    return precision, recall, f1, roc

# Parameter Tuning

In [5]:
from ad.src.anomaly_detection import ClusterBasedAnomalyDetection
from sklearn.cluster import Birch
from ad.src.data_loading import load_wine, load_musk

#Birch
n_clusters = 10
birch = Birch(n_clusters=n_clusters)
X, y = load_wine("./wine/winequality-red.csv")

cbad_cblof = ClusterBasedAnomalyDetection(birch, dissimilarity_measure="cblof", measure_args={"n_clusters": n_clusters})
cbad_cblof.fit(X)
y_scores = cbad_cblof.decision_fun(X)
y_pred = cbad_cblof.predict(X)

print(f"----- Birch clustering with CBLOF -----")
res = score(y, y_pred, y_scores)



----- Birch clustering with CBLOF -----
F1: 0.06639004149377593
Precision: 0.05
Recall: 0.09876543209876543
ROC AUC: 0.43262740122643506


In [6]:
cbad_ldcof = ClusterBasedAnomalyDetection(birch, dissimilarity_measure="ldcof")
cbad_ldcof.fit(X)
y_scores = cbad_ldcof.decision_fun(X)
y_pred = cbad_ldcof.predict(X)

print(f"----- Birch clustering with LDCOF -----")
res = score(y, y_pred, y_scores)

----- Birch clustering with LDCOF -----
F1: 0.07468879668049792
Precision: 0.05625
Recall: 0.1111111111111111
ROC AUC: 0.49668179378324306


In [7]:
X.shape

(1599, 11)

In [8]:
40 * 40

1600

In [26]:
import warnings
warnings.filterwarnings("ignore")

In [32]:
from sklearn.model_selection import StratifiedKFold
import pandas as pd

# Params for wine

birch = {
    "n_clusters" : [2, 3, 4, 9, 16, 25],
    "threshold": [0.2, 0.5, 0.7]
}

cblof = {
    "alpha": [0.8, 0.9, 0.95],
    "beta": [3, 5, 10],
    "contamination": [0.1, 0.07, 0.05, 0.03]
}

results = []

#for each dataset, algorithm and dissimilarity measure 
# meaning: wine + birch
N_SPLITS = 5
X, y = load_wine(WINE_PATH)
skf = StratifiedKFold(n_splits=N_SPLITS, random_state = 91, shuffle = True)

for contamination in [0.1, 0.07, 0.05, 0.03]:
  for threshold in [0.2, 0.5, 0.7]:
    for alpha in [0.8, 0.9, 0.95]:
      for beta in [3, 5, 10]:
        for n_clusters in [3, 4, 9, 16, 25]:
          birch = Birch(n_clusters=n_clusters)
          cbad_cblof = ClusterBasedAnomalyDetection(birch, dissimilarity_measure="cblof",  contamination = contamination,
                                                    measure_args={"n_clusters": n_clusters, "alpha": alpha, "beta": beta})
          prec_l = []
          rec_l = []
          f1_l = []
          roc_l = []
          for i, (train_index, test_index) in enumerate(skf.split(X, y)):
            X_train = X[train_index]
            X_test = X[test_index]
            y_train = y[train_index]
            y_test = y[test_index]

            paramset = {"n_clusters": n_clusters, "threshold" : threshold, "alpha": alpha, "beta": beta, "contamination": contamination}

            #Could not form valid cluster separation. Please change n_clusters or change clustering method
            try: 
              cbad_cblof.fit(X_train) 
            except ValueError as ex: 
              print(f"Paramset: {paramset} failed with error \"{ex}\". Contining on to the next one")
              break
            
            y_scores = cbad_cblof.decision_fun(X_test)
            y_pred = cbad_cblof.predict(X_test)
            precision, recall, f1, roc = score(y_test, y_pred, y_scores)

            prec_l.append(precision)
            rec_l.append(recall)
            f1_l.append(f1)
            roc_l.append(roc)

            print(f"Results [{i}/{N_SPLITS}]: {res} for paramset {paramset}")
          
          results.append(
              {
                "paramset": {"n_clusters": n_clusters, "threshold" : threshold, "alpha": alpha, "beta": beta, "contamination": contamination},
                "lists": {"precision": prec_l, "recall": rec_l, "f1": f1_l, "auc roc": roc_l},
                "averages": {"precision": sum(prec_l)/N_SPLITS, "recall": sum(rec_l)/N_SPLITS, "f1": sum(f1_l)/N_SPLITS, "auc roc": sum(roc_l)/N_SPLITS}
              }
          )

Results [0/5]: [0.045454545454545456, 0.125, 0.06666666666666667, 0.46710526315789475] for paramset {'n_clusters': 3, 'threshold': 0.2, 'alpha': 0.8, 'beta': 3, 'contamination': 0.1}
Results [1/5]: [0.045454545454545456, 0.125, 0.06666666666666667, 0.46710526315789475] for paramset {'n_clusters': 3, 'threshold': 0.2, 'alpha': 0.8, 'beta': 3, 'contamination': 0.1}
Results [2/5]: [0.045454545454545456, 0.125, 0.06666666666666667, 0.46710526315789475] for paramset {'n_clusters': 3, 'threshold': 0.2, 'alpha': 0.8, 'beta': 3, 'contamination': 0.1}
Results [3/5]: [0.045454545454545456, 0.125, 0.06666666666666667, 0.46710526315789475] for paramset {'n_clusters': 3, 'threshold': 0.2, 'alpha': 0.8, 'beta': 3, 'contamination': 0.1}
Results [4/5]: [0.045454545454545456, 0.125, 0.06666666666666667, 0.46710526315789475] for paramset {'n_clusters': 3, 'threshold': 0.2, 'alpha': 0.8, 'beta': 3, 'contamination': 0.1}
Results [0/5]: [0.045454545454545456, 0.125, 0.06666666666666667, 0.46710526315789475

KeyboardInterrupt: ignored

In [29]:
results

NameError: ignored

In [15]:
pd.Series(y).value_counts()

0    1518
1      81
dtype: int64