# Setup

In [1]:
# download red wine quality data
! mkdir wine
! wget -P wine https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv

# download and unzip Musk v2
! mkdir musk
! wget -P musk https://archive.ics.uci.edu/ml/machine-learning-databases/musk/clean2.data.Z
! uncompress musk/clean2.data.Z

# ! git clone --branch wip/clustering https://github.com/jsokolowska/anomaly-detection.git
# ! mv anomaly-detection ad

'wget' is not recognized as an internal or external command,
operable program or batch file.
'wget' is not recognized as an internal or external command,
operable program or batch file.
'uncompress' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
# install pyod - not present by default in google colab
#! pip install --force-reinstall numpy==1.22.0
! pip install pyod




You should consider upgrading via the 'C:\Users\Asia\PycharmProjects\anomaly-detection\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [7]:
WINE_PATH = "./wine/winequality-red.csv"
MUSK_PATH = "./musk/clean2.data"

In [8]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

def score( y_true, y_pred, y_proba):
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    roc = roc_auc_score(y_true, y_proba)
    f1 = f1_score(y_true, y_pred)
    return precision, recall, f1, roc

# Parameter Tuning

In [6]:
import os
os.listdir(".")

['wine', 'ad', '__notebook_source__.ipynb', '.virtual_documents', 'musk']

In [9]:
from sklearn.model_selection import GridSearchCV
from ad.src.anomaly_detection import ClusterBasedAnomalyDetection
from sklearn.cluster import Birch
from sklearn.model_selection import StratifiedKFold
from ad.src.data_loading import load_wine, load_musk
import pandas as pd

In [10]:
df = pd.DataFrame(index=["musk", "wine"], columns=["birch-ldcof", "birch-cblof", "dbscan-ldcof", "dbscan-cblof"])
# Params for wine
param_grid = {
    "alpha": [0.8, 0.9, 0.95],
    "beta": [3, 5, 10],
    "contamination": [0.1, 0.07, 0.05, 0.03],
    "n_clusters" : [ 3, 5, 7, 10],
    "threshold": [0.2, 0.5, 0.7]
}
N_SPLITS = 5
X, y = load_wine(WINE_PATH)
skf = StratifiedKFold(n_splits=N_SPLITS, random_state = 91, shuffle = True)

In [None]:
clustering_algs = [("birch", Birch())]
measures = ["ldcof", "cblof"]
datasets = {"musk": load_musk(MUSK_PATH), "wine": load_wine(WINE_PATH)}

for dataset_name, values in datasets.items():
    X,y = values
    for name, algorithm in clustering_algs:
        for measure in measures:
            cbad = ClusterBasedAnomalyDetection(clustering_estimator=algorithm, dissimilarity_measure=measure)

            search = GridSearchCV(param_grid=param_grid, estimator=cbad, scoring="roc_auc", cv = 5, n_jobs=1)
            search.fit(X, y)
            print(f"[{name}/{measure}/{dataset_name}] Best params: {search.best_params_}")
            print(f"[{name}/{measure}/{dataset_name}] Best score: {search.best_score_}")
            df[f"{name}-{measure}"][dataset_name] = search.best_params_


In [None]:
df

In [11]:
from ad.src.wrappers import DBSCANWrapped

In [12]:
dbscan_grid = {
    "alpha": [0.8, 0.9, 0.95],
    "beta": [3, 5, 10],
    "contamination": [0.1, 0.07, 0.05, 0.03],
    "eps": [0.3, 0.5, 1, 2],
    "min_samples": [1,2],
    "p": [1,2,3]
}

In [14]:
from sklearn.cluster import DBSCAN
clustering_algs = [("birch", DBSCANWrapped(dbscan = DBSCAN()))]
measures = ["ldcof", "cblof"]
datasets = {"musk": load_musk(MUSK_PATH), "wine": load_wine(WINE_PATH)}

for dataset_name, values in datasets.items():
    X,y = values
    for name, algorithm in clustering_algs:
        for measure in measures:
            cbad = ClusterBasedAnomalyDetection(clustering_estimator=DBSCANWrapped(dbscan = DBSCAN()), dissimilarity_measure=measure)

            search = GridSearchCV(param_grid=dbscan_grid, estimator=cbad, scoring="roc_auc", cv = 5, n_jobs=1, verbose = 10)
            search.fit(X, y)
            print(f"[{name}/{measure}/{dataset_name}] Best params: {search.best_params_}")
            print(f"[{name}/{measure}/{dataset_name}] Best score: {search.best_score_}")
            df[f"{name}-{measure}"][dataset_name] = search.best_params_


TypeError: DBSCANWrapped.__init__() got an unexpected keyword argument 'algorithm'