In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, average_precision_score, roc_auc_score, confusion_matrix

# define g-mean score
def gmean_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn)  # recall
    specificity = tn / (tn + fp)
    return np.sqrt(sensitivity * specificity)

import hyperopt
from hyperopt.pyll import scope

#### Data

In [48]:
def read_dataset(no: int, train_size: float = 0.6, scale=True):
    df = pd.read_csv(rf"data\tcm5_dataset_{no}.csv")

    X = df.loc[:, :"motor_power_5"]
    Y = df.loc[:, "Anomaly_Reduction":]

    X_train, X_test_val, Y_train, Y_test_val = train_test_split(X, Y, train_size=train_size, shuffle=False)
    X_test, X_val, Y_test, Y_val = train_test_split(X_test_val, Y_test_val, test_size=0.5, shuffle=False)

    # convert to numpy arrays
    X_train = X_train.values
    X_test = X_test.values
    X_val = X_val.values
    Y_train = Y_train.values
    Y_test = Y_test.values
    Y_val = Y_val.values

    if scale:
        scaler = MinMaxScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        X_val = scaler.transform(X_val)

    data = {
        "X": {
            "train": X_train,
            "test": X_test,
            "val": X_val,
            "columns": X.columns
        },
        "Y": {
            "train": Y_train,
            "test": Y_test,
            "val": Y_val,
            "columns": Y.columns
        }
    }
    
    return data

### PyOD

In [49]:
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.iforest import IForest
from pyod.models.loda import LODA
from pyod.models.hbos import HBOS
from pyod.models.anogan import AnoGAN
from pyod.models.lof import LocalOutlierFactor
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA

class AutoEncoderWrapper:
    # wrapper for training autoencoder
    def __init__(self, **kwargs):
        kwargs["hidden_neuron_list"] = [kwargs['hidden_neuron_1'], kwargs['hidden_neuron_2']]
        kwargs.pop('hidden_neuron_1')
        kwargs.pop('hidden_neuron_2')

        self._model = AutoEncoder(verbose=0, **kwargs)
    
    def fit(self, X):
        self._model.fit(X)
    
    def decision_function(self, X):
        return self._model.decision_function(X)
    
class BaselineModel:
    def __init__(self, anomaly_ratio=0.05):
        self.anomaly_ratio = anomaly_ratio
    
    def decision_function(self, X):
        return np.random.uniform(0, 1, size=(X.shape[0], ))
    
    def fit(self, X):
        pass

In [50]:
def get_best_threshold_quantile(y_true, anomaly_score, q_min=0.7, q_max=1.0):
    qs = []
    f1s = []
    for q in np.linspace(q_min, q_max, 100):
        anomaly_threshold = np.quantile(anomaly_score, q)
        y_pred = anomaly_score > anomaly_threshold
        f1 = f1_score(y_true, y_pred)
        qs.append(q)
        f1s.append(f1)
    
    best_qi = np.argmax(f1s)

    return qs[best_qi]

def tune_model_pyod(model_class, data, params_space, max_evals=10, sample=None):
    
    def objective(params: dict):
        if sample is None:
            X_train = data["X"]["train"]
        else:
            n = int(1/sample)
            X_train = data["X"]["train"][::n]
        model = model_class(**params)
        model.fit(X_train)

        true_labels = data["Y"]["test"].any(axis=1)
        anomaly_score = model.decision_function(data["X"]["test"])
        pr_auc = average_precision_score(true_labels, anomaly_score)

        return -pr_auc
    
    tpe_algo = hyperopt.tpe.suggest
    tpe_trials = hyperopt.Trials()
    tpe_best = hyperopt.fmin(fn=objective, space=params_space,  algo=tpe_algo, trials=tpe_trials,  max_evals=max_evals)
    best_hp = hyperopt.space_eval(params_space, tpe_best)

    return best_hp


def validate_model_pyod(model_class, data, params={}, sample=None):
    if sample is None:
        X_train = data["X"]["train"]
    else:
        n = int(1/sample)
        X_train = data["X"]["train"][::n]
    # model learning on TRAIN dataset
    model = model_class(**params)
    model.fit(X_train)
    
    # use TEST dataset to estimate best quantile threshold
    y_true_test = data["Y"]["test"].any(axis=1)
    anomaly_score_test = model.decision_function(data["X"]["test"])
    q_threshold = get_best_threshold_quantile(y_true_test, anomaly_score_test)
    anomaly_threshold = np.quantile(anomaly_score_test, q_threshold)
    
    # esitmate metrics on VAL dataset
    anomaly_score_val = model.decision_function(data["X"]["val"])
    y_true_val = data["Y"]["val"].any(axis=1)
    predicted_labels = anomaly_score_val > anomaly_threshold
    f1 = f1_score(y_true_val, predicted_labels)
    pr_auc = average_precision_score(y_true_val, anomaly_score_val)
    roc_auc = roc_auc_score(y_true_val, anomaly_score_val)
    g_mean = gmean_score(y_true_val, predicted_labels)
    
    return {"F1": f1, "AUCPR": pr_auc, 'AUCROC': roc_auc, 'G-mean': g_mean}


In [None]:
models_with_params_space = {}

# very very slow
# models_with_params_space['AnoGAN'] = (
#     AnoGAN,
#     {
#         'epochs':  hyperopt.hp.choice('epochs', [50, 100, 150]),
#         'epochs_query':  hyperopt.hp.choice('epochs_query', [5, 10]),
#         'G_layers':  hyperopt.hp.choice('G_layers', [[5, 10], [10, 10], [20, 10, 10], [8, 5, 4]]),
#         'D_layers':  hyperopt.hp.choice('D_layers', [[10, 5], [8, 5], [20, 10], [12, 6]]),
#     }
# )

models_with_params_space["Baseline"] = (
    BaselineModel,
    {
        # dummy feature
        'anomaly_ratio': scope.int(hyperopt.hp.uniform('anomaly_ratio', 0, 1)),
    }
)

models_with_params_space["AE"] = (
    AutoEncoderWrapper, 
    {
        'epoch_num':  hyperopt.hp.choice('epoch_num', [5, 10, 20, 30]),
        'lr': hyperopt.hp.choice('lr', [0.001, 0.003, 0.005]),
        'batch_size': hyperopt.hp.choice('batch_size', [16, 32]),
        'hidden_neuron_1': scope.int(hyperopt.hp.uniform('hidden_neuron_1', 32, 129)),
        'hidden_neuron_2': scope.int(hyperopt.hp.uniform('hidden_neuron_2', 5, 20)),
        }
)

models_with_params_space["PCA"] = (
    PCA,
    {
        'n_components': scope.int(hyperopt.hp.uniform('n_components', 2, 16)),
    }
)

models_with_params_space["IForest"] = (
    IForest, 
    {
        'n_estimators': scope.int(hyperopt.hp.uniform('n_estimators', 10, 500)),
        'max_samples': hyperopt.hp.uniform('max_samples', 0.01, 0.3),
        'max_features': hyperopt.hp.uniform('max_features', 0.3, 1.0),
        'bootstrap': hyperopt.hp.choice('bootstrap', [True, False]),
        'n_jobs': hyperopt.hp.choice('n_jobs', [4]),
        'random_state': hyperopt.hp.choice('random_state', [44]),
    }
)

models_with_params_space["LODA"] = (
    LODA, 
    {
        'n_bins': scope.int(hyperopt.hp.uniform('n_bins', 5, 100)),
        'n_random_cuts': scope.int(hyperopt.hp.uniform('n_random_cuts', 10, 500)),
    }
)


models_with_params_space["OCSVM"] = (
    OCSVM, 
    {
        'kernel': hyperopt.hp.choice('kernel', ['rbf', 'sigmoid', 'linear']),
        'nu': hyperopt.hp.uniform('nu', 0.001, 0.3),
        'gamma': hyperopt.hp.loguniform('gamma', np.log(1e-5), np.log(1e1)),
        'coef0': hyperopt.hp.uniform('coef0', 0.0, 1.0),
    }
)

models_with_params_space["HBOS"] = (
    HBOS, 
    {
        'n_bins': scope.int(hyperopt.hp.uniform('n_bins', 5, 100)),
        'alpha': hyperopt.hp.uniform('alpha', 0.0, 1),
        'tol': hyperopt.hp.uniform('coef0', 0.0, 1.0),
    }
)

for SELECTED_DATASET in (1, 2, 3, 4, 5, 6):
    data = read_dataset(SELECTED_DATASET)
    
    df_results = pd.DataFrame()
    for model_name, settings in models_with_params_space.items():
        model_class, params_space = settings
        print(f"Tuning {model_name}")
        best_hp = tune_model_pyod(model_class, data, params_space, max_evals=200)
        print("BEST HP:", best_hp)
        res = validate_model_pyod(model_class, data, best_hp)
        for metric_name, metric_value in res.items():
            df_results.at[model_name, metric_name] = metric_value
    
    print(f"SELECTED_DATASET: {SELECTED_DATASET}")
    print(df_results.round(3))
    df_results.to_csv(f"Evaluation_{SELECTED_DATASET}.csv")
    