In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, average_precision_score, roc_auc_score, confusion_matrix

# define g-mean score
def gmean_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn)  # recall
    specificity = tn / (tn + fp)
    return np.sqrt(sensitivity * specificity)

import hyperopt
from hyperopt.pyll import scope

#### Data

In [7]:
def read_dataset(no: int, train_size: float = 0.6, scale=True):
    df = pd.read_csv(rf"data\tcm5_dataset_{no}.csv")

    X = df.loc[:, :"motor_power_5"]
    Y = df.loc[:, "Anomaly_Reduction":]

    X_train, X_test_val, Y_train, Y_test_val = train_test_split(X, Y, train_size=train_size, shuffle=False)
    X_test, X_val, Y_test, Y_val = train_test_split(X_test_val, Y_test_val, test_size=0.5, shuffle=False)

    # convert to numpy arrays
    X_train = X_train.values
    X_test = X_test.values
    X_val = X_val.values
    Y_train = Y_train.values
    Y_test = Y_test.values
    Y_val = Y_val.values

    if scale:
        scaler = MinMaxScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        X_val = scaler.transform(X_val)

    data = {
        "X": {
            "train": X_train,
            "test": X_test,
            "val": X_val,
            "columns": X.columns
        },
        "Y": {
            "train": Y_train,
            "test": Y_test,
            "val": Y_val,
            "columns": Y.columns
        }
    }
    
    return data

### RiverML

In [8]:
from river.anomaly import HalfSpaceTrees, LocalOutlierFactor, OneClassSVM
from river.cluster import KMeans
from river import stream

class KMeansWrapper:
    def __init__(self, **params):
        self.model = KMeans(**params)
    
    def learn_one(self, x):
        self.model.learn_one(x)
    
    def score_one(self, x):
        y_cluster = self.model.predict_one(x)
        cluster_center = self.model.centers[y_cluster]
        cluster_center = np.array(list(cluster_center.values()))
        x_array = np.array(list(x.values()))
        distance = np.sum((x_array-cluster_center)**2)
        
        return distance


In [25]:
def array_to_dict(x):
    return {f"X{n}": x[n] for n in range(len(x))}

def get_best_threshold_quantile(y_true, anomaly_score, q_min=0.7, q_max=1.0):
    qs = []
    f1s = []
    for q in np.linspace(q_min, q_max, 100):
        anomaly_threshold = np.quantile(anomaly_score, q)
        y_pred = anomaly_score > anomaly_threshold
        f1 = f1_score(y_true, y_pred)
        qs.append(q)
        f1s.append(f1)
    
    best_qi = np.argmax(f1s)

    return qs[best_qi]

def tune_model_river(model_class, data, params_space, max_evals=10, sample=None):
    from tqdm.notebook import tqdm
    def objective(params: dict):
        if sample is None:
            X_train = data["X"]["train"]
            X_test = data['X']['test']
            y_true_test = data["Y"]["test"].any(axis=1)
        else:
            n = int(1/sample)
            X_train = data["X"]["train"][::n]
            X_test = data['X']['test'][::n]
            y_true_test = data["Y"]["test"].any(axis=1)[::n]

        model = model_class(**params)
        
        for i in range(X_train.shape[0]):
            x = X_train[i]
            x_dict = array_to_dict(x)
            model.learn_one(x_dict)
        
        y_score_test = []
        for i in range(X_test.shape[0]):
            x = X_test[i]
            x_dict = array_to_dict(x)
            score = model.score_one(x_dict)
            y_score_test.append(score)
            model.learn_one(x_dict)

        pr_auc = average_precision_score(y_true_test, y_score_test)

        return -pr_auc
    
    tpe_algo = hyperopt.tpe.suggest
    tpe_trials = hyperopt.Trials()
    tpe_best = hyperopt.fmin(fn=objective, space=params_space,  algo=tpe_algo, trials=tpe_trials,  max_evals=max_evals)
    best_hp = hyperopt.space_eval(params_space, tpe_best)
    
    return best_hp


def validate_model_river(model_class, data, params={}, sample=None):
    if sample is None:
        X_train = data["X"]["train"]
        X_test = data['X']['test']
        X_val = data['X']['val']
        y_true_test = data["Y"]["test"].any(axis=1)
        y_true_val = data["Y"]["val"].any(axis=1)
    else:
        n = int(1/sample)
        X_train = data["X"]["train"][::n]
        X_test = data['X']['test'][::n]
        X_val = data['X']['val'][::n]
        y_true_test = data["Y"]["test"].any(axis=1)[::n]
        y_true_val = data["Y"]["val"].any(axis=1)[::n]

    model = model_class(**params)

    # model learning on TRAIN dataset
    for i in range(X_train.shape[0]):
        x = X_train[i]
        x_dict = array_to_dict(x)
        model.learn_one(x_dict)
        
    y_score_test = []
    for i in range(X_test.shape[0]):
        x = X_test[i]
        x_dict = array_to_dict(x)
        score = model.score_one(x_dict)
        y_score_test.append(score)
        model.learn_one(x_dict)
    
    # use TEST dataset to estimate best quantile threshold
    q_threshold = get_best_threshold_quantile(y_true_test, y_score_test)
    anomaly_threshold = np.quantile(y_score_test, q_threshold)
    
    y_score_val = []
    for i in range(X_val.shape[0]):
        x = X_val[i]
        x_dict = array_to_dict(x)
        score = model.score_one(x_dict)
        y_score_val.append(score)
        model.learn_one(x_dict)

    # esitmate metrics on VAL dataset
    
    predicted_labels = y_score_val > anomaly_threshold
    f1 = f1_score(y_true_val, predicted_labels)
    pr_auc = average_precision_score(y_true_val, y_score_val)
    roc_auc = roc_auc_score(y_true_val, y_score_val)
    g_mean = gmean_score(y_true_val, predicted_labels)
    
    return {"F1": f1, "AUCPR": pr_auc, 'AUCROC': roc_auc, 'G-mean': g_mean}



In [None]:
models_online_with_params_space = {}

models_online_with_params_space["KMneas_o"] = (
    KMeansWrapper,
    {
        'n_clusters': scope.int(hyperopt.hp.uniform('n_clusters', 2, 30)),
        'halflife': hyperopt.hp.uniform('halflife', 0, 1),
        'mu': hyperopt.hp.uniform('mu', -1, 1),
        'sigma': hyperopt.hp.uniform('sigma', 0, 3),
        'seed': hyperopt.hp.choice('seed', [42]),
    }
)

models_online_with_params_space['HST_o'] = (
    HalfSpaceTrees,
    {
        'n_trees': scope.int(hyperopt.hp.uniform('n_trees', 10, 500)),
        'window_size': scope.int(hyperopt.hp.uniform('window_size', 200, 800)),
        'height': scope.int(hyperopt.hp.uniform('height', 3, 10)),
        'seed': hyperopt.hp.choice('seed', [42]),
        }
    )

# very slow
# models_online_with_params_space["LOF_o"] = (
#     LocalOutlierFactor, 
#     {
#         'n_neighbors': scope.int(hyperopt.hp.uniform('n_neighbors', 3, 10)),
#     }
# )

models_online_with_params_space["OCSVM_o"] = (
    OneClassSVM, 
    {
        'nu': hyperopt.hp.uniform('nu', 0.0001, 0.3),
        'intercept_lr': hyperopt.hp.uniform('intercept_lr', 0.001, 0.3),
    }
)

df_results_online = pd.DataFrame()

for SELECTED_DATASET in (1, 2,3, 4, 5, 6):
    data = read_dataset(SELECTED_DATASET)
    df_results_online = pd.DataFrame()
    for model_name, model_params in models_online_with_params_space.items():
        model_class, params_dict = model_params
        print(f"Tuning {model_name}")
        best_hp = tune_model_river(model_class, data, params_dict, max_evals=200)
        print("BEST HP:", best_hp)
        res = validate_model_river(model_class, data, best_hp)
        for metric_name, metric_value in res.items():
            df_results_online.at[model_name, metric_name] = metric_value

    print(f"SELECTED_DATASET: {SELECTED_DATASET}")
    print(df_results_online.round(3))
    df_results_online.to_csv(f"Evaluation_{SELECTED_DATASET}_online3.csv")