# Pipeline for the anomaly detection on the SKAB using Convolutional Autoencoder

In [None]:
# libraries importing
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline
import seaborn as sns
from tqdm import tqdm

# additional modules
import sys
sys.path.append('../algorithms')

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
# 'pip install tsad' if needed
# !pip install tsad
import tsad
from tsad.utils.evaluating.evaluating import evaluating

## Data

Put the `data` folder from the SKAB repository in the root of this repository. So the path will be following: `../data/`. Or select your own path.

In [None]:
path_to_data = '../data/'

## Data loading

In [None]:
# benchmark files checking
all_files=[]
import os
for root, dirs, files in os.walk(path_to_data):
    for file in files:
        if file.endswith(".csv"):
             all_files.append(os.path.join(root, file))

In [None]:
all_files

In [None]:
# datasets with anomalies loading
list_of_df = [pd.read_csv(file,
                          sep=';',
                          index_col='datetime',
                          parse_dates=True) for file in all_files if 'anomaly-free' not in file]
# anomaly-free df loading
anomaly_free_df = pd.read_csv([file for file in all_files if 'anomaly-free' in file][0],
                            sep=';',
                            index_col='datetime',
                            parse_dates=True)

### Data description

In [None]:
# dataset characteristics printing
print('SKAB v0.9')
print(f'- A number of datasets: {len(list_of_df)}')
print(f'- Shape of the random dataset: {list_of_df[10].shape}')
n_cp = sum([len(df[df.changepoint==1.]) for df in list_of_df])
n_outlier = sum([len(df[df.anomaly==1.]) for df in list_of_df])
print(f'- A number of changepoints: {n_cp}')
print(f'- A number of outliers (point anomalies): {n_outlier}\n')
print(f'Head of the random dataset:')
display(list_of_df[0].head(1))

### Data visualization

In [None]:
# random dataset visualizing
list_of_df[1].plot(figsize=(12, 3))
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('Signals')
plt.show()

## Method

### Method initialization

In [None]:
from Conv_RNN_AE import Conv_RNN_AE
import numpy as np
from sklearn.preprocessing import StandardScaler

In [None]:
gpus = tf.config.list_physical_devices('GPU')
print(gpus)
if gpus:
    try:
        tf.config.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        print(e)
        print('ERROR')
else:
    print('NO GPUS')

In [None]:
# hyperparameters selection
N_STEPS = 60
Q = 0.999 # quantile for upper control limit (UCL) selection

In [None]:
# model defining
model = Conv_RNN_AE(
    dilation_rates=[1, 3, 9],
    filters=[8, 4, 2]
)

In [None]:
# Generated training sequences for use in the model.
def create_sequences(values, time_steps=N_STEPS):
    output = []
    for i in range(len(values) - time_steps + 1):
        output.append(values[i : (i + time_steps)])
    return np.stack(output)

### Method fitting and applying

In [None]:
%%time
# inference
predicted_outlier, predicted_cp = [], []
for df in tqdm(list_of_df):
    X_train = df[:400].drop(['anomaly', 'changepoint'], axis=1)

    # scaler init and fitting
    StSc = StandardScaler()
    StSc.fit(X_train)

    # convert into input/output
    X = create_sequences(StSc.transform(X_train), N_STEPS)

    # model fitting
    model.fit(X)

    # results predicting
    residuals = pd.Series(np.sum(np.mean(np.abs(X - model.predict(X)), axis=1), axis=1))
    UCL = residuals.quantile(Q) * 4/3

    # results predicting
    X = create_sequences(StSc.transform(df.drop(['anomaly','changepoint'], axis=1)), N_STEPS)
    cnn_residuals = pd.Series(np.sum(np.mean(np.abs(X - model.predict(X)), axis=1), axis=1))

    # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
    anomalous_data = cnn_residuals > UCL
    anomalous_data_indices = []
    for data_idx in range(N_STEPS - 1, len(X) - N_STEPS + 1):
        if np.all(anomalous_data[data_idx - N_STEPS + 1 : data_idx]):
            anomalous_data_indices.append(data_idx)

    prediction = pd.Series(data=0, index=df.index)
    prediction.iloc[anomalous_data_indices] = 1

    # predicted outliers saving
    predicted_outlier.append(prediction)

    # predicted CPs saving
    prediction_cp = abs(prediction.diff())
    prediction_cp[0] = prediction[0]
    predicted_cp.append(prediction_cp)

In [None]:
np.sum([np.prod(v.shape) for v in model.model.trainable_weights])

### Labels visualization

In [None]:
# fix changepoints
for df in list_of_df:
    df.loc[df.index[0], 'anomaly'] = 0
    df['changepoint_true'] = (
        (df['anomaly'] != df['anomaly'].shift(1))
        .fillna(False)
        .astype(float)
    )
    df.loc[df.index[0], 'changepoint_true'] = 0

In [None]:
list_of_df[1].head()

In [None]:
# plotting the labels both for outlier and changepoint detection problems
sns.set(rc={"figure.figsize": (12, 3)})
sns.lineplot(data=list_of_df[1][['anomaly', 'changepoint', 'changepoint_true']])

### Results visualization

In [None]:
# true outlier indices selection
true_outlier = [df.anomaly for df in list_of_df]

predicted_outlier[0].plot(figsize=(12,3), label='predictions', marker='o', markersize=5)
true_outlier[0].plot(marker='o', markersize=2)
plt.legend();

In [None]:
true_cp = [df.changepoint for df in list_of_df]

In [None]:
for i, df in enumerate(list_of_df):
    df['predicted_cp'] = predicted_cp[i]
sns.lineplot(data=list_of_df[0][['changepoint', 'predicted_cp']])

#predicted_cp[0].plot(figsize=(12,3), label='predictions', marker='o', markersize=5)
#true_cp[0].plot(marker='o', markersize=2)
#plt.legend();

## Evaluation (metrics calculation)

### Binary classification (outlier detection) metrics

In [None]:
# binary classification metrics calculation
binary = evaluating(
    true_outlier,
    predicted_outlier,
    metric='binary'
)

### Changepoint detection metrics

In [None]:
# average detection delay metric calculation
add = evaluating(
    true_cp,
    predicted_cp,
    metric='average_time',
    anomaly_window_destination='righter',
    portion=1
)

In [None]:
# nab metric calculation
nab = evaluating(
    true_cp,
    predicted_cp,
    metric='nab',
    window_width='30S',
    anomaly_window_destination='center',
)

In [None]:
8.28
5.79
10.69