# ADR (Anomaly Detection by workflow Relations)

ADR mines numerical relations from log data and uses the relations for anomaly detection.

In the following parts, we use the BGL logs as example to show the capability of ADR.

## load datasets

For ease of presentation, the raw BGL logs are already parsed into structured log events by Drain <sup>[1]</sup> and the event-count-matrices are evaluated and saved in "_data.zip_". Please unzip "_data.zip_" to ADR folder before running the demo code.

In [1]:
import numpy as np

log_paths = {'hdfs': 'data/Drain_result/hdfs/x_y_xColumns.npz',
             'bgl': 'data/Drain_result/bgl/x_y_xColumns.npz',
             'hd': 'data/Drain_result/Hadoop/x_y_xColumns.npz',
             'spirit':'data/Drain_result/spirit/x_y_xColumns.npz'}

log_datasets = {}
for name, log_path in log_paths.items():
    log_datasets[name] = np.load(log_path, allow_pickle=True)

## sADR (supervised, need labelled logs for training)

In [5]:
from ADR import preprocess
from ADR import sADR

train_numbers = [100, 150, 200, 250, 300, 350, 400, 450, 500]

for log_name, x_y_xColumns in log_datasets.items():
    print(f'====={log_name}=====')
    x, y, xColumns = x_y_xColumns['x'], x_y_xColumns['y'], x_y_xColumns['xColumns']

    for i in range(len(train_numbers)):
        train_number = train_numbers[i]
        print(f'-----train number:{train_number}-----')
        if i == 0:
            x_train, y_train, x_test, y_test = x_train, y_train, x_test, y_test = preprocess.split_to_train_test_by_num(x, y, num_train=train_number)
        else:
            x_train_adding, y_train_adding, x_test, y_test = preprocess.split_to_train_test_by_num(x, y, num_train=train_numbers[i]-train_numbers[i-1])
            x_train = np.concatenate((x_train, x_train_adding), axis=0)
            y_train = np.concatenate((y_train, y_train_adding), axis=0)

        model = sADR.sADR()
        model.fit(x_train, y_train)
        precision, recall, f1 = model.evaluate(x_train, y_train)
        print('Accuracy on training set:')
        print(f"precision, recall, f1: {[precision, recall, f1]}")

        precision, recall, f1 = model.evaluate(x_test, y_test)
        print('Accuracy on testing set:')
        print(f"precision, recall, f1: {[precision, recall, f1]}")

NameError: name 'log_datasets' is not defined

## uADR (unsupervised, do not need labelled logs for training)

In [2]:
from ADR import preprocess

u_log_datasets_train_test = {}

u_train_ratios = {'hdfs': 0.5,
                'bgl': 0.5,
                'hd': 0.5,
                'spirit': 0.5}
for name, x_y_xColumns in log_datasets.items():
    if name in ['hdfs', 'bgl', 'hd', 'spirit']:
        print("========")
        print(name)
        x, y, xColumns = x_y_xColumns['x'], x_y_xColumns['y'], x_y_xColumns['xColumns']
        print(f'x shape: {x.shape}')
        x_train, y_train, x_test, y_test = preprocess.split_to_train_test_by_ratio(x, y, train_ratio=u_train_ratios[name])
        u_log_datasets_train_test[name] = [x_train, y_train, x_test, y_test]
        print(f'x_train shape:{x_train.shape}')
        print(f'x_test shape:{x_test.shape}')

hdfs
x shape: (575061, 48)
x_train shape:(287530, 48)
x_test shape:(287531, 48)
bgl
x shape: (69252, 384)
x_train shape:(34626, 384)
x_test shape:(34626, 384)
hd
x shape: (55, 347)
x_train shape:(27, 347)
x_test shape:(28, 347)
spirit
x shape: (517, 988)
x_train shape:(258, 988)
x_test shape:(259, 988)


In [3]:
from ADR import uADR

estimated_pN = 0.5

for log_name in u_log_datasets_train_test:
    print(f'====={log_name}=====')
    x_train, y_train, x_test, y_test = u_log_datasets_train_test[log_name]

    model = uADR.uADR(AN_ratio=1-estimated_pN, nrows_per_sample=10, nrounds=100)
    model.fit(x_train)
    precision, recall, f1 = model.evaluate(x_train, y_train)
    print('Accuracy on training set:')
    print(f"precision, recall, f1: {[precision, recall, f1]}")

    precision, recall, f1 = model.evaluate(x_test, y_test)
    print('Accuracy on testing set:')
    print(f"precision, recall, f1: {[precision, recall, f1]}")

=====hdfs=====
Accuracy on training set:
precision, recall, f1: [0.1252, 1.0, 0.2226]
Accuracy on testing set:
precision, recall, f1: [0.1292, 1.0, 0.2289]
=====bgl=====
Accuracy on training set:
precision, recall, f1: [0.6615, 1.0, 0.7962]
Accuracy on testing set:
precision, recall, f1: [0.6598, 1.0, 0.795]
=====hd=====
Accuracy on training set:
precision, recall, f1: [0.85, 0.7391, 0.7907]
Accuracy on testing set:
precision, recall, f1: [0.75, 1.0, 0.8571]
=====spirit=====
Accuracy on training set:
precision, recall, f1: [0.2248, 1.0, 0.3671]
Accuracy on testing set:
precision, recall, f1: [0.2201, 1.0, 0.3608]


In [9]:
precision, recall, f1 = model.evaluate(x_test, y_test)
print('Accuracy on testing set:')
print(f"precision, recall, f1: {[precision, recall, f1]}")

Accuracy on testing set:
precision, recall, f1: [0.2201, 1.0, 0.3608]


In [12]:
from ADR import uADR

list_estimated_pN = [0.7]

for log_name in u_log_datasets_train_test:
    if log_name == 'bgl':
        for estimated_pN in list_estimated_pN:
            print(f'====={log_name}=====')
            print(f'-----{estimated_pN}-----')
            x_train, y_train, x_test, y_test = u_log_datasets_train_test[log_name]

            model = uADR.uADR(AN_ratio=1-estimated_pN, nrows_per_sample=10, nrounds=100)
            model.fit(x_train)
            precision, recall, f1 = model.evaluate(x_train, y_train)
            print('Accuracy on training set:')
            print(f"precision, recall, f1: {[precision, recall, f1]}")

            precision, recall, f1 = model.evaluate(x_test, y_test)
            print('Accuracy on testing set:')
            print(f"precision, recall, f1: {[precision, recall, f1]}")

=====bgl=====
-----0.7-----
Accuracy on training set:
precision, recall, f1: [0.7552, 0.6972, 0.7251]
Accuracy on testing set:
precision, recall, f1: [0.7532, 0.6949, 0.7229]


In [18]:
from ADR import uADR

list_estimated_pN = [0.6]

for log_name in u_log_datasets_train_test:
    if log_name == 'bgl':
        for estimated_pN in list_estimated_pN:
            print(f'====={log_name}=====')
            print(f'-----{estimated_pN}-----')
            x_train, y_train, x_test, y_test = u_log_datasets_train_test[log_name]

            model = uADR.uADR(AN_ratio=1-estimated_pN, nrows_per_sample=10, nrounds=100)
            model.fit(x_train)
            precision, recall, f1 = model.evaluate(x_train, y_train)
            print('Accuracy on training set:')
            print(f"precision, recall, f1: {[precision, recall, f1]}")

            precision, recall, f1 = model.evaluate(x_test, y_test)
            print('Accuracy on testing set:')
            print(f"precision, recall, f1: {[precision, recall, f1]}")

=====bgl=====
-----0.6-----
Accuracy on training set:
precision, recall, f1: [0.6837, 0.991, 0.8092]
Accuracy on testing set:
precision, recall, f1: [0.6815, 0.9908, 0.8076]


In [19]:
import pickle

with open('data/Drain_result/bgl/bgl_uADR_model_0p6.pkl', 'wb') as class_file:
    pickle.dump(model, class_file, pickle.HIGHEST_PROTOCOL)

In [2]:
import pickle

with open('data/Drain_result/bgl/bgl_uADR_model_0p6.pkl', 'rb') as input:
    model = pickle.load(input)

In [3]:
import pandas as pd
df_ECM = pd.read_csv('data/Drain_result/bgl/bgl_sessions_ECM.csv', index_col=0)

In [12]:
bgl_x_part1 = df_ECM.values[:35000]

In [13]:
bgl_x_part2 = df_ECM.values[35000:]

In [14]:
bgl_y_predict_part1 = model.predict(bgl_x_part1)

In [15]:
bgl_y_predict_part2 = model.predict(bgl_x_part2)

In [16]:
bgl_y_predict_part1.sum()

33027

In [17]:
bgl_y_predict_part2.sum()

12516

In [22]:
bgl_predict_y = np.concatenate([bgl_y_predict_part1, bgl_y_predict_part2])

In [23]:
bgl_predict_y.shape

(69252,)

In [30]:
df_ECM.index[~bgl_predict_y]

Index(['R23-M0-N0-C:J09-U11', 'R23-M0-N0-C:J15-U11', 'R23-M0-N0-C:J11-U11',
       'R23-M0-N0-C:J13-U11', 'R23-M0-N0-C:J17-U11', 'R23-M0-N0-C:J03-U01',
       'R23-M0-N0-C:J05-U11', 'R23-M0-N0-C:J03-U11', 'R23-M0-N0-C:J07-U11',
       'R23-M0-N0-C:J15-U01',
       ...
       'R54-M0-N8-C:J10-U11', 'R54-M0-N8-C:J06-U11', 'R54-M0-N8-C:J14-U01',
       'R54-M0-N8-C:J10-U01', 'R54-M0-N8-C:J08-U01', 'R54-M0-N8-C:J04-U01',
       'R54-M0-N8-C:J06-U01', 'R54-M0-N8-C:J04-U11', 'R54-M0-N8-C:J02-U01',
       'R54-M0-N8-C:J02-U11'],
      dtype='object', length=23709)

In [32]:
np.savetxt('data/Drain_result/bgl/ADR0p6_predict_y.csv', bgl_predict_y, delimiter=',')

## References

[1] P. He, J. Zhu, Z. Zheng, and M. R. Lyu, “Drain: An Online Log Parsing Approach with Fixed Depth Tree,” in 2017 IEEE International Conference on Web Services (ICWS), Jun. 2017, pp. 33–40, doi: 10.1109/ICWS.2017.13.