In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
''' This is a demo file for the Invariants Mining model.
    API usage:
        dataloader.load_HDFS(): load HDFS dataset
        feature_extractor.fit_transform(): fit and transform features
        feature_extractor.transform(): feature transform after fitting
        model.fit(): fit the model
        model.predict(): predict anomalies on given data
        model.evaluate(): evaluate model accuracy with labeled data
'''

import sys
sys.path.append('../')
from loglizer.models import InvariantsMiner
from loglizer import dataloader, preprocessing

In [2]:
# Declare variables
struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
epsilon = 0.5 # threshold for estimating invariant space

In [3]:
# Load structured log without label info
train_test_tuple = dataloader.load_HDFS(struct_log,
                                                 window='session', 
                                                 train_ratio=0.8,
                                                 split_type='sequential')
(x_train, _), (x_test, _) = train_test_tuple[0], train_test_tuple[1]

Loading ../data/HDFS/HDFS_100k.log_structured.csv
Total: 7940 instances, train: 6352 instances, test: 1588 instances


In [4]:
# Feature extraction
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)

Train data shape: 6352-by-16



In [5]:
# Model initialization and training
model = InvariantsMiner(epsilon=epsilon)
model.fit(x_train)

Invariant space dimension: 13
Mined 11 invariants: {(0, 1): [-3, 1], (0, 2): [-3, 1], (0, 3): [-3, 1], (0, 4): [-3, 1], (6, 14): [1, -24], (6, 15): [1, -24], (8, 9): [1, -1], (8, 10): [-2, 1], (8, 11): [1, -1], (8, 12): [1, -2], (8, 13): [-101, 1]}



In [6]:
# Predict anomalies on the training set offline, and manually check for correctness
y_train = model.predict(x_train)

In [7]:
# Predict anomalies on the test set to simulate the online mode
# x_test may be loaded from another log file
x_test = feature_extractor.transform(x_test)
y_test = model.predict(x_test)

Test data shape: 1588-by-16



In [8]:
# If you have labeled data, you can evaluate the accuracy of the model as well.
# Load structured log with label info
train_test_tuple = dataloader.load_HDFS(struct_log,
                                                           label_file=label_file,
                                                           window='session', 
                                                           train_ratio=0.8,
                                                           split_type='sequential')   
(x_train, y_train), (x_test, y_test) = train_test_tuple[0], train_test_tuple[1]
x_test = feature_extractor.transform(x_test)
precision, recall, f1 = model.evaluate(x_test, y_test)

Loading ../data/HDFS/HDFS_100k.log_structured.csv
222 91
Total: 7940 instances, 313 anomaly, 7627 normal
Train: 6352 instances, 222 anomaly, 6130 normal
Test: 1588 instances, 91 anomaly, 1497 normal

Test data shape: 1588-by-16

Precision: 0.980, recall: 0.538, F1-measure: 0.695

