In [None]:
# Install captum package
!pip install captum

In [2]:
import pandas as pd
import numpy as np
import random
from utils import preprocessing, SlidingWindow, NegativeSampling, utils, model, explain
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import time
import math
import os
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from captum.attr import LayerIntegratedGradients
import collections

In [3]:
DATASET_NAME = 'BGL'
TRAIN_SIZE = 100000
WINDOW_SIZE = 100
STEP_SIZE = 20
RATIO = 0.1
SEED = 42

In [None]:
# Download dataset and parsing the dataset with Drain
preprocessing.parsing(DATASET_NAME)

In [None]:
# Cut log data into sliding windows
# Split data into training normal dataset, test normal dataset, and test abnormal dataset
# Get the bigram from training normal dataset
# Train a Word2Vec model with the training normal data
random.seed(SEED)
np.random.seed(SEED)
train_normal, test_normal, test_abnormal, bigram, unique, weights, train_dict, w2v_dic = SlidingWindow.sliding(DATASET_NAME, WINDOW_SIZE, STEP_SIZE, TRAIN_SIZE)

In [6]:
# +1 for unknown
VOCAB_DIM = len(train_dict)+1
OUTPUT_DIM = 2
EMB_DIM = 8
HID_DIM = 128
N_LAYERS = 1
DROPOUT = 0.0
BATCH_SIZE = 32
TIMES = 20

In [None]:
# Get negative samples and split into training data and val data
random.seed(SEED)
np.random.seed(SEED)
neg_samples = NegativeSampling.negative_sampling(train_normal, bigram, unique, TIMES, VOCAB_DIM)
df_neg = utils.get_dataframe(neg_samples, 1, w2v_dic)
df_pos = utils.get_dataframe(list(train_normal['EventId']), 0, w2v_dic)
df_pos.columns = df_pos.columns.astype(str)
df_train = pd.concat([df_pos, df_neg], ignore_index = True, axis=0)
df_train.reset_index(drop = True)
y = list(df_train.loc[:,'class_label'])
X = list(df_train['W2V_EventId'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = torch.tensor(X_train,requires_grad=False).long()
X_val = torch.tensor(X_val,requires_grad=False).long()
y_train = torch.tensor(y_train).reshape(-1, 1).long()
y_val = torch.tensor(y_val).reshape(-1, 1).long()
train_iter = utils.get_iter(X_train, y_train, BATCH_SIZE)
val_iter = utils.get_iter(X_val, y_val, BATCH_SIZE)

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device( "cuda" if torch.cuda.is_available() else"cpu")
interpretableSAD = model.C_lstm(weights, VOCAB_DIM, OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, device, BATCH_SIZE).to(device)
print(f'The model has {model.count_parameters(interpretableSAD):,} trainable parameters')
print()
optimizer = optim.Adam(interpretableSAD.parameters())
criterion = nn.CrossEntropyLoss()

try:
    os.makedirs('Model')
except:
    pass

#Training interpretableSAD
N_EPOCHS = 10
CLIP = 1

best_test_loss = float('inf')

for epoch in tqdm(range(N_EPOCHS)):
    
    start_time = time.time()
    train_loss= model.train(interpretableSAD, train_iter, optimizer, criterion, CLIP, epoch, device)        

    val_loss = model.evaluate(interpretableSAD, val_iter, criterion, device)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = model.epoch_time(start_time, end_time)
    
    if val_loss < best_test_loss:
        best_test_loss = val_loss
        torch.save(interpretableSAD.state_dict(), 'Model/interpretableSAD_BGL.pt')
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. PPL: {math.exp(val_loss):7.3f}')

In [None]:
# less or equal than 10% abnormal data
test_abnormal_ratio = model.ratio_abnormal_sequence(test_abnormal, WINDOW_SIZE, RATIO)
test_ab_X, test_ab_X_key_label = test_abnormal_ratio['W2V_EventId'], test_abnormal_ratio['Key_label']
test_n_X, test_n_X_key_label = test_normal['W2V_EventId'], test_normal['Key_label']
test_ab_y = test_abnormal_ratio['Label']
test_n_y = test_normal['Label']
y, y_pre = model.model_precision(interpretableSAD, device, test_n_X.values.tolist()[:int(len(test_n_X.values.tolist())*(len(test_abnormal_ratio)/len(test_abnormal)))], \
                           test_ab_X.values.tolist())
f1_acc = metrics.classification_report(y, y_pre, digits=5)
print(f1_acc)

In [None]:
lig = LayerIntegratedGradients(interpretableSAD, interpretableSAD.embedding)
lst_train_keys = []
for i in train_normal.W2V_EventId.values:
    lst_train_keys.extend(i)
dic_app = collections.Counter(lst_train_keys)
if w2v_dic[str(len(train_dict))] not in dic_app.keys():
    dic_app[w2v_dic[str(len(train_dict))]] = 0
start = [w2v_dic[i] for i in unique]
lst_attr, lst_y, lst_dist, lst_keys, lst_baseline = explain.get_dataset(interpretableSAD, device, lig, test_ab_X, test_ab_X_key_label, dic_app, start, RATIO, WINDOW_SIZE)

In [None]:
exp_df = pd.DataFrame()
exp_df['key'] = lst_keys
exp_df['attr'] = lst_attr
exp_df['y'] = lst_y
exp_df['baseline'] = lst_baseline
best_inter = explain.get_mean_inter(exp_df)
# Zero as inter
mean_pred = explain.mean_inter(exp_df)
print("Accuracy for zero inter:",metrics.accuracy_score(lst_y, mean_pred))
print(metrics.classification_report(lst_y, mean_pred, digits=5))
# Best inter
mean_pred = explain.mean_inter(exp_df, interception=best_inter)
print("Accuracy for the best inter:",metrics.accuracy_score(lst_y, mean_pred))
print(metrics.classification_report(lst_y, mean_pred, digits=5))