In [None]:
# Install captum package
!pip install captum

In [2]:
import pandas as pd
import numpy as np
import random
from utils import preprocessing, SlidingWindow, NegativeSampling, utils, modelhdfs, explainhdfs
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import time
import math
import os
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from captum.attr import LayerIntegratedGradients
import collections

In [3]:
DATASET_NAME = 'HDFS'
TRAIN_SIZE = 100000
RATIO = 0.1
SEED = 42

In [None]:
# Download dataset and parsing the dataset with Drain
preprocessing.parsing(DATASET_NAME)

In [None]:
# Cut log data into sliding windows
# Split data into training normal dataset, test normal dataset, and test abnormal dataset
# Get the bigram from training normal dataset
# Train a Word2Vec model with the training normal data
# Number of keys include 'pad'
random.seed(SEED)
np.random.seed(SEED)
train_normal, test_normal, test_abnormal, bigram, unique, weights, train_dict, w2v_dic = SlidingWindow.sliding(DATASET_NAME, train_size=TRAIN_SIZE)

In [6]:
# +1 for unknown
VOCAB_DIM = len(train_dict)+1
OUTPUT_DIM = 2
EMB_DIM = 4
HID_DIM = 64
N_LAYERS = 1
DROPOUT = 0.0
BATCH_SIZE = 32
TIMES = 20

In [None]:
# Get negative samples and split into training data and val data
random.seed(SEED)
np.random.seed(SEED)
neg_samples = NegativeSampling.negative_sampling_hdfs(train_normal, bigram, unique, TIMES, VOCAB_DIM)

In [8]:
df_neg = utils.get_dataframe(neg_samples, 1, w2v_dic)
df_pos = utils.get_dataframe(list(train_normal['EventSequence']), 0, w2v_dic)
df_pos.columns = df_pos.columns.astype(str)
df_train = pd.concat([df_pos, df_neg], ignore_index = True, axis=0)
df_train.reset_index(drop = True)
y = list(df_train.loc[:,'class_label'])
X = list(df_train['W2V_EventId'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
train_iter = utils.get_iter_hdfs(X_train, y_train, w2v_dic, train_dict, BATCH_SIZE)
val_iter = utils.get_iter_hdfs(X_val, y_val, w2v_dic, train_dict, BATCH_SIZE)

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device( "cuda" if torch.cuda.is_available() else"cpu")
interpretableSAD = modelhdfs.C_lstm(weights, VOCAB_DIM, OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, device, BATCH_SIZE).to(device)
print(f'The model has {modelhdfs.count_parameters(interpretableSAD):,} trainable parameters')
print()
optimizer = optim.Adam(interpretableSAD.parameters())
criterion = nn.CrossEntropyLoss()

try:
    os.makedirs('Model')
except:
    pass

#Training interpretableSAD
N_EPOCHS = 1
CLIP = 1

best_test_loss = float('inf')

for epoch in tqdm(range(N_EPOCHS)):
    
    start_time = time.time()
    train_loss= modelhdfs.train(interpretableSAD, train_iter, optimizer, criterion, CLIP, epoch, device)        

    val_loss = modelhdfs.evaluate(interpretableSAD, val_iter, criterion, device)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = modelhdfs.epoch_time(start_time, end_time)
    
    if val_loss < best_test_loss:
        best_test_loss = val_loss
        torch.save(interpretableSAD.state_dict(), 'Model/interpretableSAD_HDFS.pt')
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. PPL: {math.exp(val_loss):7.3f}')

In [None]:
test_ab_X = test_abnormal['W2V_EventId']
test_n_X = test_normal['W2V_EventId']
y, y_pre = modelhdfs.model_precision(interpretableSAD, device, w2v_dic, train_dict, test_n_X.values.tolist(), test_ab_X.values.tolist())
f1_acc = metrics.classification_report(y, y_pre, digits=5)
print(f1_acc)

In [None]:
lig = LayerIntegratedGradients(interpretableSAD, interpretableSAD.embedding)
lst_train_keys = []
for i in train_normal.W2V_EventId.values:
    lst_train_keys.extend(i)
dic_app = collections.Counter(lst_train_keys)
if w2v_dic[str(len(train_dict))] not in dic_app.keys():
    dic_app[w2v_dic[str(len(train_dict))]] = 0
start = [w2v_dic[i] for i in unique]
df_attr = explainhdfs.get_dataset(interpretableSAD, device, lig, test_abnormal, dic_app, start)

In [None]:
%%capture cap
for i in range(len(df_attr)):
    if len(df_attr['Event'].iloc[i]) >10 and len(df_attr['Event'].iloc[i]) < 30:
        display(explainhdfs.visualize_token_attrs(df_attr['Event'].iloc[i], np.array(df_attr['Attr'].iloc[i]), df_attr['Blk'].iloc[i]))

In [13]:
cap()