In [73]:
import pandas as pd
import numpy as np
from pathlib import Path
from qdrant_client import QdrantClient
from transformers import BertTokenizer, BertModel
import torch 
import hashlib
from qdrant_client.http.models import PointStruct
from qdrant_client.http.models import Distance, VectorParams
from tqdm.notebook import tqdm


In [3]:
bgl = pd.read_csv('../data/bgl/bgl_cleaned.csv', compression='gzip')
bgl.drop(columns=['Unnamed: 0'], inplace=True)

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased").to('cuda:0')

def get_sentence_embedding(text):
    encoded_input = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt').to('cuda:0')
    output = model(**encoded_input)
    embeddings = output.last_hidden_state
    # we must deallocate from gpu mem!
    del encoded_input 
    return embeddings.detach().cpu() # use first for [CLS]

In [4]:
def generate_64bit_hash(text):
    sha256_hash = hashlib.sha256(text.encode()).digest()
    hash_64bit = int.from_bytes(sha256_hash[:8], byteorder='big')
    return hash_64bit


In [5]:
client = QdrantClient("localhost", port=6333)

client.create_collection(collection_name="normal", vectors_config=VectorParams(size=model.config.hidden_size, distance=Distance.COSINE))
client.create_collection(collection_name="test", vectors_config=VectorParams(size=model.config.hidden_size, distance=Distance.COSINE))

In [8]:
unique_normal_logs = bgl[bgl['is_anomaly'] == False]['line'].unique()
for text in tqdm(unique_normal_logs):
    hash = generate_64bit_hash(text)
    embedding = get_sentence_embedding(text)
    cls_embedding = embedding[0, 0].numpy()
    token_embeddings = embedding[0, 1:].numpy()
    point = PointStruct(id=hash, vector=cls_embedding, payload={"text": text, "tokens": token_embeddings.tolist()})
    client.upsert(collection_name="normal", points=[point])

  0%|          | 0/703 [00:00<?, ?it/s]

100%|██████████| 703/703 [02:12<00:00,  5.32it/s]


In [9]:
normal_log = bgl[bgl['is_anomaly'] == False].iloc[0]['line']
abnormal_log = bgl[bgl['is_anomaly'] == True].iloc[0]['line']
normal_log, abnormal_log

In [56]:

def distance(query_emb, train_embedding):
    dot_products = query_emb @ train_embedding.T
    query_normalized = query_emb.square().sum(dim = -1).sqrt()
    train_normalized = train_embedding.square().sum(dim = -1).sqrt()
    normalized_pairs = query_normalized.unsqueeze(0).T * train_normalized.unsqueeze(0)
    max_similarity = (dot_products / normalized_pairs).max(dim=-1).values.sum(dim=-1)
    return 1 - max_similarity

def get_score(log: str):
    num_neighbors = max(2, round(0.01 * len(unique_normal_logs)))
    embedding = get_sentence_embedding(log)[0]
    core_set = client.search(
        collection_name="normal", query_vector=embedding[0].numpy(), 
        limit=num_neighbors, 
        with_payload=['text', 'tokens'], 
        with_vectors=False
    )[1:] # ignore self
    anomaly_score = min(distance(embedding[1:], torch.tensor(c.payload['tokens'])) for c in core_set)
    return anomaly_score

get_score(normal_log)

tensor(-473.2968)

In [58]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve, f1_score, roc_auc_score
def evaluate(test_y, final_pred):
    print('Done!')
    precision, recall, thresholds = precision_recall_curve(test_y, final_pred)
    f_score = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(f_score)

    print(f'Best Threshold = {thresholds[ix]}')
    print(classification_report(test_y, final_pred > thresholds[ix], target_names=['Normal', 'Anomaly']))

    f1 = f1_score(test_y, final_pred > thresholds[ix])
    print(f'F1 = {f1}')

    auc = roc_auc_score(test_y, final_pred)
    print(f'AUC = {auc}')

In [76]:
test_data = pd.concat([bgl[bgl['is_anomaly'] == False].sample(879_910), bgl[bgl['is_anomaly'] == True].sample(348_460)])
unique_test_logs = test_data['line'].unique()

In [77]:
scores = [get_score(log).item() for log in tqdm(unique_test_logs)]
unique_scores = pd.DataFrame({'line': unique_test_logs, 'score': scores})
test_data = test_data.merge(unique_scores, on='line', how='left')
evaluate(test_data['is_anomaly'], test_data['score'].values)

  0%|          | 0/661 [00:00<?, ?it/s]

  5%|▍         | 30/662 [13:35<4:46:23, 27.19s/it]
  3%|▎         | 17/662 [14:36<9:14:10, 51.55s/it]
  0%|          | 2/662 [11:54<65:30:52, 357.35s/it]


In [46]:
def log_exists_in_normaldb(log: str):
    hash = generate_64bit_hash(log)
    return client.retrieve(collection_name="normal", ids=[hash], with_payload=False, with_vectors=False) != []

def get_point_from_log(log: str):
    hash = generate_64bit_hash(log)
    return client.retrieve(collection_name="test", ids=[hash], with_payload=True, with_vectors=True)

In [54]:
def get_embedding(log: str):
    hash = generate_64bit_hash(log)
    point = get_point_from_log(log)
    if not point:
        print(f'Generating embedding for {log}')
        embedding = get_sentence_embedding(log)
        cls_embedding = embedding[0, 0].numpy()
        token_embeddings = embedding[0, 1:].numpy()
        point = PointStruct(id=hash, vector=cls_embedding, payload={"text": log, "tokens": token_embeddings.tolist()})
        client.upsert(collection_name="test", points=[point])
    else:
        print('Fetching embedding from cache!')
        point = point[0]
    return point

In [None]:
from kafka import KafkaProducer


producer = KafkaProducer(bootstrap_servers='localhost:9092')

In [57]:
test_log = abnormal_log
if log_exists_in_normaldb(test_log):
    print('Log exists in normal db!')
else:
    print('Log does not exist in normal db!')
    point = get_embedding(test_log)
    score = get_score(test_log)
    print(f'Score: {score}')
    threshold = -440 # hard coded for now
    is_anomaly = score < threshold
    print(f'Is anomaly: {is_anomaly}')

    if not is_anomaly:
        print('Normal log, adding to normal db')
        client.upsert(collection_name="normal", points=[point]) # NOTE: will exist in both test and normal collection

    # Send out result to kafka topic
    # Create a kafka producer for topic "predictions" and send out the result
    producer.send('predictions', value={'log': test_log, 'score': score, 'is_anomaly': is_anomaly})

Log does not exist in normal db!
Fetching embedding from cache!
Score: -449.79229736328125
