# Lesson 6 - Anomaly Detection

### Import the Required Packages

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer, InputExample, losses, models, util
from torch.utils.data import DataLoader
from torch import nn
from tqdm.auto import tqdm

import os
import time
import torch

In [3]:
from DLAIUtils import Utils

### Setup Pinecone

In [4]:
utils = Utils()

PINECONE_API_KEY = utils.get_pinecone_api_key()

In [5]:
INDEX_NAME = utils.create_dlai_index_name(index_name="dl-ai")
pinecone = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(name=INDEX_NAME)

pinecone.create_index(
    name=INDEX_NAME,
    dimension=256,
    spec=ServerlessSpec(cloud="aws", region="us-west-2"),
    metric="cosine"
)

index = pinecone.Index(name=INDEX_NAME)

### Load the Dataset

**Note:** To access the dataset outside of this course, just copy the following three lines of code and run it (remember to uncomment them first before executing):

In [7]:
!mkdir -p ./data && wget -q --show-progress -O ./data/training.tar.zip "https://www.dropbox.com/scl/fi/rihfngx4ju5pzjzjj7u9z/lesson6.tar.zip?rlkey=rct9a9bo8euqgshrk8wiq2orh&dl=1"



In [8]:
!tar -xzvf ./data/training.tar.zip -C ./data/

sample.log
._training.txt
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.metadata:kMDItemTextContentLanguage'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.metadata:kMDItemKeyphraseVersion'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.metadata:kMDItemKeyphraseLabels'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.metadata:kMDItemKeyphraseConfidences'
training.txt


Take a peek into the data

In [9]:
!head -5 ./data/sample.log

Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0]
Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 type 3, code 0, by access-group "acl_dmz" [0xe3aab522, 0x0]
Apr 15 2014 09:34:34 EDT: %ASA-session-5-106100: access-list acl_in permitted tcp inside/10.1.2.16(2241) -> outside/192.0.0.89(2000) hit-cnt 1 first hit [0x71a87d94, 0x0]
Apr 24 2013 16:00:28 INT-FW01 : %ASA-6-106100: access-list inside denied udp inside/172.29.2.101(1039) -> outside/192.0.2.10(53) hit-cnt 1 first hit [0xd820e56a, 0x0]
Apr 24 2013 16:00:27 INT-FW01 : %ASA-6-106100: access-list inside permitted udp inside/172.29.2.3(1065) -> outside/192.0.2.57(53) hit-cnt 144 300-second interval [0xe982c7a4, 0x0]


In [10]:
!head -5 ./data/training.txt

Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0] ^ Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0] ^ 1.0
Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 type 3, code 0, by access-group "acl_dmz" [0xe3aab522, 0x0] ^ Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0] ^ 0.9
Apr 15 2014 09:34:34 EDT: %ASA-session-5-106100: access-list acl_in permitted tcp inside/10.1.2.16(2241) -> outside/192.0.0.89(2000) hit-cnt 1 first hit [0x71a87d94, 0x0] ^ Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0] ^ 0.8
Apr 24 2013 16:00:28 INT-FW01 : %ASA-6-106100: access-list inside denied udp inside/172.29.2.1

### Check cuda and Setup the Model

We are using *bert-base-uncased* sentence-transformers model that maps sentences to a 256 dimensional dense vector space.

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [7]:
word_embedding_model = models.Transformer(model_name_or_path="bert-base-uncased", max_seq_length=768)
pooling_model = models.Pooling(word_embedding_dimension=word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model], device=device)

### Train the Model

In [8]:
train_examples = []

with open("./data/training.txt", mode="r") as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line:
            a, b, label = line.split("^")
            train_examples.append(InputExample(texts=[a, b], label=float(label)))

# Define dataset, dataloader and the training loss
warmup_steps = 100
train_dataloader = DataLoader(dataset=train_examples, batch_size=16, shuffle=True)
train_loss = losses.CosineSimilarityLoss(model=model)

<p style="background-color:#fff1d7; padding:15px; "> <b>(Note: <code>load_pretrained_model = True</code>):</b> We've saved the trained model and are loading it here for speedier results, allowing you to observe the outcomes faster. Once you've done an initial run, you may set <code>load_pretrained_model</code> to <code>False</code> to train the model yourself. This can take some time to finsih, depending the value you set for the <code>epochs</code>.</p>

In [9]:
load_pretrained_model = False

In [10]:
import pickle

if load_pretrained_model:
    trained_model_file = open("./data/pretrained_model", mode="rb")
    db = pickle.load(trained_model_file)
    trained_model_file.close()
else:
    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=16, warmup_steps=100)

Iteration: 100%|██████████| 2/2 [00:03<00:00,  1.91s/it]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.09it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.01it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.08it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.00it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.15it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.07it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.06it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.02it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.06it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.10it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.10it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  3.98it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.06it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.09it/s]
Iteration: 100%|██████████| 2/2 [00:00<00:00,  4.08it/s]
Epoch: 100%|██████████| 16/16 [00:11<00:00,  1.42it/s]


In [11]:
samples = []
with open("./data/sample.log", mode="r") as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line:
            samples.append(line)

### Create Embeddings and Upsert to Pinecone

In [12]:
emb = model.encode(sentences=samples)

In [13]:
prepped = []
for i in tqdm(range(len(samples))):
    v = {"id": f"{i}", "values": emb[i].tolist(), "metadata": {"log": samples[i]}}
    prepped.append(v)

index.upsert(vectors=prepped)

100%|██████████| 90/90 [00:00<00:00, 96766.82it/s]


{'upserted_count': 90}

### Find the Anomaly

In [14]:
good_log_line = samples[0]
print(good_log_line)

Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0]


In [15]:
results = []
while len(results) == 0:  # After the upserts, it might take a few seconds for index to be ready for query
    time.sleep(2)  # If results is empty we try again two seconds later.
    queries = index.query(
        vector=emb[0].tolist(),
        top_k=100,
        include_metadata=True
    )

    results = queries["matches"]
    print(".:. ",end="")

.:. 

In [17]:
for i in range(0, 10):
    print(f"{round(results[i]['score'], 4)}\t{results[i]['metadata']['log']}")

1.0	Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0]
0.9733	Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 type 3, code 0, by access-group "acl_dmz" [0xe3aab522, 0x0]
0.9574	Apr 30 2013 09:23:41: %ASA-4-106023: Deny tcp src outside:192.0.2.126/53638 dst inside:10.0.0.132/8111 by access-group "acl_out" [0x71761f18, 0x0]
0.9538	Apr 30 2013 09:23:40: %ASA-4-106023: Deny tcp src outside:192.0.2.126/53638 dst inside:10.0.0.132/8111 by access-group "acl_out" [0x71761f18, 0x0]
0.9084	Sep 12 2014 06:53:01 GIFRCHN01 : %ASA-4-106023: Deny tcp src outside:192.0.2.95/24069 dst inside:10.32.112.125/25 by access-group "PERMIT_IN" [0x0, 0x0]"
0.877	Apr 29 2013 12:59:50: %ASA-6-305011: Built dynamic TCP translation from outside:10.123.3.42/4952 to outside:192.0.2.130/12834
0.8724	Apr 29 2013 12:59:50: %ASA-6-305011: Built dynamic TCP translation from outside:10.123.

Print the last matched element

In [18]:
print(f"{round(results[len(results)-1]['score'], 4)}\t{results[len(results)-1]['metadata']['log']}")

0.2459	dec 31, 2021 09:18:59: %ASA-4-434005: seg fault detected in the matrix
