In [3]:
import pinecone 
import os 
import sys 
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tensorflow import keras
from tqdm import tqdm
import numpy as np
import itertools
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
import pdb
# sys.path.append("/Users/jamesmorrissey/Github/plutonium/src")
sys.path.append("/Users/jamesmorrissey/Github/plutonium/repos")

os.chdir("/Users/jamesmorrissey/Github/plutonium/src")

from core.cloud.pc import init_pinecone_connection, get_pinecone_index

In [4]:
init_pinecone_connection()
index =pinecone.Index(index_name="it-threats")

In [6]:
data_dir = "/Users/jamesmorrissey/Github/plutonium/data/threat_model"
data = pd.read_csv(os.path.join(data_dir, "Friday-23-02-2018_TrafficForML_CICFlowMeter.csv"))
print(data.Label.value_counts())

data_23_cleaned = pd.read_csv(os.path.join(data_dir,'result23.csv'))
print(data_23_cleaned.head())
print(data_23_cleaned.Label.value_counts())

Label
Benign              1048009
Brute Force -Web        362
Brute Force -XSS        151
SQL Injection            53
Name: count, dtype: int64
   Dst Port  Protocol     Timestamp  Flow Duration  Tot Fwd Pkts  \
0        22         6  1.519374e+09        1532698            11   
1       500        17  1.519374e+09      117573855             3   
2       500        17  1.519374e+09      117573848             3   
3        22         6  1.519374e+09        1745392            11   
4       500        17  1.519374e+09       89483474             6   

   Tot Bwd Pkts  TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  \
0            11             1179             1969              648   
1             0             1500                0              500   
2             0             1500                0              500   
3            11             1179             1969              648   
4             0             3000                0              500   

   Fwd Pkt Len Min  ...  F

In [8]:
model = keras.models.load_model("/Users/jamesmorrissey/Github/plutonium/models/it_threat_model.model")
model.summary()

# Select the first layer
layer_name = 'dense' 
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               10240     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 18561 (72.50 KB)
Trainable params: 18561 (72.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

def upsert_batches(items_to_upload):
    NUMBER_OF_ITEMS = len(items_to_upload)
    for batch in chunks(items_to_upload[:NUMBER_OF_ITEMS], 50):
        index.upsert(vectors=batch)

def add_to_true_list(label):
    if label == 'Benign':
        return 0
    else:
        return 1
    
def add_to_pred_list(counter):
    if counter['Bru'] or counter['SQL']:
        return 1
    else:
        return 0
    
def batch_query_results(test_vector, index):
    query_results = []
    for xq in test_vector.tolist():
        query_res = index.query(xq, top_k=50)
        query_results.append(query_res)
    return query_results

def get_query_results(data_sample, model, batch_size=100):
    y_true = []
    y_pred = []
    for i in tqdm(range(0, len(data_sample), batch_size)):
        test_data = data_sample.iloc[i:i+batch_size, :]
        test_vector = model.predict(K.constant(test_data.iloc[:, :-1]))
        query_results = batch_query_results(test_vector, index)
        
        ids = [res.id for result in query_results for res in result.matches]
        
        for label, res in zip(test_data.Label.values, query_results):
            counter = Counter(match.id.split('_')[0] for match in res.matches)
            y_true.append(add_to_true_list(label))
            y_pred.append(add_to_pred_list(counter))


def create_upload_items(model_res):
    # model_res = intermediate_layer_model.predict(K.constant(data_23_cleaned.iloc[:,:-1]))
    items_to_upload = []
    for i, res in tqdm(zip(data_23_cleaned.iterrows(), model_res), total=len(model_res)):
        benign_or_attack = i[1]['Label'][:3]
        items_to_upload.append((benign_or_attack + '_' + str(i[0]), res.tolist()))
    return items_to_upload


def plot_confusion_matrix(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    ax = plt.subplot()
    sns.heatmap(conf_matrix, annot=True, ax = ax, cmap='Blues', fmt='g', cbar=False)

    # Add labels, title and ticks
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Acctual')
    ax.set_title('Confusion Matrix')
    ax.xaxis.set_ticklabels(['Benign', 'Attack'])
    ax.yaxis.set_ticklabels(['Benign', 'Attack'])

def print_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    print(f"Accuracy: {acc:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")

def per_class_accuracy(y_true, y_pred):
    cmd = confusion_matrix(y_true, y_pred, normalize="true").diagonal()
    per_class_accuracy_df = pd.DataFrame([(index, round(value,4)) for index, value in zip(['Benign', 'Attack'], cmd)], columns = ['type', 'accuracy'])
    per_class_accuracy_df = per_class_accuracy_df.round(2)
    display(per_class_accuracy_df)

In [None]:
items_to_upload = []

model_res = intermediate_layer_model.predict(K.constant(data_23_cleaned.iloc[:,:-1]))

for i, res in tqdm(zip(data_23_cleaned.iterrows(), model_res), total=len(model_res)):
    benign_or_attack = i[1]['Label'][:3]
    items_to_upload.append((benign_or_attack + '_' + str(i[0]), res.tolist()))


NUMBER_OF_ITEMS = len(items_to_upload)

for batch in chunks(items_to_upload[:NUMBER_OF_ITEMS], 50):
    index.upsert(vectors=batch)

items_to_upload.clear()
index.describe_index_stats()

In [24]:
def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

NUMBER_OF_ITEMS = len(items_to_upload)

for batch in chunks(items_to_upload[:NUMBER_OF_ITEMS], 50):
    index.upsert(vectors=batch)

items_to_upload.clear()

index.describe_index_stats()

In [None]:
!python DeepLearning-IDS/data_cleanup.py "Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv" "result22022018"

In [None]:
data_22_cleaned = pd.read_csv(os.path.join(data_dir,'result22022018.csv'))
print(data_22_cleaned.head())
print(data_22_cleaned.Label.value_counts())

data_sample = data_22_cleaned[-2000:]
print(data_sample.Label.value_counts())

In [None]:
y_true = []
y_pred = []
BATCH_SIZE = 100

for i in tqdm(range(0, len(data_sample), BATCH_SIZE)):
    query_results = []
    test_data = data_sample.iloc[i:i+BATCH_SIZE, :]
    # Create vector embedding using the model
    test_vector = intermediate_layer_model.predict(K.constant(test_data.iloc[:, :-1]))
    # Query using the vector embedding
    for xq in test_vector.tolist():
        query_res = index.query(xq, top_k=50)
        query_results.append(query_res)
    
    ids = [res.id for result in query_results for res in result.matches]
    
    for label, res in zip(test_data.Label.values, query_results):
        # Add to the true list
        if label == 'Benign':
            y_true.append(0)
        else:
            y_true.append(1)
        
        counter = Counter(match.id.split('_')[0] for match in res.matches)

        # Add to the predicted list
        if counter['Bru'] or counter['SQL']:
            y_pred.append(1)
        else:
            y_pred.append(0)

In [None]:
# Create confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Show confusion matrix
ax = plt.subplot()
sns.heatmap(conf_matrix, annot=True, ax = ax, cmap='Blues', fmt='g', cbar=False)

# Add labels, title and ticks
ax.set_xlabel('Predicted')
ax.set_ylabel('Acctual')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['Benign', 'Attack'])
ax.yaxis.set_ticklabels(['Benign', 'Attack'])

In [None]:
# Calculate accuracy
acc = accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)

print(f"Accuracy: {acc:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")

# Calculate per class accuracy
cmd = confusion_matrix(y_true, y_pred, normalize="true").diagonal()
per_class_accuracy_df = pd.DataFrame([(index, round(value,4)) for index, value in zip(['Benign', 'Attack'], cmd)], columns = ['type', 'accuracy'])
per_class_accuracy_df = per_class_accuracy_df.round(2)
display(per_class_accuracy_df)