In [None]:
# install dependencies
! pip install pandas
! pip3 install torch --index-url https://download.pytorch.org/whl/cu128
! pip install ipdb
! pip install tqdm
! pip install pyarrow
! pip install matplotlib
! pip install scikit-learn
! pip install scipy

In [None]:
# import libraries
import os
#os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
import pandas as pd
import glob
import torch
from torch.utils.data import Dataset, DataLoader, random_split, IterableDataset
import gc
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
import time
from tqdm import tqdm
import pyarrow.dataset as ds
from sklearn.metrics import roc_curve, auc, precision_recall_fscore_support
import matplotlib.pyplot as plt
import random
import json
import torch.nn.utils.prune as prune

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# ! cp drive/MyDrive/normalized-ids2018-parquet.tar.gz /content/
# ! tar -xzvf normalized-ids2018-parquet.tar.gz normalized/

In [None]:
PARQUET_FILES = glob.glob('normalized/*.parquet')
PARQUET_FILES_BENIGN = glob.glob('normalized-benign/*.parquet')
print(PARQUET_FILES)
print(PARQUET_FILES_BENIGN)

In [None]:
# canon columns
CANON_COLUMN_INDEX = ['Fwd IAT Tot', 'Fwd Pkt Len Min', 'Down/Up Ratio', 'Dst Port', 'Fwd IAT Std', 'Fwd Header Len', 'Fwd IAT Min', 'Flow IAT Std', 'Active Std', 'Bwd IAT Max', 'Fwd Pkt Len Mean', 'Pkt Size Avg', 'PSH Flag Cnt', 'Flow IAT Mean', 'Fwd Act Data Pkts', 'Bwd Pkt Len Max', 'Flow IAT Max', 'ACK Flag Cnt', 'Bwd IAT Tot', 'Flow IAT Min', 'Bwd Pkts/b Avg', 'Fwd IAT Max', 'SYN Flag Cnt', 'Bwd Header Len', 'Fwd Seg Size Avg', 'Bwd Byts/b Avg', 'Subflow Bwd Byts', 'Pkt Len Max', 'Bwd Pkts/s', 'Fwd IAT Mean', 'Pkt Len Var', 'Fwd Pkt Len Std', 'Protocol', 'Init Bwd Win Byts', 'Active Min', 'Src Port', 'RST Flag Cnt', 'Subflow Fwd Byts', 'Init Fwd Win Byts', 'Bwd Pkt Len Std', 'Fwd PSH Flags', 'Fwd Pkts/s', 'Bwd Blk Rate Avg', 'Flow Byts/s', 'CWE Flag Count', 'Pkt Len Std', 'Active Max', 'Fwd Byts/b Avg', 'Fwd Blk Rate Avg', 'URG Flag Cnt', 'Timestamp', 'Fwd Pkts/b Avg', 'Idle Mean', 'Idle Std', 'Fwd Pkt Len Max', 'Pkt Len Min', 'Flow Duration', 'Fwd Seg Size Min', 'Bwd IAT Min', 'TotLen Fwd Pkts', 'Flow Pkts/s', 'Active Mean', 'ECE Flag Cnt', 'Idle Min', 'Subflow Bwd Pkts', 'Bwd Pkt Len Mean', 'Pkt Len Mean', 'Tot Fwd Pkts', 'Bwd IAT Std', 'Bwd Seg Size Avg', 'Bwd URG Flags', 'Bwd Pkt Len Min', 'Tot Bwd Pkts', 'Subflow Fwd Pkts', 'Bwd IAT Mean', 'FIN Flag Cnt', 'Bwd PSH Flags', 'TotLen Bwd Pkts', 'Fwd URG Flags', 'Idle Max']
CANON_COLUMN_INDEX.sort()
CANON_COLUMN_INDEX.append('Label')
print(CANON_COLUMN_INDEX)
TRAINING_UNWANTED_COLUMNS = ['Timestamp', 'Flow ID', 'Dst IP', "Src IP"]
TRAINING_WANTED_COLUMNS = []
for col in CANON_COLUMN_INDEX:
  if col not in TRAINING_UNWANTED_COLUMNS:
    TRAINING_WANTED_COLUMNS.append(col)
print(TRAINING_WANTED_COLUMNS)
TRAINING_FEATURES = TRAINING_WANTED_COLUMNS[:-1]

In [None]:
# pyarrow parquet dataset
class ArrowParquetDataset(IterableDataset):
    def __init__(self, path, batch_size=1024, shuffle=True, splits=[0.7,0.1,0.2]):
        self.path = path
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.splits = splits
        self.mode("train")

        self.dataset = ds.dataset(self.path, format="parquet")
        self.scanner = self.dataset.scanner(columns=TRAINING_WANTED_COLUMNS, batch_size=self.batch_size)

    def mode(self, m):
        self.mode_value = m

    # approximate iterations
    def num_iterations(self):
        return int((len(self) * self.splits[self.mode_value]) / self.batch_size)

    def __len__(self):
        return self.scanner.count_rows()

    def __iter__(self):
        batches = list(self.scanner.to_batches())

        num_batches = len(batches)
        batch_intervals = [0]
        running_total = 0
        for i in range(len(self.splits) - 1):
            length = int(np.floor(num_batches * self.splits[i]))
            batch_intervals.append(length + running_total)
            running_total += length
        batch_intervals.append(num_batches)

        batches = batches[batch_intervals[self.mode_value]:batch_intervals[self.mode_value + 1]]

        if self.shuffle:
            random.shuffle(batches)

        for batch in batches:
            x = batch.select(TRAINING_FEATURES).to_tensor(null_to_nan=True)
            
            y_string_array = batch.column("Label")
            y = torch.tensor([(0.0 if val.as_py() == "Benign" else 1.0) for val in y_string_array], dtype=torch.float32)
            x = torch.tensor(x, dtype=torch.float32)

            # in val and train modes ignore intrusions
            if (self.mode_value == 0 or self.mode_value == 1):
                benign_mask = (y == 0.0)
                x = x[benign_mask]
                y = y[benign_mask]
            
            # mask and impute nans
            mask = torch.isnan(x).float()
            x = torch.nan_to_num(x, nan=0.0)
            x = torch.cat([x, mask], dim=1)
            
            yield x, y

DS_ARROW = ArrowParquetDataset(PARQUET_FILES, batch_size=512, splits=[0.6,0.1,0.1,0.2])
print(len(DS_ARROW))

In [None]:
# dnn model
class DNN(nn.Module):
  def __init__(self, input_size, hidden_sizes, output_size):
    super(DNN, self).__init__()

    #layers
    self.input = nn.Linear(input_size, hidden_sizes[0])
    self.output = nn.Linear(hidden_sizes[-1], output_size)
    self.dropout = nn.Dropout(0.6)
    self.hiddens = nn.ModuleList()
    for i in range(len(hidden_sizes) - 1):
      self.hiddens.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))

  def forward(self, x):
    x = F.relu(self.input(x))
    for layer in self.hiddens:
      x = self.dropout(x)
      x = F.relu(layer(x))
    x = self.dropout(x)
    x = self.output(x)
    # return torch.sigmoid(x).view(-1)
    return x

In [None]:
# create model
input_shape = len(TRAINING_FEATURES) * 2 # because masking

# autoencoder
model = DNN(
    input_shape,
    [int(input_shape / 2), int(input_shape / 2)],
    input_shape
)

model_desc = model.__str__()
print(model_desc)

In [None]:
# train
torch.cuda.empty_cache()

model.to(device)
loss_fn = nn.MSELoss()
loss_fn_no_reduction = nn.MSELoss(reduction='none')
lr = 0.00001
beta1 = 0.9
beta2 = 0.999
weight_decay = 0.001
optim = torch.optim.Adam(model.parameters(), lr=lr, betas=(beta1,beta2), weight_decay=weight_decay)
epochs = 20
train_loss = -1

reconstruction_l2 = 0.0

historical_loss = []
historical_val_loss = []
historical_reconstruction = []

train_start_time = time.time()
for epoch in range(epochs):
  epoch_loss = 0.0
  epoch_val_loss = 0.0

  model.train()
  epoch_samples = 0
  DS_ARROW.mode(0)
  for x, _ in tqdm(DS_ARROW, total=DS_ARROW.num_iterations()):
    x = x.to(device)
    batch_size = x.size()[0]
    if batch_size == 0: continue
    epoch_samples += batch_size

    optim.zero_grad()
    out = model(x)
    loss = loss_fn(out, x)
    loss.backward()
    optim.step()
    epoch_loss += loss.item() * batch_size

  epoch_loss = epoch_loss / epoch_samples

  epoch_l2s = []

  model.eval()
  epoch_val_samples = 0
  DS_ARROW.mode(1)
  with torch.no_grad():
    for x, _ in tqdm(DS_ARROW, total=DS_ARROW.num_iterations()):
      x = x.to(device)
      batch_size = x.size()[0]
      if batch_size == 0: continue
      epoch_val_samples += batch_size

      out = model(x)
      loss = loss_fn(out, x)
      epoch_val_loss += loss.item() * batch_size

  epoch_val_loss = epoch_val_loss / epoch_val_samples
  historical_val_loss.append(epoch_val_loss)
  historical_loss.append(epoch_loss)

  print(f'epoch: {epoch + 1}/{epochs}, train loss: {epoch_loss:.5f}, val loss: {epoch_val_loss:.5f}')
  train_loss = epoch_loss
min, sec = divmod(int(time.time() - train_start_time), 60)
print(f'train time: {min}:{sec}')

In [None]:
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        print(f"{name}: original weight: {torch.sum(module.weight == 0).item()}")
        module = prune.l1_unstructured(module, name='weight', amount=0.2)
        prune.remove(module, 'weight')

for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        print(f"{name}: Pruned weights: {torch.sum(module.weight == 0).item()} zeros")

In [None]:
# threshold search
search_start = 10
search_depth = 4
search_length = search_depth * 20
search_counter = 0

best_fscore = np.array([-1,-1])
best_fscore_sum = -1
best_threshold = -1
best_percision = -1
best_recall = -1

historical_threshold = []
historical_fscore = []

threshold_search_start_time = time.time()
for i in range(search_depth):
    search_step = 10 ** (i * -1)
    for j in range(-9,10):
        search_counter += 1
        threshold = j * search_step + search_start
        y_pred_tensor = []
        y_true_tensor = []

        DS_ARROW.mode(2)
        test_loss = 0.0
        with torch.no_grad():
            for x, y in tqdm(DS_ARROW, total=DS_ARROW.num_iterations()):
                x = x.to(device)
                y = y.to(device)

                out = model(x)
                reconstruction_losses = loss_fn_no_reduction(out, x)
                l2_dists = torch.norm(reconstruction_losses, p=2, dim=1)
                thresholded = (l2_dists > threshold).float()
                y_pred_tensor.append(thresholded.bool())
                y_true_tensor.append(y.bool())
        
        y_pred = torch.cat(y_pred_tensor).tolist()
        y_true = torch.cat(y_true_tensor).tolist()

        percision, recall, fscore, support = precision_recall_fscore_support(y_true, y_pred)
        curr_weighted_sum = fscore[0] + 1 * fscore[1]
        historical_fscore.append(curr_weighted_sum)
        historical_threshold.append(threshold)
        print(f"search: {search_counter}/{search_length}, threshold: {threshold}, percision: {percision}, recall: {recall}, fscore: {fscore}, fscore weighted sum: {curr_weighted_sum}")
        if best_fscore_sum < curr_weighted_sum:
            best_threshold = threshold
            best_fscore_sum = curr_weighted_sum
            best_fscore = fscore
            best_percision = percision
            best_recall = recall
    
    search_start = best_threshold

print(f"best threshold: {best_threshold}, percision: {best_percision}, recall: {best_recall}, fscore: {best_fscore}, fscore weighted sum: {best_fscore_sum}")
min, sec = divmod(int(time.time() - threshold_search_start_time), 60)
print(f'train time: {min}:{sec}')


In [None]:
# test
model.eval()
test_loss = 0.0
test_start_time = time.time()

historical_label_tensor = []
historical_pred_tensor = []

DS_ARROW.mode(3)
epoch_samples = 0
with torch.no_grad():
  for x, y in tqdm(DS_ARROW, total=DS_ARROW.num_iterations()):
    x = x.to(device)
    y = y.to(device)
    batch_size = x.size()[0]
    epoch_samples += batch_size

    out = model(x)
    reconstruction_losses = loss_fn_no_reduction(out, x)
    l2_dists = torch.norm(reconstruction_losses, p=2, dim=1)
    thresholded = (l2_dists > best_threshold).float()
    test_loss += torch.isclose(thresholded, y).float().sum().item()

    historical_label_tensor.append(y.to("cpu"))
    historical_pred_tensor.append(thresholded.to("cpu"))

test_loss = test_loss / epoch_samples
print(f'test loss (accuracy): {test_loss:.5f}')

historical_pred = torch.cat(historical_pred_tensor).tolist()
historical_label = torch.cat(historical_label_tensor).tolist()
test_percision, test_recall, test_fscore, test_support = precision_recall_fscore_support(y_true, y_pred)
print(f"percision: {test_percision}, recall: {test_recall}, fscore: {test_fscore}\nsupport: {test_support}")

min, sec = divmod(int(time.time() - test_start_time), 60)
print(f'test time: {min}:{sec}')

In [None]:
# analyze

with open("models/models.json", "r") as file:
    saved_models = json.load(file)
model_name = f'model_{len(saved_models)}'
color_list = ['red', 'orange', 'yellow', 'green', 'blue', 'purple']

historical_pred = torch.cat(historical_pred_tensor).tolist()
historical_label = torch.cat(historical_label_tensor).tolist()

# epoch loss
loss_plt, ax = plt.subplots()
ax.plot(range(1, len(historical_loss) + 1), historical_loss, marker='o', color='blue', label="training loss")
ax.plot(range(1, len(historical_val_loss) + 1), historical_val_loss, marker='o', color='orange', label="validation loss")
ax.set_title(f'{model_name}: Loss by Epoch')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.legend()
plt.show(loss_plt.number)

# threshold search
threshold_plt, ax = plt.subplots()
thresh_sort, fscore_sort = zip(*sorted(zip(historical_threshold, historical_fscore)))
ax.plot(thresh_sort, fscore_sort, color='blue')
ax.plot(best_threshold, best_fscore_sum, color='red', marker='o', label=f'chosen threshold={best_threshold}')
ax.set_title(f'{model_name}: threshold search')
ax.set_xlabel('threshold')
ax.set_ylabel('fscore')
ax.legend()
plt.show(threshold_plt.number)

# confusion matrix
tp = 0
fp = 0
fn = 0
tn = 0
for i in range(len(historical_label)):
    if historical_label[i]:
        if historical_pred[i]:
            tp += 1
        else:
            fn += 1
    else:
        if historical_pred[i]:
            fp += 1
        else:
            tn += 1
tpr = tp / (tp + fn)
fpr = fp / (tn + fp)
tnr = tn / (tn + fp)
fnr = fn / (tp + fn)

conf_matrix = np.array([[tp, fp],[fn, tn]])
classes = ['Intrusion (positive)','Benign (negative)']
conf_plt, ax = plt.subplots()
im = ax.imshow(conf_matrix, cmap='Blues')
for i in range(2):
    for j in range(2):
        ax.text(j, i, conf_matrix[i, j], ha='center', va='center', color='black', fontsize=14)
ax.set_xticks(np.arange(2))
ax.set_yticks(np.arange(2))
ax.set_xticklabels(['Actual ' + c for c in classes])
ax.set_yticklabels(['Pred ' + c for c in classes])
ax.set_title('Confusion Matrix', fontsize=16)
ax.set_xlabel('Actual Label', fontsize=12)
ax.set_ylabel('Predicted Label', fontsize=12)
conf_plt.tight_layout()
conf_plt.colorbar(im)
plt.show(conf_plt.number)
print(f"tpr: {tpr}, fpr: {fpr}, tnr: {fnr}, fnr: {fnr}")

# chart, name
charts = [(loss_plt, f"charts/{model_name}-loss.jpg"), 
          (conf_plt, f"charts/{model_name}-conf.jpg"),
          (threshold_plt, f"charts/{model_name}-threshold.jpg"),
         ]

In [None]:
# review
save_path = f"models/{model_name}.pth"
model_object = {
    "path": save_path,
    "lr": float(lr),
    "beta1": float(beta1),
    "beta2": float(beta2),
    "weight_decay": float(weight_decay),
    "epochs": int(epochs),
    "loss_fn": str(loss_fn.__str__()),
    "train_loss": float(train_loss),
    "val_loss": float(epoch_val_loss),
    "test_loss": float(test_loss),
    "desc": str(model_desc),
    "acc": float(test_loss),
    "thresh": float(best_threshold),
    "percision": list(test_percision),
    "recall": list(test_recall),
    "fscore": list(test_fscore),
    "tpr": float(tpr),
    "fpr": float(fpr),
    "tnr": float(tnr),
    "fnr": float(fnr),
    "charts": [name for _, name in charts],
}
json_str = json.dumps(model_object, indent=4)

print(json_str)

In [None]:
# save
save = input("save (y/n): ")
if save == "y":
    torch.save(model.state_dict(), save_path)
    notes = input("notes: ")
    model_object['notes'] = notes

    for chart, name in charts:
        chart.savefig(name)

    with open("models/models.json", "r") as file:
        saved_models = json.load(file)
        saved_models.append(model_object)
    with open("models/models.json", "w") as file:
        json.dump(saved_models, file, indent=4)