In [None]:
# install dependencies
! pip install pandas
! pip3 install torch --index-url https://download.pytorch.org/whl/cu128
! pip install ipdb
! pip install tqdm
! pip install pyarrow
! pip install matplotlib
! pip install scikit-learn
! pip install scipy

In [None]:
# import libraries
import os
#os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
import pandas as pd
import glob
import torch
from torch.utils.data import Dataset, DataLoader, random_split, IterableDataset
import gc
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
import time
from tqdm import tqdm
import pyarrow.dataset as ds
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import random
import json
from scipy.signal import find_peaks

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# ! cp drive/MyDrive/normalized-ids2018-parquet.tar.gz /content/
! tar -xzvf normalized-ids2018-parquet.tar.gz normalized/

In [None]:
PARQUET_FILES = glob.glob('normalized/*')

In [None]:
# canon columns
CANON_COLUMN_INDEX = ['Fwd IAT Tot', 'Fwd Pkt Len Min', 'Down/Up Ratio', 'Dst Port', 'Fwd IAT Std', 'Fwd Header Len', 'Fwd IAT Min', 'Flow IAT Std', 'Active Std', 'Bwd IAT Max', 'Fwd Pkt Len Mean', 'Pkt Size Avg', 'PSH Flag Cnt', 'Flow IAT Mean', 'Fwd Act Data Pkts', 'Bwd Pkt Len Max', 'Flow IAT Max', 'ACK Flag Cnt', 'Bwd IAT Tot', 'Flow IAT Min', 'Bwd Pkts/b Avg', 'Fwd IAT Max', 'SYN Flag Cnt', 'Bwd Header Len', 'Fwd Seg Size Avg', 'Bwd Byts/b Avg', 'Subflow Bwd Byts', 'Pkt Len Max', 'Bwd Pkts/s', 'Fwd IAT Mean', 'Pkt Len Var', 'Fwd Pkt Len Std', 'Protocol', 'Init Bwd Win Byts', 'Active Min', 'Src Port', 'RST Flag Cnt', 'Subflow Fwd Byts', 'Init Fwd Win Byts', 'Bwd Pkt Len Std', 'Fwd PSH Flags', 'Fwd Pkts/s', 'Bwd Blk Rate Avg', 'Flow Byts/s', 'CWE Flag Count', 'Pkt Len Std', 'Active Max', 'Fwd Byts/b Avg', 'Fwd Blk Rate Avg', 'URG Flag Cnt', 'Timestamp', 'Fwd Pkts/b Avg', 'Idle Mean', 'Idle Std', 'Fwd Pkt Len Max', 'Pkt Len Min', 'Flow Duration', 'Fwd Seg Size Min', 'Bwd IAT Min', 'TotLen Fwd Pkts', 'Flow Pkts/s', 'Active Mean', 'ECE Flag Cnt', 'Idle Min', 'Subflow Bwd Pkts', 'Bwd Pkt Len Mean', 'Pkt Len Mean', 'Tot Fwd Pkts', 'Bwd IAT Std', 'Bwd Seg Size Avg', 'Bwd URG Flags', 'Bwd Pkt Len Min', 'Tot Bwd Pkts', 'Subflow Fwd Pkts', 'Bwd IAT Mean', 'FIN Flag Cnt', 'Bwd PSH Flags', 'TotLen Bwd Pkts', 'Fwd URG Flags', 'Idle Max']
CANON_COLUMN_INDEX.sort()
CANON_COLUMN_INDEX.append('Label')
print(CANON_COLUMN_INDEX)
TRAINING_UNWANTED_COLUMNS = ['Timestamp', 'Flow ID', 'Dst IP', "Src IP"]
TRAINING_WANTED_COLUMNS = []
for col in CANON_COLUMN_INDEX:
  if col not in TRAINING_UNWANTED_COLUMNS:
    TRAINING_WANTED_COLUMNS.append(col)
print(TRAINING_WANTED_COLUMNS)
TRAINING_FEATURES = TRAINING_WANTED_COLUMNS[:-1]

LENGTH = 16233002 # precalculated from data wrangling, see info.json


In [None]:
# pyarrow parquet dataset
class ArrowParquetDataset(IterableDataset):
    def __init__(self, path, batch_size=1024, shuffle=True, splits=(0.7,0.1,0.2)):
        self.path = path
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.splits = splits
        self.mode("train")

        self.dataset = ds.dataset(self.path, format="parquet")
        self.scanner = self.dataset.scanner(columns=TRAINING_WANTED_COLUMNS, batch_size=self.batch_size)

    def mode(self, m):
        match m:
            case "train":
                self.mode_value = 0
            case "val":
                self.mode_value = 1
            case "test":
                self.mode_value = 2


    # approximate iterations
    def num_iterations(self):
        return int((len(self) * self.splits[self.mode_value]) / self.batch_size)

    def __len__(self):
        return self.scanner.count_rows()

    def __iter__(self):
        batches = list(self.scanner.to_batches())

        num_batches = len(batches)
        train_end = int(num_batches * self.splits[0])
        val_end = train_end + int(num_batches * self.splits[1])

        match self.mode_value:
            case 0:
                batches = batches[:train_end]
            case 1:
                batches = batches[train_end:val_end]
            case 2:
                batches = batches[val_end:]

        if self.shuffle:
            random.shuffle(batches)

        for batch in batches:
            x = batch.select(TRAINING_FEATURES).to_tensor(null_to_nan=True)
            y_string_array = batch.column("Label")

            # Optional: convert to torch.Tensor
            x = torch.tensor(x, dtype=torch.float32)
            y = torch.tensor([(0.0 if val.as_py() == "Benign" else 1.0) for val in y_string_array], dtype=torch.float32)

            # mask and impute nans
            mask = torch.isnan(x).float()
            x = torch.nan_to_num(x, nan=0.0)
            x = torch.cat([x, mask], dim=1)

            yield x, y

DS_ARROW = ArrowParquetDataset(PARQUET_FILES, batch_size=1024)
print(len(DS_ARROW))

In [None]:
# create model
class DNN(nn.Module):
  def __init__(self, input_size, hidden_sizes, output_size):
    super(DNN, self).__init__()

    #layers
    self.input = nn.Linear(input_size, hidden_sizes[0])
    self.output = nn.Linear(hidden_sizes[-1], output_size)
    self.dropout = nn.Dropout(0.2)
    self.hiddens = nn.ModuleList()
    for i in range(len(hidden_sizes) - 1):
      self.hiddens.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))

  def forward(self, x):
    x = F.relu(self.input(x))
    x = self.dropout(x)
    for layer in self.hiddens:
      x = F.relu(layer(x))
    x = self.dropout(x)
    x = self.output(x)
    return torch.sigmoid(x).view(-1)


In [None]:
# create model
input_shape = len(TRAINING_FEATURES) * 2 # because masking
model = DNN(input_shape, 
                [int(input_shape / 2), int(input_shape / 4)],
                1)
model_desc = model.__str__()
print(model_desc)

In [None]:
# train
torch.cuda.empty_cache()

model.to(device)
loss_fn = nn.BCELoss()
lr = 0.00001
beta1 = 0.9
beta2 = 0.999
weight_decay = 0.0001
optim = torch.optim.Adam(model.parameters(), lr=lr, betas=(beta1,beta2), weight_decay=weight_decay)
epochs = 10
train_loss = -1

historical_loss = []
historical_val_loss = []

train_start_time = time.time()
for epoch in range(epochs):
  epoch_loss = 0.0
  epoch_val_loss = 0.0

  model.train()
  epoch_samples = 0
  DS_ARROW.mode("train")
  for x, y in tqdm(DS_ARROW, total=DS_ARROW.num_iterations()):
    x = x.to(device)
    y = y.to(device)
    epoch_samples += y.size()[0]

    optim.zero_grad()
    out = model(x)
    loss = loss_fn(out, y)
    loss.backward()
    optim.step()
    epoch_loss += loss.item()

  epoch_loss = epoch_loss / epoch_samples

  model.eval()
  epoch_val_samples = 0
  DS_ARROW.mode("val")
  with torch.no_grad():
    for x, y in tqdm(DS_ARROW, total=DS_ARROW.num_iterations()):
      x = x.to(device)
      y = y.to(device)
      epoch_val_samples += y.size()[0]

      out = model(x)
      loss = loss_fn(out, y)

      epoch_val_loss += loss.item()

  epoch_val_loss = epoch_val_loss / epoch_val_samples
  historical_val_loss.append(epoch_val_loss)
  historical_loss.append(epoch_loss)

  print(f'epoch: {epoch + 1}/{epochs}, train loss: {epoch_loss:.5f}, val loss: {epoch_val_loss:.5f}')
  train_loss = epoch_loss
min, sec = divmod(int(time.time() - train_start_time), 60)
print(f'train time: {min}:{sec}')

In [None]:
# test
model.eval()
test_loss = 0.0
test_start_time = time.time()

historical_label_tensor = []
historical_pred_tensor = []

DS_ARROW.mode("test")
epoch_samples = 0
with torch.no_grad():
  for x, y in tqdm(DS_ARROW, total=DS_ARROW.num_iterations()):
    x = x.to(device)
    y = y.to(device)
    epoch_samples += y.size()[0]

    out = model(x)
    test_loss += loss_fn(out, y).item()

    if random.random() < 0.02:
      historical_label_tensor.append(y.to("cpu"))
      historical_pred_tensor.append(out.to("cpu"))

test_loss = test_loss / epoch_samples
print(f'test loss: {test_loss:.5f}')
min, sec = divmod(int(time.time() - test_start_time), 60)
print(f'test time: {min}:{sec}')
print(f"samples: {len(historical_label_tensor) * DS_ARROW.batch_size}/{epoch_samples} {(len(historical_label_tensor) * DS_ARROW.batch_size / epoch_samples) * 100:.2f}%")

In [None]:
# analyze

historical_pred = torch.cat(historical_pred_tensor).tolist()
historical_label = torch.cat(historical_label_tensor).tolist()

# epoch loss
plt.figure()
plt.plot(range(1, len(historical_loss) + 1), historical_loss, marker='o', color='blue', label="training loss")
plt.plot(range(1, len(historical_val_loss) + 1), historical_val_loss, marker='o', color='orange', label="validation loss")
plt.title('Loss by Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# accuracy
thresh_steps = 100
thresholds = torch.linspace(0, 1, steps=thresh_steps)
accuracies = []
for t in thresholds: 
    correct = 0
    for i in range(len(historical_label)):
        if (historical_pred[i] > t and historical_label[i] > 0.5) or \
        (historical_pred[i] < t and historical_label[i] < 0.5):
            correct += 1

    accuracies.append(correct / len(historical_label))

thresholds = np.array(thresholds)
accuracies = np.array(accuracies)

maxima_accuracies, _ = find_peaks(accuracies)

plt.figure(figsize=(8, 5))
plt.plot(thresholds[1:], accuracies[1:], label='Accuracy')
for i in maxima_accuracies:
    x = thresholds[i]
    y = accuracies[i]
    plt.plot(x, y, 'ro')  # plot the point as a red dot
    plt.text(x, y + 0.01, f"({x:.3f}, {y:.3f})")
plt.xlabel('Threshold')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Threshold')
plt.legend()
plt.tight_layout()
plt.show()

# roc and auc
fpr_roc, tpr_roc, thresholds_roc = roc_curve(historical_label, historical_pred)
roc_auc = auc(fpr_roc, tpr_roc)

for i in range(len(thresholds_roc)):
    if thresholds_roc[i] <= (best_thresh + (1/thresh_steps)) and \
    thresholds_roc[i] >= (best_thresh - (1/thresh_steps)):
        best_fpr = fpr_roc[i]
        best_tpr = tpr_roc[i]

plt.figure()
plt.plot(fpr_roc, tpr_roc, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line
plt.scatter([best_fpr], [best_tpr], color='red', zorder=5, label=f'Maxima')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic")
plt.legend(loc="lower right")
plt.show()

# confusion matrix at best accuracy
tp = 0
fp = 0
fn = 0
tn = 0
for i in range(len(historical_label)):
    pred_benign = (historical_pred[i] <= best_thresh)
    truth_benign = (historical_label[i] < 0.5)
    if truth_benign:
        if pred_benign:
            tp += 1
        else:
            fn += 1
    else:
        if pred_benign:
            fp += 1
        else:
            tn += 1
tpr = tp / (tp + fn)
fnr = fn / (tp + fn)
tnr = tn / (tn + fp)
fpr = fp / (tn + fp)

conf_matrix = np.array([[tp, fp],[fn, tn]])
classes = ['Benign', 'Intrusion']
fig, ax = plt.subplots()
im = ax.imshow(conf_matrix, cmap='Blues')
for i in range(2):
    for j in range(2):
        ax.text(j, i, conf_matrix[i, j], ha='center', va='center', color='black', fontsize=14)
ax.set_xticks(np.arange(2))
ax.set_yticks(np.arange(2))
ax.set_xticklabels(['Actual ' + c for c in classes])
ax.set_yticklabels(['Pred ' + c for c in classes])
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Actual Label', fontsize=12)
plt.ylabel('Predicted Label', fontsize=12)

plt.tight_layout()
plt.colorbar(im)
plt.show()

print(f"tpr: {tpr}, fnr: {fnr}, tnr: {tnr}, fpr: {fpr}")



In [57]:
# review
with open("models/models.json", "r") as file:
    saved_models = json.load(file)
save_path = f"models/model_{len(saved_models)}.pth"

model_object = {
    "path": save_path,
    "lr": float(lr),
    "beta1": float(beta1),
    "beta2": float(beta2),
    "weight_decay": float(weight_decay),
    "epochs": int(epochs),
    "train_loss": float(0.00006),
    "val_loss": float(0.00011),
    "test_loss": float(test_loss),
    "acc": float(best_acc),
    "thresh": float(best_thresh),
    "auc": float(roc_auc),
    "desc": str(model_desc),
    "confusion": {
        "tp": int(tp),
        "fp": int(fp),
        "fn": int(fn),
        "tn": int(tn),
        "tpr": float(tpr),
        "fnr": float(fnr),
        "tnr": float(tnr),
        "fpr": float(fpr),
    },
}
json_str = json.dumps(model_object, indent=4)

print(json_str)

{
    "path": "models/model_0.pth",
    "lr": 1e-05,
    "beta1": 0.9,
    "beta2": 0.999,
    "weight_decay": 0.0001,
    "epochs": 10,
    "train_loss": 6e-05,
    "val_loss": 0.00011,
    "test_loss": 0.0008948093706207522,
    "acc": 0.9029998779296875,
    "thresh": 0.9898989796638489,
    "auc": 0.7177023562327813,
    "desc": "DNN(\n  (input): Linear(in_features=158, out_features=79, bias=True)\n  (output): Linear(in_features=39, out_features=1, bias=True)\n  (dropout): Dropout(p=0.2, inplace=False)\n  (hiddens): ModuleList(\n    (0): Linear(in_features=79, out_features=39, bias=True)\n  )\n)",
    "confusion": {
        "tp": 59179,
        "fp": 6357,
        "fn": 0,
        "tn": 0,
        "tpr": 1.0,
        "fnr": 0.0,
        "tnr": 0.0,
        "fpr": 1.0
    }
}


In [2]:
# save
save = input("save (y/n): ")
if save == "y":
    torch.save(model.state_dict(), save_path)
    notes = input("notes: ")
    model_object['notes'] = notes
    with open("models/models.json", "r") as file:
        saved_models = json.load(file)
        saved_models.append(model_object)
    with open("models/models.json", "w") as file:
        json.dump(saved_models, file, indent=4)


NameError: name 'torch' is not defined