In [None]:
from google.colab import drive
drive.mount('/content/drive')
DATA_DIR = '/content/drive/MyDrive/kitsune+network+attack+dataset.zip'


Mounted at /content/drive


In [None]:
import zipfile
import os
import pandas as pd

extract_dir = '/content/kitsune'

In [None]:
import zipfile
import os
import pandas as pd

extract_dir = '/content/kitsune'
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(DATA_DIR, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [None]:
attack_dirs = sorted([d for d in os.listdir(extract_dir) if os.path.isdir(os.path.join(extract_dir, d))])
print("Available attack directories:")
for attack in attack_dirs:
    print(" -", attack)

Available attack directories:
 - active_wiretap
 - arp_mitm
 - fuzzing
 - mirai
 - os_scan
 - ssdp_flood
 - ssl_renegotiation
 - syn_dos
 - video_injection


In [None]:
k_sample = 0.1


attack_mapping = {
    'os_scan'            : 'OS Scan',
    'fuzzing'            : 'Fuzzing',
    'video_injection'    : 'Video Injection',
    'arp_mitm'           : 'ARP MitM',
    'active_wiretap'     : 'Active Wiretap',
    'ssdp_flood'         : 'SSDP Flood',
    'syn_dos'            : 'SYN DoS',
    'ssl_renegotiation'  : 'SSL Renegotiation',
    'mirai'              : 'Mirai'
}

# features_dfs = {}
# labels_dfs   = {}
errors       = []


feature_list = []
label_list   = []

# ─── 4) Loop over each directory, build exact filenames, and load ────────────────

for folder in attack_dirs:
    if folder not in attack_mapping:
        errors.append(f"→ No mapping found for folder: '{folder}'")
        continue

    base_name   = attack_mapping[folder]          # e.g. "Active Wiretap"
    attack_path = os.path.join(extract_dir, folder)

    # Build the expected .csv.gz paths
    features_path = os.path.join(attack_path, f"{base_name}_dataset.csv.gz")
    labels_path   = os.path.join(attack_path, f"{base_name}_labels.csv.gz")

    try:
        # --- 4a) Load features ---
        print(f"Loading features for '{folder}' from:\n  {features_path}")
        df_feat = pd.read_csv(features_path, compression='gzip', header=None)
        print(f"  → Features shape: {df_feat.shape}")


        if folder == 'mirai':
            # Mirai labels file has only one column (no index column)
            print(f"Loading MIRAI labels (single‐column) from:\n  {labels_path}")
            df_lbl = pd.read_csv(
                labels_path,
                compression='gzip',
                header=None,     # assume first line is a header like "label", drop it
            )
        else:
            # --- 4b) Load labels (parse index,col properly) ---
            print(f"Loading labels for '{folder}' from:\n  {labels_path}")
            df_lbl = pd.read_csv(
              labels_path,
              compression='gzip',
              sep=',',           # split “index,label”
              header=0,          # treat first line (e.g. "0,x") as header
              index_col=0,       # drop the index‐column
              names=['index','label']
            )
            print(f"  → Labels shape:   {df_lbl.shape}")
            df_lbl = df_lbl['label']

        # 4c) Verify row counts match
        if df_feat.shape[0] != df_lbl.shape[0]:
            msg = (
                f"features {df_feat.shape[0]} vs labels {df_lbl.shape[0]}"
            )
            errors.append(msg)
        else:
            print(f"Row counts match ({df_feat.shape[0]} packets)\n")


        # sample k_sample from entire dataset (stratified on y)
        _, df_feat_sample, _, df_lbl_sample = train_test_split(
                    df_feat,
                    df_lbl,
                    test_size=k_sample,
                    random_state=0,
                    stratify=df_lbl
                )

        del df_feat, df_lbl, _
        # Reset indices for consistency
        df_feat_sample = df_feat_sample.reset_index(drop=True)
        df_lbl_sample  = df_lbl_sample.reset_index(drop=True)

        feature_list.append(df_feat_sample)
        label_list.append(df_lbl_sample)

        del df_feat_sample, df_lbl_sample


    except Exception as e:
        errors.append(f"Failed to load '{folder}': {e}\n")

# ─── 5) Summary ─────────────────────────────────────────────────────────────────

print("────────────────────────────────────────")
print(f"Successfully loaded {len(feature_list)} attack datasets.")
if errors:
    print("\nSome folders could not be loaded or had mismatches:")
    for e in errors:
        print(" ", e)
else:
    print("No errors detected. All row counts match.\n")

# Now you can access, for example:
#   features_dfs['active_wiretap']   → pandas DataFrame of shape (N, 115)
#   labels_dfs  ['active_wiretap']   → pandas Series of length N

# Example: display the first 3 rows of “active_wiretap” features & labels
# demo_folder = 'active_wiretap'
# if demo_folder in features_dfs:
#     print(f"\nExample: first 3 rows of {demo_folder} features:")
#     display(features_dfs[demo_folder].head(3))
#     print(f"\nExample: first 10 labels of {demo_folder}:")
#     display(labels_dfs[demo_folder].head(10))

Loading features for 'active_wiretap' from:
  /content/kitsune/active_wiretap/Active Wiretap_dataset.csv.gz
  → Features shape: (2278689, 115)
Loading labels for 'active_wiretap' from:
  /content/kitsune/active_wiretap/Active Wiretap_labels.csv.gz
  → Labels shape:   (2278689, 1)
Row counts match (2278689 packets)

Loading features for 'arp_mitm' from:
  /content/kitsune/arp_mitm/ARP MitM_dataset.csv.gz
  → Features shape: (2504267, 115)
Loading labels for 'arp_mitm' from:
  /content/kitsune/arp_mitm/ARP MitM_labels.csv.gz
  → Labels shape:   (2504267, 1)
Row counts match (2504267 packets)

Loading features for 'fuzzing' from:
  /content/kitsune/fuzzing/Fuzzing_dataset.csv.gz
  → Features shape: (2244139, 115)
Loading labels for 'fuzzing' from:
  /content/kitsune/fuzzing/Fuzzing_labels.csv.gz
  → Labels shape:   (2244139, 1)
Row counts match (2244139 packets)

Loading features for 'mirai' from:
  /content/kitsune/mirai/Mirai_dataset.csv.gz
  → Features shape: (764137, 116)
Loading MIRA

In [None]:
print(len(feature_list))
print(feature_list[0].shape)

In [None]:
feature_list[3] = feature_list[3].iloc[:, 1:]
label_list   = []

for folder in list(features_dfs.keys()):
    df_feat = features_dfs.pop(folder)
    df_lbl  = labels_dfs.pop(folder)

    if folder == 'mirai':
        df_feat = df_feat.iloc[:, 1:]

    feature_list.append(df_feat)
    label_list.append(df_lbl)

In [None]:
X = pd.concat(feature_list, ignore_index=True)
y = pd.concat(label_list, ignore_index=True)

In [None]:
# free memory
for name in dir():
    if name not in ['X', 'y'] and not name.startswith('_'):
        del globals()[name]
import gc
gc.collect()

In [None]:
from sklearn.model_selection import train_test_split

X, X_val, y, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y, shuffle=True
)

print(f"X_train shape: {X.shape}")
print(f"X_val   shape: {X_val.shape}")
print(f"y_train shape: {y.shape}")
print(f"y_val   shape: {y_val.shape}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import torch
# from torch_rbm import RBM as TRBM
import datetime
#from softmax import Net, train
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
import os
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import math

class Net(nn.Module):
    def __init__(self, input, output=2):
        super().__init__()
        self.soft = nn.Linear(input, output)

    def forward(self, x):
        x = self.soft(x)
        return x


def training_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    correct = 0
    n = 0
    for x, y in dataloader:
        x,y = x.to(device), y.to(device)
        output = model(x)
        loss = criterion(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        _, predicted = output.max(1)
        correct += predicted.eq(y).sum().item()
        n += y.size(0)

    avg_loss = epoch_loss / n
    accuracy = 100 * correct / n
    return accuracy, avg_loss


def evaluate(model,dataloader, criterion, device):
    total_loss = 0
    correct = 0
    n = 0
    model.eval()
    with torch.no_grad():
        for x, y in dataloader:
            x,y = x.to(device), y.to(device)
            output = model(x)
            _, predicted = torch.max(output, 1)

            loss = criterion(output, y)
            total_loss += loss.item()
            n += y.size(0)
            correct += predicted.eq(y).sum().item()

    avg_loss = total_loss / n
    accuracy = 100 * correct / n
    return accuracy, avg_loss


def train(epochs, model, dataloader_train, dataloader_val, optimizer, criterion, device, model_path,
        tolerance=math.inf):
    train_accuracy_list, train_loss_list = [], []
    val_accuracy_list, val_loss_list = [], []
    best_loss = float('inf')
    last_save = 0
    epochs_without_improvement = 0

    for epoch in range(epochs):
        train_accuracy, train_avg_loss = training_epoch(model, dataloader_train, optimizer, criterion, device)
        train_accuracy_list.append(train_accuracy)
        train_loss_list.append(train_avg_loss)
        print(f"epoch: {epoch + 1}, training loss: {train_avg_loss}, training accuracy: {train_accuracy}")

        val_accuracy, val_avg_loss = evaluate(model, dataloader_val, criterion, device)
        val_accuracy_list.append(val_accuracy)
        val_loss_list.append(val_avg_loss)
        print(f"epoch: {epoch + 1}, validation loss: {val_avg_loss}, validation accuracy: {val_accuracy}")

        if val_avg_loss < best_loss:
            best_loss = val_avg_loss
            torch.save(model.state_dict(), model_path)
            last_save = epoch + 1
            epochs_without_improvement = 0
            print("model saved")
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement > tolerance:
                print(f"Training stopped. Tolerance {tolerance} exceeded")
                break
        print()

    history = {
        "loss_train": train_loss_list,
        "accuracy_train": train_accuracy_list,
        "loss_val": val_loss_list,
        "accuracy_val": val_accuracy_list,
        "last_save": last_save
    }
    return history

In [None]:
import torch

class RBM:
    def __init__(self, n_visible, n_hidden, device='cpu'):
        self.n_visible = n_visible
        self.n_hidden = n_hidden
        self.device = torch.device(device)

        self.a = torch.zeros(self.n_visible).to(self.device).double()
        self.b = torch.zeros(self.n_hidden).to(self.device).double()
        self.W = torch.zeros((self.n_visible, self.n_hidden)).to(self.device).double()

    def state_dict(self):
        state_dict = {
            "n_visible": self.n_visible,
            "n_hidden": self.n_hidden,
            "device": self.device,
            "a": self.a,
            "b": self.b,
            "W": self.W
        }
        return state_dict

    def load_state_dict(self, state_dict):
        self.n_visible = state_dict["n_visible"]
        self.n_hidden = state_dict["n_hidden"]
        self.device = state_dict["device"]
        self.a = state_dict["a"]
        self.b = state_dict["b"]
        self.W = state_dict["W"]

    def to(self, device):
        self.device = device
        self.a = self.a.to(device)
        self.b = self.b.to(device)
        self.W = self.W.to(device)
        return self

    def h_probability(self, v):
        v = v.to(self.device)
        return torch.sigmoid(self.b + v @ self.W)

    def v_probability(self, h):
        h = h.to(self.device)
        return torch.sigmoid(self.a + (self.W @ h.T).T)

    def draw_hidden(self, v):
        v = v.to(self.device)
        p = self.h_probability(v)
        h = torch.bernoulli(p)
        return h

    def draw_visible(self, h):
        h = h.to(self.device)
        # v = torch.zeros(self.n_visible).to(self.device)
        p = self.v_probability(h) # keep tensor shape
        v = torch.bernoulli(p)
        return v

    def gibbs_sampling(self, n, h):
        h = h.to(self.device)
        for i in range(n):
            v = self.draw_visible(h)
            h = self.draw_hidden(v)
        return v, h

    def fit(self, V, iterations, learning_rate, cd_n=1, batch_size=64, verbose=False):
        dataset = torch.utils.data.TensorDataset(V)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

        for i in range(iterations):
            if verbose:
                print(f"Iteration: {i + 1} of {iterations}")
            for batch in dataloader:
                v = batch[0].to(self.device)
                h = self.draw_hidden(v)
                v_cd, h_cd = self.gibbs_sampling(cd_n, h)

                self.W += learning_rate * ((v.T @ h - v_cd.T @ h_cd) / v.size(0))
                self.a += learning_rate * torch.mean(v - v_cd, dim=0)
                self.b += learning_rate * torch.mean(h - h_cd, dim=0)


In [None]:
TRBM = RBM

NameError: name 'RBM' is not defined

In [None]:
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler(feature_range=(0, 1))

X_stand = standard_scaler.fit_transform(X)
X_norm = minmax_scaler.fit_transform(X_stand)

X_val_stand = standard_scaler.transform(X_val)
X_val_norm = minmax_scaler.transform(X_val_stand)
del X, X_val, X_stand, X_val_stand

In [None]:
X_train_tensor = torch.tensor(X_norm, dtype=torch.double)
Y_train_tensor = torch.tensor(np.array(y), dtype=torch.long)
X_val_tensor = torch.tensor(X_val_norm, dtype=torch.double)
Y_val_tensor = torch.tensor(np.array(y_val), dtype=torch.long)
batch_size = 64
del X_norm, X_val_norm

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = X_train_tensor.shape[1] # 78
rbm = TRBM(input_size, 50, device)
rbm = rbm.to(device)
print(datetime.datetime.now())
print("Training RBM using the original RBM code...")
rbm.fit(X_train_tensor, iterations=20, learning_rate=0.01, cd_n=1, batch_size=256, verbose=True)
print("RBM training complete.")
print(datetime.datetime.now())