In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# DATA_DIR = '/content/drive/MyDrive/kitsune+network+attack+dataset.zip'


In [2]:
# import zipfile
# import os
# 
# extract_dir = '/content/kitsune'
# os.makedirs(extract_dir, exist_ok=True)
# 
# with zipfile.ZipFile(DATA_DIR, 'r') as zip_ref:
#     zip_ref.extractall(extract_dir)

In [3]:
import zipfile
import os
import pandas as pd
import numpy as np
from torch_rbm import RBM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import datetime
import torch.nn as nn
import torch
import math
from softmax import Net, train
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [4]:
extract_dir = 'content/kitsune'
attack_dirs = sorted([d for d in os.listdir(extract_dir) if os.path.isdir(os.path.join(extract_dir, d))])
print("Available attack directories:")
for attack in attack_dirs:
    print(" -", attack)

Available attack directories:
 - active_wiretap
 - arp_mitm
 - fuzzing
 - mirai
 - os_scan
 - ssdp_flood
 - ssl_renegotiation
 - syn_dos
 - video_injection


In [5]:
k_sample = 0.1

attack_mapping = {
    'os_scan'            : 'OS Scan',
    'fuzzing'            : 'Fuzzing',
    'video_injection'    : 'Video Injection',
    'arp_mitm'           : 'ARP MitM',
    'active_wiretap'     : 'Active Wiretap',
    'ssdp_flood'         : 'SSDP Flood',
    'syn_dos'            : 'SYN DoS',
    'ssl_renegotiation'  : 'SSL Renegotiation',
    'mirai'              : 'Mirai'
}

# features_dfs = {}
# labels_dfs   = {}
errors       = []


feature_list = []
label_list   = []

# ─── 4) Loop over each directory, build exact filenames, and load ────────────────

for folder in attack_dirs:
    if folder not in attack_mapping:
        errors.append(f"→ No mapping found for folder: '{folder}'")
        continue

    base_name   = attack_mapping[folder]          # e.g. "Active Wiretap"
    attack_path = os.path.join(extract_dir, folder)

    # Build the expected .csv.gz paths
    features_path = os.path.join(attack_path, f"{base_name}_dataset.csv.gz")
    labels_path   = os.path.join(attack_path, f"{base_name}_labels.csv.gz")

    try:
        # --- 4a) Load features ---
        print(f"Loading features for '{folder}' from:\n  {features_path}")
        df_feat = pd.read_csv(features_path, compression='gzip', header=None)
        print(f"  → Features shape: {df_feat.shape}")


        if folder == 'mirai':
            # Mirai labels file has only one column (no index column)
            print(f"Loading MIRAI labels (single‐column) from:\n  {labels_path}")
            df_lbl = pd.read_csv(
                labels_path,
                compression='gzip',
                header=None,     # assume first line is a header like "label", drop it
            )
        else:
            # --- 4b) Load labels (parse index,col properly) ---
            print(f"Loading labels for '{folder}' from:\n  {labels_path}")
            df_lbl = pd.read_csv(
              labels_path,
              compression='gzip',
              sep=',',           # split “index,label”
              header=0,          # treat first line (e.g. "0,x") as header
              index_col=0,       # drop the index‐column
              names=['index','label']
            )
            print(f"  → Labels shape:   {df_lbl.shape}")
            df_lbl = df_lbl['label']

        # 4c) Verify row counts match
        if df_feat.shape[0] != df_lbl.shape[0]:
            msg = (
                f"features {df_feat.shape[0]} vs labels {df_lbl.shape[0]}"
            )
            errors.append(msg)
        else:
            print(f"Row counts match ({df_feat.shape[0]} packets)\n")


        # sample k_sample from entire dataset (stratified on y)
        # _, df_feat_sample, _, df_lbl_sample = train_test_split(
        #             df_feat,
        #             df_lbl,
        #             test_size=k_sample,
        #             random_state=0,
        #             stratify=df_lbl
        #         )
        
        # loading full dataset
        df_feat_sample = df_feat
        df_lbl_sample = df_lbl

        # del df_feat, df_lbl, _
        # Reset indices for consistency
        df_feat_sample = df_feat_sample.reset_index(drop=True)
        df_lbl_sample  = df_lbl_sample.reset_index(drop=True)

        feature_list.append(df_feat_sample)
        label_list.append(df_lbl_sample)

        # del df_feat_sample, df_lbl_sample


    except Exception as e:
        errors.append(f"Failed to load '{folder}': {e}\n")

# ─── 5) Summary ─────────────────────────────────────────────────────────────────

print("────────────────────────────────────────")
print(f"Successfully loaded {len(feature_list)} attack datasets.")
if errors:
    print("\nSome folders could not be loaded or had mismatches:")
    for e in errors:
        print(" ", e)
else:
    print("No errors detected. All row counts match.\n")

# Now you can access, for example:
#   features_dfs['active_wiretap']   → pandas DataFrame of shape (N, 115)
#   labels_dfs  ['active_wiretap']   → pandas Series of length N

# Example: display the first 3 rows of “active_wiretap” features & labels
# demo_folder = 'active_wiretap'
# if demo_folder in features_dfs:
#     print(f"\nExample: first 3 rows of {demo_folder} features:")
#     display(features_dfs[demo_folder].head(3))
#     print(f"\nExample: first 10 labels of {demo_folder}:")
#     display(labels_dfs[demo_folder].head(10))

Loading features for 'active_wiretap' from:
  content/kitsune\active_wiretap\Active Wiretap_dataset.csv.gz
  → Features shape: (2278689, 115)
Loading labels for 'active_wiretap' from:
  content/kitsune\active_wiretap\Active Wiretap_labels.csv.gz
  → Labels shape:   (2278689, 1)
Row counts match (2278689 packets)

Loading features for 'arp_mitm' from:
  content/kitsune\arp_mitm\ARP MitM_dataset.csv.gz
  → Features shape: (2504267, 115)
Loading labels for 'arp_mitm' from:
  content/kitsune\arp_mitm\ARP MitM_labels.csv.gz
  → Labels shape:   (2504267, 1)
Row counts match (2504267 packets)

Loading features for 'fuzzing' from:
  content/kitsune\fuzzing\Fuzzing_dataset.csv.gz
  → Features shape: (2244139, 115)
Loading labels for 'fuzzing' from:
  content/kitsune\fuzzing\Fuzzing_labels.csv.gz
  → Labels shape:   (2244139, 1)
Row counts match (2244139 packets)

Loading features for 'mirai' from:
  content/kitsune\mirai\Mirai_dataset.csv.gz
  → Features shape: (764137, 116)
Loading MIRAI label

In [6]:
print(len(feature_list))
print(len(label_list))
print(feature_list[0].shape)
for df in feature_list:
    print(df.shape)
    # print(df.head())
    
feature_list[3] = feature_list[3].iloc[:, 1:]

9
9
(2278689, 115)
(2278689, 115)
(2504267, 115)
(2244139, 115)
(764137, 116)
(1697851, 115)
(4077266, 115)
(2207571, 115)
(2771276, 115)
(2472401, 115)


In [7]:
# feature_list[3] = feature_list[3].iloc[:, 1:]
# label_list   = []
# 
# for folder in list(features_dfs.keys()):
#     df_feat = features_dfs.pop(folder)
#     df_lbl  = labels_dfs.pop(folder)
# 
#     if folder == 'mirai':
#         df_feat = df_feat.iloc[:, 1:]
# 
#     feature_list.append(df_feat)
#     label_list.append(df_lbl)

In [8]:
# merging data
X = pd.concat(feature_list, ignore_index=True)
y = pd.concat(label_list, ignore_index=True)

del feature_list, label_list

nan_counts = X.isna().sum(axis=1)
print(nan_counts.value_counts().sort_index())

X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.mean())
X = X.to_numpy()
y = y.to_numpy()

1    21017597
Name: count, dtype: int64


In [9]:
X.shape

(21017597, 116)

In [10]:
# # free memory
# for name in dir():
#     if name not in ['X', 'y'] and not name.startswith('_'):
#         del globals()[name]
# import gc
# gc.collect()

In [11]:
X, X_val, y, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y, shuffle=True
)

print(f"X_train shape: {X.shape}")
print(f"X_val   shape: {X_val.shape}")
print(f"y_train shape: {y.shape}")
print(f"y_val   shape: {y_val.shape}")

X_train shape: (16814077, 116)
X_val   shape: (4203520, 116)
y_train shape: (16814077, 1)
y_val   shape: (4203520, 1)


In [13]:
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler(feature_range=(0, 1))

X_stand = standard_scaler.fit_transform(X)
X_norm = minmax_scaler.fit_transform(X_stand)

X_val_stand = standard_scaler.transform(X_val)
X_val_norm = minmax_scaler.transform(X_val_stand)

del X, X_val, X_stand, X_val_stand

In [14]:
X_train_tensor = torch.tensor(X_norm, dtype=torch.double)
Y_train_tensor = torch.tensor(np.array(y), dtype=torch.long)
X_val_tensor = torch.tensor(X_val_norm, dtype=torch.double)
Y_val_tensor = torch.tensor(np.array(y_val), dtype=torch.long)

del X_norm, X_val_norm

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
input_size = X_train_tensor.shape[1] # 78 # 116
rbm = RBM(input_size, 50, device)
rbm = rbm.to(device)
print(datetime.datetime.now())
print("Training RBM using the original RBM code...")
rbm.fit(X_train_tensor, iterations=20, learning_rate=0.01, cd_n=1, batch_size=4096, verbose=True)
print("RBM training complete.")
print(datetime.datetime.now())

2025-06-06 22:11:18.981303
Training RBM using the original RBM code...
Iteration: 1 of 20
Iteration: 2 of 20
Iteration: 3 of 20
Iteration: 4 of 20
Iteration: 5 of 20
Iteration: 6 of 20
Iteration: 7 of 20
Iteration: 8 of 20
Iteration: 9 of 20
Iteration: 10 of 20
Iteration: 11 of 20
Iteration: 12 of 20
Iteration: 13 of 20
Iteration: 14 of 20
Iteration: 15 of 20
Iteration: 16 of 20
Iteration: 17 of 20
Iteration: 18 of 20
Iteration: 19 of 20
Iteration: 20 of 20
RBM training complete.
2025-06-06 22:29:25.290528


In [29]:
H_train_tensor = torch.tensor(rbm.draw_hidden(X_train_tensor), dtype=torch.float)
H_val_tensor = torch.tensor(rbm.draw_hidden(X_val_tensor), dtype=torch.float)

dataset_Train = TensorDataset(H_train_tensor.cpu(), Y_train_tensor.squeeze().cpu())
dataset_val = TensorDataset(H_val_tensor.cpu(), Y_val_tensor.squeeze().cpu())

batch_size = 4096
loader_train = DataLoader(dataset_Train, batch_size=batch_size, shuffle=True)
loader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

  H_train_tensor = torch.tensor(rbm.draw_hidden(X_train_tensor), dtype=torch.float)
  H_val_tensor = torch.tensor(rbm.draw_hidden(X_val_tensor), dtype=torch.float)


In [30]:
torch.cuda.empty_cache()

In [31]:
model = Net(50, 2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
# optimizer = optim.SGD(model.parameters())
# optimizer = optim.RMSprop(model.parameters())
history_Adam = train(10, model, loader_train, loader_val, optimizer, criterion, device, model_path='test')

epoch: 1, training loss: 7.83365356800507e-05, training accuracy: 86.18891777407704
epoch: 1, validation loss: 7.534809038354975e-05, validation accuracy: 86.2932256775274
model saved

epoch: 2, training loss: 7.523838525069918e-05, training accuracy: 86.29053500825529
epoch: 2, validation loss: 7.52192468469765e-05, validation accuracy: 86.29305915042632
model saved

epoch: 3, training loss: 7.520389745537175e-05, training accuracy: 86.28116190974978
epoch: 3, validation loss: 7.525875203903588e-05, validation accuracy: 86.30638131851401

epoch: 4, training loss: 7.520255787955125e-05, training accuracy: 86.27713552162274
epoch: 4, validation loss: 7.522480830892775e-05, validation accuracy: 86.28830123325213

epoch: 5, training loss: 7.520194098103831e-05, training accuracy: 86.27535724976161
epoch: 5, validation loss: 7.521662999786227e-05, validation accuracy: 86.28920523751522
model saved

epoch: 6, training loss: 7.520046325593899e-05, training accuracy: 86.27561893525288
epoch: 