Import the dataset

In [1]:
import os

# Get the root path of the project (parent directory of 'db-ocsvm')
root_path = os.path.dirname(os.path.dirname(os.getcwd()))
print(f"Project root path: {root_path}")

Project root path: /home/jbct/Projects/thesis/db-ocsvm/notebooks/data


In [2]:
import sys
from dotenv import load_dotenv

load_dotenv()

root_project_path = os.getenv("ROOTH_PATH")
sys.path.append(root_project_path)

In [3]:
from constants.dataset_paths import DATASET

raw_dataset_path = DATASET["raw"]["NSL-KDD"]["train"]
print(raw_dataset_path)

/home/jbct/Projects/thesis/db-ocsvm/data/raw/NSL-KDD/KDDTrain+.csv


In [4]:
import pandas as pd

dataset = pd.read_csv(raw_dataset_path)
dataset.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack
0,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
1,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
2,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
3,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
4,0,tcp,private,REJ,0,0,0,0,0,0,...,19,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune


# Mapping attacks to their classess

In [5]:
attack_classes = {
    "normal": ["normal"],
    "DoS": [
        "mailbomb",
        "udpstorm",
        "neptune",
        "back",
        "land",
        "pod",
        "smurf",
        "teardrop",
        "apache2",
        "processtable",
    ],
    "Probe": ["ipsweep", "nmap", "portsweep", "satan", "saint", "mscan"],
    "R2L": [
        "ftp_write",
        "sendmail",
        "xsnoop",
        "worm",
        "named",
        "xlock",
        "snmpguess",
        "guess_passwd",
        "snmpgetattack",
        "imap",
        "multihop",
        "phf",
        "spy",
        "warezclient",
        "warezmaster",
    ],
    "U2R": [
        "buffer_overflow",
        "sqlattack",
        "loadmodule",
        "perl",
        "rootkit",
        "httptunnel",
        "ps",
        "xterm",
    ],
}


# map attacks to their categories
def get_attack_class(attack):
    for category, attacks in attack_classes.items():
        if attack.lower() in attacks:
            return category
    return "Unknown"


dataset["attack_class"] = dataset["attack"].apply(get_attack_class)

In [6]:
print("Attack class distribution:")
print(dataset["attack_class"].value_counts())

Attack class distribution:
attack_class
normal    67342
DoS       45927
Probe     11656
R2L         995
U2R          52
Name: count, dtype: int64


In [7]:
dataset.sample(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,attack_class
122993,0,tcp,ftp,S0,0,0,0,0,0,0,...,0.02,0.07,0.0,0.0,1.0,1.0,0.0,0.0,neptune,DoS
7604,0,tcp,http,REJ,0,0,0,0,0,0,...,1.0,0.0,0.33,0.06,0.0,0.0,1.0,0.4,normal,normal
14161,0,tcp,uucp,S0,0,0,0,0,0,0,...,0.04,0.06,0.0,0.0,1.0,1.0,0.0,0.0,neptune,DoS
38567,0,tcp,private,S0,0,0,0,0,0,0,...,0.09,0.04,0.0,0.0,1.0,1.0,0.0,0.0,neptune,DoS
108757,0,icmp,eco_i,SF,18,0,0,0,0,0,...,1.0,0.0,1.0,0.26,0.0,0.0,0.0,0.0,ipsweep,Probe


# 1. Removing Duplicates

In [8]:
print(f"The number of rows before removing duplicates is: {dataset.shape[0]:,}")
dataset.drop_duplicates(inplace=True)
print(f"The number of rows after removing duplicates is: {dataset.shape[0]:,}")

The number of rows before removing duplicates is: 125,972
The number of rows after removing duplicates is: 125,972


In [9]:
dataset.head(3)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,attack_class
0,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,normal
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,DoS
2,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,normal


# 2. Applying 1-n Encoding

Applying 1-n encoding to the categorical fields

In [10]:
categorical_columns = dataset.select_dtypes(include=["object"]).columns
print(f"The categorical fields in the dataset are: {list(categorical_columns)}")

The categorical fields in the dataset are: ['protocol_type', 'service', 'flag', 'attack', 'attack_class']


In [11]:
categorical_feature_columns = categorical_columns.tolist()
categorical_feature_columns.remove("attack")
categorical_feature_columns.remove("attack_class")
categorical_feature_columns

['protocol_type', 'service', 'flag']

In [12]:
train_dataset_encoded = pd.get_dummies(
    dataset, columns=categorical_feature_columns, dtype=int
)
train_dataset_encoded.head(3)

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


dimensions increase to 124 columns after 1-n encoding

In [13]:
# binary attack column (-1 for attacks, 1 for normal)
train_dataset_encoded["attack_binary"] = train_dataset_encoded["attack"].apply(
    lambda x: 1 if x == "normal" else -1
)
train_dataset_encoded["attack_categorical"] = train_dataset_encoded["attack"]
train_dataset_encoded["attack_class"] = train_dataset_encoded.pop("attack_class")
train_dataset_encoded.drop("attack", axis=1, inplace=True)

In [14]:
train_dataset_encoded.sample(n=5, random_state=None)

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,attack_binary,attack_categorical,attack_class
56823,6,1998,331,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,1,normal,normal
25122,0,1519,335,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,1,normal,normal
74828,2382,146,105,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,normal,normal
43574,0,54540,8314,0,0,0,2,0,1,1,...,0,0,0,0,0,1,0,-1,back,DoS
27344,2,2194619,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,1,normal,normal


verify values of 'attack' field

In [15]:
print(train_dataset_encoded["attack_binary"].unique())

[ 1 -1]


In [16]:
train_dataset_encoded.head(3)

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,attack_binary,attack_categorical,attack_class
0,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,normal,normal
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,-1,neptune,DoS
2,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,1,normal,normal


# 3. Scaling (standard scaling)

In [17]:
scaler_type = "minmax"  # minmax, standard, robust

In [18]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

if scaler_type == "minmax":
    scaler = MinMaxScaler()
elif scaler_type == "standard":
    scaler = StandardScaler()
elif scaler_type == "robust":
    scaler = RobustScaler()

In [19]:
train_dataset_encoded = train_dataset_encoded[train_dataset_encoded["attack_binary"] == 1]

In [20]:
train_dataset_scaled = scaler.fit_transform(
    train_dataset_encoded.drop(
        columns=["attack_binary", "attack_categorical", "attack_class"]
    )
)

In [21]:
# exporting the datast because the test dataset preprocessing will be done in a separate notebook
import joblib

scaler_path = "/home/jbct/Projects/thesis/db-ocsvm/models/NSL-KDD/scaler.joblib"

if os.path.exists(scaler_path):
    os.remove(scaler_path)

joblib.dump(scaler, scaler_path)

['/home/jbct/Projects/thesis/db-ocsvm/models/NSL-KDD/scaler.joblib']

In [22]:
# Convert the normalized data back to a DataFrame
train_dataset_scaled = pd.DataFrame(
    train_dataset_scaled,
    columns=train_dataset_encoded.columns.drop(
        ["attack_binary", "attack_categorical", "attack_class"]
    ),
)

In [23]:
# Add the 'attack' column back to the normalized DataFrame
train_dataset_scaled["attack_binary"] = train_dataset_encoded["attack_binary"].values
train_dataset_scaled["attack_categorical"] = train_dataset_encoded[
    "attack_categorical"
].values
train_dataset_scaled["attack_class"] = train_dataset_encoded["attack_class"].values
print(train_dataset_scaled.shape)
train_dataset_scaled.head()

(67342, 125)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,attack_binary,attack_categorical,attack_class
0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,normal,normal
1,0.0,3e-06,0.00116,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,normal,normal
2,0.0,2e-06,6e-05,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,normal,normal
3,0.0,3e-06,0.00032,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,normal,normal
4,0.0,3e-06,0.001962,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,normal,normal


In [24]:
train_dataset_scaled["attack_binary"].value_counts()

attack_binary
1    67342
Name: count, dtype: int64

In [25]:
train_dataset_encoded.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,attack_binary
count,67342.0,67342.0,67342.0,67342.0,67342.0,67342.0,67342.0,67342.0,67342.0,67342.0,...,67342.0,67342.0,67342.0,67342.0,67342.0,67342.0,67342.0,67342.0,67342.0,67342.0
mean,168.589899,13133.47,4329.75,0.000104,0.0,0.000148,0.230658,0.001381,0.710656,0.507083,...,0.003252,0.0,0.002168,0.005257,0.005361,0.001767,0.000668,0.941344,3e-05,1.0
std,1304.461651,418116.2,65463.3,0.010195,0.0,0.017233,2.308353,0.04948,0.453461,32.743564,...,0.056934,0.0,0.046512,0.072313,0.073021,0.042,0.025842,0.234981,0.00545,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,129.0,105.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
50%,0.0,233.0,379.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
75%,0.0,324.0,2056.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,40504.0,89581520.0,7028652.0,1.0,0.0,3.0,77.0,4.0,1.0,7479.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
min_values = train_dataset_scaled.drop(
    columns=["attack_binary", "attack_categorical", "attack_class"]
).min()
max_values = train_dataset_scaled.drop(
    columns=["attack_binary", "attack_categorical", "attack_class"]
).max()
median_values = train_dataset_scaled.drop(
    columns=["attack_binary", "attack_categorical", "attack_class"]
).median()

pd.set_option("display.max_rows", None)
result = pd.DataFrame({"min": min_values, "median": median_values, "max": max_values})
result

Unnamed: 0,min,median,max
duration,0.0,0.0,1.0
src_bytes,0.0,3e-06,1.0
dst_bytes,0.0,5.4e-05,1.0
land,0.0,0.0,1.0
wrong_fragment,0.0,0.0,0.0
urgent,0.0,0.0,1.0
hot,0.0,0.0,1.0
num_failed_logins,0.0,0.0,1.0
logged_in,0.0,1.0,1.0
num_compromised,0.0,0.0,1.0


# 4. Export Dataset to csv

In [27]:
pd.reset_option("display.max_rows")

In [28]:
X_train_full = train_dataset_scaled.drop(
    ["attack_binary", "attack_categorical", "attack_class"], axis=1
)
y_train_full = train_dataset_encoded["attack_binary"]
X_train_full.shape, y_train_full.shape

((67342, 122), (67342,))

In [29]:
print(X_train_full.shape)
X_train_full.head()

(67342, 122)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,3e-06,0.00116,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,2e-06,6e-05,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,3e-06,0.00032,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,3e-06,0.001962,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [30]:
train_set_path = DATASET["processed"]["NSL-KDD"]["train"]

if os.path.exists(train_set_path):
    os.remove(train_set_path)
X_train_full.to_csv(train_set_path, index=False)