Import the dataset

In [1]:
import pandas as pd

raw_dataset_path = (
    "D:\\projects\\School\\db-ocsvm\\data\\raw\\CIDDS-001\\cidds-001-openstack.parquet"
)
dataset_pd_openstack = pd.read_parquet(raw_dataset_path)
dataset_pd_openstack.head()

Unnamed: 0,duration,proto,packets,bytes,flows,tcp_urg,tcp_ack,tcp_psh,tcp_rst,tcp_syn,tcp_fin,tos,label,attack_type,attack_id
0,0.018,TCP,2,338.0,1,0,1,1,0,0,0,0,normal,benign,0
1,0.0,TCP,1,212.0,1,0,1,1,0,0,0,32,normal,benign,0
2,0.0,TCP,1,108.0,1,0,1,1,0,0,0,0,normal,benign,0
3,0.006,TCP,2,174.0,1,0,1,1,0,0,0,0,normal,benign,0
4,0.019,TCP,2,338.0,1,0,1,1,0,0,0,0,normal,benign,0


# 1. Removing Duplicates

In [2]:
print(
    f"The number of rows before removing duplicates is: {dataset_pd_openstack.shape[0]:,}"
)
dataset_pd_openstack.drop_duplicates(inplace=True)
print(
    f"The number of rows after removing duplicates is: {dataset_pd_openstack.shape[0]:,}"
)

The number of rows before removing duplicates is: 4,161,690
The number of rows after removing duplicates is: 4,161,690


In [3]:
dataset_pd_openstack.head(3)

Unnamed: 0,duration,proto,packets,bytes,flows,tcp_urg,tcp_ack,tcp_psh,tcp_rst,tcp_syn,tcp_fin,tos,label,attack_type,attack_id
0,0.018,TCP,2,338.0,1,0,1,1,0,0,0,0,normal,benign,0
1,0.0,TCP,1,212.0,1,0,1,1,0,0,0,32,normal,benign,0
2,0.0,TCP,1,108.0,1,0,1,1,0,0,0,0,normal,benign,0


# 2. Applying 1-n Encoding

Applying 1-n encoding to the categorical fields

In [4]:
categorical_columns = dataset_pd_openstack.select_dtypes(include=["category"]).columns
print(f"The categorical fields in the dataset are: {list(categorical_columns)}")

The categorical fields in the dataset are: ['proto', 'label', 'attack_type']


In [5]:
categorical_feature_columns = categorical_columns.tolist()
categorical_feature_columns.remove("label")
categorical_feature_columns.remove("attack_type")
categorical_feature_columns

['proto']

In [6]:
dataset_encoded = pd.get_dummies(
    dataset_pd_openstack, columns=categorical_feature_columns, dtype=int
)
print(dataset_encoded.shape)
dataset_encoded.head(3)

(4161690, 18)


Unnamed: 0,duration,packets,bytes,flows,tcp_urg,tcp_ack,tcp_psh,tcp_rst,tcp_syn,tcp_fin,tos,label,attack_type,attack_id,proto_ICMP,proto_IGMP,proto_TCP,proto_UDP
0,0.018,2,338.0,1,0,1,1,0,0,0,0,normal,benign,0,0,0,1,0
1,0.0,1,212.0,1,0,1,1,0,0,0,32,normal,benign,0,0,0,1,0
2,0.0,1,108.0,1,0,1,1,0,0,0,0,normal,benign,0,0,0,1,0


dimensions increase to 18 columns after 1-n encoding

In [7]:
# binary attack column (-1 for attacks, 1 for normal)
dataset_encoded["attack_binary"] = dataset_encoded["label"].apply(
    lambda x: 1 if x == "normal" else -1
)
dataset_encoded["attack_categorical"] = dataset_encoded["attack_type"]
dataset_encoded.drop("label", axis=1, inplace=True)
dataset_encoded.drop("attack_type", axis=1, inplace=True)

In [8]:
print(dataset_encoded.shape)
dataset_encoded.sample(n=5, random_state=None)

(4161690, 18)


Unnamed: 0,duration,packets,bytes,flows,tcp_urg,tcp_ack,tcp_psh,tcp_rst,tcp_syn,tcp_fin,tos,attack_id,proto_ICMP,proto_IGMP,proto_TCP,proto_UDP,attack_binary,attack_categorical
250155,0.1,4,575.0,1,0,1,1,0,1,0,0,0,0,0,1,0,1,benign
579537,0.072,2,138.0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,benign
2966383,1.259,6,3912.0,1,0,1,1,0,1,0,32,0,0,0,1,0,1,benign
2459811,0.716,7,6665.0,1,0,1,1,0,1,0,32,0,0,0,1,0,1,benign
1361957,2.685,51,61973.0,1,0,1,1,0,0,0,32,0,0,0,1,0,1,benign


verify values of 'attack' field

In [9]:
print(dataset_encoded["attack_binary"].value_counts())
print(dataset_encoded["attack_binary"].unique())

attack_binary
 1    4152916
-1       8774
Name: count, dtype: int64
[ 1 -1]


In [None]:
dataset_encoded.head(3)

# 3. Splitting

since there are 4 milion "normal" records and only 8.7k "attack" records, splitting needs to be methodological

In [None]:
dataset_encoded["attack_binary"].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset

# Split the dataset into train and test sets (80% train, 20% test)
train_dataset_encoded, test_dataset_encoded = train_test_split(
    dataset_encoded,
    test_size=0.2,
    random_state=42,
    stratify=dataset_encoded["attack_binary"],
)
print(train_dataset_encoded.shape)
print(test_dataset_encoded.shape)

In [None]:
print("Original dataset sizes:")
print(train_dataset_encoded.shape)
print(test_dataset_encoded.shape)
print("\nOriginal attack distribution:")
print("Train dataset:")
print(train_dataset_encoded["attack_binary"].value_counts())
print("\nTest dataset:")
print(test_dataset_encoded["attack_binary"].value_counts())

removing and reducing attack records in the train dataset

In [None]:
print("Separating normal from attack data")
train_normal = train_dataset_encoded[train_dataset_encoded["attack_binary"] == 1]
train_attack = train_dataset_encoded[train_dataset_encoded["attack_binary"] == -1]
print(train_normal.shape)
print(train_attack.shape)
print("\nReducing normal records to only 180k records")
train_normal = train_normal.sample(n=180000, random_state=42).reset_index(drop=True)
print("\nNew train datset shape")
print(train_normal.shape)
print(train_attack.shape)

reducing test dataset and adding the attack records from the train dataset

In [None]:
test_normal = test_dataset_encoded[test_dataset_encoded["attack_binary"] == 1]
test_attack = test_dataset_encoded[test_dataset_encoded["attack_binary"] == -1]
print("\n Test dataset attack distribution")
print(test_normal.shape)
print(test_attack.shape)
print(
    "\nReducing normal records to only 11226 records and concatenating the attack records from train set to test set"
)
test_normal = test_normal.sample(n=11226, random_state=42)
test_attack = pd.concat([test_attack, train_attack], ignore_index=True)
print("\nNew test dataset shape")
print(test_normal.shape)
print(test_attack.shape)

turning them back as one dataframe

In [None]:
train_dataset_encoded = train_normal.copy()
test_dataset_encoded = pd.concat([test_normal, test_attack], ignore_index=True)
print(train_dataset_encoded.shape)
print(test_dataset_encoded.shape)

inspecting distribution

In [None]:
print(train_dataset_encoded["attack_binary"].value_counts())
print("Train dataset encoded")
train_dataset_encoded.head(3)

In [None]:
print(test_dataset_encoded["attack_binary"].value_counts())
print("Test dataset encoded")
test_dataset_encoded.head(3)

# 4. Scaling (standard scaling)

## Scaling the train set

In [None]:
scaler_type = "robust"  # minmax, standard, robust

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

if scaler_type == "minmax":
    scaler = MinMaxScaler()
    print("Using MinMaxScaler")
elif scaler_type == "standard":
    scaler = StandardScaler()
    print("Using StandardScaler")
elif scaler_type == "robust":
    scaler = RobustScaler()
    print("Using RobustScaler")

In [None]:
train_dataset_scaled = scaler.fit_transform(
    train_dataset_encoded.drop(columns=["attack_binary", "attack_categorical"])
)

In [None]:
# exporting the datast because the test dataset preprocessing will be done in a separate notebook
import joblib

scaler_path = "/home/jbct/Projects/thesis/db-ocsvm/models/CIDDS-001/scaler.joblib"
joblib.dump(scaler, scaler_path)

In [None]:
# Convert the normalized data back to a DataFrame
train_dataset_scaled = pd.DataFrame(
    train_dataset_scaled,
    columns=train_dataset_encoded.columns.drop(["attack_binary", "attack_categorical"]),
)

In [None]:
# Add the 'attack' column back to the normalized DataFrame
train_dataset_scaled["attack_binary"] = train_dataset_encoded["attack_binary"].values
train_dataset_scaled["attack_categorical"] = train_dataset_encoded[
    "attack_categorical"
].values
print(train_dataset_scaled.shape)
train_dataset_scaled.head()

In [None]:
min_values = train_dataset_scaled.drop(
    columns=["attack_binary", "attack_categorical"]
).min()
max_values = train_dataset_scaled.drop(
    columns=["attack_binary", "attack_categorical"]
).max()
median_values = train_dataset_scaled.drop(
    columns=["attack_binary", "attack_categorical"]
).median()

pd.set_option("display.max_rows", None)
result = pd.DataFrame({"min": min_values, "median": median_values, "max": max_values})
result

## Scaling the test set

In [None]:
test_dataset_scaled = scaler.transform(
    test_dataset_encoded.drop(columns=["attack_binary", "attack_categorical"])
)

In [None]:
test_dataset_scaled = pd.DataFrame(
    test_dataset_scaled,
    columns=train_dataset_encoded.columns.drop(["attack_binary", "attack_categorical"]),
)

# Add the 'attack' column back to the scaled DataFrame
test_dataset_scaled["attack_binary"] = test_dataset_encoded["attack_binary"].values
test_dataset_scaled["attack_categorical"] = test_dataset_encoded[
    "attack_categorical"
].values
print("\nTest dataset shape")
print(test_dataset_scaled.shape)
print("\nTest dataset attack distribution (binary)")
print(test_dataset_scaled["attack_binary"].value_counts())
print("\nTest dataset attack distribution (categorical)")
print(test_dataset_scaled["attack_categorical"].value_counts())
test_dataset_scaled.head()

In [None]:
min_values = test_dataset_scaled.drop(
    columns=["attack_binary", "attack_categorical"]
).min()
max_values = test_dataset_scaled.drop(
    columns=["attack_binary", "attack_categorical"]
).max()
median_values = test_dataset_scaled.drop(
    columns=["attack_binary", "attack_categorical"]
).median()

pd.set_option("display.max_rows", None)
result = pd.DataFrame({"min": min_values, "median": median_values, "max": max_values})
result

# 5. Export Dataset to csv

full train dataset that only contains normal class

In [None]:
print("\nTran dataset shape")
print(train_dataset_scaled.shape)
print("\nTrain dataset attack distribution (binary)")
print(train_dataset_scaled["attack_binary"].value_counts())
train_dataset_scaled.head()

In [None]:
train_dataset_scaled.to_csv(
    DATASET["processed"]["CIDDS-001"]["train_full"], index=False
)

test dataset

In [None]:
print("\nTran dataset shape")
print(test_dataset_scaled.shape)
print("\nTrain dataset attack distribution (binary)")
print(test_dataset_scaled["attack_binary"].value_counts())
test_dataset_scaled.head()

In [None]:
test_dataset_scaled.to_csv(DATASET["processed"]["CIDDS-001"]["test"], index=False)