In [10]:
import os
import pandas as pd
import tensorflow as tf
from utils.binary_processing import split_flag_columns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import math
from copy import copy
import h5py

In [11]:
DATASET_NAME = "NF-CSE-CIC-IDS2018-v2-DDoS-downsample"

In [12]:
dtypes_netflow = {
    "IPV4_SRC_ADDR":                "object",
    "L4_SRC_PORT":                  "float32",
    "IPV4_DST_ADDR":                "object",
    "L4_DST_PORT":                  "float32",
    "PROTOCOL":                     "float32",
    "L7_PROTO":                     "float64",
    "IN_BYTES":                     "float32",
    "IN_PKTS":                      "float32",
    "OUT_BYTES":                    "float32",
    "OUT_PKTS":                     "float32",
    "TCP_FLAGS":                    "int32",
    "CLIENT_TCP_FLAGS":             "int32",
    "SERVER_TCP_FLAGS":             "int32",
    "FLOW_DURATION_MILLISECONDS":   "float32",
    "DURATION_IN":                  "float32",
    "DURATION_OUT":                 "float32",
    "MIN_TTL":                      "float32",
    "MAX_TTL":                      "float32",
    "LONGEST_FLOW_PKT":             "float32",
    "SHORTEST_FLOW_PKT":            "float32",
    "MIN_IP_PKT_LEN":               "float32",
    "MAX_IP_PKT_LEN":               "float32",
    "SRC_TO_DST_SECOND_BYTES":      "float64",
    "DST_TO_SRC_SECOND_BYTES":      "float64",
    "RETRANSMITTED_IN_BYTES":       "float32",
    "RETRANSMITTED_IN_PKTS":        "float32",
    "RETRANSMITTED_OUT_BYTES":      "float32",
    "RETRANSMITTED_OUT_PKTS":       "float32",
    "SRC_TO_DST_AVG_THROUGHPUT":    "float32",
    "DST_TO_SRC_AVG_THROUGHPUT":    "float32",
    "NUM_PKTS_UP_TO_128_BYTES":     "float32",
    "NUM_PKTS_128_TO_256_BYTES":    "float32",
    "NUM_PKTS_256_TO_512_BYTES":    "float32",
    "NUM_PKTS_512_TO_1024_BYTES":   "float32",
    "NUM_PKTS_1024_TO_1514_BYTES":  "float32",
    "TCP_WIN_MAX_IN":               "float32",
    "TCP_WIN_MAX_OUT":              "float32",
    "ICMP_TYPE":                    "float32",
    "ICMP_IPV4_TYPE":               "float32",
    "DNS_QUERY_ID":                 "float32",
    "DNS_QUERY_TYPE":               "float32",
    "DNS_TTL_ANSWER":               "float32",
    "FTP_COMMAND_RET_CODE":         "float32",
    "Attack":                       "object",
    "Label":                        "float32",
}

In [13]:
def save_dataset_to_h5(set_x, set_y, filename):
    hf = h5py.File(filename, 'w')
    hf.create_dataset('set_x', data=set_x)
    hf.create_dataset('set_y', data=set_y)
    hf.close()


In [14]:
def get_features_to_drop() -> list: 
        """
            Attributes that bias the model and should be removed:
                IPV4_SRC_ADDR = Source address of the information flow.
                IPV4_DST_ADDR = Destination address of the information flow.
                L7_PROTO = Layer 7 application protocol, specific to each type of DDoS attack.
                L4_SRC_PORT = Source port of the communication flow.
                L4_DST_PORT = Destination port of the communication flow.
                FTP_COMMAND_RET_CODE = Return code of the FTP command.
                Attack = Descriptive label of the example class. 
        """
        __features_to_drop = [
            'Unnamed: 0',
            'IPV4_SRC_ADDR', 
            'IPV4_DST_ADDR', 
            'L7_PROTO', 
            'L4_SRC_PORT', 
            'L4_DST_PORT', 
            'FTP_COMMAND_RET_CODE',
            'Attack'
        ]

        return __features_to_drop

In [15]:
SEED = 42

In [16]:
# Load Netflow Datasets
print(f"Loading dataset {DATASET_NAME}...")

df = pd.read_csv(
    f"{os.getcwd()}/anomaly-flow-datasets/{DATASET_NAME}.csv.gz",
    dtype=dtypes_netflow
)

print(f"Loaded dataset {DATASET_NAME} [OK]")

# Preprocessing to perform one-hot encoding of descriptive attributes.
print("Initialized Columns Split Preprocessing.")

df = split_flag_columns(df)

print("Finished Columns Split Preprocessing. [OK]")

# Remove unused features for training.
df.drop(get_features_to_drop(), axis=1, inplace=True)

# Preprocessing to remove very large values.
df.replace([np.inf, -np.inf], np.nan, inplace=True)
threshold = np.finfo(np.float32).max
df = df[df < threshold]

df.dropna(inplace=True)

X, y = df.drop(['Label'], axis=1), df['Label']
columns = X.columns

print('Experiments using the following columns: ')
print(*list(columns), sep='\n')

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state=SEED, shuffle=True)

# Normalize the data between 0 and 1 (FLAD output)
scaler = MinMaxScaler(feature_range=(0, 1))

train_size = None

if train_size is None:
    scaler.fit(X_train[y_train == 0])
    X_train = scaler.transform(X_train)
    y_train = y_train
else:
    self.scaler.fit(X_train[y_train == 0])
    X_train = scaler.transform(X_train[:train_size])
    y_train = y_train[:train_size]

validation_benign_samples = math.floor(0.1 * np.count_nonzero(y_train == 0))
validation_attack_samples = math.floor(0.1 * np.count_nonzero(y_train == 1))

X_test = scaler.transform(X_test)
X_test = copy(X_test) 
y_test = copy(y_test)

# Save the datasets
save_dataset_to_h5(X_train, y_train, f'{DATASET_NAME}-train.hdf5')
save_dataset_to_h5(X_test, y_test, f'{DATASET_NAME}-test.hdf5')
save_dataset_to_h5(X_train[:validation_benign_samples], y_train[:validation_benign_samples], f'{DATASET_NAME}-val.hdf5')


Loading dataset NF-CSE-CIC-IDS2018-v2-DDoS-downsample...
Loaded dataset NF-CSE-CIC-IDS2018-v2-DDoS-downsample [OK]
Initialized Columns Split Preprocessing.
Using cached file: ec93b974b991f647501fb155031de7d4.
Finished Columns Split Preprocessing. [OK]
Experiments using the following columns: 
PROTOCOL
IN_BYTES
IN_PKTS
OUT_BYTES
OUT_PKTS
FLOW_DURATION_MILLISECONDS
DURATION_IN
DURATION_OUT
MIN_TTL
MAX_TTL
LONGEST_FLOW_PKT
SHORTEST_FLOW_PKT
MIN_IP_PKT_LEN
MAX_IP_PKT_LEN
SRC_TO_DST_SECOND_BYTES
DST_TO_SRC_SECOND_BYTES
RETRANSMITTED_IN_BYTES
RETRANSMITTED_IN_PKTS
RETRANSMITTED_OUT_BYTES
RETRANSMITTED_OUT_PKTS
SRC_TO_DST_AVG_THROUGHPUT
DST_TO_SRC_AVG_THROUGHPUT
NUM_PKTS_UP_TO_128_BYTES
NUM_PKTS_128_TO_256_BYTES
NUM_PKTS_256_TO_512_BYTES
NUM_PKTS_512_TO_1024_BYTES
NUM_PKTS_1024_TO_1514_BYTES
TCP_WIN_MAX_IN
TCP_WIN_MAX_OUT
ICMP_TYPE
ICMP_IPV4_TYPE
DNS_QUERY_ID
DNS_QUERY_TYPE
DNS_TTL_ANSWER
URGENT_POINTER
ACKNOWLEDGEMENT
PUSH
RESET
SYNCHRONISATION
FIN
CLIENT_URGENT_POINTER
CLIENT_ACKNOWLEDGEMEN