In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict

# Normal label
normal_label = "Benign"
# Base path where to find the dataset
base_path = "../data/IDS2018/original"
# Base path where processed dataset will be stored
export_path = "../data/IDS2018"
# Name of the file summarizing the preprocessing
info_fname = "ids2018_info.csv"
# File name of the cleaned/processed dataset
export_fname = "ids2018.csv"
# Used to track preprocessing steps
stats = defaultdict()
stats["n_dropped_cols"] = 0
stats["n_dropped_rows"] = 0
# Columns to drop before any analysis
cols_to_drop = [
    'Flow ID',
    'Src IP',
    'Dst IP',
    'Src Port',
    'Dst Port'
    'Protocol',
    'Timestamp',
]
num_cols = [
    'Tot Fwd Pkts',
    'Tot Bwd Pkts',
    'TotLen Fwd Pkts',
    'TotLen Bwd Pkts',
    'Fwd Pkt Len Max',
    'Fwd Pkt Len Min',
    'Fwd Pkt Len Mean',
    'Fwd Pkt Len Std',
    'Bwd Pkt Len Max',
    'Bwd Pkt Len Min',
    'Bwd Pkt Len Mean',
    'Bwd Pkt Len Std',
    'Flow Byts/s',
    'Flow Pkts/s',
    'Flow IAT Mean',
    'Flow IAT Std',
    'Flow IAT Max',
    'Flow IAT Min',
    'Fwd IAT Tot',
    'Fwd IAT Mean',
    'Fwd IAT Std',
    'Fwd IAT Max',
    'Fwd IAT Min',
    'Bwd IAT Tot',
    'Bwd IAT Mean',
    'Bwd IAT Std',
    'Bwd IAT Max',
    'Bwd IAT Min',
    'Fwd PSH Flags',
    'Fwd URG Flags',
    'Fwd Header Len',
    'Bwd Header Len',
    'Fwd Pkts/s',
    'Bwd Pkts/s',
    'Pkt Len Min',
    'Pkt Len Max',
    'Pkt Len Mean',
    'Pkt Len Std',
    'Pkt Len Var',
    'FIN Flag Cnt',
    'SYN Flag Cnt',
    'RST Flag Cnt',
    'PSH Flag Cnt',
    'ACK Flag Cnt',
    'URG Flag Cnt',
    'CWE Flag Count',
    'ECE Flag Cnt',
    'Down/Up Ratio',
    'Pkt Size Avg',
    'Fwd Seg Size Avg',
    'Bwd Seg Size Avg',
    'Subflow Fwd Pkts',
    'Subflow Fwd Byts',
    'Subflow Bwd Pkts',
    'Subflow Bwd Byts',
    'Fwd Act Data Pkts',
    'Fwd Seg Size Min',
    'Active Mean',
    'Active Std',
    'Active Max',
    'Active Min',
    'Idle Mean',
    'Idle Std',
    'Idle Max',
    'Idle Min'
]

In [None]:
df = pd.DataFrame()
for f in os.listdir(base_path):
    chunk = pd.read_csv(os.path.join(base_path, f))
    chunk.columns = chunk.columns.str.strip()
    chunk.loc[:, chunk.columns != "Label"] = chunk.loc[:, chunk.columns != "Label"].apply(pd.to_numeric, errors="coerce")   
    chunk.drop(cols_to_drop, axis=1, errors="ignore")
    df = pd.concat((df, chunk))
    print(f)
print(stats)
df.to_csv(export_path + "/ids2018_merged.csv", index=False)
df.head(5)

In [None]:
df = df.drop(["Flow ID", "Src IP", "Src Port", "Dst IP", "Dst Port", "Protocol", "Timestamp"], axis=1)
df.to_csv(export_path + "/ids2018_merged.csv", index=False)
df.columns

In [None]:
df = pd.read_csv(export_path + "/ids2018_merged.csv")
stats["dropped_cols"] = ""
stats["n_dropped_cols"] = 0
stats["n_dropped_rows"] = 0
stats["n_instances"] = len(df)
stats["n_features"] = df.shape[1] - 1
stats["anomaly_ratio"] = "{:2.4f}".format((df["Label"] != normal_label).sum() / len(df))
df.head(5)

In [None]:
df.dtypes

## Inner class imbalance
Between anomalies, there is a strong class imbalance

In [None]:
# Original class imbalance within attacks themselves
mask = df["Label"] != normal_label
original_ad_ratios = pd.DataFrame(
    pd.concat(
        (df[mask]["Label"].value_counts(),
        df[mask]["Label"].value_counts() / len(df[mask])), axis=1),
)
original_ad_ratios.to_csv(export_path + "/ids2018_anomaly_labels_ratio.csv")
original_ad_ratios.columns = ["Count", "Ratio"]
original_ad_ratios

In [None]:
# Group DoS attacks
mask = df["Label"].str.startswith("DoS")
df.loc[mask, "Label"] = "DoS"

# Group DDoS attacks
mask = df["Label"].str.startswith("DDoS")
df.loc[mask, "Label"] = "DDoS"
mask = df["Label"].str.startswith("DDOS")
df.loc[mask, "Label"] = "DDoS"

# Group Web attacks
mask = df["Label"].str.startswith("Brute Force")
df.loc[mask, "Label"] = "Web Attack"
mask = df["Label"].str.startswith("SQL")
df.loc[mask, "Label"] = "Web Attack"

Found 49 rows that are duplicates of the header row

In [None]:
df[df["Label"] == "Label"]

In [None]:
# Updated class imbalance
mask = df["Label"] != normal_label
mod_ad_ratios = pd.DataFrame(
    pd.concat(
        (df[mask]["Label"].value_counts(),
        df[mask]["Label"].value_counts() / len(df[mask])), axis=1),
)
mod_ad_ratios.columns = ["Count", "Ratio"]
mod_ad_ratios

In [None]:
# Group DoS attacks
mask = df["Label"].str.startswith("DoS")
df.loc[mask, "Label"] = "DoS"

# Group DDoS attacks
mask = df["Label"].str.lower().str.startswith("ddos")
df.loc[mask, "Label"] = "DDoS"

# Group Web attacks
mask = df["Label"].str.startswith("Web Attack")
df.loc[mask, "Label"] = "Web Attack"

In [None]:
# Updated class imbalance
mask = df["Label"] != normal_label
mod_ad_ratios = pd.DataFrame(
    pd.concat(
        (df[mask]["Label"].value_counts(),
        df[mask]["Label"].value_counts() / len(df[mask])), axis=1),
)
mod_ad_ratios.columns = ["Count", "Ratio"]
mod_ad_ratios

## Check unique values
Drop columns with unique values

In [None]:
uniq_cols = df.columns[df.nunique() <= 1].tolist()
stats["n_unique_cols"] = len(uniq_cols)
if uniq_cols:
    print("Found {} columns with unique values: {}".format(len(uniq_cols), uniq_cols))
    stats["unique_cols"] = ", ".join([str(col) for col in uniq_cols])
    df.drop(uniq_cols, axis=1, inplace=True)
    stats["n_dropped_cols"] += len(uniq_cols)
    uniq_cols = df.columns[df.nunique() <= 1].tolist()
assert len(uniq_cols) == 0, "Found {} columns with unique values: {}".format(len(uniq_cols), uniq_cols)
print("Columns are valid with more than one distinct value")

## Check for NaN/invalid values
First, find the columns with NaN values. Further processing will be required if we find any.

In [None]:
#n_dropped = len(df[df["Flow Duration"].isna()])
#df = df[df["Flow Duration"].isna()].dropna()
n_dropped = len(df[df["Flow Duration"].isna()])
stats["n_dropped_rows"] += n_dropped
df = df.drop(index=df[df["Flow Duration"].isna()].index)
print("Dropped {} rows".format(n_dropped))

In [None]:
# Replacing INF values with NaN
df = df.replace([-np.inf, np.inf], np.nan)
nan_cols = df.columns[df.isna().sum() > 0].tolist()
stats["n_nan_cols"] = len(nan_cols)
if nan_cols:
    stats["nan_cols"] = ", ".join([str(col) for col in nan_cols])
print("Found NaN columns: {}".format(nan_cols))

Having found two columns with NaN values, we must investigate further before taking any decision.

Dropping them seems risky because we would also lose anomalies which are already scarce and important for evaluation.

In [None]:
df[df.isna().any(axis=1)]["Label"].value_counts()

Dropping them seems risky since we also drop anomalies which are already scarce

In [None]:
# Check different values in Flow Duration
print(df[df.isna().any(axis=1)]["Flow Duration"].unique())
# Count number of nan instances when `Flow Duration` > 0
df[df["Flow Duration"] > 0].isna().sum().sum()

`Flow Bytes/s` and `Flow Packets/s` must be computed from `Flow Duration`. When the latter columns is zero, a division by zero occurs and the first two columns have NaN values. Zero values in `Flow Duration` are probably due to a lack of precision in the data type used. They must be associated with flows that lasted nanoseconds. Hence, we can convert the NaN rows to zeros.

In [None]:
n_dropped = df[nan_cols].isna().sum()[0]
df = df.fillna(0)
print("Replaced {} rows or {:2.4f}% of original data".format(n_dropped, n_dropped / len(df)))
remaining_nans = df.isna().sum().sum()
assert remaining_nans == 0, "There are still {} NaN values".format(remaining_nans)

## Check for negative values
Most of the features should be strictly positive. For instance, a packet with a negative number of bytes makes no sense.

In [None]:
num_cols = df.select_dtypes(exclude="object").columns
mask = (df[num_cols] < 0).sum() > 0
neg_cols = df[num_cols].columns[mask]
stats["n_negative_cols"] = len(neg_cols)
stats["negative_cols"] = ", ".join(neg_cols)
print("Found {} columns with negative values: {}".format(len(neg_cols), neg_cols))

In [None]:
neg_df = pd.DataFrame(
    pd.concat((
        (df[neg_cols] < 0).sum(),
        (df[neg_cols] < 0).sum() / len(df)
    ), axis=1)
)
neg_df.columns = ["Count", "Ratio"]
neg_df = neg_df.sort_values("Count", ascending=False)
neg_df

In [None]:
print((df["Init Bwd Win Byts"][df["Init Bwd Win Byts"] < 0]).unique())
print((df["Init Fwd Win Byts"][df["Init Fwd Win Byts"] < 0]).unique())
#df[df["Init_Win_bytes_backward"] < 0]

In [None]:
# Drop `Init_Win_bytes_forward` and `Init_Win_bytes_backward` because too many of their values are equal to -1 which makes no sense.
to_drop = neg_df[neg_df["Ratio"] > 0.01].index.tolist()
df = df.drop(to_drop, axis=1)
neg_df = neg_df.drop(to_drop)
stats["n_dropped_cols"] += len(to_drop)
stats["dropped_cols"] = stats["dropped_cols"] + ", ".join(to_drop)
num_cols = df.select_dtypes(include=np.number).columns
print("Dropped {} columns: {}".format(len(to_drop), to_drop))

In [None]:
df[(df[num_cols] < 0).any(1)]["Label"].value_counts()

The remaining invalid values are associated only to 15 benign rows. Removing them is probably the safest solution here.

In [None]:
df[(df[num_cols] < 0).any(1)]

When Flow Duration < 0, multiple columns are negative. Since these rows are only associated with BENIGN flows, we can drop them.

In [None]:
n_dropped = (df["Flow Duration"] < 0).sum()
stats["n_dropped_rows"] += n_dropped
df = df[df["Flow Duration"] >= 0]
print("Dropped {} rows".format(n_dropped))
# assert len(df[(df[num_cols] < 0).any(1)]) == 0, "there are still negative rows"

In [None]:
df = df.drop(index=df[(df[num_cols] < 0).any(1)].index)
stats["n_dropped_rows"] += 1
assert len(df[(df[num_cols] < 0).any(1)]) == 0, "there are still negative rows"

In [None]:
df["Category"] = df["Label"]
df["Label"] = df["Label"].apply(lambda x: 0 if x == normal_label else 1)
df["Label"] = df["Label"].astype(np.uint8)

## Normalize attributes

In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
assert np.allclose(df[num_cols].max(axis=0).to_numpy(), 1.), "Found values different than 1."
assert np.allclose(df[num_cols].min(axis=0).to_numpy(), 0.), "Found values lesser than 0."
print("Data is scaled between 0 and 1")

In [None]:
stats["n_final_features"] = df.shape[1] - 2
stats["n_final_rows"] = df.shape[0]
stats["final_anomaly_ratio"] = (df["Label"] != 0).sum() / len(df)
stats

## Store processed dataset to CSV

In [None]:
df.to_csv(export_path + "/" + export_fname, index=False)
print("Processed data saved under: {}".format(export_path + "/" + export_fname))