In [170]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict

# Base path where to find the dataset
base_path = "../data/IDS2017"
# Base path where processed dataset will be stored
export_path = "../data/IDS2017"
# Name of the file summarizing the preprocessing
info_fname = "ids2017_info.csv"
# File name of the original dataset
fname = ""
# File name of the cleaned/processed dataset
export_fname = "ids2017.npy"
# Used to track preprocessing steps
stats = defaultdict()

In [171]:
df = pd.DataFrame()
for f in os.listdir(base_path):
    chunk = pd.read_csv(os.path.join(base_path, f))
    chunk.columns = chunk.columns.str.strip()
    df = pd.concat((df, chunk))
    print(f)
stats["dropped_cols"] = ""
stats["n_dropped_cols"] = 0
stats["n_dropped_rows"] = 0
stats["n_instances"] = df.shape[0]
stats["n_features"] = df.shape[1] - 1
stats["anomaly_ratio"] = "{:2.4f}".format((df["Label"] != "BENIGN").sum() / len(df))
df.head(5)

Wednesday-workingHours.pcap_ISCX.csv
Tuesday-WorkingHours.pcap_ISCX.csv
Monday-WorkingHours.pcap_ISCX.csv
Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Friday-WorkingHours-Morning.pcap_ISCX.csv
Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,38308,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,389,479,11,5,172,326,79,0,15.636364,31.449238,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,88,1095,10,6,3150,3150,1575,0,315.0,632.561635,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,389,15206,17,12,3452,6660,1313,0,203.058823,425.778474,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,88,1092,9,6,3150,3152,1575,0,350.0,694.509719,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


## Inner class imbalance
Between anomalies, there is a strong class imbalance

In [172]:
# Original class imbalance within attacks themselves
mask = df["Label"] != "BENIGN"
original_ad_ratios = pd.DataFrame(
    pd.concat(
        (df[mask]["Label"].value_counts(),
        df[mask]["Label"].value_counts() / len(df[mask])), axis=1),
)
original_ad_ratios.columns = ["Count", "Ratio"]
original_ad_ratios

Unnamed: 0,Count,Ratio
DoS Hulk,231073,0.414372
PortScan,158930,0.285002
DDoS,128027,0.229585
DoS GoldenEye,10293,0.018458
FTP-Patator,7938,0.014235
SSH-Patator,5897,0.010575
DoS slowloris,5796,0.010394
DoS Slowhttptest,5499,0.009861
Bot,1966,0.003526
Web Attack � Brute Force,1507,0.002702


We regroup similar attacks under the same label

In [173]:
# Group DoS attacks
mask = df["Label"].str.startswith("DoS")
df.loc[mask, "Label"] = "DoS"

# Group Web attacks
mask = df["Label"].str.startswith("Web Attack")
df.loc[mask, "Label"] = "Web Attack"

In [174]:
# Updated class imbalance
mask = df["Label"] != "BENIGN"
mod_ad_ratios = pd.DataFrame(
    pd.concat(
        (df[mask]["Label"].value_counts(),
        df[mask]["Label"].value_counts() / len(df[mask])), axis=1),
)
mod_ad_ratios.columns = ["Count", "Ratio"]
mod_ad_ratios

Unnamed: 0,Count,Ratio
DoS,252661,0.453085
PortScan,158930,0.285002
DDoS,128027,0.229585
FTP-Patator,7938,0.014235
SSH-Patator,5897,0.010575
Web Attack,2180,0.003909
Bot,1966,0.003526
Infiltration,36,6.5e-05
Heartbleed,11,2e-05


## Check unique values
Drop columns with unique values

In [175]:
uniq_cols = df.columns[df.nunique() <= 1].tolist()
stats["n_unique_cols"] = len(uniq_cols)
if uniq_cols:
    print("Found {} columns with unique values: {}".format(len(uniq_cols), uniq_cols))
    stats["unique_cols"] = ", ".join([str(col) for col in uniq_cols])
    df.drop(uniq_cols, axis=1, inplace=True)
    stats["n_dropped_cols"] += len(uniq_cols)
    uniq_cols = df.columns[df.nunique() <= 1].tolist()
assert len(uniq_cols) == 0, "Found {} columns with unique values: {}".format(len(uniq_cols), uniq_cols)
print("Columns are valid with more than one distinct value")

Found 8 columns with unique values: ['Bwd PSH Flags', 'Bwd URG Flags', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate']
Columns are valid with more than one distinct value


## Check for NaN/invalid values
First, find the columns with NaN values. Further processing will be required if we find any.

In [176]:
# Replacing INF values with NaN
df = df.replace([-np.inf, np.inf], np.nan)
nan_cols = df.columns[df.isna().sum() > 0].tolist()
stats["n_nan_cols"] = len(nan_cols)
if nan_cols:
    stats["nan_cols"] = ", ".join([str(col) for col in nan_cols])
print("Found NaN columns: {}".format(nan_cols))

Found NaN columns: ['Flow Bytes/s', 'Flow Packets/s']


Having found two columns with NaN values, we must investigate further before taking any decision.

In [177]:
df[nan_cols].isna().sum()

Flow Bytes/s      2867
Flow Packets/s    2867
dtype: int64

Are the nan instances associated with anomalies?

In [178]:
df[df.isna().any(axis=1)]["Label"].value_counts()

BENIGN         1777
DoS             949
PortScan        126
Bot              10
FTP-Patator       3
DDoS              2
Name: Label, dtype: int64

Dropping them seems risky since we also drop anomalies which are already scarce

In [179]:
# Check different values in Flow Duration
print(df[df.isna().any(axis=1)]["Flow Duration"].unique())
# Count number of nan instances when `Flow Duration` > 0
df[df["Flow Duration"] > 0].isna().sum().sum()

[0]


0

`Flow Bytes/s` and `Flow Packets/s` must be computed from `Flow Duration`. When the latter columns is zero, a division by zero occurs and the first two columns have NaN values. Zero values in `Flow Duration` are probably due to a lack of precision in the data type used. They must be associated with flows that lasted nanoseconds. Hence, we can convert the NaN rows to zeros.

In [180]:
ratio = df[nan_cols].isna().sum()[0] / len(df)
df = df.fillna(0)
print("Replaced {:2.4f}% of original data".format(ratio))
remaining_nans = df.isna().sum().sum()
assert remaining_nans == 0, "There are still {} NaN values".format(remaining_nans)

Replaced 0.0010% of original data


## Check for negative values
Most of the features should be strictly positive. For instance, a packet with a negative number of bytes makes no sense.

In [181]:
num_cols = df.select_dtypes(exclude="object").columns
mask = (df[num_cols] < 0).sum() > 0
neg_cols = df[num_cols].columns[mask]
stats["n_negative_cols"] = len(neg_cols)
stats["negative_cols"] = ", ".join(neg_cols)
print("Found {} columns with negative values: {}".format(len(neg_cols), neg_cols))

Found 13 columns with negative values: Index(['Flow Duration', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean',
       'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Min', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Header Length.1', 'Init_Win_bytes_forward',
       'Init_Win_bytes_backward', 'min_seg_size_forward'],
      dtype='object')


In [182]:
neg_df = pd.DataFrame(
    pd.concat((
        (df[neg_cols] < 0).sum(),
        (df[neg_cols] < 0).sum() / len(df)
    ), axis=1)
)
neg_df.columns = ["Count", "Ratio"]
neg_df = neg_df.sort_values("Count", ascending=False)
neg_df

Unnamed: 0,Count,Ratio
Init_Win_bytes_backward,1441552,0.509249
Init_Win_bytes_forward,1001189,0.353684
Flow IAT Min,2891,0.001021
Flow Duration,115,4.1e-05
Flow Packets/s,115,4.1e-05
Flow IAT Mean,115,4.1e-05
Flow IAT Max,115,4.1e-05
Flow Bytes/s,85,3e-05
Fwd Header Length,35,1.2e-05
Fwd Header Length.1,35,1.2e-05


In [183]:
print((df["Init_Win_bytes_backward"][df["Init_Win_bytes_backward"] < 0]).unique())
print((df["Init_Win_bytes_forward"][df["Init_Win_bytes_forward"] < 0]).unique())
df[df["Init_Win_bytes_backward"] < 0]

[-1]
[-1]


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
8,49666,3,2,0,12,0,6,6,6.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
9,49413,4,3,0,18,0,6,6,6.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
14,443,4,2,0,91,0,85,6,45.500000,55.861436,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
23,443,76,3,0,97,0,85,6,32.333333,45.610671,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
34,465,384,3,0,49,0,37,6,16.333333,17.897858,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286453,53,203,2,2,90,122,45,45,45.000000,0.000000,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
286455,53,60066,1,1,52,116,52,52,52.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
286456,53,202,2,2,82,210,41,41,41.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
286458,53,60567,2,2,62,264,31,31,31.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [186]:
# Drop `Init_Win_bytes_forward` and `Init_Win_bytes_backward` because too many of their values are equal to -1 which makes no sense.
to_drop = neg_df[neg_df["Ratio"] > 0.01].index.tolist()
df = df.drop(to_drop, axis=1)
neg_df = neg_df.drop(to_drop)
stats["n_dropped_cols"] += len(to_drop)
stats["dropped_cols"] = stats["dropped_cols"] + ", ".join(to_drop)
num_cols = df.select_dtypes(include=np.number).columns
print("Dropped {} columns: {}".format(len(to_drop), to_drop))

In [197]:
df[(df[num_cols] < 0).any(1)]["Label"].value_counts()

BENIGN          2732
DoS              164
DDoS              19
FTP-Patator        4
Heartbleed         4
SSH-Patator        2
Infiltration       1
Name: Label, dtype: int64

In [None]:
# # For the other attributes we can drop the rows since they account for less than 1% of the original data.
# n_before = len(df)
# df = df[num_cols][df[num_cols] >= 0]
# n_after = len(df)
# n_dropped = n_before - n_after
# stats["n_dropped_rows"] += n_dropped
# print("Dropped {} rows ({:2.4f}% of original data)".format(n_dropped, n_dropped / n_before))

## Check if scaling is required

In [None]:
# num_cols = df.select_dtypes(include=[np.number])
# scaler = MinMaxScaler()
# df[num_cols] = scaler.fit_transform(df[num_cols])
# assert np.allclose(df.max(axis=0).to_numpy(), 1.), "Found values greater than 1."
# assert np.allclose(df.min(axis=0).to_numpy(), 0.), "Found values lesser than 0."
# print("Data is scaled between 0 and 1")

## Store processed dataset in a compressed file

In [None]:
# np.save(base_path + "/" + export_fname, df.to_numpy())
# print("Processed data saved under: {}".format(base_path + "/" + export_fname))

## Store basic information

In [None]:
stats_df = pd.DataFrame(stats, index=[0])
stats_df.to_csv(export_path + "/" + info_fname, index=False)
print("Processing summary saved under: {}".format(base_path + "/" + info_fname))
stats_df