In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict

# Base path where to find the dataset
base_path = "../data/IDS2017/original"
# Base path where processed dataset will be stored
export_path = "../data/IDS2017"
# Name of the file summarizing the preprocessing
info_fname = "ids2017_info.csv"
# File name of the cleaned/processed dataset
export_fname = "ids2017.csv"
# Used to track preprocessing steps
stats = defaultdict()

In [2]:
df = pd.DataFrame()
for f in os.listdir(base_path):
    chunk = pd.read_csv(os.path.join(base_path, f))
    chunk.columns = chunk.columns.str.strip()
    df = pd.concat((df, chunk))
    print(f)
stats["dropped_cols"] = ""
stats["n_dropped_cols"] = 0
stats["n_dropped_rows"] = 0
stats["n_instances"] = len(df)
stats["n_features"] = df.shape[1] - 1
stats["anomaly_ratio"] = "{:2.4f}".format((df["Label"] != "BENIGN").sum() / len(df))
df.head(5)

Wednesday-workingHours.pcap_ISCX.csv
Tuesday-WorkingHours.pcap_ISCX.csv
Monday-WorkingHours.pcap_ISCX.csv
Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Friday-WorkingHours-Morning.pcap_ISCX.csv
Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,38308,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,389,479,11,5,172,326,79,0,15.636364,31.449238,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,88,1095,10,6,3150,3150,1575,0,315.0,632.561635,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,389,15206,17,12,3452,6660,1313,0,203.058823,425.778474,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,88,1092,9,6,3150,3152,1575,0,350.0,694.509719,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


## Inner class imbalance
Between anomalies, there is a strong class imbalance

In [3]:
# Original class imbalance within attacks themselves
mask = df["Label"] != "BENIGN"
original_ad_ratios = pd.DataFrame(
    pd.concat(
        (df[mask]["Label"].value_counts(),
        df[mask]["Label"].value_counts() / len(df[mask])), axis=1),
)
original_ad_ratios.columns = ["Count", "Ratio"]
original_ad_ratios

Unnamed: 0,Count,Ratio
DoS Hulk,231073,0.414372
PortScan,158930,0.285002
DDoS,128027,0.229585
DoS GoldenEye,10293,0.018458
FTP-Patator,7938,0.014235
SSH-Patator,5897,0.010575
DoS slowloris,5796,0.010394
DoS Slowhttptest,5499,0.009861
Bot,1966,0.003526
Web Attack � Brute Force,1507,0.002702


We regroup similar attacks under the same label

In [4]:
# Group DoS attacks
mask = df["Label"].str.startswith("DoS")
df.loc[mask, "Label"] = "DoS"

# Group Web attacks
mask = df["Label"].str.startswith("Web Attack")
df.loc[mask, "Label"] = "Web Attack"

In [5]:
# Updated class imbalance
mask = df["Label"] != "BENIGN"
mod_ad_ratios = pd.DataFrame(
    pd.concat(
        (df[mask]["Label"].value_counts(),
        df[mask]["Label"].value_counts() / len(df[mask])), axis=1),
)
mod_ad_ratios.columns = ["Count", "Ratio"]
mod_ad_ratios

Unnamed: 0,Count,Ratio
DoS,252661,0.453085
PortScan,158930,0.285002
DDoS,128027,0.229585
FTP-Patator,7938,0.014235
SSH-Patator,5897,0.010575
Web Attack,2180,0.003909
Bot,1966,0.003526
Infiltration,36,6.5e-05
Heartbleed,11,2e-05


## Check unique values
Drop columns with unique values

In [6]:
uniq_cols = df.columns[df.nunique() <= 1].tolist()
stats["n_unique_cols"] = len(uniq_cols)
if uniq_cols:
    print("Found {} columns with unique values: {}".format(len(uniq_cols), uniq_cols))
    stats["unique_cols"] = ", ".join([str(col) for col in uniq_cols])
    df.drop(uniq_cols, axis=1, inplace=True)
    stats["n_dropped_cols"] += len(uniq_cols)
    uniq_cols = df.columns[df.nunique() <= 1].tolist()
assert len(uniq_cols) == 0, "Found {} columns with unique values: {}".format(len(uniq_cols), uniq_cols)
print("Columns are valid with more than one distinct value")

Found 8 columns with unique values: ['Bwd PSH Flags', 'Bwd URG Flags', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate']
Columns are valid with more than one distinct value


## Check for NaN/invalid values
First, find the columns with NaN values. Further processing will be required if we find any.

In [7]:
# Replacing INF values with NaN
df = df.replace([-np.inf, np.inf], np.nan)
nan_cols = df.columns[df.isna().sum() > 0].tolist()
stats["n_nan_cols"] = len(nan_cols)
if nan_cols:
    stats["nan_cols"] = ", ".join([str(col) for col in nan_cols])
print("Found NaN columns: {}".format(nan_cols))

Found NaN columns: ['Flow Bytes/s', 'Flow Packets/s']


Having found two columns with NaN values, we must investigate further before taking any decision.

In [8]:
df[nan_cols].isna().sum()

Flow Bytes/s      2867
Flow Packets/s    2867
dtype: int64

Are the nan instances associated with anomalies?

In [9]:
df[df.isna().any(axis=1)]["Label"].value_counts()

BENIGN         1777
DoS             949
PortScan        126
Bot              10
FTP-Patator       3
DDoS              2
Name: Label, dtype: int64

Dropping them seems risky since we also drop anomalies which are already scarce

In [10]:
# Check different values in Flow Duration
print(df[df.isna().any(axis=1)]["Flow Duration"].unique())
# Count number of nan instances when `Flow Duration` > 0
df[df["Flow Duration"] > 0].isna().sum().sum()

[0]


0

`Flow Bytes/s` and `Flow Packets/s` must be computed from `Flow Duration`. When the latter columns is zero, a division by zero occurs and the first two columns have NaN values. Zero values in `Flow Duration` are probably due to a lack of precision in the data type used. They must be associated with flows that lasted nanoseconds. Hence, we can convert the NaN rows to zeros.

In [11]:
ratio = df[nan_cols].isna().sum()[0] / len(df)
df = df.fillna(0)
print("Replaced {:2.4f}% of original data".format(ratio))
remaining_nans = df.isna().sum().sum()
assert remaining_nans == 0, "There are still {} NaN values".format(remaining_nans)

Replaced 0.0010% of original data


## Check for negative values
Most of the features should be strictly positive. For instance, a packet with a negative number of bytes makes no sense.

In [12]:
num_cols = df.select_dtypes(exclude="object").columns
mask = (df[num_cols] < 0).sum() > 0
neg_cols = df[num_cols].columns[mask]
stats["n_negative_cols"] = len(neg_cols)
stats["negative_cols"] = ", ".join(neg_cols)
print("Found {} columns with negative values: {}".format(len(neg_cols), neg_cols))

Found 13 columns with negative values: Index(['Flow Duration', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean',
       'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Min', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Header Length.1', 'Init_Win_bytes_forward',
       'Init_Win_bytes_backward', 'min_seg_size_forward'],
      dtype='object')


In [13]:
neg_df = pd.DataFrame(
    pd.concat((
        (df[neg_cols] < 0).sum(),
        (df[neg_cols] < 0).sum() / len(df)
    ), axis=1)
)
neg_df.columns = ["Count", "Ratio"]
neg_df = neg_df.sort_values("Count", ascending=False)
neg_df

Unnamed: 0,Count,Ratio
Init_Win_bytes_backward,1441552,0.509249
Init_Win_bytes_forward,1001189,0.353684
Flow IAT Min,2891,0.001021
Flow Duration,115,4.1e-05
Flow Packets/s,115,4.1e-05
Flow IAT Mean,115,4.1e-05
Flow IAT Max,115,4.1e-05
Flow Bytes/s,85,3e-05
Fwd Header Length,35,1.2e-05
Fwd Header Length.1,35,1.2e-05


In [14]:
print((df["Init_Win_bytes_backward"][df["Init_Win_bytes_backward"] < 0]).unique())
print((df["Init_Win_bytes_forward"][df["Init_Win_bytes_forward"] < 0]).unique())
df[df["Init_Win_bytes_backward"] < 0]

[-1]
[-1]


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
8,49666,3,2,0,12,0,6,6,6.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
9,49413,4,3,0,18,0,6,6,6.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
14,443,4,2,0,91,0,85,6,45.500000,55.861436,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
23,443,76,3,0,97,0,85,6,32.333333,45.610671,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
34,465,384,3,0,49,0,37,6,16.333333,17.897858,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286453,53,203,2,2,90,122,45,45,45.000000,0.000000,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
286455,53,60066,1,1,52,116,52,52,52.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
286456,53,202,2,2,82,210,41,41,41.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
286458,53,60567,2,2,62,264,31,31,31.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [15]:
# Drop `Init_Win_bytes_forward` and `Init_Win_bytes_backward` because too many of their values are equal to -1 which makes no sense.
to_drop = neg_df[neg_df["Ratio"] > 0.01].index.tolist()
df = df.drop(to_drop, axis=1)
neg_df = neg_df.drop(to_drop)
stats["n_dropped_cols"] += len(to_drop)
stats["dropped_cols"] = stats["dropped_cols"] + ", ".join(to_drop)
num_cols = df.select_dtypes(include=np.number).columns
print("Dropped {} columns: {}".format(len(to_drop), to_drop))

Dropped 2 columns: ['Init_Win_bytes_backward', 'Init_Win_bytes_forward']


In [16]:
df[(df[num_cols] < 0).any(1)]["Label"].value_counts()

BENIGN          2732
DoS              164
DDoS              19
Heartbleed         4
FTP-Patator        4
SSH-Patator        2
Infiltration       1
Name: Label, dtype: int64

In [17]:
df[(df[num_cols] < 0).any(1)]

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
321,80,119843186,180892,230419,1080532,516000000,365,0,5.973354,8.769721,...,20,1.102950e+05,0.000000e+00,110295,110295,5953633.0,0.000000,5953633,5953633,BENIGN
387,88,512,7,5,476,376,229,0,68.000000,110.016665,...,20,0.000000e+00,0.000000e+00,0,0,0.0,0.000000,0,0,BENIGN
721,80,119999933,203943,263412,1224076,584000000,367,0,6.002050,8.603629,...,20,0.000000e+00,0.000000e+00,0,0,0.0,0.000000,0,0,BENIGN
728,80,117107216,3356,4277,15800,10700000,344,0,4.707986,6.382862,...,20,1.390695e+05,4.101874e+05,1375830,14956,10000000.0,7762.637596,10000000,10000000,BENIGN
1041,80,12221158,5682,4527,12887,10500000,763,0,2.268039,15.995691,...,20,0.000000e+00,0.000000e+00,0,0,0.0,0.000000,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284001,443,-1,1,1,6,6,6,6,6.000000,0.000000,...,20,0.000000e+00,0.000000e+00,0,0,0.0,0.000000,0,0,BENIGN
284149,443,43855854,151,188,16804,207299,613,0,111.284768,217.478194,...,20,1.430000e+07,0.000000e+00,14300000,14300000,29500000.0,0.000000,29500000,29500000,BENIGN
284151,443,44442302,476,710,17892,1134483,709,0,37.588235,129.848181,...,20,1.530000e+07,0.000000e+00,15300000,15300000,29100000.0,0.000000,29100000,29100000,BENIGN
284852,80,49152804,68,81,1125,111640,390,0,16.544118,62.173670,...,20,8.006892e+05,1.571138e+06,3157396,14989,10200000.0,65926.064590,10200000,10100000,BENIGN


When Flow Duration < 0, multiple columns are negative. Since these rows are only associated with BENIGN flows, we can drop them.

In [18]:
n_dropped = (df["Flow Duration"] < 0).sum()
stats["n_dropped_rows"] += n_dropped
df = df[df["Flow Duration"] >= 0]
print("Dropped {} rows".format(n_dropped))

Dropped 115 rows


In [19]:
neg_cols_when_anomalies = df[num_cols].columns[(df[num_cols][((df[num_cols]).any(1)) & (df["Label"] != "BENIGN")] < 0).sum() > 0]
neg_cols_when_anomalies

Index(['Flow IAT Min', 'Fwd IAT Min'], dtype='object')

In [20]:
t = neg_cols_when_anomalies
df[(df[t] < 0).any(1)]

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
321,80,119843186,180892,230419,1080532,516000000,365,0,5.973354,8.769721,...,20,1.102950e+05,0.000000e+00,110295,110295,5953633.0,0.000000,5953633,5953633,BENIGN
387,88,512,7,5,476,376,229,0,68.000000,110.016665,...,20,0.000000e+00,0.000000e+00,0,0,0.0,0.000000,0,0,BENIGN
721,80,119999933,203943,263412,1224076,584000000,367,0,6.002050,8.603629,...,20,0.000000e+00,0.000000e+00,0,0,0.0,0.000000,0,0,BENIGN
728,80,117107216,3356,4277,15800,10700000,344,0,4.707986,6.382862,...,20,1.390695e+05,4.101874e+05,1375830,14956,10000000.0,7762.637596,10000000,10000000,BENIGN
1041,80,12221158,5682,4527,12887,10500000,763,0,2.268039,15.995691,...,20,0.000000e+00,0.000000e+00,0,0,0.0,0.000000,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282068,443,976619,62,61,822,87201,481,0,13.258065,66.672170,...,32,0.000000e+00,0.000000e+00,0,0,0.0,0.000000,0,0,BENIGN
284149,443,43855854,151,188,16804,207299,613,0,111.284768,217.478194,...,20,1.430000e+07,0.000000e+00,14300000,14300000,29500000.0,0.000000,29500000,29500000,BENIGN
284151,443,44442302,476,710,17892,1134483,709,0,37.588235,129.848181,...,20,1.530000e+07,0.000000e+00,15300000,15300000,29100000.0,0.000000,29100000,29100000,BENIGN
284852,80,49152804,68,81,1125,111640,390,0,16.544118,62.173670,...,20,8.006892e+05,1.571138e+06,3157396,14989,10200000.0,65926.064590,10200000,10100000,BENIGN


In [21]:
to_drop = list(neg_cols_when_anomalies)
stats["n_dropped_cols"] += len(neg_cols_when_anomalies)
stats["dropped_cols"] = stats["dropped_cols"] + ", ".join(to_drop)
df = df.drop(to_drop, axis=1)
print("Dropped {} columns {}".format(len(to_drop), to_drop))

Dropped 2 columns ['Flow IAT Min', 'Fwd IAT Min']


In [22]:
num_cols = df.select_dtypes(include=np.number).columns
neg_cols_labels = df[(df[num_cols] < 0).any(1)]["Label"].unique()
assert len(neg_cols_labels) == 1 and neg_cols_labels[0] == "BENIGN"
idx_to_drop = df[(df[num_cols] < 0).any(1)].index
n_dropped = len(idx_to_drop)
stats["n_dropped_rows"] += n_dropped
df = df.drop(idx_to_drop, axis=0)
print("Dropped {} rows".format(n_dropped))
assert (df[num_cols] < 0).any(1).sum() == 0, "There are still negative values"
print("There are no more negative values")

Dropped 35 rows
There are no more negative values


In [23]:
df = df.drop(["Destination Port"], axis=1)
df["Category"] = df["Label"]
df.loc[df["Label"] == "BENIGN", "Label"] = 0
df.loc[df["Label"] != 0, "Label"] = 1

## Normalize attributes

In [24]:
num_cols = df.select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
assert np.allclose(df[num_cols].max(axis=0).to_numpy(), 1.), "Found values different than 1."
assert np.allclose(df[num_cols].min(axis=0).to_numpy(), 0.), "Found values lesser than 0."
print("Data is scaled between 0 and 1")

Data is scaled between 0 and 1


In [25]:
stats["n_final_features"] = df.shape[1] - 2
stats["n_final_rows"] = df.shape[0]
stats["final_anomaly_ratio"] = (df["Label"] != 0).sum() / len(df)

## Store processed dataset to CSV

In [27]:
df.to_csv(export_path + "/" + export_fname, index=False)
print("Processed data saved under: {}".format(base_path + "/" + export_fname))

Processed data saved under: ../data/IDS2017/original/ids2017.csv
Processed data saved under: ../data/IDS2017/original/ids2017.csv


## Store basic information

In [28]:
stats_df = pd.DataFrame(stats, index=[0])
stats_df.to_csv(export_path + "/" + info_fname, index=False)
print("Processing summary saved under: {}".format(base_path + "/" + info_fname))
stats_df

Processing summary saved under: ../data/IDS2017/original/ids2017_info.csv


Unnamed: 0,dropped_cols,n_dropped_cols,n_dropped_rows,n_instances,n_features,anomaly_ratio,n_unique_cols,unique_cols,n_nan_cols,nan_cols,n_negative_cols,negative_cols,n_final_features,n_final_rows,final_anomaly_ratio
0,"Init_Win_bytes_backward, Init_Win_bytes_forwar...",12,150,2830743,78,0.197,8,"Bwd PSH Flags, Bwd URG Flags, Fwd Avg Bytes/Bu...",2,"Flow Bytes/s, Flow Packets/s",13,"Flow Duration, Flow Bytes/s, Flow Packets/s, F...",65,2830392,0.19702
