In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
TRAIN_DATAPATH = "path to train data"
TEST_DATAPATH = "path to test data"
TRAIN_OUTPUT_PATH = "output path for train data"
TEST_OUTPUT_PATH = "output path for test data"

if not os.path.exists(TRAIN_OUTPUT_PATH):
    os.makedirs(TRAIN_OUTPUT_PATH)
if not os.path.exists(TEST_OUTPUT_PATH):
    os.makedirs(TEST_OUTPUT_PATH)

In [None]:
def make_file_list(reader_list, path):
    cur_path = os.walk(path)
    for root, directories, files in cur_path:
        for file in files:
            reader_list.append(os.path.join(root, file))

In [None]:
train_list = []

make_file_list(train_list, TRAIN_DATAPATH)
train_list

In [None]:
test_list = []

make_file_list(test_list, TEST_DATAPATH)
test_list

In [None]:
# Meaning of each feature
# https://github.com/CanadianInstituteForCybersecurity/
# CICFlowMeter/blob/master/ReadMe.txt

# this list includes all spellings across CIC NIDS datasets
drop_cols = [
    "Flow ID",    
    " Fwd Header Length.1",
    " Source IP",
    " Source Port",
    " Destination IP",
    " Destination Port",
    " Timestamp",
    # CIC-DDoS other undocumented columns
    "Unnamed: 0", " Inbound", "SimillarHTTP" 
]

def process_data(reader: pd.DataFrame, output_fn: str):

    # drop the columns not intended for use
    reader.drop(columns=drop_cols, inplace=True)

    # remove NaN
    reader.replace([np.inf, -np.inf], np.nan, inplace=True)
    reader.dropna(inplace=True)

    # drop duplicates
    reader.drop_duplicates(inplace=True)
    
    # rename the label for binary classification task
    reader[" Label"].replace({'BENIGN': int(0)}, inplace=True)
    reader[" Label"] = reader[" Label"].apply(lambda x: int(1) if x != int(0) else x)

    # dump preprocessed data
    reader.to_csv(
        output_fn, header=(not os.path.exists(output_fn)), 
        index=False, mode='a'
    )

In [None]:
CHUNKSIZE = 50000

def read_file(filelist: list, output_path: str) -> None:
    for file in filelist:

        # Skip the .~lock.UDPLag.csv# file in test set
        if file.split(sep='.')[1] == "csv":
            print(f"File reading now: {file}")
            filename = file.split(sep="\\")[1]
            output_fn = output_path + '/' + filename.split(sep='.')[0] + "_preprocessed.csv"
            
            with pd.read_csv(file, chunksize=CHUNKSIZE, low_memory=False) as reader:
                for chunk in reader:
                    chunk[" Protocol"].astype("int32")
                    process_data(chunk, output_fn)

In [None]:
read_file(train_list, TRAIN_OUTPUT_PATH)

In [None]:
read_file(test_list, TEST_OUTPUT_PATH)