# Dataset Generation

Here we will generate 2 batches of datasets for our multi-class classification experiments. 

First we produce our baseline datasets containing most of the features present in CIC_DDoS2019, and then we produce our time-based feature datasets, each containing only the 25 time-based features as well as a label

We start by importing relavent libraries, setting a seed for reproducibility, and by printing out the versions of the libraries we are using for reproducibility.

In [1]:
import os, platform, pprint, sys
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


seed: int = 14


# locations of the data files relative to current directory
data_path    : str  = './prepared/single/'
data_classes : list = [ 'BENIGN', 'DNS', 'LDAP', 'MSSQL', 'NetBIOS', 'NTP', 'Portmap', 'SNMP', 'SSDP', 'Syn', 'TFTP', 'UDP', 'UDPLag' ]
data_sets    : list = [
    'BENIGN.csv'  , 'DNS.csv'   ,
    'LDAP.csv'    , 'MSSQL.csv' ,
    'NetBIOS.csv' , 'NTP.csv'   ,
    'Portmap.csv' , 'SNMP.csv'  ,
    'SSDP.csv'    , 'Syn.csv'   ,
    'TFTP.csv'    , 'UDP.csv'   ,
    'UDPLag.csv'  ,
]   


# a list of all the time based features, as they are given in the dataframes we are dealing with. 
# We also add Label to make a total of 26 features
time_based_features: list = [
    'Fwd IAT Mean'  , 'Fwd IAT Std'    , 'Fwd IAT Max'   , 'Fwd IAT Min'  , 
    'Bwd IAT Mean'  , 'Bwd IAT Std'    , 'Bwd IAT Max'   , 'Bwd IAT Min'  , 
    'Flow IAT Mean' , 'Flow IAT Std'   , 'Flow IAT Max'  , 'Flow IAT Min' , 
    'Active Mean'   , 'Active Std'     , 'Active Max'    , 'Active Min'   , 
    'Idle Mean'     , 'Idle Std'       , 'Idle Max'      , 'Idle Min'     ,  
    'Flow Bytes/s'  , 'Flow Packets/s' , 'Flow Duration' ,
    'Fwd IAT Total' , 'Bwd IAT Total'  , 'Label'
]


# we define a dictionary of the different dataset sizes we are going to generate
sizes: dict = {
    'Mini':  10000,
    'Small': 50000,
    'Medium': 100000,
    'Large': 200000,
}


# set up pretty printer for easier data evaluation
pretty = pprint.PrettyPrinter(indent=4, width=30).pprint

print(
    f'''
    python:\t{platform.python_version()}

    \tmatplotlib:\t{mpl.__version__}
    \tnumpy:\t\t{np.__version__}
    \tpandas:\t\t{pd.__version__}
    '''
)


    python:	3.7.10

    	matplotlib:	3.3.4
    	numpy:		1.20.3
    	pandas:		1.2.5
    


## Preliminaries

Next, we do some preliminary set up. We list the data files we will be using and a list of new column names for the datasets that is more readable and understandable.

In [2]:
def get_file_path(directory: str):
    '''
        Closure that will return a function that returns the filepath to the directory given to the closure
    '''

    def func(file: str) -> str:
        return os.path.join(directory, file)

    return func


# use the get_file_path closure to create a function that will return the path to a file
file_path = get_file_path(data_path)


# a list of all complete filepaths relative to current directory with indicies mapped to the indicies of data_set
file_set: list = list(map(file_path, data_sets))

print('We grab the 13 classes of data we will be using from the ./prepared/single directory')
pretty(file_set)

We grab the 13 classes of data we will be using from the ./prepared/single directory
[   './prepared/single/BENIGN.csv',
    './prepared/single/DNS.csv',
    './prepared/single/LDAP.csv',
    './prepared/single/MSSQL.csv',
    './prepared/single/NetBIOS.csv',
    './prepared/single/NTP.csv',
    './prepared/single/Portmap.csv',
    './prepared/single/SNMP.csv',
    './prepared/single/SSDP.csv',
    './prepared/single/Syn.csv',
    './prepared/single/TFTP.csv',
    './prepared/single/UDP.csv',
    './prepared/single/UDPLag.csv']


In [3]:
def load_data(filePath: str) -> pd.DataFrame:
    '''
        Loads the Dataset from the given filepath and caches it for quick access in the future
        Function will only work when filepath is a .csv file
    '''

    # slice off the ./CSV/ from the filePath
    if filePath[0] == '.' and filePath[1] == '/':
        filePathClean: str = filePath[11::]
        pickleDump: str = f'./cache/{filePathClean}.pickle'
    else:
        pickleDump: str = f'./cache/{filePath}.pickle'
    
    print(f'Loading Dataset: {filePath}')
    print(f'\tTo Dataset Cache: {pickleDump}\n')
    
    # check if data already exists within cache
    if os.path.exists(pickleDump):
        df = pd.read_pickle(pickleDump)
        
    # if not, load data and cache it
    else:
        df = pd.read_csv(filePath, low_memory=True)
        df.to_pickle(pickleDump)

    
    return df

In [4]:
benign_df: pd.DataFrame = load_data(file_set[0])

attack_samples: dict = {}
for i, file in enumerate(file_set):
    if i is not 0:
        attack_samples[data_classes[i]] = load_data(file)


for i in range(len(file_set)):
    if i is not 0:
        print(f'{data_classes[i]}: {attack_samples[data_classes[i]].shape}')


Loading Dataset: ./prepared/single/BENIGN.csv
	To Dataset Cache: ./cache/single/BENIGN.csv.pickle

Loading Dataset: ./prepared/single/DNS.csv
	To Dataset Cache: ./cache/single/DNS.csv.pickle

Loading Dataset: ./prepared/single/LDAP.csv
	To Dataset Cache: ./cache/single/LDAP.csv.pickle

Loading Dataset: ./prepared/single/MSSQL.csv
	To Dataset Cache: ./cache/single/MSSQL.csv.pickle

Loading Dataset: ./prepared/single/NetBIOS.csv
	To Dataset Cache: ./cache/single/NetBIOS.csv.pickle

Loading Dataset: ./prepared/single/NTP.csv
	To Dataset Cache: ./cache/single/NTP.csv.pickle

Loading Dataset: ./prepared/single/Portmap.csv
	To Dataset Cache: ./cache/single/Portmap.csv.pickle

Loading Dataset: ./prepared/single/SNMP.csv
	To Dataset Cache: ./cache/single/SNMP.csv.pickle

Loading Dataset: ./prepared/single/SSDP.csv
	To Dataset Cache: ./cache/single/SSDP.csv.pickle

Loading Dataset: ./prepared/single/Syn.csv
	To Dataset Cache: ./cache/single/Syn.csv.pickle

Loading Dataset: ./prepared/single/TFT

## Dataset Generation Functions

In [5]:
def generate_ddos_vs_benign(samples: int) -> tuple:
    '''
        Generates two datasets of equal number of benign and DDoS samples
        one for our baseline features and one of time-based features
        the total number of samples is given by the samples parameter

        saves the generated datasets and returns a tuple containing both
    '''

    if samples > benign_df.shape[0]*2:
        raise ValueError(f'Number of samples {samples} cannot be greater than the number of attack samples {benign_df.shape[0]*2}')

    total_benign = int(samples/2)
    total_each_attack_type = int(total_benign/12)
    print(f'\t\tGenerating {total_benign} Benign and {total_each_attack_type} DDoS Samples')

    benign = benign_df.sample(total_benign, random_state=seed)

    DDoS_list = []
    for key in attack_samples.keys():
        temp = attack_samples[key].sample(n=total_each_attack_type, random_state=seed)
        DDoS_list.append(temp)

    ddos_df: pd.DataFrame = pd.concat(DDoS_list, ignore_index=True)
    to_replace = list(attack_samples.keys())
    ddos_df.replace(to_replace=to_replace, value="DDOS", inplace=True)

    DDoS_vs_Benign_Baseline  = pd.concat([benign, ddos_df], ignore_index=True)
    DDoS_vs_Benign_Timebased = DDoS_vs_Benign_Baseline[time_based_features] 

    print(f'DDoS vs Benign - Baseline:  {DDoS_vs_Benign_Baseline.shape }')
    print(f'\tto CSV: ./downsampled/baseline/DDoS_vs_Benign_{samples}.csv')
    print(f'DDoS vs Benign - Timebased: {DDoS_vs_Benign_Timebased.shape}')
    print(f'\tto CSV: ./downsampled/timebased/DDoS_vs_Benign_{samples}.csv')

    DDoS_vs_Benign_Baseline .to_csv(f"./downsampled/baseline/DDoS_vs_Benign_{samples}.csv" , index=False)
    DDoS_vs_Benign_Timebased.to_csv(f"./downsampled/timebased/DDoS_vs_Benign_{samples}.csv", index=False)

    return (DDoS_vs_Benign_Baseline, DDoS_vs_Benign_Timebased)

In [6]:
def generate_attack_vs_benign(samples: int) -> list:
    '''
        Generates two datasets of equal number of a DDoS attack type
        and benign samples (2 classes total)
        one for our baseline features and one of time-based features
        the total number of samples is given by the samples parameter

        saves the generated datasets and returns a list of tuples containing both
    '''

    for key in attack_samples.keys():
        if samples > attack_samples[key].shape[0]*2:
            print('in generate_attack_vs_benign')
            raise ValueError(f'Number of samples {samples} cannot be greater than the number of attack samples {attack_samples[key].shape[0]*2}')

    result_list: list = []

    total_benign = int(samples/2)
    benign = benign_df.sample(total_benign, random_state=seed)

    # here we generate attack vs benign datasets. one for each attack type
    for key in attack_samples.keys():
    
        attack_df = attack_samples[key].sample(n=total_benign)

        Attack_vs_Benign_Baseline = pd.concat([attack_df, benign], ignore_index=True)
        Attack_vs_Benign_Timebased = Attack_vs_Benign_Baseline[time_based_features]

        print(f'{key} vs Benign - Baseline: {Attack_vs_Benign_Baseline.shape}')
        print(f'\tto CSV: ./downsampled/baseline/{key}_vs_Benign_{samples}.csv' )
        print(f'{key} vs Benign - Timebased: {Attack_vs_Benign_Timebased.shape}')
        print(f'\tto CSV: ./downsampled/timebased/{key}_vs_Benign_{samples}.csv' )

        Attack_vs_Benign_Baseline .to_csv(f'./downsampled/baseline/{key}_vs_Benign_{samples}.csv' , index=False)
        Attack_vs_Benign_Timebased.to_csv(f'./downsampled/timebased/{key}_vs_Benign_{samples}.csv', index=False)   

        result_list.append((Attack_vs_Benign_Baseline, Attack_vs_Benign_Timebased))     

    return result_list

In [7]:
def generate_attack_vs_ddos(samples: int) -> list:
    '''
        Generates two datasets of equal number of a DDoS attack type
        and a basket of all other DDoS attack types without benign samples

        one for our baseline features and one of time-based features
        the total number of samples is given by the samples parameter

        saves the generated datasets and returns a list of tuples containing both
    '''

    for key in attack_samples.keys():
        if samples > attack_samples[key].shape[0]*2:
            print('in generate_attack_vs_ddos')
            raise ValueError(f'Number of samples {samples} cannot be greater than the number of attack samples {attack_samples[key].shape[0]*2}')
        
   
    result_list: list = []

    # here we generate attack vs basket datasets. one for each attack type without benign samples
    for key in attack_samples.keys():
        
        total_attacks = int(samples/2)
        total_each_other_type = int(total_attacks / 11)
        
        other_type_list = []
        for attack in attack_samples.keys():
            if attack != key:
                other_type_list.append(attack_samples[attack].sample(n=total_each_other_type, random_state=seed))
        

        other_df = pd.concat(other_type_list, ignore_index=True)

        to_replace = list(attack_samples.keys())
        to_replace.append('BENIGN')

        other_df.replace(to_replace=to_replace, value=f'DDOS', inplace=True)

        attack_df = attack_samples[key].sample(n=total_attacks, random_state=seed)

        Attack_vs_DDoS_Baseline = pd.concat([attack_df, other_df], ignore_index=True)
        Attack_vs_DDoS_Timebased = Attack_vs_DDoS_Baseline[time_based_features]


        print(f'{key} vs DDoS - Baseline: {Attack_vs_DDoS_Baseline.shape}')
        print(f'\tto CSV: ./downsampled/baseline/{key}_vs_DDoS_{samples}.csv' )
        print(f'{key} vs DDoS - Timebased: {Attack_vs_DDoS_Timebased.shape}')
        print(f'\tto CSV: ./downsampled/timebased/{key}_vs_DDoS_{samples}.csv' )

        Attack_vs_DDoS_Baseline .to_csv(f'./downsampled/baseline/{key}_vs_DDoS_{samples}.csv' , index=False)
        Attack_vs_DDoS_Timebased.to_csv(f'./downsampled/timebased/{key}_vs_DDoS_{samples}.csv', index=False)

        result_list.append((Attack_vs_DDoS_Baseline, Attack_vs_DDoS_Timebased))

    return result_list



In [8]:
def generate_attack_vs_all(samples: int) -> list:
    '''
        Generates two datasets of equal number of a specific DDoS attack type
        against a basket of all other DDoS attack types and Benign samples 

        one for our baseline features and one of time-based features
        the total number of samples is given by the samples parameter

        saves the generated datasets and returns a list of tuples containing both
    '''

    for key in attack_samples.keys():
        if samples > attack_samples[key].shape[0]*2:
            print('in generate_attack_vs_all')
            raise ValueError(f'Number of samples {samples} cannot be greater than the number of attack samples {attack_samples[key].shape[0]*2}')

    result_list: list = []

    # here we generate attack vs basket datasets. one for each attack type
    for key in attack_samples.keys():
        
        total_attacks = int(samples/2)
        total_each_other_type = int(total_attacks / 12)
        
        other_type_list = []
        for attack in attack_samples.keys():
            if attack != key:
                other_type_list.append(attack_samples[attack].sample(n=total_each_other_type, random_state=seed))
        

        other_type_list.append(benign_df.sample(n=total_each_other_type, random_state=seed))
        other_df = pd.concat(other_type_list, ignore_index=True)
        to_replace = list(attack_samples.keys())
        other_df.replace(to_replace=to_replace, value=f'NOT{key}', inplace=True)

        attack_df = attack_samples[key].sample(n=total_attacks, random_state=seed)

        Attack_vs_all_Baseline  = pd.concat([attack_df, other_df], ignore_index=True)
        Attack_vs_all_Timebased = Attack_vs_all_Baseline[time_based_features]
        
        print(f'{key} vs all - Baseline: {Attack_vs_all_Baseline.shape}')
        print(f'\tto CSV: ./downsampled/baseline/{key}_vs_all_{samples}.csv' )
        print(f'{key} vs all - Timebased: {Attack_vs_all_Timebased.shape}')
        print(f'\tto CSV: ./downsampled/timebased/{key}_vs_all_{samples}.csv' )

        Attack_vs_all_Baseline .to_csv(f'./downsampled/baseline/{key}_vs_all_{samples}.csv' , index=False)
        Attack_vs_all_Timebased.to_csv(f'./downsampled/timebased/{key}_vs_all_{samples}.csv', index=False)

        result_list.append((Attack_vs_all_Baseline, Attack_vs_all_Timebased))

    return result_list

In [9]:
def generate_benign_many_vs_many(samples: int) -> tuple:
    '''
        Generates two multi-class datasets with an equal number of 
        all DDoS attacks and Benign samples (13 classes total)
        one for our baseline features and one of time-based features
        the total number of samples is given by the samples parameter

        saves the generated datasets and returns a tuple containing both
    '''


    if samples > benign_df.shape[0]*13:
        print('in generate_benign_many_vs_many')
        raise ValueError(f'Number of samples {samples} cannot be greater than the number of attack samples {benign_df.shape[0]*13}')


    total_each_attack_type = total_benign = int(samples/13)


    DDoS_list = []
    for key in attack_samples.keys():
        DDoS_list.append(attack_samples[key].sample(n=total_each_attack_type, random_state=seed))

    ddos_df = pd.concat(DDoS_list, ignore_index=True)

    merge = [ddos_df, benign_df.sample(n=total_each_attack_type, random_state=seed)]

    Benign_Many_vs_Many_Baseline = pd.concat(merge, ignore_index=True)
    Benign_Many_vs_Many_Timebased = Benign_Many_vs_Many_Baseline[time_based_features]

    print(f'Benign Many vs Many - Baseline:  {Benign_Many_vs_Many_Baseline.shape }')
    print(f'\tto CSV: ./downsampled/baseline/Benign_Many_vs_Many_{samples}.csv' )
    print(f'Benign Many vs Many - Timebased: {Benign_Many_vs_Many_Timebased.shape}')
    print(f'\tto CSV: ./downsampled/timebased/Benign_Many_vs_Many_{samples}.csv' )

    Benign_Many_vs_Many_Baseline .to_csv(f"./downsampled/baseline/Benign_Many_vs_Many_{samples}.csv", index=False)
    Benign_Many_vs_Many_Timebased.to_csv(f"./downsampled/timebased/Benign_Many_vs_Many_{samples}.csv", index=False)

    return (Benign_Many_vs_Many_Baseline, Benign_Many_vs_Many_Timebased)

In [10]:
def generate_attack_many_vs_many(samples: int) -> tuple:
    '''
        Generates two multi-class datasets with an equal number of 
        all DDoS attacks without Benign samples (12 classes total)
        one for our baseline features and one of time-based features
        the total number of samples is given by the samples parameter

        saves the generated datasets and returns a tuple containing both
    '''

    for key in attack_samples.keys():
        if samples > attack_samples[key].shape[0]*12:
            print('in generate_attack_many_vs_many')
            raise ValueError(f'Number of samples {samples} cannot be greater than the number of attack samples {attack_samples[key].shape[0]*12}')

    # here we generate the many vs many dataset without benign samples
    total_each_attack_type: int = int(samples/12)

    attack_list = []
    for attack in attack_samples.keys():
        attack_list.append(attack_samples[attack].sample(n=total_each_attack_type, random_state=seed))

    Attacks_Many_vs_Many_Baseline = pd.concat(attack_list, ignore_index=True)
    Attacks_Many_vs_Many_Timebased = Attacks_Many_vs_Many_Baseline[time_based_features] 


    print(f'Attacks Many vs Many - Baseline : {Attacks_Many_vs_Many_Baseline.shape }')
    print(f'\tto CSV: ./downsampled/baseline/Attacks_Many_vs_Many_{samples}.csv' )
    print(f'Attacks Many vs Many - Timebased: {Attacks_Many_vs_Many_Timebased.shape}')
    print(f'\tto CSV: ./downsampled/timebased/Attacks_Many_vs_Many_{samples}.csv' )

    Attacks_Many_vs_Many_Baseline .to_csv(f'./downsampled/baseline/Attacks_Many_vs_Many_{samples}.csv', index=False)
    Attacks_Many_vs_Many_Timebased.to_csv(f'./downsampled/timebased/Attacks_Many_vs_Many_{samples}.csv', index=False)
    
    return (Attacks_Many_vs_Many_Baseline, Attacks_Many_vs_Many_Timebased)

In [11]:
generators: list = [
    generate_ddos_vs_benign      , 
    generate_attack_vs_benign    , 
    generate_attack_vs_ddos      ,
    generate_attack_vs_all       ,
    generate_benign_many_vs_many ,
    generate_attack_many_vs_many ,
]

## Dataset Generation

In [12]:
for key in sizes.keys():
    print(f'Generating {key} datasets with {sizes[key]} samples')
    for generator in generators:
        generator(sizes[key])

Generating Mini datasets with 10000 samples
		Generating 5000 Benign and 416 DDoS Samples
DDoS vs Benign - Baseline:  (9992, 70)
	to CSV: ./downsampled/baseline/DDoS_vs_Benign_10000.csv
DDoS vs Benign - Timebased: (9992, 26)
	to CSV: ./downsampled/timebased/DDoS_vs_Benign_10000.csv
DNS vs Benign - Baseline: (10000, 70)
	to CSV: ./downsampled/baseline/DNS_vs_Benign_10000.csv
DNS vs Benign - Timebased: (10000, 26)
	to CSV: ./downsampled/timebased/DNS_vs_Benign_10000.csv
LDAP vs Benign - Baseline: (10000, 70)
	to CSV: ./downsampled/baseline/LDAP_vs_Benign_10000.csv
LDAP vs Benign - Timebased: (10000, 26)
	to CSV: ./downsampled/timebased/LDAP_vs_Benign_10000.csv
MSSQL vs Benign - Baseline: (10000, 70)
	to CSV: ./downsampled/baseline/MSSQL_vs_Benign_10000.csv
MSSQL vs Benign - Timebased: (10000, 26)
	to CSV: ./downsampled/timebased/MSSQL_vs_Benign_10000.csv
NetBIOS vs Benign - Baseline: (10000, 70)
	to CSV: ./downsampled/baseline/NetBIOS_vs_Benign_10000.csv
NetBIOS vs Benign - Timebased: (1