## Setup

In [1]:
import pandas as pd
from numpy import dtype
import numpy as np

dtypes = {'Timestamp': dtype('O'),
 'Dst Port': dtype('int64'),
 'Protocol': dtype('int64'),
 'Flow Duration': dtype('int64'),
 'Tot Fwd Pkts': dtype('int64'),
 'Tot Bwd Pkts': dtype('int64'),
 'TotLen Fwd Pkts': dtype('float64'),
 'TotLen Bwd Pkts': dtype('float64'),
 'Fwd Pkt Len Max': dtype('float64'),
 'Fwd Pkt Len Min': dtype('float64'),
 'Fwd Pkt Len Mean': dtype('float64'),
 'Fwd Pkt Len Std': dtype('float64'),
 'Bwd Pkt Len Max': dtype('float64'),
 'Bwd Pkt Len Min': dtype('float64'),
 'Bwd Pkt Len Mean': dtype('float64'),
 'Bwd Pkt Len Std': dtype('float64'),
 'Flow Byts/s': dtype('float64'),
 'Flow Pkts/s': dtype('float64'),
 'Flow IAT Mean': dtype('float64'),
 'Flow IAT Std': dtype('float64'),
 'Flow IAT Max': dtype('float64'),
 'Flow IAT Min': dtype('float64'),
 'Fwd IAT Tot': dtype('float64'),
 'Fwd IAT Mean': dtype('float64'),
 'Fwd IAT Std': dtype('float64'),
 'Fwd IAT Max': dtype('float64'),
 'Fwd IAT Min': dtype('float64'),
 'Bwd IAT Tot': dtype('float64'),
 'Bwd IAT Mean': dtype('float64'),
 'Bwd IAT Std': dtype('float64'),
 'Bwd IAT Max': dtype('float64'),
 'Bwd IAT Min': dtype('float64'),
 'Fwd PSH Flags': dtype('int64'),
 'Bwd PSH Flags': dtype('int64'),
 'Fwd URG Flags': dtype('int64'),
 'Bwd URG Flags': dtype('int64'),
 'Fwd Header Len': dtype('int64'),
 'Bwd Header Len': dtype('int64'),
 'Fwd Pkts/s': dtype('float64'),
 'Bwd Pkts/s': dtype('float64'),
 'Pkt Len Min': dtype('float64'),
 'Pkt Len Max': dtype('float64'),
 'Pkt Len Mean': dtype('float64'),
 'Pkt Len Std': dtype('float64'),
 'Pkt Len Var': dtype('float64'),
 'FIN Flag Cnt': dtype('int64'),
 'SYN Flag Cnt': dtype('int64'),
 'RST Flag Cnt': dtype('int64'),
 'PSH Flag Cnt': dtype('int64'),
 'ACK Flag Cnt': dtype('int64'),
 'URG Flag Cnt': dtype('int64'),
 'CWE Flag Count': dtype('int64'),
 'ECE Flag Cnt': dtype('int64'),
 'Down/Up Ratio': dtype('float64'),
 'Pkt Size Avg': dtype('float64'),
 'Fwd Seg Size Avg': dtype('float64'),
 'Bwd Seg Size Avg': dtype('float64'),
 'Fwd Byts/b Avg': dtype('float64'),
 'Fwd Pkts/b Avg': dtype('float64'),
 'Fwd Blk Rate Avg': dtype('float64'),
 'Bwd Byts/b Avg': dtype('float64'),
 'Bwd Pkts/b Avg': dtype('float64'),
 'Bwd Blk Rate Avg': dtype('float64'),
 'Subflow Fwd Pkts': dtype('int64'),
 'Subflow Fwd Byts': dtype('int64'),
 'Subflow Bwd Pkts': dtype('int64'),
 'Subflow Bwd Byts': dtype('int64'),
 'Init Fwd Win Byts': dtype('int64'),
 'Init Bwd Win Byts': dtype('int64'),
 'Fwd Act Data Pkts': dtype('int64'),
 'Fwd Seg Size Min': dtype('int64'),
 'Active Mean': dtype('float64'),
 'Active Std': dtype('float64'),
 'Active Max': dtype('float64'),
 'Active Min': dtype('float64'),
 'Idle Mean': dtype('float64'),
 'Idle Std': dtype('float64'),
 'Idle Max': dtype('float64'),
 'Idle Min': dtype('float64'),
 'Label': dtype('O')}



In [1]:
### Creating the CSV

import time
import pandas as pd
from glob import glob
import os

list_of_files = glob("/home/haktrak/Public/full_datasets/shuffled.csv")
classes_andcounts = {}
LIMIT_PER_CLASS = 100

header = True
mode = 'w'
cols = list(dtypes.keys())

for csv_file in list_of_files:
    print(f"#### {os.path.basename(csv_file)} ####")
    chunks = pd.read_csv(csv_file,chunksize=100000, usecols=cols, low_memory=False)

    for num, chunk in enumerate(chunks):
        start = time.time()
        # Cleaning

        chunk.dropna(axis=0, inplace=True)

        repeated_headers = chunk[(chunk.Protocol == 'Protocol')].index
        chunk.drop(repeated_headers, axis=0, inplace=True)
        # Convert to the right type
        # chunk.astype(dtypes)

        # Getting the classes
        chunk_counts = chunk['Label'].value_counts().to_dict()
        for key, count in chunk_counts.items():
            classes_andcounts[key] = classes_andcounts.get(key, 0) + count
        
        # Writing it to csv
        chunk[cols].to_csv("csv/full_dataset.csv", header=header, columns=cols, mode=mode, index=False)

        end = time.time()
        print(f"Processed chuck #{num}: ",(end-start),"sec")
        header=False
        mode = 'a'
    print(classes_andcounts)    
    print()
# print(classes_)

KeyboardInterrupt: 

In [None]:
"""{'DDoS attacks-LOIC-HTTP': 576191, 'Benign': 13425831, 'Brute Force -Web': 611, 'Brute Force -XSS': 230, 'SQL Injection': 87, 'DDOS attack-HOIC': 686012, 'DDOS attack-LOIC-UDP': 1730, 'DoS attacks-SlowHTTPTest': 139890, 'DoS attacks-Hulk': 461912, 'Bot': 286191, 'FTP-BruteForce': 193354, 'SSH-Bruteforce': 187589, 'Infilteration': 161096, 'DoS attacks-GoldenEye': 41508, 'DoS attacks-Slowloris': 10990}
"""

In [78]:
### Removing certain classes and create a balanced dataset

import time
import pandas as pd
from glob import glob
import os

list_of_files = glob("/home/haktrak/Public/full_datasets/shuffled.csv")

LIMIT_PER_CLASS = 1000
TRAIN_TEST_SPLIT = 0.7
split_num = int(LIMIT_PER_CLASS*TRAIN_TEST_SPLIT)

classes = {'DDoS attacks-LOIC-HTTP': 0,
 'Benign': 0,
 'Brute Force -Web': 0,
 'Brute Force -XSS': 0,
 'SQL Injection': 0,
 'DDOS attack-HOIC': 0,
 'DDOS attack-LOIC-UDP': 0,
 'DoS attacks-SlowHTTPTest': 0,
 'DoS attacks-Hulk': 0,
 'Bot': 0,
 'FTP-BruteForce': 0,
 'SSH-Bruteforce': 0,
 'Infilteration': 0,
 'DoS attacks-GoldenEye': 0,
 'DoS attacks-Slowloris': 0}

# classes = list(classes.keys())
[classes.pop(i) for i in ['Brute Force -Web', 'Brute Force -XSS', 'SQL Injection']]


header = True
mode = 'w'
cols = list(dtypes.keys())

for csv_file in list_of_files:
    print(f"#### {os.path.basename(csv_file)} ####")
    chunks = pd.read_csv(csv_file,chunksize=100000, usecols=cols, dtype=dtypes, low_memory=False)
    empty_df = pd.DataFrame(columns=cols)
    
    for num, chunk in enumerate(chunks):
        start = time.time()
        
        train_chunk = empty_df.copy()
        test_chunk = empty_df.copy()
        
        # Getting the classes
        chunk_counts = chunk['Label'].value_counts().to_dict()
        for label, count_in_chunk in chunk_counts.items():
            previous_count = classes.get(label)

            if previous_count == LIMIT_PER_CLASS:
                classes.pop(label)
                continue
            if previous_count == None:
                continue

            rows_to_grab = LIMIT_PER_CLASS - previous_count
            if rows_to_grab > count_in_chunk:
                rows_to_grab = count_in_chunk
            
            
            classes[label] += rows_to_grab
            new_chunk = new_chunk.append(chunk[chunk['Label']==label][:rows_to_grab], ignore_index=True)

        # Writing it to csv
        new_chunk.to_csv(f"/home/haktrak/Public/full_datasets/shuffled_{LIMIT_PER_CLASS}.csv", header=header, columns=cols, mode=mode, index=False)
        end = time.time()
        header=False
        mode = 'a'
        print(f"Processed chunk #{num}...")
        if len(classes.keys()) == 0:
            break


#     print()

#### shuffled.csv ####
Processed chunk #0...
Processed chunk #1...
Processed chunk #2...
Processed chunk #3...
Processed chunk #4...
Processed chunk #5...
Processed chunk #6...
Processed chunk #7...
Processed chunk #8...
Processed chunk #9...
Processed chunk #10...
Processed chunk #11...
Processed chunk #12...
Processed chunk #13...
Processed chunk #14...
Processed chunk #15...
Processed chunk #16...
Processed chunk #17...
Processed chunk #18...
Processed chunk #19...
Processed chunk #20...
Processed chunk #21...
Processed chunk #22...
Processed chunk #23...
Processed chunk #24...
Processed chunk #25...
Processed chunk #26...
Processed chunk #27...
Processed chunk #28...
Processed chunk #29...
Processed chunk #30...
Processed chunk #31...
Processed chunk #32...
Processed chunk #33...
Processed chunk #34...
Processed chunk #35...
Processed chunk #36...
Processed chunk #37...
Processed chunk #38...
Processed chunk #39...
Processed chunk #40...
Processed chunk #41...
Processed chunk #42..

In [12]:
path = "/home/haktrak/Public/full_datasets/shuffled_1000.csv"
df = pd.read_csv(path, dtype=dtypes)
df.Label.value_counts()

DDOS attack-HOIC            1000
Bot                         1000
DoS attacks-SlowHTTPTest    1000
DDoS attacks-LOIC-HTTP      1000
DDOS attack-LOIC-UDP        1000
DoS attacks-GoldenEye       1000
Infilteration               1000
DoS attacks-Hulk            1000
SSH-Bruteforce              1000
FTP-BruteForce              1000
DoS attacks-Slowloris       1000
Benign                      1000
Name: Label, dtype: int64

In [15]:
from sklearn.model_selection import StratifiedShuffleSplit
x = df.drop('Label', axis=1)
y = df.Label
sss = StratifiedShuffleSplit(n_splits=1, train_size=0.7, random_state=42)


In [24]:
gen = sss.split(x,y)
for train, test in gen:
    print(train, test)

[ 2295  7203 10428 ... 11670  5438  2700] [3251 9580 9734 ... 5140 5935 6741]


In [27]:
train_df = df.iloc[train]
test_df = df.iloc[test]
train_df.to_csv(f"/home/haktrak/Public/full_datasets/shuffled_1000_train.csv", header=True, index=False)
test_df.to_csv(f"/home/haktrak/Public/full_datasets/shuffled_1000_test.csv", header=True, index=False)