In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
from scipy import stats
from scipy.stats import ks_2samp

%matplotlib inline

In [2]:
# set base path to the directory containing the csv files of the dataset
dataset_base_path = r'DATASET PATH'

In [3]:
types = {
    'dst_port': 'uint32',
    'protocol': 'uint8',
    'timestamp': 'object',
    'flow_duration': 'int64',
    'tot_fwd_pkts': 'uint32',
    'tot_bwd_pkts': 'uint32',
    'totlen_fwd_pkts': 'uint32',
    'totlen_bwd_pkts': 'uint32',
    'fwd_pkt_len_max': 'uint16',
    'fwd_pkt_len_min': 'uint16',
    'fwd_pkt_len_mean': 'float32',
    'fwd_pkt_len_std': 'float32',
    'bwd_pkt_len_max': 'uint16',
    'bwd_pkt_len_min': 'uint16',
    'bwd_pkt_len_mean': 'float32',
    'bwd_pkt_len_std': 'float32',
    'flow_byts_s': 'float64',
    'flow_pkts_s': 'float64',
    'flow_iat_mean': 'float32',
    'flow_iat_std': 'float32',
    'flow_iat_max': 'int64',
    'flow_iat_min': 'int64',
    'fwd_iat_tot': 'int64',
    'fwd_iat_mean': 'float32',
    'fwd_iat_std': 'float32',
    'fwd_iat_max': 'int64',
    'fwd_iat_min': 'int64',
    'bwd_iat_tot': 'uint32',
    'bwd_iat_mean': 'float32',
    'bwd_iat_std': 'float32',
    'bwd_iat_max': 'uint32',
    'bwd_iat_min': 'uint32',
    'fwd_psh_flags': 'uint8',
    'bwd_psh_flags': 'uint8',
    'fwd_urg_flags': 'uint8',
    'bwd_urg_flags': 'uint8',
    'fwd_header_len': 'uint32',
    'bwd_header_len': 'uint32',
    'fwd_pkts_s': 'float32',
    'bwd_pkts_s': 'float32',
    'pkt_len_min': 'uint16',
    'pkt_len_max': 'uint16',
    'pkt_len_mean': 'float32',
    'pkt_len_std': 'float32',
    'pkt_len_var': 'float32',
    'fin_flag_cnt': 'uint8',
    'syn_flag_cnt': 'uint8',
    'rst_flag_cnt': 'uint8',
    'psh_flag_cnt': 'uint8',
    'ack_flag_cnt': 'uint8',
    'urg_flag_cnt': 'uint8',
    'cwe_flag_count': 'uint8',
    'ece_flag_cnt': 'uint8',
    'down_up_ratio': 'uint16',
    'pkt_size_avg': 'float32',
    'fwd_seg_size_avg': 'float32',
    'bwd_seg_size_avg': 'float32',
    'fwd_byts_b_avg': 'uint8',
    'fwd_pkts_b_avg': 'uint8',
    'fwd_blk_rate_avg': 'uint8',
    'bwd_byts_b_avg': 'uint8',
    'bwd_pkts_b_avg': 'uint8',
    'bwd_blk_rate_avg': 'uint8',
    'subflow_fwd_pkts': 'uint32',
    'subflow_fwd_byts': 'uint32',
    'subflow_bwd_pkts': 'uint32',
    'subflow_bwd_byts': 'uint32',
    'init_fwd_win_byts': 'int32',
    'init_bwd_win_byts': 'int32',
    'fwd_act_data_pkts': 'uint32',
    'fwd_seg_size_min': 'uint8',
    'active_mean': 'float32',
    'active_std': 'float32',
    'active_max': 'uint32',
    'active_min': 'uint32',
    'idle_mean': 'float32',
    'idle_std': 'float32',
    'idle_max': 'uint64',
    'idle_min': 'uint64',
    'label': 'category'
}

In [5]:
import numpy as np
import pandas as pd
import os
import re

csv_files = {
 '28-02-2018.csv': 'C28-02-2018.parquet',
 '01-03-2018.csv': 'C01-03-2018.parquet',
 '02-03-2018.csv': 'C02-03-2018.parquet',
 '22-02-2018.csv': 'C22-02-2018.parquet',
 '15-02-2018.csv': 'C15-02-2018.parquet',
 '20-02-2018.csv': 'C20-02-2018.parquet',
 '21-02-2018.csv': 'C21-02-2018.parquet',
 '14-02-2018.csv': 'C14-02-2018.parquet',
 '16-02-2018.csv': 'C16-02-2018.parquet',
 '23-02-2018.csv': 'C23-02-2018.parquet'
}

column_name_regex = re.compile(r"\W", re.IGNORECASE)
processed_dir = 'CLEANPARQUET'
processed_path = os.path.join(dataset_base_path, processed_dir)

def remove_null_values(f):    
    return f.dropna(inplace = True)

def remove_dupliactes(f):
    return f.drop_duplicates(inplace = True)

def clean(f):
    inf_columns = [c for c in f.columns if f[f[c] == np.inf][c].count() > 0]
    for col in inf_columns:
        f[col].replace([np.inf, -np.inf], np.nan, inplace=True)
        mean = f[col].mean()
        f[col].fillna(mean, inplace=True)
    return f
    
if not os.path.exists(processed_path):
    os.mkdir(processed_path)    
    
for f, out in csv_files.items():
    file_path = os.path.join(dataset_base_path, f)
    output_path = os.path.join(dataset_base_path, processed_dir, out)
    
    df = pd.read_csv(file_path, dtype=types).drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Src Port'], errors='ignore')
    remove_null_values(df)
    remove_dupliactes(df)
    clean(df)
    df.to_parquet(output_path)

In [2]:
# set base path to the directory containing the csv files of the dataset
dataset_base_path_parquet = r'CLEANED DATASET PATH'

In [3]:
parquet_files = glob.glob(os.path.join(dataset_base_path_parquet, '*.parquet'))

df = pd.concat((pd.read_parquet(f) for f in parquet_files))

In [4]:
df.shape

(15738470, 80)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15738470 entries, 0 to 613070
Data columns (total 80 columns):
 #   Column             Dtype  
---  ------             -----  
 0   dst_port           uint32 
 1   protocol           uint8  
 2   timestamp          object 
 3   flow_duration      int64  
 4   tot_fwd_pkts       uint32 
 5   tot_bwd_pkts       uint32 
 6   totlen_fwd_pkts    uint32 
 7   totlen_bwd_pkts    uint32 
 8   fwd_pkt_len_max    uint16 
 9   fwd_pkt_len_min    uint16 
 10  fwd_pkt_len_mean   float32
 11  fwd_pkt_len_std    float32
 12  bwd_pkt_len_max    uint16 
 13  bwd_pkt_len_min    uint16 
 14  bwd_pkt_len_mean   float32
 15  bwd_pkt_len_std    float32
 16  flow_byts_s        float64
 17  flow_pkts_s        float64
 18  flow_iat_mean      float32
 19  flow_iat_std       float32
 20  flow_iat_max       int64  
 21  flow_iat_min       int64  
 22  fwd_iat_tot        int64  
 23  fwd_iat_mean       float32
 24  fwd_iat_std        float32
 25  fwd_iat_max       