In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

In [2]:
data_files_list = ["data/ids2018_processed/Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv",
                   "data/ids2018_processed/Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv", 
                   "data/ids2018_processed/Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv",                   
                   "data/ids2018_processed/Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv",
                   "data/ids2018_processed/Friday-23-02-2018_TrafficForML_CICFlowMeter.csv",
                   "data/ids2018_processed/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv"]

dtypes = {'dst_port': 'int', 'protocol': 'int', 'timestamp': 'str', 'fl_dur': 'int',
     'tot_fw_pk': 'int', 'tot_bw_pk': 'int', 'tot_l_fw_pkt': 'int', 'tot_l_bw_pkt': 'int', 
     'fw_pkt_l_max': 'int', 'fw_pkt_l_min': 'int', 'fw_pkt_l_avg': 'float', 'fw_pkt_l_std': 'float',
     'bw_pkt_l_max': 'int', 'bw_pkt_l_min': 'int', 'bw_pkt_l_avg': 'float', 'bw_pkt_l_std': 'float', 
     'fl_byt_s': 'float', 'fl_pkt_s': 'float', 
     'fl_iat_avg': 'float', 'fl_iat_std': 'float', 'fl_iat_max': 'int', 'fl_iat_min': 'int',
     'fw_iat_tot': 'int', 'fw_iat_avg': 'float', 'fw_iat_std': 'float', 'fw_iat_max': 'int', 'fw_iat_min': 'int',
     'bw_iat_tot': 'int', 'bw_iat_avg': 'float', 'bw_iat_std': 'float', 'bw_iat_max': 'int', 'bw_iat_min': 'int',
     'fw_psh_flag': 'int', 'bw_psh_flag': 'int', 'fw_urg_flag': 'int', 'bw_urg_flag': 'int',
     'fw_hdr_len': 'int', 'bw_hdr_len': 'int', 'fw_pkt_s': 'float', 'bw_pkt_s': 'float', 
     'pkt_len_min': 'int', 'pkt_len_max': 'int', 'pkt_len_avg': 'float', 'pkt_len_std': 'float', 'pkt_len_var': 'float', 
     'fin_cnt': 'int', 'syn_cnt': 'int', 'rst_cnt': 'int', 'pst_cnt': 'int', 
     'ack_cnt': 'int', 'urg_cnt': 'int', 'cwe_cnt': 'int', 'ece_cnt': 'int', 
     'down_up_ratio': 'float', 'pkt_size_avg': 'float', 'fw_seg_avg': 'float', 'bw_seg_avg': 'float', 
     'fw_byt_blk_avg': 'float', 'fw_pkt_blk_avg': 'float', 'fw_blk_rate_avg': 'float', 
     'bw_byt_blk_avg': 'float', 'bw_pkt_blk_avg': 'float', 'bw_blk_rate_avg': 'float', 
     'subfl_fw_pk': 'int', 'subfl_fw_byt': 'int', 'subfl_bw_pkt': 'int', 'subfl_bw_byt': 'int',
     'fw_win_byt': 'int', 'bw_win_byt': 'int', 'fw_act_pkt': 'int', 'fw_seg_min': 'int', 
     'atv_avg': 'float', 'atv_std': 'float', 'atv_max': 'int', 'atv_min': 'int', 
     'idl_avg': 'float', 'idl_std': 'float', 'idl_max': 'int', 'idl_min': 'int', 
     'label': 'str'}

features = list(dtypes.keys())

In [3]:
save_dir = "data/ids2018_collaborative"

In [4]:
benign_user_num = 10000

In [5]:
malicious_flows_list = []
benign_flow_nums = []
malicious_flow_nums = []
for csv_path in data_files_list[:]:
    # read the csv file
    basename = csv_path.split("/")[-1].replace('.csv', '')
    flow_df = pd.read_csv(csv_path, skiprows=1, names=features, dtype=dtypes,
                          parse_dates=['timestamp'], dayfirst=True, warn_bad_lines=True, error_bad_lines=False)
    print("\nFile >> {}".format(csv_path))
    print("Raw file has {} rows with {} features".format(len(flow_df), len(flow_df.columns)))
    
    # preprocess (feature selection & cleaning)
    filtered_features = ["dst_port", "protocol", "timestamp", "fl_dur", "idl_avg", "idl_std"]
    selected_feature = [f for f in features if ("min" not in f and "max" not in f)]
    for ff in filtered_features:
        selected_feature.remove(ff)
    preprocessed_df = flow_df[selected_feature]
    
    clean_df = preprocessed_df[~preprocessed_df.isin([np.nan, np.inf, -np.inf]).any(1)]
    print("Cleaned file has {} rows with {} features".format(len(clean_df), len(clean_df.columns)))
    
    # get the benign flows
    benign_flow_df = clean_df[clean_df['label'] == "Benign"]
    benign_flow_nums.append(len(benign_flow_df))
    print("Number of benign data: {}".format(len(benign_flow_df)))

    # assign random identifier
    benign_identifier = (np.random.rand(len(benign_flow_df))*benign_user_num).astype(int)
    benign_flow_df.insert(0, "id", benign_identifier)
    
    benign_flow_np = benign_flow_df.values
    X, y = benign_flow_np[:, 1:-1], benign_flow_np[:, 0]
    X_train, X_benign_test, y_train, y_benign_test = train_test_split(X, y, test_size=0.2)
    
    np.save("{}/{}_train.npy".format(save_dir, basename), np.c_[y_train, X_train])
    np.save("{}/{}_benign_test.npy".format(save_dir, basename), np.c_[y_benign_test, X_benign_test])
    
    # get the malicious flows
    malicious_flow_df = clean_df[clean_df['label'] != "Benign"]
    malicious_flows_list.append(malicious_flow_df)
    malicious_flow_nums.append(len(malicious_flow_df))
    print("Number of malicious data: {}".format(len(malicious_flow_df)))
    
print("\nTotal Statistics\n\tBenign Flows: {}({:.2f}%)\n\tMalicious Flows: {}({:.2f}%)"
      .format(sum(benign_flow_nums), sum(benign_flow_nums)/(sum(benign_flow_nums)+sum(malicious_flow_nums))*100,
             sum(malicious_flow_nums), sum(malicious_flow_nums)/(sum(benign_flow_nums)+sum(malicious_flow_nums))*100))


File >> data/ids2018_processed/Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv
Raw file has 1048575 rows with 80 features
Cleaned file has 1044751 rows with 57 features
Number of benign data: 663808
Number of malicious data: 380943

File >> data/ids2018_processed/Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv
Raw file has 1048575 rows with 80 features
Cleaned file has 1040548 rows with 57 features
Number of benign data: 988050
Number of malicious data: 52498

File >> data/ids2018_processed/Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv
Raw file has 1048575 rows with 80 features
Cleaned file has 1048575 rows with 57 features
Number of benign data: 360833
Number of malicious data: 687742

File >> data/ids2018_processed/Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv
Raw file has 1048575 rows with 80 features
Cleaned file has 1042965 rows with 57 features
Number of benign data: 1042603
Number of malicious data: 362

File >> data/ids2018_processed/Friday-23-02-2018_TrafficForM

In [6]:
# merge the malicious flows
merged_malicious_df = pd.concat(malicious_flows_list, ignore_index=True)
merged_malicious_df.insert(0, "id", 0)

attack_num = merged_malicious_df.groupby('label')['label'].count()
print("Statistics of attack:\n\n{}".format(attack_num))
print("-"*50+"\nTotal types of attack: {}".format(len(attack_num)))

Statistics of attack:

label
Bot                      286191
Brute Force -Web            611
Brute Force -XSS            230
DDOS attack-HOIC         686012
DDOS attack-LOIC-UDP       1730
DoS attacks-GoldenEye     41508
DoS attacks-Slowloris     10990
FTP-BruteForce           193354
SQL Injection                87
SSH-Bruteforce           187589
Name: label, dtype: int64
--------------------------------------------------
Total types of attack: 10


In [7]:
malicious_user_num = 2000 

temperature = 0.55
adjusted_attack_num = [n**temperature for n in attack_num]
weight = [n/sum(adjusted_attack_num) for n in adjusted_attack_num]
assigned_malicious_user_num = [int(w*malicious_user_num) for w in weight]

print("Statistics of attacker:\n")
for i, k in enumerate(attack_num.keys()):
    print("{:20s}\t{}".format(k, assigned_malicious_user_num[i]))
print("-"*50+"\nTotal number of attackers: {}".format(sum(assigned_malicious_user_num)))

Statistics of attacker:

Bot                 	412
Brute Force -Web    	13
Brute Force -XSS    	8
DDOS attack-HOIC    	666
DDOS attack-LOIC-UDP	24
DoS attacks-GoldenEye	142
DoS attacks-Slowloris	68
FTP-BruteForce      	332
SQL Injection       	4
SSH-Bruteforce      	326
--------------------------------------------------
Total number of attackers: 1995


In [8]:
user_id_start = benign_user_num
for i, (k, v) in enumerate(attack_num.items()):
    attack_user_num = assigned_malicious_user_num[i]
    attack_identifier = (0.9999*np.random.rand(v)*attack_user_num).astype(int) + user_id_start
    merged_malicious_df.loc[merged_malicious_df['label'] == k, "id"] = attack_identifier
    
    print("{} attackers are assigned to id {}~{}".format(k, user_id_start, user_id_start+attack_user_num-1))
    user_id_start += attack_user_num

Bot attackers are assigned to id 10000~10411
Brute Force -Web attackers are assigned to id 10412~10424
Brute Force -XSS attackers are assigned to id 10425~10432
DDOS attack-HOIC attackers are assigned to id 10433~11098
DDOS attack-LOIC-UDP attackers are assigned to id 11099~11122
DoS attacks-GoldenEye attackers are assigned to id 11123~11264
DoS attacks-Slowloris attackers are assigned to id 11265~11332
FTP-BruteForce attackers are assigned to id 11333~11664
SQL Injection attackers are assigned to id 11665~11668
SSH-Bruteforce attackers are assigned to id 11669~11994


In [9]:
malicious_flow_np = merged_malicious_df.values

# randomly shuffle the attacks
np.random.shuffle(malicious_flow_np)

benign_relative_size = [bfn/sum(benign_flow_nums) for bfn in benign_flow_nums]
start_idx = 0
for i, brs in enumerate(benign_relative_size):
    basename = data_files_list[i].split("/")[-1].replace('.csv', '')
    flow_num = int(brs * len(malicious_flow_np))
    partition_np = malicious_flow_np[start_idx:start_idx+flow_num, :-1]
    
    np.save("{}/{}_attack_test.npy".format(save_dir, basename), partition_np)
    start_idx += flow_num