In [None]:
from copy import deepcopy
import datetime as dt
from tqdm import tqdm
import sys
import os
import pandas as pd
import time
import datetime as datetime
import numpy as np
import ipaddress
from pathlib import Path
import json
import matplotlib
import math
from sklearn.model_selection import cross_val_predict, KFold, cross_val_score, train_test_split, learning_curve, \
    cross_validate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, \
    make_scorer, recall_score, precision_recall_fscore_support
from sklearn.preprocessing import MinMaxScaler, minmax_scale, scale
import matplotlib.pyplot as plt
from copy import deepcopy
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import *
from sklearn.neural_network import *

pd.options.mode.chained_assignment = None  # default='warn'

pd.options.mode.use_inf_as_na = False

import platform
if platform.system() == 'Windows':
    fs = "\\"
else:
    fs = "/"
print(platform.system())

In [2]:
root_folder = "L:{}Kingston_Omen{}Secondary{}NIDS-Datasets{}preprocessed{}CTU13-cd{}".format(fs, fs, fs, fs, fs, fs, fs)

background_folder = "{}benign{}background{}".format(root_folder, fs, fs, fs)
malicious_folder = "{}malicious{}original_ctu{}".format(root_folder, fs, fs, fs)
adversarial_folder = "{}malicious{}adversarial_ctu{}".format(root_folder, fs, fs, fs)



In [3]:
class FlowFile:
    def __init__(self, year, month, day, name, index, attack=None):
        self.year = year
        self.month = month
        self.day = day
        self.index = index
        self.attack = attack
        self.name = name
        self.date = "{}_{}_{}".format(self.year, self.month, self.day)
        self.create_filename()
        
    
    def create_filename(self):
        if self.attack == None:
            self.filename = "{}_{}{}.csv".format(self.date, self.name, self.index)
        else:
            self.filename = "{}_{}{}-{}.csv".format(self.date, self.name, self.index, self.attack)

        
        


In [4]:
class Result:
    import pandas as pd
    from sklearn.metrics import accuracy_score, f1_score, recall_score
    
    def __init__(self, true, pred, time, pos_label=1):
        self.time = time
        self.rec = 0
        self.f1 = 0
        self.fpr = 0
        self.acc = 0
        self.acc_multi = 0
        if len(pd.Series(true).unique())==2: #binary classification
            if len(pd.Series(pred).unique())==2:
                self.bin_results(true, pred, pos_label)
            else:
                self.multi_results(true, pred)
            
        else: #multi class or different
            self.multi_results(true, pred)
        
    def multi_results(self, true, pred):
        self.ctab = pd.crosstab(true, pred, rownames=['True'], colnames=['Pred'])           
        self.acc_multi = accuracy_score(true, pred, normalize=True, sample_weight=None)
    
    def bin_results(self, true, pred, pos_label = 1):
        self.ctab_bin = pd.crosstab(true, pred, rownames=['True'], colnames=['Pred'])
        self.acc = accuracy_score(true, pred, normalize=True, sample_weight=None)
        try:
            self.rec = recall_score(true, pred, zero_division=0, pos_label=pos_label)
            self.f1 = f1_score(true, pred, zero_division=0, pos_label=pos_label)
            self.tnr = recall_score(true, pred, zero_division=0, pos_label=0)
        except:
            self.rec = 0
            self.f1 = 0
            self.tnr = self.acc
        self.fpr = 1-self.tnr
        
        

                            

In [5]:
def plot_features(classifier, feature_names, classifier_type='RF', top_features=None, threshold=None, 
                        text_ranking=True, plot_ranking=True):
    import matplotlib.pyplot as plt
    import numpy as np
    
    if top_features==None:
        top_features = int(len(feature_names)/2)
    
    if classifier_type == 'RF':
        coef = classifier.feature_importances_
    elif classifier_type == 'SVM':
        coef = classifier.coef_.ravel()
    
    top_coefficients = np.argsort(coef)[-top_features:]
    bottom_coefficients = np.argsort(coef)[:top_features]
    ranked_coefficients = np.hstack([bottom_coefficients, top_coefficients])
    
    if text_ranking:
        print("FEATURE RANKING:")
        for tc in reversed(top_coefficients):
            print("{} {:.5f}".format(feature_names[tc], coef[tc]))
        for bc in reversed(bottom_coefficients):
            print("{} {:.5f}".format(feature_names[bc], coef[bc]))

    if plot_ranking:
        plt.figure(figsize=(15, 5))
        if threshold == None:
            colors = ['red' if (rc in bottom_coefficients) else 'blue' for rc in ranked_coefficients]
        else:
            colors = ['red' if rc < threshold else 'blue' for rc in coef[ranked_coefficients]]
        plt.bar(np.arange(2 * top_features), coef[ranked_coefficients], color=colors)
        feature_names = np.array(feature_names)
        plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[ranked_coefficients], rotation=60, ha='right')
        plt.show()
        
def develop_clf(train, test, features, clf_name='classifier', label='Nature', clf_type = 'rf'):
    # Function that trains and tests a classifier, returning the results
    import pandas as pd
    import time
    train_y = train[label]
    test_y = test[label]
    
    clf = choose_clf(clf_type)
    
    start_time = time.time()
    print("Training and testing {}...".format(clf_name), end="", flush=True)
    clf.fit(train[features], train_y)
    clf_time = time.time() - start_time
    clf_pred = clf.predict(test[features])
    clf_result = Result(test_y, clf_pred, clf_time)
    print("...done! Training time: {:3f}s".format(clf_time))
    return clf, clf_pred, clf_result

def evaluate_clf(clf, test, features, clf_name='classifier', label='Nature', time=None, verbose=False):
    import pandas as pd
    if verbose==True:
        print("Testing {}...".format(clf_name))
    clf_pred = clf.predict(test[features])
    clf_result = Result(test[label], clf_pred, time)
    return clf_pred, clf_result
    
def train_clf(train_data, features, clf_name='classifier', label='Nature'):
    import pandas as pd
    import time
    train_y = train_data[label]
    clf = choose_clf(clf_type)
    
    start_time = time.time()
    print("Training {}...".format(clf_name), end="", flush=True)
    clf.fit(train_data[features], train_y)
    clf_time = time.time() - start_time
    print("...done! Training time: {}".format(clf_time))
    return clf, clf_time

def choose_clf(clf_type):
    if clf_type == 'rf':
        clf = RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=None, min_samples_split=2, 
                                 min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', 
                                 max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, 
                                 n_jobs=-2, random_state=None, verbose=0, warm_start=False, class_weight=None, 
                                 ccp_alpha=0.0, max_samples=None)
    elif clf_type == 'lr':
        clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                                intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, 
                                multi_class='auto', verbose=0, warm_start=False, n_jobs=-2, l1_ratio=None)
    elif clf_type == 'hgb':
        clf = HistGradientBoostingClassifier(loss='auto', learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, 
                                    min_samples_leaf=20, l2_regularization=0.0, max_bins=255,
                                    monotonic_cst=None, warm_start=False, early_stopping='auto', scoring='loss', 
                                    validation_fraction=0.1, n_iter_no_change=10, tol=1e-07, verbose=0, random_state=None)
    elif clf_type == 'mlp':
        clf = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', alpha=0.0001, 
                            batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, 
                            max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, 
                            warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, 
                            validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, 
                            n_iter_no_change=20, max_fun=15000)
    elif clf_type == 'knn':
        clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, 
                                   metric='minkowski', metric_params=None, n_jobs=-1)
    elif clf_type == 'dt':
        clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, 
                                     min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, 
                                     random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                     class_weight=None, ccp_alpha=0.0)
    return clf


def choose_reg(reg_type):
    if reg_type == 'rf':
        reg = RandomForestRegressor(n_estimators=200, criterion='friedman_mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, 
                                    min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                    bootstrap=True, oob_score=False, n_jobs=-2, random_state=None, verbose=0, warm_start=False, 
                                    ccp_alpha=0.0, max_samples=None)
    elif reg_type == 'lr':
        reg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                                intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, 
                                multi_class='auto', verbose=0, warm_start=False, n_jobs=-2, l1_ratio=None)
    elif clf_type == 'hgb':
        reg = HistGradientBoostingRegressor(loss='squared_error', learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, 
                                            min_samples_leaf=20, l2_regularization=0.0, max_bins=255,
                                            monotonic_cst=None, warm_start=False, early_stopping='auto', scoring='loss', 
                                            validation_fraction=0.1, n_iter_no_change=10, tol=1e-07, verbose=0, random_state=None)
    elif clf_type == 'mlp':
        reg = MLPRegressor(hidden_layer_sizes=(100), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', 
                           learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, 
                           random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                           nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, 
                           beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000)
    elif clf_type == 'knn':
        reg = KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, 
                                  p=2, metric='minkowski', metric_params=None, n_jobs=-2)
    elif clf_type == 'dt':
        reg = DecisionTreeRegressor(criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, 
                                    min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, 
                                    random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0)
    return reg

In [6]:
def distillation(clf, train, test, features, label='Nature', reg_type = 'rf', threshold=0.5):
    # Function that returns a distilled variant of a classifier
    print("Training distillation...")
    start_time = time.time()
    train_probas = ((clf.predict_proba(train[features])).transpose())[1]
    reg = choose_reg(reg_type)
    reg.fit(train[features], train_probas)
    
    
    reg_time = time.time() - start_time
    print("...done! Training time: {:3f}s".format(reg_time))
    reg_pred = reg.predict(test[features])
    reg_pred[reg_pred>=threshold] = True
    reg_pred[reg_pred<threshold] = False
    reg_result = Result(test[label], reg_pred, reg_time)
    
    return reg, reg_pred, reg_result

In [7]:
def malicious_ip_selector(malware_name, malware_index):
    # function that given a malware name and index, returns a list of IP addresses
    
    win2 = '192.168.1.112' # dridex 9, wannacry 6, 7, 9, 11, 12, 13, 16
    win3 = '192.168.1.113' # artemis 3, trickbot 19
    win4 = '192.168.1.114' # pony 1, dridex 3, trickbot 12, trickbot 20, wannacry 4
    win5 = '192.168.1.115' # artemis 2, dridex 8, trickbot 5, 13
    win6 = '192.168.1.116' # dridex 10, trickster 2, trickbot 2, 14
    win7 = '192.168.1.117' # trickbot 3
    win8 = '192.168.1.118' # dridex 6, yakes 1, trickbot 1
    win9 = '192.168.1.119' # yakes 2, trickbot 17, wannacry 5
    win10 = '192.168.1.120' # artemis 4, dridex 11, trickbot 4, wannacry 1, 2
    win11 = '192.168.1.121' # dridex 7, trickbot 6, 18
    win12 = '192.168.1.122' # dridex 12, trickbot 10, 15
    win13 = '192.168.1.123' # dridex 2, trickbot 16, wannacry 3
    win14 = '192.168.1.124' # artemis 1, trickbot 7, 8
    win15 = '192.168.1.125' # artemis 5, trickbot 9
    win17 = '192.168.1.127' # trickster 3
    win18 = '192.168.1.128' # dridex 4, trickster 1
    win19 = '192.168.1.129' # dridex 1
    win20 = '192.168.1.130' # dridex 5, trickbot 11
    win22 = '192.168.1.106' # pony 2
    win24 = '192.168.1.135' # wannacry 8, 10, 14, 15, 17
    win20_ip6 = 'fd2d:ab8c:225:0:1d3:35e9:7d97:2325' # dridex 5, trickbot 11
    
    ip_list = list()
    code = malware_name + malware_index
    
    if code in ("dridex9", "wannacry6", "wannacry7", "wannacry9", "wannacry11", "wannacry12", "wannacry13", "wannacry16"):
        ip_list.append(win2)
    elif code in ("artemis3", "trickbot19"):
        ip_list.append(win3)
    elif code in ("pony1", "dridex3", "trickbot12", "trickbot20", "wannacry4"):
        ip_list.append(win4)
    elif code in ("artemis2", "dridex8", "trickbot5", "trickbot13"):
        ip_list.append(win5)
    elif code in ("dridex10", "trickster2", "trickbot2", "trickbot14"):
        ip_list.append(win6)
    elif code in ("trickbot3"):
        ip_list.append(win7)
    elif code in ("dridex6", "yakes1", "trickbot1"):
        ip_list.append(win8)
    elif code in ("yakes2", "trickbot17", "wannacry5"):
        ip_list.append(win9)
    elif code in ("artemis4", "dridex11", "trickbot4", "wannacry1", "wannacry2"):
        ip_list.append(win10)
    elif code in ("dridex7", "trickbot6", "trickbot18"):
        ip_list.append(win11)
    elif code in ("dridex12", "trickbot10", "trickbot15"):
        ip_list.append(win12)
    elif code in ("dridex2", "trickbot16", "wannacry3"):
        ip_list.append(win13)
    elif code in ("artemis1", "trickbot7", "trickbot8"):
        ip_list.append(win14)
    elif code in ("artemis5", "trickbot9"):
        ip_list.append(win15)
    elif code in ("trickster3"):
        ip_list.append(win17)
    elif code in ("dridex4", "trickster1"):
        ip_list.append(win18)
    elif code in ("dridex1"):
        ip_list.append(win19)
    elif code in ("dridex5", "trickbot11"):
        ip_list.append(win20)
        ip_list.append(win20_ip6)
    elif code in ("pony2"):
        ip_list.append(win22)
    elif code in ("wannacry8", "wannacry10", "wannacry14", "wannacry15", "wannacry17"):
        ip_list.append(win24)
    
    return ip_list    



def malicious_filter(source_df, malware, index=None, protocol=None, only_source = False, attack = None):
    ctu_families = ['neris', 'rbot', 'virut']
    df = source_df[source_df['Label']==malware]
    if len(df)==0:
        print("No sample with the {} label!".format(malware))

    if protocol is not None:
        df = df[df['Proto'].isin(protocol)]
    
    
    if only_source == True:
        # Filter only those flows whose SourceIP corresponds to the malicious machine
        if malware in ctu_families:
            # print("CTU family!")
            malware_addresses = ['147.32.84.165',
                                    '147.32.84.191',
                                    '147.32.84.192',
                                    '147.32.84.193',
                                    '147.32.84.204',
                                    '147.32.84.205',
                                    '147.32.84.206',
                                    '147.32.84.207',
                                    '147.32.84.208',
                                    '147.32.84.209']
            df = df[(df['SrcAddr'].isin(malware_addresses))]
        else:
            # print("MCP family!")
            new_df = pd.DataFrame()
            for i in index:
                
                malware_addresses = malicious_ip_selector(malware, str(i))
                temp_df = df[(df['index'] == str(i)) & (df['SrcAddr'].isin(malware_addresses))]
                new_df = pd.concat([new_df, temp_df])
            df = new_df
        
            
    
    if attack is not None:
        df = df[df['is_adversarial']==attack]
    
    return df  

def quick_results(clf, df, features, label='Nature', threshold = None, output = True):
    if len(df)==0:
        print("df of length 0!")
        return 0, 0, 0
    pred = clf.predict(df[features])
    
    if threshold!=None:
        pred[pred>=threshold] = True
        pred[pred<threshold] = False
    
    acc = accuracy_score(df[label], pred)
    miss = math.ceil((1-acc) * len(df))
    if output:
        print("Accuracy: {:5f}, Missclassifications: {}".format(acc, miss))
        display(pd.crosstab(df[label], pred, rownames=['True'], colnames=['Pred']))
    return pred, acc, miss


def test_ensemble(model_list, malware_list, dataset, features, threshold=None):
    df = pd.DataFrame()
    if len(dataset) > 0:
        for index, malware in enumerate(malware_list):
            exec(f"df['{malware}'], acc, miss =  quick_results(model_list[index], dataset, features, threshold=threshold, output=False)")

        # Convert aggregated results in LOR
        df["sum"] = df.sum(axis=1)
        df["LOR"] = (df["sum"]>0)
        temp = dataset['Nature'] #> 0)
        df['Truth'] = ((temp.reset_index(drop=True)) > 0)
        result = Result(df['Truth'], df['LOR'], 0)
        return df, result
    else:
        result = Result(pd.Series([0]), pd.Series([1]), 0)
        return df, result

In [8]:
def fixValues(df):
    # Function to fix NaNs and Infinite values
    # NaNs are replaced with the MEAN
    # Infinite are replaced with the MAX
    import numpy as np
    #x = df.copy(deep=True)
    for c in df.columns:
        if df[c].dtype == 'int' or df[c].dtype == 'float':
            temp = np.asarray(df[c], dtype=np.float64)
            # remove NaN & Infinity (if there are)
            # print(c)
            temp = temp[np.isfinite(temp)]
            mean_value = temp.mean()
            if (str(mean_value) == 'nan'):
                mean_value = -1
                max_value = -1
            else:
                max_value = temp.max()
            df[c].replace([np.inf, -np.inf], max_value, inplace=True)
            df[c].replace([np.nan], mean_value, inplace=True)
    return df

def fixPorts(df):
    # create new columns
    df['Sport_num'] = df['Sport']
    df['Dport_num'] = df['Dport']
    
    # remove nan values
    df['Sport_num'].replace([np.nan], '-1', inplace=True)
    df['Dport_num'].replace([np.nan], '-1', inplace=True)
    
    # remove hexadecimal values
    df['Dport_num'] = np.where(df['Dport_num'].str.contains("0x"), '-1', df['Dport_num'])
    df['Sport_num'] = np.where(df['Sport_num'].str.contains("0x"), '-1', df['Sport_num'])
    
    # conver ports to numeric
    df['Dport_num'] = pd.to_numeric(df['Dport_num'], errors='coerce')
    df['Sport_num'] = pd.to_numeric(df['Sport_num'], errors='coerce')
    
    # the "errors" are well-known ports, setting them to 0
    df['Dport_num'].replace([np.nan], 0, inplace=True)
    df['Sport_num'].replace([np.nan], 0, inplace=True)
    
    # setting the conditions
    srcPort_conditions = [
        (df['Sport_num'] == -1),
        (df['Sport_num'] >= 0) & (df['Sport_num'] <= 1023),
        (df['Sport_num'] >= 1024) & (df['Sport_num'] <= 49151),
        (df['Sport_num'] > 49151)
    ]
    dstPort_conditions = [
        (df['Dport_num'] == -1),
        (df['Dport_num'] >= 0) & (df['Dport_num'] <= 1023),
        (df['Dport_num'] >= 1024) & (df['Dport_num'] <= 49151),
        (df['Dport_num'] > 49151)
    ]    
    port_choices = ['none','well-known','registered','dynamic']
    port_choices = [-1,0,1,2] # -1 = none, 0 =well-known, 1=registered, 2=dynamic
    df['Sport_type'] = np.select(srcPort_conditions, port_choices)
    df['Dport_type'] = np.select(dstPort_conditions, port_choices)
    
    df.drop(["Dport_num", "Sport_num"], axis=1, inplace=True)
    return df


def preprocess(df, _filter, flow_file, test_size):# , adversarial=False):
    # remove excess
    df['seed'] = (np.random.uniform(0,1,len(df)))
    df = df[df['seed']<_filter]
    # assign stuff
    df['Date'] = flow_file.date
    df['Label_original'] = df['Label']
    if flow_file.name in ["background", "normal"]:
        df['Label'] = 'BENIGN'
    #split train_test
    df['seed'] = (np.random.uniform(0,1,len(df)))
    
    if flow_file.attack == None:
        df['is_adversarial'] = False
        df['is_test'] = np.where(df['seed'] <= test_size, True, False)
    else:
        df['is_adversarial'] = flow_file.attack
        df['is_test'] = True
    df = fixValues(df)
    df = fixPorts(df)
    return df


def handleCategorical(df):
    ## Handling categorical data and labels
    ## THIS MUST BE CALLED ON THE ENTIRE DF!!!!!
    df['Nature'] = np.where(df['Label'].str.contains('BENIGN'),0,1)
    
    for column_name in df.columns:
        if column_name in ['State', 'Flgs', 'Proto', 'Dir']:
            df[column_name+"-f"] = pd.factorize(df[column_name])[0]
        else:
            pass
    df['Label_cat'] = pd.factorize(df['Label'])[0]
    return df

In [9]:
cutoff_date = "2011_08_15"
scenario = 'scs'
malware_list = ['neris', 'rbot', 'virut']

# cutoff_date = "2017_07_01"
# scenario = 'lcs'
# malware_list = ['artemis', 'dridex', 'trickbot', 'trickster', 'wannacry']

perturb = 'small' # choose: 'small' or 'large' ## for this paper, we have always used "small"
base_clf = 'rf' # choose: 'hgb' or 'rf'
test_size = 0.3
threshold=0.5 # threshold for distillation

filter_malicious = 1
filter_normal = 1
filter_background = 0.8

trials = 1 # just for an example

In [10]:
output_file = "snippet/{}_{}_{}.txt".format(base_clf, perturb, scenario)
print("The output will be saved in: ", output_file)

The output will be saved in:  snippet/rf_small_scs.txt


In [11]:
background_list = [ # only choose three
    # FlowFile("2011", "08", "10", "background", "-42"), # choose two below this
    # FlowFile("2011", "08", "11", "background", "-43"),
    # FlowFile("2011", "08", "12", "background", "-44"),
    FlowFile("2011", "08", "15", "background", "-45"),
    FlowFile("2011", "08", "15", "background", "-46"),
    # FlowFile("2011", "08", "16", "background", "-54"), # always choose one below this
    # FlowFile("2011", "08", "17", "background", "-50"),
    FlowFile("2011", "08", "18", "background", "-52"), 
]

In [12]:
if scenario == 'scs':
    normal_list = [
        FlowFile("2011", "08", "10", "normal", "-42"), # these are the original CTU normal files
        FlowFile("2011", "08", "11", "normal", "-43"), # these are the original CTU normal files
        FlowFile("2011", "08", "12", "normal", "-44"), # these are the original CTU normal files
        FlowFile("2011", "08", "15", "normal", "-45"), # these are the original CTU normal files
        FlowFile("2011", "08", "15", "normal", "-46"), # these are the original CTU normal files
        FlowFile("2011", "08", "16", "normal", "-54"), # these are the original CTU normal files
        FlowFile("2011", "08", "17", "normal", "-50"), # these are the original CTU normal files
        FlowFile("2011", "08", "18", "normal", "-52"), # these are the original CTU normal files
    ]
    normal_folder = "{}benign{}normal{}original_ctu{}".format(root_folder, fs, fs, fs)
    malicious_list = [
        FlowFile("2011", "08", "10", "neris", "1"), # these are the original CTU malware
        FlowFile("2011", "08", "11", "neris", "2"), # these are the original CTU malware
        FlowFile("2011", "08", "17", "neris", "3"), # these are the original CTU malware
        
        FlowFile("2011", "08", "12", "rbot", "1"),  # these are the original CTU malware
        FlowFile("2011", "08", "15", "rbot", "2"),  # these are the original CTU malware
        FlowFile("2011", "08", "18", "rbot", "3"),  # these are the original CTU malware
        
        FlowFile("2011", "08", "15", "virut", "1"), # these are the original CTU malware
        FlowFile("2011", "08", "16", "virut", "2"), # these are the original CTU malware
    ]
    neris_indices = None
    rbot_indices = None
    virut_indices = None
elif scenario == 'lcs':
    normal_list = [
        FlowFile("2013", "12", "17", "normal", "1"),
        FlowFile("2013", "12", "17", "normal", "2"),
        # FlowFile("2015", "03", "24", "normal", "3"), # apparently this creates an error?
        FlowFile("2016", "09", "13", "normal", "4"),
        FlowFile("2016", "09", "13", "normal", "5"),
        FlowFile("2016", "09", "13", "normal", "6"),
        FlowFile("2017", "04", "18", "normal", "7"),
        FlowFile("2017", "04", "19", "normal", "8"),
        FlowFile("2017", "04", "25", "normal", "9"),
        FlowFile("2017", "04", "28", "normal", "10"),
        FlowFile("2017", "04", "30", "normal", "11"),
        FlowFile("2017", "04", "30", "normal", "12"),
        FlowFile("2017", "05", "01", "normal", "13"),
        FlowFile("2017", "05", "01", "normal", "14"),
        FlowFile("2017", "05", "01", "normal", "15"),
        FlowFile("2017", "05", "01", "normal", "16"),
        FlowFile("2017", "05", "02", "normal", "17"),
        FlowFile("2017", "05", "02", "normal", "18"),
        FlowFile("2017", "05", "02", "normal", "19"),
        FlowFile("2017", "07", "03", "normal", "20"),
        FlowFile("2017", "07", "23", "normal", "21"),
        FlowFile("2017", "09", "05", "normal", "22"),
        FlowFile("2018", "05", "07", "normal", "23"),   
    ]
    normal_folder = "{}benign{}normal{}lcs{}".format(root_folder, fs, fs, fs)
    malicious_list = [
        FlowFile("2017", "06", "24", "artemis", "1"),  # lcs
        FlowFile("2017", "08", "01", "artemis", "2"),  # lcs
        FlowFile("2017", "08", "14", "artemis", "3"),  # lcs
        FlowFile("2017", "08", "16", "artemis", "4"),  # lcs
        FlowFile("2017", "08", "16", "artemis", "5"),  # lcs
        FlowFile("2017", "02", "13", "dridex", "1"),  # lcs
        FlowFile("2017", "02", "27", "dridex", "2"),  # lcs
        FlowFile("2017", "04", "11", "dridex", "3"),  # lcs
        FlowFile("2017", "04", "18", "dridex", "4"),  # lcs
        FlowFile("2017", "04", "18", "dridex", "5"),  # lcs
        FlowFile("2017", "05", "15", "dridex", "6"),  # lcs
        FlowFile("2017", "05", "15", "dridex", "7"),  # lcs
        FlowFile("2017", "05", "16", "dridex", "8"),  # lcs
        FlowFile("2017", "06", "24", "dridex", "9"),  # lcs
        FlowFile("2018", "01", "29", "dridex", "10"),  # lcs
        FlowFile("2018", "01", "30", "dridex", "11"),  # lcs
        FlowFile("2018", "04", "03", "dridex", "12"),  # lcs
        
        FlowFile("2017", "03", "29", "trickbot", "1"),  # lcs
        FlowFile("2017", "03", "30", "trickbot", "2"),  # lcs
        FlowFile("2017", "03", "30", "trickbot", "3"),  # lcs
        FlowFile("2017", "03", "30", "trickbot", "4"),  # lcs
        FlowFile("2017", "04", "12", "trickbot", "5"),  # lcs
        FlowFile("2017", "04", "12", "trickbot", "6"),  # lcs
        FlowFile("2017", "04", "17", "trickbot", "7"),  # lcs
        FlowFile("2017", "05", "08", "trickbot", "8"),  # lcs
        FlowFile("2017", "05", "15", "trickbot", "9"),  # lcs
        FlowFile("2017", "06", "07", "trickbot", "10"),  # lcs
        FlowFile("2017", "06", "15", "trickbot", "11"),  # lcs
        FlowFile("2017", "06", "24", "trickbot", "12"),  # lcs
        FlowFile("2017", "06", "24", "trickbot", "13"),  # lcs
        FlowFile("2017", "06", "24", "trickbot", "14"),  # lcs
        FlowFile("2017", "06", "24", "trickbot", "15"),  # lcs
        FlowFile("2018", "01", "30", "trickbot", "16"),  # lcs
        FlowFile("2018", "01", "30", "trickbot", "17"),  # lcs
        FlowFile("2018", "02", "02", "trickbot", "18"),  # lcs
        FlowFile("2018", "03", "27", "trickbot", "19"),  # lcs
        FlowFile("2021", "07", "30", "trickbot", "20"),  # lcs
        
        FlowFile("2017", "06", "24", "trickster", "1"),  # lcs
        FlowFile("2017", "08", "03", "trickster", "2"),  # lcs
        FlowFile("2018", "01", "29", "trickster", "3"),  # lcs
        
        FlowFile("2017", "05", "14", "wannacry", "1"),  # lcs
        FlowFile("2017", "05", "14", "wannacry", "2"),  # lcs
        FlowFile("2017", "05", "15", "wannacry", "3"),  # lcs
        FlowFile("2017", "05", "15", "wannacry", "4"),  # lcs
        FlowFile("2017", "06", "24", "wannacry", "5"),  # lcs
        FlowFile("2017", "07", "11", "wannacry", "6"),  # lcs
        FlowFile("2017", "07", "11", "wannacry", "7"),  # lcs
        FlowFile("2017", "07", "11", "wannacry", "8"),  # lcs
        FlowFile("2017", "07", "11", "wannacry", "9"),  # lcs
        FlowFile("2017", "07", "11", "wannacry", "10"),  # lcs
        FlowFile("2017", "07", "11", "wannacry", "11"),  # lcs
        FlowFile("2017", "07", "11", "wannacry", "12"),  # lcs
        FlowFile("2017", "07", "11", "wannacry", "13"),  # lcs
        FlowFile("2017", "07", "12", "wannacry", "14"),  # lcs
        FlowFile("2017", "07", "13", "wannacry", "15"),  # lcs
        FlowFile("2017", "07", "13", "wannacry", "16"),  # lcs
        FlowFile("2017", "07", "13", "wannacry", "17"),  # lcs
    ]
    artemis_indices = [1,2,3,4,5]
    dridex_indices = [1,2,3,4,5,6,7,8,9,10,11,12]
    trickbot_indices = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
    trickster_indices = [1,2,3]
    wannacry_indices = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]

In [13]:
def createAdversarial_list(malicious_list, attack_list):
    adversarial_list = []
    for m in malicious_list:
        for a in attack_list:
            adversarial_list.append(FlowFile(m.year, m.month, m.day, m.name, m.index, attack=a))
    return adversarial_list

if perturb == 'small':
    attack_list = ['Ts', 'Us']
elif perturb == 'large':
    attack_list = ['Tb', 'Ub'] ## We did not use these
    
    
adversarial_list = createAdversarial_list(malicious_list, attack_list)



In [14]:
for trial in tqdm(range(trials)):
    all_df = pd.DataFrame()

    begin = datetime.datetime.now()

    starttime = time.time()

    print("Reading MALICIOUS (startime: {})...".format(begin))
    show_split = True
    for m in malicious_list:
        m_file = "{}{}{}{}".format(malicious_folder, m.name, fs, m.filename)
        m_df = pd.read_csv(m_file, dtype={'Sport':'object', 'Dport':'object'})
        print("{} entries for {}".format(len(m_df), m.filename), end=" ")
        m_df = preprocess(m_df, filter_malicious, m, test_size)
        print("(final entries: {})".format(len(m_df)))
        m_df['index'] = str(m.index)
        if show_split:
            if scenario == 'scs':
                ind = None
            else:
                ind = [m.index]
            exec(f"tcp_df = malicious_filter(m_df, m.name, index=ind, protocol=['tcp'], only_source = True, attack = None)")
            exec(f"udp_df = malicious_filter(m_df, m.name, index=ind, protocol=['udp'], only_source = True, attack = None)")
            print("\tUDP: {}\tTCP: {}".format(len(udp_df), len(tcp_df)))
        all_df = pd.concat([all_df, m_df])



    print("Reading ADVERSARIAL...")
    for a in adversarial_list:
        a_file = "{}{}{}{}".format(adversarial_folder, a.name, fs, a.filename)
        a_df = pd.read_csv(a_file, dtype={'Sport':'object', 'Dport':'object'})
        print("{} entries for {}".format(len(a_df), a.filename), end=" ")
        a_df = preprocess(a_df, filter_malicious, a, test_size)#, adversarial=True)
        print("(final entries: {})".format(len(a_df)))
        a_df['index'] = str(a.index)
        all_df = pd.concat([all_df, a_df])


    print("Reading NORMAL...")
    for n in normal_list:
        n_file = "{}{}".format(normal_folder, n.filename)
        n_df = pd.read_csv(n_file, dtype={'Sport':'object', 'Dport':'object'})
        print("{} entries for {}".format(len(n_df), n.filename), end=" ")
        n_df = preprocess(n_df, filter_normal, n, test_size)
        print("(final entries: {})".format(len(n_df)))
        n_df['index'] = str(n.index.replace("-", ""))
        all_df = pd.concat([all_df, n_df])

    print("Reading BACKGROUND...")
    for b in background_list:
        b_file = "{}{}".format(background_folder, b.filename)
        b_df = pd.read_csv(b_file, dtype={'Sport':'object', 'Dport':'object'})
        print("{} entries for {}".format(len(b_df), b.filename), end=" ")
        b_df = preprocess(b_df, filter_background, b, test_size)
        print("(final entries: {})".format(len(b_df)))
        b_df['index'] = str(b.index.replace("-", ""))
        all_df = pd.concat([all_df, b_df])

    runtime = time.time() - starttime
    print("...done! ({} \t TIME TAKEN: {:.5f}s)".format(datetime.datetime.now(), runtime))

    all_df = handleCategorical(all_df)

    print(all_df.isna().any().any())
    print(all_df.columns[all_df.isna().any()])

    past_df = all_df[all_df['Date']<=cutoff_date]
    future_df = all_df[all_df['Date']>cutoff_date]


    future_adversarial_df = future_df[(future_df['is_adversarial']!=False)]
    future_malicious_df = future_df[((future_df['is_adversarial']==False) & (future_df['Label']!='BENIGN'))]
    future_benign_df = future_df[((future_df['is_adversarial']==False) & (future_df['Label']=='BENIGN'))]

    past_adversarial_df = past_df[past_df['is_adversarial']!=False]

    all_train = past_df[past_df['is_test']==False]
    all_test = past_df[(past_df['is_test']==True) & (past_df['is_adversarial']==False)]

    benign_train = all_train[all_train['Label']=='BENIGN']
    benign_test = all_test[all_test['Label']=='BENIGN']

    malicious_train = all_train[all_train['Label']!='BENIGN']
    malicious_test = all_test[all_test['Label']!='BENIGN']
    past_malicious_df = pd.concat([malicious_train, malicious_test])

    for malware in malware_list:
        exec(f"{malware}_train = all_train[all_train['Label']=='{malware}']")
        exec(f"{malware}_test = all_test[all_test['Label']=='{malware}']")
        exec(f"past_{malware}_df = malicious_filter(past_malicious_df, '{malware}', index=None, protocol=None, only_source = False, attack = None)")
        exec(f"future_{malware}_df = malicious_filter(future_malicious_df, '{malware}', index=None, protocol=None, only_source = False, attack = None)")


    # CREATING all the datasets for testing the specific performance in adversarial/non-adversarial scenarios!

    starttime = time.time()

    past_tcp_malicious_df = pd.DataFrame()
    past_udp_malicious_df = pd.DataFrame()
    past_tcp_adversarial_df = pd.DataFrame()
    past_udp_adversarial_df = pd.DataFrame()

    future_tcp_malicious_df = pd.DataFrame()
    future_udp_malicious_df = pd.DataFrame()
    future_tcp_adversarial_df = pd.DataFrame()
    future_udp_adversarial_df = pd.DataFrame()

    for malware in malware_list:
        print("Generating sets for: {}...".format(malware))
        exec(f"past_tcp_{malware}_df = malicious_filter(past_malicious_df, '{malware}', index={malware}_indices, protocol=['tcp'], only_source = True, attack = None)")
        exec(f"past_udp_{malware}_df = malicious_filter(past_malicious_df, '{malware}', index={malware}_indices, protocol=['udp'], only_source = True, attack = None)")
        exec(f"future_tcp_{malware}_df = malicious_filter(future_malicious_df, '{malware}', index={malware}_indices, protocol=['tcp'], only_source = True, attack = None)")
        exec(f"future_udp_{malware}_df = malicious_filter(future_malicious_df, '{malware}', index={malware}_indices, protocol=['udp'], only_source = True, attack = None)")

        exec(f"past_tcp_adversarial_{malware}_df = malicious_filter(past_adversarial_df, '{malware}', index={malware}_indices, protocol=['tcp'], only_source = True, attack = '{attack_list[0]}')") 
        exec(f"past_udp_adversarial_{malware}_df = malicious_filter(past_adversarial_df, '{malware}', index={malware}_indices, protocol=['udp'], only_source = True, attack = '{attack_list[1]}')")
        exec(f"future_tcp_adversarial_{malware}_df = malicious_filter(future_adversarial_df, '{malware}', index={malware}_indices, protocol=['tcp'], only_source = True, attack = '{attack_list[0]}')")
        exec(f"future_udp_adversarial_{malware}_df = malicious_filter(future_adversarial_df, '{malware}', index={malware}_indices, protocol=['udp'], only_source = True, attack = '{attack_list[1]}')")

        exec(f"past_tcp_malicious_df    = pd.concat([past_tcp_malicious_df    , past_tcp_{malware}_df])")
        exec(f"past_udp_malicious_df    = pd.concat([past_udp_malicious_df    , past_udp_{malware}_df])")
        exec(f"past_tcp_adversarial_df  = pd.concat([past_tcp_adversarial_df  , past_tcp_adversarial_{malware}_df])")
        exec(f"past_udp_adversarial_df  = pd.concat([past_udp_adversarial_df  , past_udp_adversarial_{malware}_df])")
        exec(f"future_tcp_malicious_df  = pd.concat([future_tcp_malicious_df  , future_tcp_{malware}_df])")
        exec(f"future_udp_malicious_df  = pd.concat([future_udp_malicious_df  , future_udp_{malware}_df])")
        exec(f"future_tcp_adversarial_df= pd.concat([future_tcp_adversarial_df, future_tcp_adversarial_{malware}_df])")
        exec(f"future_udp_adversarial_df= pd.concat([future_udp_adversarial_df, future_udp_adversarial_{malware}_df])")

    runtime = time.time() - starttime
    print("Time taken: {:.5f}".format(runtime))



    features_full = ['Dur', 'SrcDur', 'DstDur', 'Dir-f',
           'sTos', 'dTos',  'dTtl',
           'dHops', 'TotPkts', 'SrcPkts', 'DstPkts', 'TotBytes',
           'SrcBytes', 'DstBytes', 'TotAppByte', 'SAppBytes', 'DAppBytes', 'Load',
           'SrcLoad', 'DstLoad', 'Rate', 'SrcRate', 'DstRate', 'Loss', 'SrcLoss',
           'DstLoss', 'pLoss', 'pSrcLoss', 'pDstLoss', 'SIntPkt', 'DIntPkt',
           'SIntPktAct', 'DIntPktAct', 'SIntPktIdl', 'DIntPktIdl', 'SIntPktMax',
           'SIntPktMin', 'DIntPktMax', 'DIntPktMin', 'SIPActMax', 'SIPActMin',
           'DIPActMax', 'DIPActMin', 'SIPIdlMax', 'SIPIdlMin', 'DIPIdlMax',
           'DIPIdlMin', 'SrcJitter', 'DstJitter', 'SrcJitAct', 'DstJitAct',
           'SrcJitIdl', 'DstJitIdl', 'DstWin', 'SrcTCPBase',
           'DstTCPBase', 'TcpRtt', 'SynAck', 'AckDat', 'sMaxPktSz', 'sMinPktSz',
           'dMaxPktSz', 'dMinPktSz',
           'Sport_type', 'Dport_type', 'Flgs-f', 'Dir-f', 'Proto-f',
           'State-f', 'sTtl', 'sHops', 'SrcWin']

    
    # we will use these
    features = ['Dur', 'SrcDur', 'DstDur', 'Dir-f',
           'sTos', 'dTos',  'dTtl',
           'TotPkts', 'SrcPkts', 'DstPkts', 'TotBytes',
           'SrcBytes', 'DstBytes', 'TotAppByte', 'SAppBytes', 'DAppBytes', 'Load',
           'SrcLoad', 'DstLoad', 'Rate', 'SrcRate', 'DstRate',
                  'DstTCPBase', 'SrcTCPBase', 
            'TcpRtt', 'SynAck', 'AckDat', 'sMaxPktSz', 'sMinPktSz',
           'dMaxPktSz', 'dMinPktSz', 
           'Sport_type', 'Dport_type', 'Flgs-f', 'Dir-f', 'Proto-f',
           'State-f',
               'Loss',]

    benign_results = dict()
    uniform_malicious_results = dict()

    for malware in malware_list:
        exec(f"{malware}_results_rec = dict()")
        exec(f"{malware}_results_miss = dict()")


    #####################################################
    #####################################################
    #################   TRAINING  #######################
    #####################################################
    #####################################################

    # FULL BINARY

    fbClf, fbPred_all, fbResult = develop_clf(all_train, all_test, features, clf_name='bin', label='Nature', clf_type=base_clf)

    # INDIVIDUAL CLASSIFIERS
    models = list([])

    for malware in malware_list:
        exec(f"temp_train = pd.concat([benign_train, {malware}_train])") 
        exec(f"temp_test = pd.concat([benign_test, {malware}_test])")
        exec(f"{malware}Clf, {malware}Pred, {malware}Result = develop_clf(temp_train, temp_test, features, clf_name='{malware}', label='Nature', clf_type=base_clf)")
        exec(f"models.append({malware}Clf)")
        # Printing "individual" results
        exec(f"print('{{}} Recall: {{:5f}}'.format('{malware}', {malware}Result.rec))")
        exec(f"display(pd.crosstab(temp_test['Nature'], {malware}Pred, rownames=['True'], colnames=['Pred']))")

    # FULL BINARY (distilled)


    dfbClf, dfb_pred, dfb_result = distillation(fbClf, all_train, all_test, features, threshold=0.5)

    # INDIVIDUAL CLASSIFIERS (distilled)

    d_models = list([])

    # train models and print results
    for malware in malware_list:
        exec(f"temp_train = pd.concat([benign_train, {malware}_train])") 
        exec(f"temp_test = pd.concat([benign_test, {malware}_test])")
        exec(f"{malware}Reg, {malware}RegPred, {malware}RegResult = distillation({malware}Clf, temp_train, temp_test, features, threshold=threshold)")
        exec(f"d_models.append({malware}Reg)")
        # Printing "individual" results
        exec(f"print('{{}} Recall: {{:5f}}'.format('{malware}', {malware}RegResult.rec))")
        exec(f"display(pd.crosstab(temp_test['Nature'], {malware}RegPred, rownames=['True'], colnames=['Pred']))")


    #####################################################
    #####################################################
    #################   TESTING  #######################
    #####################################################
    #####################################################
    # FULL BINARY: results on past (non adversarial)
    fb_ben_pred, fb_ben_acc, fb_ben_miss = quick_results(fbClf, benign_test, features)
    benign_results['fb_past'] = fb_ben_acc


    fb_past_acc = 0
    for malware in malware_list:
        exec(f"fb_{malware}_pred, fb_{malware}_acc, fb_{malware}_miss = quick_results(fbClf, {malware}_test, features)")
        exec(f"fb_past_acc += fb_{malware}_acc")
    fb_past_acc = fb_past_acc / len(malware_list)
    uniform_malicious_results['fb_past'] = fb_past_acc

    for malware in malware_list:
        exec(f"print(len(future_udp_adversarial_{malware}_df))")

    # FULL BINARY: results on past (non adversarial)
    show = False
    for malware in malware_list:
        exec(f"fb_past_tcp_{malware}_pred, fb_past_tcp_{malware}_acc, fb_past_tcp_{malware}_miss = quick_results(fbClf, past_tcp_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['fb_past_tcp'] = fb_past_tcp_{malware}_acc")
        exec(f"{malware}_results_miss['fb_past_tcp'] = fb_past_tcp_{malware}_miss")
        exec(f"fb_past_udp_{malware}_pred, fb_past_udp_{malware}_acc, fb_past_udp_{malware}_miss = quick_results(fbClf, past_udp_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['fb_past_udp'] = fb_past_udp_{malware}_acc")
        exec(f"{malware}_results_miss['fb_past_udp'] = fb_past_udp_{malware}_miss")

    # FULL BINARY: results on past (adversarial)
    for malware in malware_list:
        exec(f"fb_past_tcp_adversarial_{malware}_pred, fb_past_tcp_adversarial_{malware}_acc, fb_past_tcp_adversarial_{malware}_miss = quick_results(fbClf, past_tcp_adversarial_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['fb_past_tcp_adversarial'] = fb_past_tcp_adversarial_{malware}_acc")    
        exec(f"{malware}_results_miss['fb_past_tcp_adversarial'] = fb_past_tcp_adversarial_{malware}_miss")
        exec(f"fb_past_udp_adversarial_{malware}_pred, fb_past_udp_adversarial_{malware}_acc, fb_past_udp_adversarial_{malware}_miss = quick_results(fbClf, past_udp_adversarial_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['fb_past_udp_adversarial'] = fb_past_udp_adversarial_{malware}_acc")
        exec(f"{malware}_results_miss['fb_past_udp_adversarial'] = fb_past_udp_adversarial_{malware}_miss")

    # FULL BINARY: detailed results on FUTURE (no adversarial)
    fb_future_ben_pred, fb_future_ben_acc, fb_future_ben_miss = quick_results(fbClf, future_benign_df, features)
    benign_results['fb_future'] = fb_future_ben_acc

    fb_future_acc = 0
    for malware in malware_list:
        exec(f"fb_future_{malware}_pred, fb_future_{malware}_acc, fb_future_{malware}_miss = quick_results(fbClf, future_{malware}_df, features)")
        exec(f"fb_future_acc += fb_future_{malware}_acc")
    fb_future_acc = fb_future_acc / len(malware_list)
    uniform_malicious_results['fb_future'] = fb_future_acc


    # FULL BINARY: results on future (non adversarial)
    show = False
    for malware in malware_list:
        exec(f"fb_future_tcp_{malware}_pred, fb_future_tcp_{malware}_acc, fb_future_tcp_{malware}_miss = quick_results(fbClf, future_tcp_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['fb_future_tcp'] = fb_future_tcp_{malware}_acc")
        exec(f"{malware}_results_miss['fb_future_tcp'] = fb_future_tcp_{malware}_miss")
        exec(f"fb_future_udp_{malware}_pred, fb_future_udp_{malware}_acc, fb_future_udp_{malware}_miss = quick_results(fbClf, future_udp_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['fb_future_udp'] = fb_future_udp_{malware}_acc")
        exec(f"{malware}_results_miss['fb_future_udp'] = fb_future_udp_{malware}_miss")

    # FULL BINARY: results on future (adversarial)
    for malware in malware_list:
        exec(f"fb_future_tcp_adversarial_{malware}_pred, fb_future_tcp_adversarial_{malware}_acc, fb_future_tcp_adversarial_{malware}_miss = quick_results(fbClf, future_tcp_adversarial_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['fb_future_tcp_adversarial'] = fb_future_tcp_adversarial_{malware}_acc")    
        exec(f"{malware}_results_miss['fb_future_tcp_adversarial'] = fb_future_tcp_adversarial_{malware}_miss")
        exec(f"fb_future_udp_adversarial_{malware}_pred, fb_future_udp_adversarial_{malware}_acc, fb_future_udp_adversarial_{malware}_miss = quick_results(fbClf, future_udp_adversarial_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['fb_future_udp_adversarial'] = fb_future_udp_adversarial_{malware}_acc")
        exec(f"{malware}_results_miss['fb_future_udp_adversarial'] = fb_future_udp_adversarial_{malware}_miss")

    # INDIVIDUAL CLASSIFIERS: results on past
    show = False

    # Appending benign and uniform malicious results in dictionary
    for malware in malware_list:
        exec(f"benign_results['{malware}_past'] = accuracy_score({malware}Clf.predict(benign_test[features]), benign_test['Nature'])")
        exec(f"benign_results['{malware}_future'] = accuracy_score({malware}Clf.predict(future_benign_df[features]), future_benign_df['Nature'])")
        exec(f"uniform_malicious_results['{malware}_past'] = accuracy_score({malware}Clf.predict({malware}_test[features]), {malware}_test['Nature'])")
        exec(f"uniform_malicious_results['{malware}_future'] = accuracy_score({malware}Clf.predict(future_{malware}_df[features]), future_{malware}_df['Nature'])")


    for malware in malware_list:
        exec(f"{malware}_past_tcp_{malware}_pred, {malware}_past_tcp_{malware}_acc, {malware}_past_tcp_{malware}_miss = quick_results({malware}Clf, past_tcp_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['{malware}_past_tcp'] = {malware}_past_tcp_{malware}_acc")
        exec(f"{malware}_results_miss['{malware}_past_tcp'] = {malware}_past_tcp_{malware}_miss")
        exec(f"{malware}_past_udp_{malware}_pred, {malware}_past_udp_{malware}_acc, {malware}_past_udp_{malware}_miss = quick_results({malware}Clf, past_udp_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['{malware}_past_udp'] = {malware}_past_udp_{malware}_acc")
        exec(f"{malware}_results_miss['{malware}_past_udp'] = {malware}_past_udp_{malware}_miss")

    # INDIVIDUAL CLASSIFIERS: results on past (adversarial)
    for malware in malware_list:
        exec(f"{malware}_past_tcp_adversarial_{malware}_pred, {malware}_past_tcp_adversarial_{malware}_acc, {malware}_past_tcp_adversarial_{malware}_miss = quick_results({malware}Clf, past_tcp_adversarial_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['{malware}_past_tcp_adversarial'] = {malware}_past_tcp_adversarial_{malware}_acc")    
        exec(f"{malware}_results_miss['{malware}_past_tcp_adversarial'] = {malware}_past_tcp_adversarial_{malware}_miss")
        exec(f"{malware}_past_udp_adversarial_{malware}_pred, {malware}_past_udp_adversarial_{malware}_acc, {malware}_past_udp_adversarial_{malware}_miss = quick_results({malware}Clf, past_udp_adversarial_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['{malware}_past_udp_adversarial'] = {malware}_past_udp_adversarial_{malware}_acc")
        exec(f"{malware}_results_miss['{malware}_past_udp_adversarial'] = {malware}_past_udp_adversarial_{malware}_miss")

    # INDIVIDUAL CLASSIFIERS - future data (no adversarial)

    for malware in malware_list:
        exec(f"temp_test = pd.concat([future_benign_df, future_{malware}_df])")
        exec(f"{malware}_future_pred, {malware}_future_acc, {malware}_future_miss = quick_results({malware}Clf, future_{malware}_df, features, output=False)")
        exec(f"print('Recall: {{}}'.format({malware}_future_acc))")
        exec(f"{malware}_future_pred, {malware}_future_acc, {malware}_future_miss = quick_results({malware}Clf, temp_test, features)")

    # INDIVIDUAL CLASSIFIERS: results on future (non adversarial)
    show = False
    for malware in malware_list:
        exec(f"{malware}_future_tcp_{malware}_pred, {malware}_future_tcp_{malware}_acc, {malware}_future_tcp_{malware}_miss = quick_results({malware}Clf, future_tcp_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['{malware}_future_tcp'] = {malware}_future_tcp_{malware}_acc")
        exec(f"{malware}_results_miss['{malware}_future_tcp'] = {malware}_future_tcp_{malware}_miss")
        exec(f"{malware}_future_udp_{malware}_pred, {malware}_future_udp_{malware}_acc, {malware}_future_udp_{malware}_miss = quick_results({malware}Clf, future_udp_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['{malware}_future_udp'] = {malware}_future_udp_{malware}_acc")
        exec(f"{malware}_results_miss['{malware}_future_udp'] = {malware}_future_udp_{malware}_miss")

    # INDIVIDUAL CLASSIFIERS: results on future (adversarial)
    for malware in malware_list:
        exec(f"{malware}_future_tcp_adversarial_{malware}_pred, {malware}_future_tcp_adversarial_{malware}_acc, {malware}_future_tcp_adversarial_{malware}_miss = quick_results({malware}Clf, future_tcp_adversarial_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['{malware}_future_tcp_adversarial'] = {malware}_future_tcp_adversarial_{malware}_acc")    
        exec(f"{malware}_results_miss['{malware}_future_tcp_adversarial'] = {malware}_future_tcp_adversarial_{malware}_miss")
        exec(f"{malware}_future_udp_adversarial_{malware}_pred, {malware}_future_udp_adversarial_{malware}_acc, {malware}_future_udp_adversarial_{malware}_miss = quick_results({malware}Clf, future_udp_adversarial_{malware}_df, features, output=show)")
        exec(f"{malware}_results_rec['{malware}_future_udp_adversarial'] = {malware}_future_udp_adversarial_{malware}_acc")
        exec(f"{malware}_results_miss['{malware}_future_udp_adversarial'] = {malware}_future_udp_adversarial_{malware}_miss")

    ############# ENSEMBLE ############

    #aggregate results
    past_benignEns_df, past_benignEns_result = test_ensemble(models, malware_list, benign_test, features, threshold=None)
    future_benignEns_df, future_benignEns_result = test_ensemble(models, malware_list, future_benign_df, features, threshold=None)
    benign_results['ens_past'] = past_benignEns_result.acc_multi
    benign_results['ens_future'] = future_benignEns_result.acc_multi


    ens_past_acc = 0
    ens_future_acc = 0
    for malware in malware_list:
        exec(f"past_{malware}Ens_df, past_{malware}Ens_result = test_ensemble(models, malware_list, {malware}_test, features, threshold=None)")
        exec(f"ens_past_acc += past_{malware}Ens_result.acc_multi")
        exec(f"future_{malware}Ens_df, future_{malware}Ens_result = test_ensemble(models, malware_list, future_{malware}_df, features, threshold=None)")
        exec(f"ens_future_acc += future_{malware}Ens_result.acc_multi")

    uniform_malicious_results['ens_past'] = ens_past_acc / len(malware_list)
    uniform_malicious_results['ens_future'] = ens_future_acc / len(malware_list)


    # ENSEMBLE: results on past (non adversarial)
    show = False
    for malware in malware_list:
        print(malware)
        exec(f"t_df, ens_past_tcp_{malware}_result= test_ensemble(models, malware_list, past_tcp_{malware}_df, features, threshold=None)")
        exec(f"ens_past_tcp_{malware}_acc, ens_past_tcp_{malware}_miss = ens_past_tcp_{malware}_result.acc_multi, int((len(past_tcp_{malware}_df) * (1-ens_past_tcp_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['ens_past_tcp'] = ens_past_tcp_{malware}_acc")
        exec(f"{malware}_results_miss['ens_past_tcp'] = ens_past_tcp_{malware}_miss")
        exec(f"t_df, ens_past_udp_{malware}_result= test_ensemble(models, malware_list, past_udp_{malware}_df, features, threshold=None)")
        exec(f"ens_past_udp_{malware}_acc, ens_past_udp_{malware}_miss = ens_past_udp_{malware}_result.acc_multi, int((len(past_udp_{malware}_df) * (1-ens_past_udp_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['ens_past_udp'] = ens_past_udp_{malware}_acc")
        exec(f"{malware}_results_miss['ens_past_udp'] = ens_past_udp_{malware}_miss")

    # FULL BINARY: results on past (adversarial)
    for malware in malware_list:
        exec(f"t_df, ens_past_tcp_adversarial_{malware}_result= test_ensemble(models, malware_list, past_tcp_adversarial_{malware}_df, features, threshold=None)")
        exec(f"ens_past_tcp_adversarial_{malware}_acc, ens_past_tcp_adversarial_{malware}_miss = ens_past_tcp_adversarial_{malware}_result.acc_multi, int((len(past_tcp_adversarial_{malware}_df) * (1-ens_past_tcp_adversarial_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['ens_past_tcp_adversarial'] = ens_past_tcp_adversarial_{malware}_acc")    
        exec(f"{malware}_results_miss['ens_past_tcp_adversarial'] = ens_past_tcp_adversarial_{malware}_miss")
        exec(f"t_df, ens_past_udp_adversarial_{malware}_result= test_ensemble(models, malware_list, past_udp_adversarial_{malware}_df, features, threshold=None)")
        exec(f"ens_past_udp_adversarial_{malware}_acc, ens_past_udp_adversarial_{malware}_miss = ens_past_udp_adversarial_{malware}_result.acc_multi, int((len(past_udp_adversarial_{malware}_df) * (1-ens_past_udp_adversarial_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['ens_past_udp_adversarial'] = ens_past_udp_adversarial_{malware}_acc")
        exec(f"{malware}_results_miss['ens_past_udp_adversarial'] = ens_past_udp_adversarial_{malware}_miss")



    # ENSEMBLE: results on future (non adversarial)
    show = False
    for malware in malware_list:
        exec(f"t_df, ens_future_tcp_{malware}_result= test_ensemble(models, malware_list, future_tcp_{malware}_df, features, threshold=None)")
        exec(f"ens_future_tcp_{malware}_acc, ens_future_tcp_{malware}_miss = ens_future_tcp_{malware}_result.acc_multi, int((len(future_tcp_{malware}_df) * (1-ens_future_tcp_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['ens_future_tcp'] = ens_future_tcp_{malware}_acc")
        exec(f"{malware}_results_miss['ens_future_tcp'] = ens_future_tcp_{malware}_miss")
        exec(f"t_df, ens_future_udp_{malware}_result= test_ensemble(models, malware_list, future_udp_{malware}_df, features, threshold=None)")
        exec(f"ens_future_udp_{malware}_acc, ens_future_udp_{malware}_miss = ens_future_udp_{malware}_result.acc_multi, int((len(future_udp_{malware}_df) * (1-ens_future_udp_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['ens_future_udp'] = ens_future_udp_{malware}_acc")
        exec(f"{malware}_results_miss['ens_future_udp'] = ens_future_udp_{malware}_miss")

    # FULL BINARY: results on future (adversarial)
    for malware in malware_list:
        exec(f"t_df, ens_future_tcp_adversarial_{malware}_result= test_ensemble(models, malware_list, future_tcp_adversarial_{malware}_df, features, threshold=None)")
        exec(f"ens_future_tcp_adversarial_{malware}_acc, ens_future_tcp_adversarial_{malware}_miss = ens_future_tcp_adversarial_{malware}_result.acc_multi, int((len(future_tcp_adversarial_{malware}_df) * (1-ens_future_tcp_adversarial_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['ens_future_tcp_adversarial'] = ens_future_tcp_adversarial_{malware}_acc")    
        exec(f"{malware}_results_miss['ens_future_tcp_adversarial'] = ens_future_tcp_adversarial_{malware}_miss")
        exec(f"t_df, ens_future_udp_adversarial_{malware}_result= test_ensemble(models, malware_list, future_udp_adversarial_{malware}_df, features, threshold=None)")
        exec(f"ens_future_udp_adversarial_{malware}_acc, ens_future_udp_adversarial_{malware}_miss = ens_future_udp_adversarial_{malware}_result.acc_multi, int((len(future_udp_adversarial_{malware}_df) * (1-ens_future_udp_adversarial_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['ens_future_udp_adversarial'] = ens_future_udp_adversarial_{malware}_acc")
        exec(f"{malware}_results_miss['ens_future_udp_adversarial'] = ens_future_udp_adversarial_{malware}_miss")

    #####################################################
    #####################################################
    #################   DISTILLATION  #######################
    #####################################################
    #####################################################


    # FULL BINARY - DISTILLED: focus on TEST data
    dfb_all_pred, dfb_all_acc, dfb_ben_miss = quick_results(dfbClf, all_test, features, threshold=threshold)

    pred, benign_results['d_fb_past'], miss  = quick_results(dfbClf, benign_test, features, threshold=threshold, output=False)

    dfb_past_acc = 0
    for malware in malware_list:
        exec(f"dfb_{malware}_pred, dfb_{malware}_acc, dfb_{malware}_miss = quick_results(dfbClf, {malware}_test, features, threshold=threshold)")
        exec(f"dfb_past_acc += dfb_{malware}_acc")

    dfb_past_acc = dfb_past_acc / len(malware_list)
    uniform_malicious_results['d_fb_past'] = dfb_past_acc

    # FULL BINARY (distilled): results on past (non adversarial)
    show = False
    for malware in malware_list:
        exec(f"d_fb_past_tcp_{malware}_pred, d_fb_past_tcp_{malware}_acc, d_fb_past_tcp_{malware}_miss = quick_results(dfbClf, past_tcp_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_fb_past_tcp'] = d_fb_past_tcp_{malware}_acc")
        exec(f"{malware}_results_miss['d_fb_past_tcp'] = d_fb_past_tcp_{malware}_miss")
        exec(f"d_fb_past_udp_{malware}_pred, d_fb_past_udp_{malware}_acc, d_fb_past_udp_{malware}_miss = quick_results(dfbClf, past_udp_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_fb_past_udp'] = d_fb_past_udp_{malware}_acc")
        exec(f"{malware}_results_miss['d_fb_past_udp'] = d_fb_past_udp_{malware}_miss")

    # FULL BINARY: results on past (adversarial)
    for malware in malware_list:
        exec(f"d_fb_past_tcp_adversarial_{malware}_pred, d_fb_past_tcp_adversarial_{malware}_acc, d_fb_past_tcp_adversarial_{malware}_miss = quick_results(dfbClf, past_tcp_adversarial_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_fb_past_tcp_adversarial'] = d_fb_past_tcp_adversarial_{malware}_acc")    
        exec(f"{malware}_results_miss['d_fb_past_tcp_adversarial'] = d_fb_past_tcp_adversarial_{malware}_miss")
        exec(f"d_fb_past_udp_adversarial_{malware}_pred, d_fb_past_udp_adversarial_{malware}_acc, d_fb_past_udp_adversarial_{malware}_miss = quick_results(dfbClf, past_udp_adversarial_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_fb_past_udp_adversarial'] = d_fb_past_udp_adversarial_{malware}_acc")
        exec(f"{malware}_results_miss['d_fb_past_udp_adversarial'] = d_fb_past_udp_adversarial_{malware}_miss")


    # FULL BINARY - Distilled: detailed results on FUTURE (no adversarial)
    dfb_future_ben_pred, dfb_future_ben_acc, dfb_future_ben_miss = quick_results(dfbClf, future_benign_df, features, threshold=threshold)

    benign_results['d_fb_future']  = dfb_future_ben_acc


    dfb_future_acc = 0
    for malware in malware_list:
        exec(f"dfb_future_{malware}_pred, dfb_future_{malware}_acc, dfb_future_{malware}_miss = quick_results(dfbClf, future_{malware}_df, features, threshold=threshold)")
        exec(f"dfb_future_acc += dfb_future_{malware}_acc")

    dfb_future_acc = dfb_future_acc / len(malware_list)
    uniform_malicious_results['d_fb_future'] = dfb_future_acc

    # FULL BINARY (distilled): results on future (non adversarial)
    show = False
    for malware in malware_list:
        exec(f"d_fb_future_tcp_{malware}_pred, d_fb_future_tcp_{malware}_acc, d_fb_future_tcp_{malware}_miss = quick_results(dfbClf, future_tcp_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_fb_future_tcp'] = d_fb_future_tcp_{malware}_acc")
        exec(f"{malware}_results_miss['d_fb_future_tcp'] = d_fb_future_tcp_{malware}_miss")
        exec(f"d_fb_future_udp_{malware}_pred, d_fb_future_udp_{malware}_acc, d_fb_future_udp_{malware}_miss = quick_results(dfbClf, future_udp_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_fb_future_udp'] = d_fb_future_udp_{malware}_acc")
        exec(f"{malware}_results_miss['d_fb_future_udp'] = d_fb_future_udp_{malware}_miss")

    # FULL BINARY: results on future (adversarial)
    for malware in malware_list:
        exec(f"d_fb_future_tcp_adversarial_{malware}_pred, d_fb_future_tcp_adversarial_{malware}_acc, d_fb_future_tcp_adversarial_{malware}_miss = quick_results(dfbClf, future_tcp_adversarial_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_fb_future_tcp_adversarial'] = d_fb_future_tcp_adversarial_{malware}_acc")    
        exec(f"{malware}_results_miss['d_fb_future_tcp_adversarial'] = d_fb_future_tcp_adversarial_{malware}_miss")
        exec(f"d_fb_future_udp_adversarial_{malware}_pred, d_fb_future_udp_adversarial_{malware}_acc, d_fb_future_udp_adversarial_{malware}_miss = quick_results(dfbClf, future_udp_adversarial_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_fb_future_udp_adversarial'] = d_fb_future_udp_adversarial_{malware}_acc")
        exec(f"{malware}_results_miss['d_fb_future_udp_adversarial'] = d_fb_future_udp_adversarial_{malware}_miss")

    #####################################################
    #####################################################
    #################   ENSEMBLE  #######################
    #################   DISTILLATION  #######################
    #####################################################
    #####################################################

    # Individual classifiers (distilled) on past data

    # generating results
    for malware in malware_list:
        exec(f"pred, benign_results['d_{malware}_past'], miss = quick_results({malware}Reg, benign_test, features, threshold=threshold, output=False)")
        exec(f"pred, benign_results['d_{malware}_future'], miss = quick_results({malware}Reg, future_benign_df, features, threshold=threshold, output=False)")
        exec(f"pred, uniform_malicious_results['d_{malware}_past'], miss = quick_results({malware}Reg, {malware}_test, features, threshold=threshold, output=False)")
        exec(f"pred, uniform_malicious_results['d_{malware}_future'], miss = quick_results({malware}Reg, future_{malware}_df, features, threshold=threshold, output=False)")


    # INDIVIDUAL CLASSIFIERS (distilled): results on past (non adversarial)
    show = False
    for malware in malware_list:
        exec(f"d_{malware}_past_tcp_{malware}_pred, d_{malware}_past_tcp_{malware}_acc, d_{malware}_past_tcp_{malware}_miss = quick_results({malware}Reg, past_tcp_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_{malware}_past_tcp'] = d_{malware}_past_tcp_{malware}_acc")
        exec(f"{malware}_results_miss['d_{malware}_past_tcp'] = d_{malware}_past_tcp_{malware}_miss")
        exec(f"d_{malware}_past_udp_{malware}_pred, d_{malware}_past_udp_{malware}_acc, d_{malware}_past_udp_{malware}_miss = quick_results({malware}Reg, past_udp_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_{malware}_past_udp'] = d_{malware}_past_udp_{malware}_acc")
        exec(f"{malware}_results_miss['d_{malware}_past_udp'] = d_{malware}_past_udp_{malware}_miss")

    # INDIVIDUAL CLASSIFIERS (distilled): results on past (adversarial)
    for malware in malware_list:
        exec(f"d_{malware}_past_tcp_adversarial_{malware}_pred, d_{malware}_past_tcp_adversarial_{malware}_acc, d_{malware}_past_tcp_adversarial_{malware}_miss = quick_results({malware}Reg, past_tcp_adversarial_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_{malware}_past_tcp_adversarial'] = d_{malware}_past_tcp_adversarial_{malware}_acc")    
        exec(f"{malware}_results_miss['d_{malware}_past_tcp_adversarial'] = d_{malware}_past_tcp_adversarial_{malware}_miss")
        exec(f"d_{malware}_past_udp_adversarial_{malware}_pred, d_{malware}_past_udp_adversarial_{malware}_acc, d_{malware}_past_udp_adversarial_{malware}_miss = quick_results({malware}Reg, past_udp_adversarial_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_{malware}_past_udp_adversarial'] = d_{malware}_past_udp_adversarial_{malware}_acc")
        exec(f"{malware}_results_miss['d_{malware}_past_udp_adversarial'] = d_{malware}_past_udp_adversarial_{malware}_miss")

    # INDIVIDUAL CLASSIFIERS (distilled) - future data (no adversarial)
    for malware in malware_list:
        exec(f"temp_test = pd.concat([future_benign_df, future_{malware}_df])")
        exec(f"d_{malware}_future_pred, d_{malware}_future_acc, d_{malware}_future_miss = quick_results({malware}Reg, future_{malware}_df, features, threshold=threshold, output=False)")
        exec(f"print('Recall: {{}}'.format(d_{malware}_future_acc))")
        exec(f"d_{malware}_future_pred, d_{malware}_future_acc, d_{malware}_future_miss = quick_results({malware}Reg, temp_test, features, threshold=threshold, output=True)")

    # INDIVIDUAL CLASSIFIERS: results on future (non adversarial)
    show = False
    for malware in malware_list:
        exec(f"d_{malware}_future_tcp_{malware}_pred, d_{malware}_future_tcp_{malware}_acc, d_{malware}_future_tcp_{malware}_miss = quick_results({malware}Reg, future_tcp_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_{malware}_future_tcp'] = d_{malware}_future_tcp_{malware}_acc")
        exec(f"{malware}_results_miss['d_{malware}_future_tcp'] = d_{malware}_future_tcp_{malware}_miss")
        exec(f"d_{malware}_future_udp_{malware}_pred, d_{malware}_future_udp_{malware}_acc, d_{malware}_future_udp_{malware}_miss = quick_results({malware}Reg, future_udp_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_{malware}_future_udp'] = d_{malware}_future_udp_{malware}_acc")
        exec(f"{malware}_results_miss['d_{malware}_future_udp'] = d_{malware}_future_udp_{malware}_miss")

    # INDIVIDUAL CLASSIFIERS: results on future (adversarial)
    for malware in malware_list:
        exec(f"d_{malware}_future_tcp_adversarial_{malware}_pred, d_{malware}_future_tcp_adversarial_{malware}_acc, d_{malware}_future_tcp_adversarial_{malware}_miss = quick_results({malware}Reg, future_tcp_adversarial_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_{malware}_future_tcp_adversarial'] = d_{malware}_future_tcp_adversarial_{malware}_acc")    
        exec(f"{malware}_results_miss['d_{malware}_future_tcp_adversarial'] = d_{malware}_future_tcp_adversarial_{malware}_miss")
        exec(f"d_{malware}_future_udp_adversarial_{malware}_pred, d_{malware}_future_udp_adversarial_{malware}_acc, d_{malware}_future_udp_adversarial_{malware}_miss = quick_results({malware}Reg, future_udp_adversarial_{malware}_df, features, output=show, threshold=threshold)")
        exec(f"{malware}_results_rec['d_{malware}_future_udp_adversarial'] = d_{malware}_future_udp_adversarial_{malware}_acc")
        exec(f"{malware}_results_miss['d_{malware}_future_udp_adversarial'] = d_{malware}_future_udp_adversarial_{malware}_miss")

    #### ENSEMBLE (distilled) ######


    #aggregate results
    d_past_benignEns_df, d_past_benignEns_result = test_ensemble(d_models, malware_list, benign_test, features, threshold=threshold)
    d_future_benignEns_df, d_future_benignEns_result = test_ensemble(d_models, malware_list, future_benign_df, features, threshold=threshold)
    benign_results['d_ens_past'] = d_past_benignEns_result.acc_multi
    benign_results['d_ens_future'] = d_future_benignEns_result.acc_multi


    d_ens_past_acc = 0
    d_ens_future_acc = 0
    for malware in malware_list:
        exec(f"d_{malware}Ens_df, d_{malware}Ens_result = test_ensemble(d_models, malware_list, {malware}_test, features, threshold=threshold)")
        exec(f"d_ens_past_acc += d_{malware}Ens_result.acc_multi")
        exec(f"d_future_{malware}Ens_df, d_future_{malware}Ens_result = test_ensemble(d_models, malware_list, future_{malware}_df, features, threshold=threshold)")
        exec(f"d_ens_future_acc += d_future_{malware}Ens_result.acc_multi")

    uniform_malicious_results['d_ens_past'] = d_ens_past_acc / len(malware_list)
    uniform_malicious_results['d_ens_future'] = d_ens_future_acc / len(malware_list)

    # ENSEMBLE (distilled): results on past (non adversarial)
    show = False
    for malware in malware_list:
        exec(f"t_df, d_ens_past_tcp_{malware}_result= test_ensemble(d_models, malware_list, past_tcp_{malware}_df, features, threshold=threshold)")
        exec(f"d_ens_past_tcp_{malware}_acc, d_ens_past_tcp_{malware}_miss = d_ens_past_tcp_{malware}_result.acc_multi, int((len(past_tcp_{malware}_df) * (1-d_ens_past_tcp_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['d_ens_past_tcp'] = d_ens_past_tcp_{malware}_acc")
        exec(f"{malware}_results_miss['d_ens_past_tcp'] = d_ens_past_tcp_{malware}_miss")
        exec(f"t_df, d_ens_past_udp_{malware}_result= test_ensemble(d_models, malware_list, past_udp_{malware}_df, features, threshold=threshold)")
        exec(f"d_ens_past_udp_{malware}_acc, d_ens_past_udp_{malware}_miss = d_ens_past_udp_{malware}_result.acc_multi, int((len(past_udp_{malware}_df) * (1-d_ens_past_udp_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['d_ens_past_udp'] = d_ens_past_udp_{malware}_acc")
        exec(f"{malware}_results_miss['d_ens_past_udp'] = d_ens_past_udp_{malware}_miss")

    # ENSEMBLE (distilled): results on past (adversarial)
    for malware in malware_list:
        exec(f"t_df, d_ens_past_tcp_adversarial_{malware}_result= test_ensemble(d_models, malware_list, past_tcp_adversarial_{malware}_df, features, threshold=threshold)")
        exec(f"d_ens_past_tcp_adversarial_{malware}_acc, d_ens_past_tcp_adversarial_{malware}_miss = d_ens_past_tcp_adversarial_{malware}_result.acc_multi, int((len(past_tcp_adversarial_{malware}_df) * (1-d_ens_past_tcp_adversarial_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['d_ens_past_tcp_adversarial'] = d_ens_past_tcp_adversarial_{malware}_acc")    
        exec(f"{malware}_results_miss['d_ens_past_tcp_adversarial'] = d_ens_past_tcp_adversarial_{malware}_miss")
        exec(f"t_df, d_ens_past_udp_adversarial_{malware}_result= test_ensemble(d_models, malware_list, past_udp_adversarial_{malware}_df, features, threshold=threshold)")
        exec(f"d_ens_past_udp_adversarial_{malware}_acc, d_ens_past_udp_adversarial_{malware}_miss = d_ens_past_udp_adversarial_{malware}_result.acc_multi, int((len(past_udp_adversarial_{malware}_df) * (1-d_ens_past_udp_adversarial_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['d_ens_past_udp_adversarial'] = d_ens_past_udp_adversarial_{malware}_acc")
        exec(f"{malware}_results_miss['d_ens_past_udp_adversarial'] = d_ens_past_udp_adversarial_{malware}_miss")


    # ENSEMBLE (distilled) - future data (no adversarial)
    print("Benign Accuracy: {:5f}".format(d_future_benignEns_result.acc_multi))
    display(d_future_benignEns_result.ctab)

    for malware in malware_list:
        exec(f"print('{{}} Recall: {{:5f}}'.format('{malware}', d_future_{malware}Ens_result.acc_multi))")
        exec(f"display(d_future_{malware}Ens_result.ctab)")


    # ENSEMBLE (distilled): results on future (non adversarial)
    show = False
    for malware in malware_list:
        exec(f"t_df, d_ens_future_tcp_{malware}_result= test_ensemble(d_models, malware_list, future_tcp_{malware}_df, features, threshold=threshold)")
        exec(f"d_ens_future_tcp_{malware}_acc, d_ens_future_tcp_{malware}_miss = d_ens_future_tcp_{malware}_result.acc_multi, int((len(future_tcp_{malware}_df) * (1-d_ens_future_tcp_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['d_ens_future_tcp'] = d_ens_future_tcp_{malware}_acc")
        exec(f"{malware}_results_miss['d_ens_future_tcp'] = d_ens_future_tcp_{malware}_miss")
        exec(f"t_df, d_ens_future_udp_{malware}_result= test_ensemble(d_models, malware_list, future_udp_{malware}_df, features, threshold=threshold)")
        exec(f"d_ens_future_udp_{malware}_acc, d_ens_future_udp_{malware}_miss = d_ens_future_udp_{malware}_result.acc_multi, int((len(future_udp_{malware}_df) * (1-d_ens_future_udp_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['d_ens_future_udp'] = d_ens_future_udp_{malware}_acc")
        exec(f"{malware}_results_miss['d_ens_future_udp'] = d_ens_future_udp_{malware}_miss")

    # ENSEMBLE (distilled): results on future (adversarial)
    for malware in malware_list:
        exec(f"t_df, d_ens_future_tcp_adversarial_{malware}_result= test_ensemble(d_models, malware_list, future_tcp_adversarial_{malware}_df, features, threshold=threshold)")
        exec(f"d_ens_future_tcp_adversarial_{malware}_acc, d_ens_future_tcp_adversarial_{malware}_miss = d_ens_future_tcp_adversarial_{malware}_result.acc_multi, int((len(future_tcp_adversarial_{malware}_df) * (1-d_ens_future_tcp_adversarial_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['d_ens_future_tcp_adversarial'] = d_ens_future_tcp_adversarial_{malware}_acc")    
        exec(f"{malware}_results_miss['d_ens_future_tcp_adversarial'] = d_ens_future_tcp_adversarial_{malware}_miss")
        exec(f"t_df, d_ens_future_udp_adversarial_{malware}_result= test_ensemble(d_models, malware_list, future_udp_adversarial_{malware}_df, features, threshold=threshold)")
        exec(f"d_ens_future_udp_adversarial_{malware}_acc, d_ens_future_udp_adversarial_{malware}_miss = d_ens_future_udp_adversarial_{malware}_result.acc_multi, int((len(future_udp_adversarial_{malware}_df) * (1-d_ens_future_udp_adversarial_{malware}_result.acc_multi))) ")
        exec(f"{malware}_results_rec['d_ens_future_udp_adversarial'] = d_ens_future_udp_adversarial_{malware}_acc")
        exec(f"{malware}_results_miss['d_ens_future_udp_adversarial'] = d_ens_future_udp_adversarial_{malware}_miss")


    all_results = dict()
    all_results['benign'] = benign_results
    all_results['malicious_uniform'] = uniform_malicious_results
    for malware in malware_list:
        exec(f"all_results['{malware}']=dict()")
        exec(f"all_results['{malware}']['recall']={malware}_results_rec")
        exec(f"all_results['{malware}']['miss']={malware}_results_miss")

    def save_results(results_dict_input, output_file, malware_list):
        import copy
        import os
        import json

        # converting to list
        results_dict = copy.deepcopy(results_dict_input)
        for key in results_dict['benign']:
            results_dict['benign'][key] = [results_dict['benign'][key]]
        for key in results_dict['malicious_uniform']:
            results_dict['malicious_uniform'][key] = [results_dict['malicious_uniform'][key]]
        for malware in malware_list:
            for key in results_dict[malware]['recall']:
                results_dict[malware]['recall'][key] = [results_dict[malware]['recall'][key]]
            for key in results_dict[malware]['miss']:
                results_dict[malware]['miss'][key] = [results_dict[malware]['miss'][key]]
        if os.path.exists(output_file):
            # if file exists, read content and append new values
            with open(output_file, 'r') as f:    
                data = json.load(f)
            for key in data['benign']:
                data['benign'][key] = data['benign'][key] + results_dict['benign'][key]
            for key in data['malicious_uniform']:
                data['malicious_uniform'][key] = data['malicious_uniform'][key] + results_dict['malicious_uniform'][key]

            for malware in malware_list:
                for key in data[malware]['recall']:
                    data[malware]['recall'][key] = data[malware]['recall'][key] + results_dict[malware]['recall'][key]
                for key in results_dict[malware]['miss']:
                    data[malware]['miss'][key] = data[malware]['miss'][key] + results_dict[malware]['miss'][key]


            with open(output_file, 'w') as f:    
                f.write(json.dumps(data))
            return "Overwritten File: {}".format(output_file)
        else:
            # otherwise, create the file
            with open(output_file, 'w') as f:    
                f.write(json.dumps(results_dict))
            return "Created new File: {}".format(output_file)
        return True
    
    end = datetime.datetime.now()
    print("Start:\t{}\nEnd:\t{}".format(begin, end))
    save_results(all_results, output_file, malware_list)
    

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Reading MALICIOUS (startime: 2024-02-21 02:16:23.673600)...
42591 entries for 2011_08_10_neris1.csv (final entries: 42591)
	UDP: 29339	TCP: 12516
22949 entries for 2011_08_11_neris2.csv (final entries: 22949)
	UDP: 913	TCP: 21628
188714 entries for 2011_08_17_neris3.csv (final entries: 188714)
	UDP: 109085	TCP: 77623
39063 entries for 2011_08_12_rbot1.csv (final entries: 39063)
	UDP: 625	TCP: 29838
884 entries for 2011_08_15_rbot2.csv (final entries: 884)
	UDP: 676	TCP: 10
8737 entries for 2011_08_18_rbot3.csv (final entries: 8737)
	UDP: 32	TCP: 13
944 entries for 2011_08_15_virut1.csv (final entries: 944)
	UDP: 61	TCP: 857
41424 entries for 2011_08_16_virut2.csv (final entries: 41424)
	UDP: 7661	TCP: 32805
Reading ADVERSARIAL...
42596 entries for 2011_08_10_neris1-Ts.csv (final entries: 42596)
42591 entries for 2011_08_10_neris1-Us.csv (final entries: 42591)
22981 entries for 2011_08_11_neris2-Ts.csv (final entries: 22981)
22949 entries for 2011_08_11_neris2-Us.csv (final entries: 229

Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,431780,6
1,124,19364


Training and testing rbot......done! Training time: 43.654594s
rbot Recall: 0.998645


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,431786,0
1,16,11790


Training and testing virut......done! Training time: 30.646892s
virut Recall: 0.968379


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,431786,0
1,8,245


Training distillation...
...done! Training time: 552.158006s
Training distillation...
...done! Training time: 515.719126s
neris Recall: 0.992457


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,431741,45
1,147,19341


Training distillation...
...done! Training time: 637.999433s
rbot Recall: 0.997798


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,431780,6
1,26,11780


Training distillation...
...done! Training time: 507.095199s
virut Recall: 0.932806


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,431783,3
1,17,236


Accuracy: 0.999981, Missclassifications: 8


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,431778,8


Accuracy: 0.993637, Missclassifications: 124


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
1,124,19364


Accuracy: 0.999238, Missclassifications: 10


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
1,9,11797


Accuracy: 0.996047, Missclassifications: 1


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,252


109085
32
7661
Accuracy: 0.988276, Missclassifications: 1838


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,154847,1837


Accuracy: 0.980410, Missclassifications: 3697


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3697,185017


Accuracy: 0.032734, Missclassifications: 8451


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
1,8451,286


Accuracy: 0.996982, Missclassifications: 126


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
1,125,41299


Recall: 0.7217376559237788
Accuracy: 0.847955, Missclassifications: 52516


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,156680,4
1,52512,136202


Recall: 0.0293006752890008
Accuracy: 0.948731, Missclassifications: 8482


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,156684,0
1,8481,256


Recall: 0.8544080726149093
Accuracy: 0.969557, Missclassifications: 6031


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,156684,0
1,6031,35393


neris
rbot
virut
Accuracy: 0.999536, Missclassifications: 216


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,431742,44
1,171,31376


Accuracy: 0.991841, Missclassifications: 159


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
1,159,19329


Accuracy: 0.998984, Missclassifications: 13


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
1,12,11794


Accuracy: 1.000000, Missclassifications: 0


Pred,1.0
True,Unnamed: 1_level_1
1,253


Accuracy: 0.982927, Missclassifications: 2675


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,154009,2675


Accuracy: 0.847075, Missclassifications: 28860


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
1,28859,159855


Accuracy: 0.029301, Missclassifications: 8481


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
1,8481,256


Accuracy: 0.995631, Missclassifications: 182


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
1,181,41243


Recall: 0.16414256493953813
Accuracy: 0.541335, Missclassifications: 158422


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,156000,684
1,157738,30976


Recall: 0.024836900537942085
Accuracy: 0.948453, Missclassifications: 8528


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,156677,7
1,8520,217


Recall: 0.7709298957126304
Accuracy: 0.952062, Missclassifications: 9497


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,156676,8
1,9489,31935


Benign Accuracy: 0.995571


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
False,155990,694


neris Recall: 0.314630


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
True,129339,59375


rbot Recall: 0.087559


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
True,7972,765


virut Recall: 0.933300


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
True,2763,38661


100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [41:10<00:00, 2470.38s/it]

Start:	2024-02-21 02:16:23.673600
End:	2024-02-21 02:57:34.050569



