In [1]:
import os
import sys
from supportFunctions import *

root_folder = "..\\data\\CTU13\\flows\\"

In [2]:
#### PARAMETERS #####

## Generic parameters
temporal = False # used to determine if this evaluation assumes a "temporal" dependency among its samples
base_clf = 'dt' # name of the classifier used for this "run". Available names: ['dt', 'rf', 'hgb', 'lr']. You can add more by editing the supportFunctions file
test_size = 0.2 # proportion of the dataset used for testing. We always kept it fixed to 0.2 for our paper
train_size = 100 # proportion of the REMAINING data that are used for training (if >1, then it will take that exact amount). To reproduce the results of the paper, use: 100 (for "limited" training data) or 0.2 or 0.5 or 0.99 (for scarce, moderate, abundant training data, respectively) 
agreement = 0.5 # from 0 to 1. Proportion of classifiers that must agree on an attack (for the ensemble). This is fixed in our paper.
max_size = 500000 ## maximum amount of samples to include when creating the initial dataframes. This is fixed in our paper
max_size_atk = int(max_size / 3) # maximum amount of malicious samples per class. This is fixed in our paper

## Adversarial Attacks parameters
atk_intensity = 1 # 10 packets of 100B each over 10 seconds. The response is of one pkt of 1 byte
byt_intensity = atk_intensity * 102 # 
pkt_intensity = atk_intensity * 10 # 
dur_intensity = atk_intensity * 10 # consider duration in SECONDS

In [3]:
### Reading input data

malicious_folder = root_folder + "malicious/"

benign_file = root_folder + "benign.csv"
benign_df = pd.read_csv(benign_file, header='infer', index_col=0)
benign_df = benign_df.sample(min(max_size, len(benign_df)))
#sort by timestamp
if temporal == True:
    pass # it should be already sorted
    
benign_df.reset_index(inplace=True, drop=True)

attack_names = ["neris", "rbot", "nsis", "virut", "donbot", "murlo"]

neris_file = malicious_folder + "neris.csv"
rbot_file = malicious_folder + "rbot.csv"
nsis_file = malicious_folder + "nsis.csv"
virut_file = malicious_folder + "virut.csv"
donbot_file = malicious_folder + "donbot.csv"
sogou_file = malicious_folder + "sogou.csv" #only 63 samples: discard
murlo_file = malicious_folder + "murlo.csv"



for a in attack_names:
    exec(f"{a}_df = pd.read_csv({a}_file, header='infer', index_col=0)")
    exec(f"{a}_df = {a}_df.sample(min(max_size_atk, len({a}_df)))")
    # sort by timestamp
    if temporal == True:
        pass # it should be already sorted
    exec(f"{a}_df.reset_index(inplace=True, drop=True)")
    exec(f"{a}_df['Label'] = a")

In [4]:
# Determining Train and Test sets for each class

df_list = [benign_df]
for a in attack_names:
    exec(f"df_list.append({a}_df)")

if temporal == True:
    for dummy_df in df_list:
        if train_size <=1:
            train_threshold = int(((1-test_size) * train_size) * len(dummy_df))
        else:
            train_threshold = int(100)
        test_threshold = len(dummy_df) - int(test_size * len(dummy_df))
        dummy_df['index'] = dummy_df.index
        dummy_df['is_test'] = np.where(dummy_df['index'] >= test_threshold , True, False)
        dummy_df['is_train'] = np.where(dummy_df['index'] <= train_threshold , True, False)
else:
    for dummy_df in df_list:
        if train_size <= 1:
            train_threshold = test_size + (1-test_size)*train_size
        else:
            train_threshold = test_size + ((train_size * 100) / (len(dummy_df)) / 100)       
        dummy_df['seed'] = (np.random.uniform(0,1,len(dummy_df)))
        dummy_df['is_test'] = np.where(dummy_df['seed'] <= test_size, True, False)
        dummy_df['is_train'] = np.where((dummy_df['seed'] <= train_threshold) & (dummy_df['is_test']==False), True, False)

# get all together
all_df = pd.concat(df_list)

In [6]:
def handle_categorical(df):
    ## Handling categorical data
    df_dummy = df.copy(deep=True)
    df_dummy['Nature'] = np.where(df_dummy['Label'].str.contains('BENIGN'),0,1)
    
    for column_name in df_dummy.columns:
        if column_name == ('SrcPort_type'):
            df_dummy[column_name+"-f"] = pd.factorize(df_dummy[column_name])[0]
        elif column_name == ('DstPort_type'):
            df_dummy[column_name+"-f"] = pd.factorize(df_dummy[column_name])[0]
        elif column_name == ('Proto'):
            df_dummy[column_name+"-f"] = pd.factorize(df_dummy[column_name])[0]
        elif column_name == ('State'):
            df_dummy[column_name+"-f"] = pd.factorize(df_dummy[column_name])[0]
        else:
            pass
    return df_dummy

all_df = handle_categorical(all_df)
all_df['Label_cat'] = pd.factorize(all_df['Label'])[0]

all_df['int2int'] = np.where( ((all_df['SrcIP_internal']==True) & (all_df['DstIP_internal']==True)), True, False)

all_train, all_test = all_df[all_df['is_train']==True], all_df[all_df['is_test']==True]

### SPLITTING ALL BACK ####
benign_df = all_df[all_df['Label']=='BENIGN']
benign_train = benign_df[benign_df['is_train']==True]
benign_test = benign_df[benign_df['is_test']==True]

for a in attack_names:
    exec(f"{a}_df = all_df[all_df['Label']=='{a}']")
    
malicious_df = all_df[all_df['Label']!='BENIGN']
malicious_train, malicious_test = malicious_df[malicious_df['is_train']==True], malicious_df[malicious_df['is_test']==True]

print("& 0 & \\textit{{Benign}} & {} & \\\\ \\cline{{2-4}}".format(len(benign_df)))


for i,a in enumerate(attack_names):
    exec(f"print('& {i+1} & \\\\textit{{{{{a}}}}} & {{}} \\\\\\\\ \\\\cline{{{{2-4}}}}'.format(len({a}_df)))")

& 0 & \textit{Benign} & 500000 & \\ \cline{2-4}
& 1 & \textit{neris} & 166666 \\ \cline{2-4}
& 2 & \textit{rbot} & 143918 \\ \cline{2-4}
& 3 & \textit{nsis} & 2168 \\ \cline{2-4}
& 4 & \textit{virut} & 40904 \\ \cline{2-4}
& 5 & \textit{donbot} & 4630 \\ \cline{2-4}
& 6 & \textit{murlo} & 6127 \\ \cline{2-4}


In [7]:
## Feature sets

# the following is the "complete" feature set

features = ['sTos', 'dTos', 'SrcWin', 'DstWin',
       'sHops', 'dHops', 'sTtl', 'dTtl', 'TcpRtt',
       'SynAck', 'AckDat', 'SrcPkts', 'DstPkts', 'SrcBytes', 'DstBytes',
       'SAppBytes', 'DAppBytes', 'Dur', 'TotPkts', 'TotBytes', 'TotAppByte',
       'Rate', 'SrcRate', 'DstRate', 'DstIP_internal',
       'SrcIP_internal', 'Proto-f',
       'State-f', 'SrcPort_type-f', 'DstPort_type-f'
       ]

# this is for the "essential" feature set
small_features = ['sTos', 'dTos',       
       'SrcPkts', 'DstPkts', 'SrcBytes', 'DstBytes',
       'Dur', 'TotPkts', 'TotBytes',
        'Proto-f', 
       'State-f', 'SrcPort_type-f', 'DstPort_type-f']

In [8]:
# creating adversarial dataset
mal_base = malicious_df[((malicious_df['Proto']=='udp')) & (malicious_df['is_test']==True)]
print(len(mal_base))
mal_adv = mal_base.copy(deep=True)
# support

# attacking
max_dur = mal_adv['Dur'].max()
min_dur = mal_adv['Dur'].min()
mal_adv['Dur'] = mal_adv['Dur'] + dur_intensity
mal_adv['Dur'] = np.where(mal_adv['Dur'] > max_dur, max_dur, mal_adv['Dur'])
mal_adv['Dur'] = np.where(mal_adv['Dur'] < min_dur, min_dur, mal_adv['Dur'])

mal_adv['SrcBytes'] = mal_adv['SrcBytes'] + byt_intensity
mal_adv['SrcPkts'] = mal_adv['SrcPkts'] + pkt_intensity
mal_adv['DstBytes'] = mal_adv['DstBytes'] + byt_intensity / 1024
mal_adv['DstPkts'] = mal_adv['DstPkts'] + pkt_intensity

mal_adv['TotPkts'] = mal_adv['SrcPkts'] + mal_adv['DstPkts']
mal_adv['TotBytes'] = mal_adv['SrcBytes'] + mal_adv['DstBytes']

19694


# FROM NOW ON, THE CODE IS ALWAYS THE SAME FOR EVERY DATASET!!!!!!

# Baseline: Assessment on "Complete" feature set

## BINARY CLASSIFIER (Complete features)

In [9]:
bClf, bPred, bResult = develop_clf(all_train, all_test, features, clf_name='bin', label='Nature', clf_type=base_clf, verbose=1)

if (bResult.acc == 0):
    bErr = int(len(all_test) * (1-bResult.acc_multi))
else:
    bErr = int(len(all_test) * (1-bResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(bErr, len(all_test), bResult.rec, bResult.fpr))
pd.crosstab(all_test['Nature'], bPred, rownames=['True'], colnames=['Pred'])

Training and testing bin......done! Training time: 0.006977s	Inference time: 0.034920s
Total Misclassifications: 7046 out of 173220 (Recall: 0.991456	FPR: 0.064165)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,93649,6421
1,625,72525


## MULTI-CLASS CLASSIFIER - cascade

In [10]:
# This is the classifier that analyzes ONLY the malicious samples that "receives" from the initial binary classifier
# It is trained on the same training set---but without using the benign samples
# It is tested on the malicious samples in the test set that are flagged as malicious by the binary classifier


mcClf, mcPred, mcResult = develop_clf(malicious_train, malicious_test, features, clf_name='mc', label='Label_cat', clf_type=base_clf, verbose=1)
mcErr = int(len(malicious_test) * (1-mcResult.acc_multi))
print("Total Misclassifications: {} out of {}".format(mcErr, len(malicious_test)))
pd.crosstab(malicious_test['Label_cat'], mcPred, rownames=['True'], colnames=['Pred'])

Training and testing mc......done! Training time: 0.005978s	Inference time: 0.019933s
Total Misclassifications: 22153 out of 73150


Pred,1,2,3,4,5,6
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,17739,1051,3058,6433,3258,2058
2,944,25485,68,1356,806,191
3,23,0,353,16,0,18
4,1583,225,266,5710,289,71
5,52,39,2,55,679,114
6,25,0,39,9,105,1030


In [11]:
# We select the samples flagged as malicious by the initial classifier.
# Of course, samples flagged as malicious that are NOT actually malicious will always be misclassified

all_test['bPred'] = bPred
mc_test = all_test[(all_test['bPred']==1) & (all_test['Nature']==1)]
if (len(mc_test)==0):
    # in this case, this classifier receives nothing
    print("There is no malicious sample flagged as malicious to analyze!")

mcPred_m = mcClf.predict(mc_test[features])
mcResult.acc_multic = accuracy_score(mc_test['Label_cat'], mcPred_m, normalize=True, sample_weight=None)
mcErr_m = int((1-mcResult.acc_multic) * len(mc_test))
print("Total Misclassifications (among the malicious samples): {} out of {}".format(mcErr_m, len(mc_test)))




pd.crosstab(mc_test['Label_cat'], mcPred_m, rownames=['True'], colnames=['Pred'])

Total Misclassifications (among the malicious samples): 21856 out of 72525


Pred,1,2,3,4,5,6
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,17514,1051,2918,6398,3249,2023
2,944,25416,59,1351,804,191
3,18,0,337,12,0,15
4,1560,225,241,5694,288,69
5,52,39,2,55,678,114
6,25,0,39,9,105,1030


In [12]:
## Note: We also accounted for the false positives of the first binary classifier (all of which have been considered as misclassifications)
bin_falsePositives = int(bResult.fpr * len(benign_test))
print("This classifier also analyzed {} benign samples that were incorrectly labelled as 'malicious' by the binary classifier".format(bin_falsePositives))

print("Hence, this classifier was tested on {} samples, of which {} have been misclassified".format(len(mc_test)+bin_falsePositives, bin_falsePositives+mcErr_m))

This classifier also analyzed 6421 benign samples that were incorrectly labelled as 'malicious' by the binary classifier
Hence, this classifier was tested on 78946 samples, of which 28277 have been misclassified


## MULTI-CLASS CLASSIFIER - stand-alone

In [13]:
# We first assess its multiclassification performance, and then its binary classification performance

mClf, mPred, mResult = develop_clf(all_train, all_test, features, clf_name='m', label='Label_cat', clf_type=base_clf, verbose=1)
mErr = int(len(all_test) * (1-mResult.acc_multi))
print("Total Misclassifications: {} out of {}".format(mErr, len(all_test)))
mResult.ctab

Training and testing m......done! Training time: 0.007973s	Inference time: 0.040874s
Total Misclassifications: 29816 out of 173220


Pred,0,1,2,3,4,5,6
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,92819,936,450,2993,1067,1274,531
1,418,17212,1073,2761,6522,3560,2051
2,53,917,25466,141,1266,870,137
3,23,30,0,321,20,0,16
4,28,1370,224,299,5851,322,50
5,0,52,31,2,48,700,108
6,3,25,0,32,8,105,1035


In [14]:
# For the binary classification performance, we use the previous predictions
mPred_bin = np.copy(mPred)
mPred_bin[mPred_bin > 0] = 1
mResult.bin_results(all_test['Nature'], mPred_bin)
mErr_bin = int(len(all_test) * (1-mResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(mErr_bin, len(all_test), mResult.rec, mResult.fpr))
mResult.ctab_bin

Total Misclassifications: 7775 out of 173220 (Recall: 0.992823	FPR: 0.072459)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,92819,7251
1,525,72625


## ENSEMBLE CLASSIFIERS

### Training "individual" binary classifiers

In [15]:
ensemble_df = pd.DataFrame()

benign_train = benign_df[benign_df['is_train']==True]
benign_test = benign_df[benign_df['is_test']==True]


ens_time = 0
ens_avgFPR = 0
tot_TP = 0
tot_P = 0
ens_infer_time = 0
fakeEns_infer_time = 0

for a in attack_names:
    exec(f"{a}_train = {a}_df[{a}_df['is_train']==True]")
    exec(f"{a}_test = {a}_df[{a}_df['is_test']==True]")

    exec(f"train = pd.concat([benign_train, {a}_train])")
    exec(f"test = pd.concat([benign_test, {a}_test])")
    
    # We first train a classifier only on "benign" or on malicious samples of a specific attack. 
    # Afterwards, we immediately test it on a test-set having ONLY malicious samples of this specific attack
    # Note: such "testing" is redundant, because it assumes that the classifier only receives the samples of the attack it is trained on!
    exec(f"{a}Clf, {a}Pred, {a}Result = develop_clf(train, test, features, clf_name='{a}', clf_type=base_clf, verbose=1)")
    exec(f"fakeEns_infer_time += {a}Result.infer_time")
    # We now test the specific classifier on the ENTIRE test-set, thereby allowing to assess its performance also against malicious samples of different attacks
    exec(f"{a}_allPred, {a}_allResults, {a}Result.infer_time = evaluate_clf({a}Clf, all_test, features, clf_name='{a}', time={a}Result.time, verbose=1)")

    exec(f"ensemble_df['{a}'] = {a}_allPred")

    exec(f"tot_TP += ({a}Result.rec * len({a}_test))")
    exec(f"tot_P += len({a}_test)")


    exec(f"ens_avgFPR += {a}Result.fpr")
    exec(f"ens_time += {a}Result.time")
    exec(f"ens_infer_time +={a}Result.infer_time")

ens_avgFPR = ens_avgFPR / len(attack_names)
ens_avgREC = tot_TP/tot_P

print("Total training time: {:5f}s\t AvgFPR: {:5f}\t AvgTPR: {:5f}\tTotal inference time: {:5f}s (fake: {:5f}s)".
      format(ens_time, ens_avgFPR, ens_avgREC, ens_infer_time, fakeEns_infer_time))

Training and testing neris......done! Training time: 0.003979s	Inference time: 0.024917s
Testing neris...
...done! 	Inference time: 0.031893s
Training and testing rbot......done! Training time: 0.003990s	Inference time: 0.024913s
Testing rbot...
...done! 	Inference time: 0.029908s
Training and testing nsis......done! Training time: 0.005980s	Inference time: 0.022838s
Testing nsis...
...done! 	Inference time: 0.029900s
Training and testing virut......done! Training time: 0.004981s	Inference time: 0.020931s
Testing virut...
...done! 	Inference time: 0.029897s
Training and testing donbot......done! Training time: 0.004154s	Inference time: 0.016979s
Testing donbot...
...done! 	Inference time: 0.030894s
Training and testing murlo......done! Training time: 0.004014s	Inference time: 0.019442s
Testing murlo...
...done! 	Inference time: 0.029903s
Total training time: 0.027098s	 AvgFPR: 0.030910	 AvgTPR: 0.983882	Total inference time: 0.182396s (fake: 0.130020s)


### Ensemble (real assessment)

In [16]:
# Here we measure the combined performance of the entire ensemble
# This is done with a logical or, or for majority voting (regulated by the "agreement" variable)

ensemble_df["sum"] = ensemble_df.sum(axis=1)
#calculating 
ensemble_df["LOR"] = (ensemble_df["sum"]>0)

#Appending Ground Truth
temp = all_test['Nature'] #> 0)
ensemble_df['True'] = ((temp.reset_index(drop=True)) > 0)

### Ensemble: Logical OR

In [17]:
enslorResult = Result(ensemble_df['True'], ensemble_df['LOR'], ens_time, ens_infer_time)
enslorErr= int(len(all_test) * (1-enslorResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(enslorErr, len(all_test), enslorResult.rec, enslorResult.fpr))
enslorResult.ctab_bin # you can also try with enslorResult.ctab

Total Misclassifications: 11754 out of 173220 (Recall: 0.998988	FPR: 0.116718)


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
False,88390,11680
True,74,73076


### Ensemble: Majority Voting

In [18]:
min_agree = math.ceil(agreement * len(attack_names))
print("Voting: at least {} out of {} classifiers must agree that a sample is malicious.".format(min_agree, len(attack_names)))
ensemble_df["MAJV"] = (ensemble_df["sum"]>=min_agree)
ensvotResult = Result(ensemble_df['True'], ensemble_df['MAJV'], ens_time, ens_infer_time)
ensvotErr = int(len(all_test) * (1-ensvotResult.acc))
print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(ensvotErr, len(all_test), ensvotResult.rec, ensvotResult.fpr))
ensvotResult.ctab_bin # you can also try with ensvotResult.ctab

Voting: at least 3 out of 6 classifiers must agree that a sample is malicious.
Total Misclassifications: 3673 out of 173220 (Recall: 0.974532	FPR: 0.018097)


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
False,98259,1811
True,1863,71287


### Ensemble: Stacked Classifier

In [19]:
from mlxtend.classifier import StackingClassifier
clf_list = []
for a in attack_names:
    exec(f"clf_list.append({a}Clf)")

meta = choose_clf(clf_type=base_clf)
sClf = StackingClassifier(classifiers=clf_list, meta_classifier=meta, fit_base_estimators=False, use_probas = False)
s_timeStart = time.time()
sClf.fit(all_train[features], all_train['Nature'])
s_time = time.time() - s_timeStart + ens_time
s_timeStart = time.time()
sPred = sClf.predict(all_test[features])
s_infer_time = time.time()-s_timeStart
sResult = Result(all_test['Nature'], sPred, s_time, s_infer_time)
if sResult.acc < sResult.acc_multi:
    sResult.acc = sResult.acc_multi
sErr = int(len(all_test) * (1-sResult.acc))
print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sErr, len(all_test), sResult.rec, sResult.fpr))

sResult.ctab_bin # you can also try with sResult.ctab

Total Misclassifications: 10226 out of 173220 (Recall: 0.996364	FPR: 0.099540)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,90109,9961
1,266,72884


## Open World Assessment: One attack against all classifiers

In [20]:
### The following code is a mixture of everything described insofar.
### We are only focused on TPR and FPR here. We do not care about accuracy, adversarial robustness, or runtime.
### These experiments are also done only on the "Complete" feature set


oaac_bin_rec = 0
oaac_bin_fpr = 0
oaac_multi_rec = 0
oaac_multi_fpr = 0
oaac_enslor_rec = 0
oaac_enslor_fpr = 0
oaac_ensvot_rec = 0
oaac_ensvot_fpr = 0
oaac_ensstk_rec = 0
oaac_ensstk_fpr = 0

for u in attack_names: # u is the unknown attack
    #print(u) # this is the unknown attack
    #exec(f"{u}_test = {u}_df[{u}_df['is_train']==False]") # create test set by putting the "test" samples of u
    exec(f"{u}_test = pd.concat([benign_test, {u}_test])") # add to the test set the "benign" test samples
    exec(f"{u}_train = benign_df[benign_df['is_train']==True]") # compose the "training" set: start by putting the benign "training" samples
    for a in attack_names: 
        # for every attack that is not u, add its training samples to the training set of u
        if a==u:
            continue
        exec(f"{u}_train = pd.concat([{u}_train, {a}_df[{a}_df['is_train']==True]])")


    # We have created the training and testing set. Now we must train and test a binary classifier by following the standard procedure
    ########## BINARY CLASSIFIER ##########
    exec(f"{u}_oaac_bClf, {u}_oaac_bPred, {u}_oaac_bResult = develop_clf({u}_train, {u}_test, features, clf_name='{u}_oaac_bin', label='Nature', clf_type=base_clf)")


    ########## Multiclass CLASSIFIER ########## --> Train, then test only on binary
    exec(f"{u}_oaac_mClf, {u}_oaac_mPred, {u}_oaac_mResult = develop_clf({u}_train, {u}_test, features, clf_name='{u}_oaac_multi', label='Label_cat', clf_type=base_clf)")
    exec(f"{u}_oaac_mPred_bin = np.copy({u}_oaac_mPred)")
    exec(f"{u}_oaac_mPred_bin[{u}_oaac_mPred_bin > 0] = 1")
    exec(f"{u}_oaac_mResult.bin_results({u}_test['Nature'], {u}_oaac_mPred_bin)")


    ######### Ensemble ##############
    # send the samples in TEST to all the classifiers of the ensemble (which are already trained), aside from the one focusing on u
    exec(f"{u}_oaac_ens_df = pd.DataFrame()")
    for a in attack_names:    
        if a==u:
                continue
        exec(f"{a}_{u}Pred, {a}_{u}Results, {a}_{u}_infer_time = evaluate_clf({a}Clf, {u}_test, features, clf_name='{a}_{u}', time={a}Result.time)")
        exec(f"{u}_oaac_ens_df['{a}'] = {a}_{u}Pred")

    # now we have the dataframe with all the predictions, let's see the aggregate results
    exec(f"{u}_oaac_ens_df['sum'] = {u}_oaac_ens_df.sum(axis=1)")
    exec(f"{u}_oaac_ens_df['LOR'] = ({u}_oaac_ens_df['sum']>0)")
    exec(f"temp = {u}_test['Nature'] #> 0)")
    exec(f"{u}_oaac_ens_df['True'] = ((temp.reset_index(drop=True)) > 0)")
    exec(f"{u}_oaac_enslorResult = Result({u}_oaac_ens_df['True'], {u}_oaac_ens_df['LOR'], (ens_time-{u}Result.time), (ens_infer_time-{u}Result.infer_time))")

    # now we consider the majority voting of the ensemble
    exec(f"{u}_oaac_ens_df['MAJV'] = ({u}_oaac_ens_df['sum']>=min_agree)")
    exec(f"{u}_oaac_ensvotResult = Result({u}_oaac_ens_df['True'], {u}_oaac_ens_df['MAJV'], (ens_time-{u}Result.time), (ens_infer_time-{u}Result.infer_time))")

    # finally, let's use the stacking ensemble
    exec(f"{u}_clf_list = []")
    for a in attack_names:
        if a==u:
                continue
        exec(f"{u}_clf_list.append({a}Clf)")
    exec(f"{u}_oaac_sClf = StackingClassifier(classifiers={u}_clf_list, meta_classifier=meta, fit_base_estimators=False, use_probas = False)")
    exec(f"{u}_oaac_sClf.fit({u}_train[features], {u}_train['Nature'])")
    exec(f"{u}_oaac_sPred = {u}_oaac_sClf.predict({u}_test[features])")
    exec(f"{u}_oaac_sResult = Result({u}_test['Nature'], {u}_oaac_sPred, (ens_time-{u}Result.time), (ens_infer_time-{u}Result.infer_time))")

    # Updating results
    exec(f"oaac_bin_rec += {u}_oaac_bResult.rec")
    exec(f"oaac_bin_fpr += {u}_oaac_bResult.fpr")
    exec(f"oaac_multi_rec += {u}_oaac_mResult.rec")
    exec(f"oaac_multi_fpr += {u}_oaac_mResult.fpr")
    exec(f"oaac_enslor_rec += {u}_oaac_enslorResult.rec")
    exec(f"oaac_enslor_fpr += {u}_oaac_enslorResult.fpr")
    exec(f"oaac_ensvot_rec += {u}_oaac_ensvotResult.rec")
    exec(f"oaac_ensvot_fpr += {u}_oaac_ensvotResult.fpr")
    exec(f"oaac_ensstk_rec += {u}_oaac_sResult.rec")
    exec(f"oaac_ensstk_fpr += {u}_oaac_sResult.fpr")

# Finalizing averages
oaac_bin_rec /= len(attack_names)
oaac_bin_fpr /= len(attack_names)
oaac_multi_rec /= len(attack_names)
oaac_multi_fpr /= len(attack_names)
oaac_enslor_rec /= len(attack_names)
oaac_enslor_fpr /= len(attack_names)
oaac_ensvot_rec /= len(attack_names)
oaac_ensvot_fpr /= len(attack_names)
oaac_ensstk_rec /= len(attack_names)
oaac_ensstk_fpr /= len(attack_names)


print('''Open World assessment: performance against one unknown attack (averaged for all attacks in the dataset)
      Binary CLF: TPR={:5f}\tFPR={:5f}
      Multiclass (binarized) CLF: TPR={:5f}\tFPR={:5f}
      EnsLOR CLF: TPR={:5f}\tFPR={:5f}
      EnsVOT CLF: TPR={:5f}\tFPR={:5f}
      EnsSTK CLF: TPR={:5f}\tFPR={:5f}
      '''.format(oaac_bin_rec, oaac_bin_fpr,
                 oaac_multi_rec, oaac_multi_fpr,
                 oaac_enslor_rec, oaac_enslor_fpr,
                 oaac_ensvot_rec, oaac_ensvot_fpr,
                 oaac_ensstk_rec, oaac_ensstk_fpr))

Open World assessment: performance against one unknown attack (averaged for all attacks in the dataset)
      Binary CLF: TPR=0.937699	FPR=0.058364
      Multiclass (binarized) CLF: TPR=0.900330	FPR=0.069526
      EnsLOR CLF: TPR=0.972680	FPR=0.104795
      EnsVOT CLF: TPR=0.614395	FPR=0.011307
      EnsSTK CLF: TPR=0.967791	FPR=0.092237
      


## Assessment on Essential feature set (and Adversarial attacks)

### BINARY CLASSIFIER (Essential features)

In [21]:
sma_bClf, sma_bPred, sma_bResult = develop_clf(all_train, all_test, small_features, clf_name='adv_bin', label='Nature', clf_type=base_clf, verbose=1)
sma_bErr = int(len(all_test) * (1-sma_bResult.acc))
if (sma_bResult.acc == 0):
    sma_bErr = int(len(all_test) * (1-sma_bResult.acc_multi))
else:
    sma_bErr = int(len(all_test) * (1-sma_bResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_bErr, len(all_test), sma_bResult.rec, sma_bResult.fpr))
pd.crosstab(all_test['Nature'], sma_bPred, rownames=['True'], colnames=['Pred'])

Training and testing adv_bin......done! Training time: 0.004975s	Inference time: 0.024429s
Total Misclassifications: 23222 out of 173220 (Recall: 0.966658	FPR: 0.207685)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,79287,20783
1,2439,70711


#### Adversarial Attack against Binary Classifier

In [22]:
# Note that the adversarial attacks only affect a subset of the initial set of malicious samples
# Hence, we compute the classification performance also on this subset for a far comparison



adv_bPred_base = sma_bClf.predict(mal_base[small_features])
adv_bPred_adv = sma_bClf.predict(mal_adv[small_features])
adv_bin_base_rec =  recall_score(mal_base['Nature'], adv_bPred_base, pos_label=1)
adv_bin_adv_rec = recall_score(mal_adv['Nature'], adv_bPred_adv, pos_label=1)

print("Adversarial Recall (baseline): {:.3f}".format(adv_bin_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_bin_adv_rec))

Adversarial Recall (baseline): 0.958
Adversarial Recall (attack): 0.972


### Multiclass Classifier - cascade (essential feature set)

In [23]:
sma_mcClf, sma_mcPred, sma_mcResult = develop_clf(malicious_train, malicious_test, small_features, clf_name='sma_mc', label='Label_cat', clf_type=base_clf, verbose=1)
sma_mcErr = int(len(malicious_test) * (1-sma_mcResult.acc_multi))

print("Total Misclassifications: {} out of {}".format(sma_mcErr, len(malicious_test)))
pd.crosstab(malicious_test['Label_cat'], sma_mcPred, rownames=['True'], colnames=['Pred'])

Training and testing sma_mc......done! Training time: 0.003987s	Inference time: 0.013962s
Total Misclassifications: 22714 out of 73150


Pred,1,2,3,4,5,6
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,17571,976,3587,5799,1413,4251
2,988,25368,58,1363,833,240
3,45,0,338,22,1,4
4,1571,233,474,5560,270,36
5,32,30,10,43,470,356
6,5,0,32,8,34,1129


In [24]:
# We select the samples flagged as malicious by the initial classifier.
# Of course, samples flagged as malicious that are NOT actually malicious will always be misclassified

all_test['sma_bPred'] = sma_bPred
sma_mc_test = all_test[(all_test['sma_bPred']==1) & (all_test['Nature']==1)]
if (len(sma_mc_test)==0):
    # in this case, this classifier receives nothing
    print("There is no malicious sample flagged as malicious to analyze!")

sma_mcPred_m = sma_mcClf.predict(sma_mc_test[small_features])
sma_mcResult.acc_multic = accuracy_score(sma_mc_test['Label_cat'], sma_mcPred_m, normalize=True, sample_weight=None)
sma_mcErr_m = int((1-sma_mcResult.acc_multic) * len(sma_mc_test))
print("Total Misclassifications (among the malicious samples): {} out of {}".format(sma_mcErr_m, len(sma_mc_test)))




pd.crosstab(sma_mc_test['Label_cat'], sma_mcPred_m, rownames=['True'], colnames=['Pred'])

Total Misclassifications (among the malicious samples): 21209 out of 70711


Pred,1,2,3,4,5,6
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,16866,976,3146,5207,1405,4167
2,988,25290,58,1359,831,125
3,29,0,281,16,0,4
4,1484,233,387,5468,270,24
5,32,30,10,43,469,356
6,2,0,23,3,1,1128


In [25]:
## Note: We also accounted for the false positives of the first binary classifier (all of which have been considered as misclassifications)
sma_bin_falsePositives = int(sma_bResult.fpr * len(benign_test))
print("This classifier also analyzed {} benign samples that were incorrectly labelled as 'malicious' by the (small) binary classifier".format(sma_bin_falsePositives))

print("Hence, this (small) classifier was tested on {} samples, of which {} have been misclassified".format(len(sma_mc_test)+sma_bin_falsePositives, sma_bin_falsePositives+sma_mcErr_m))

This classifier also analyzed 20782 benign samples that were incorrectly labelled as 'malicious' by the (small) binary classifier
Hence, this (small) classifier was tested on 91493 samples, of which 41991 have been misclassified


### Multiclass Classifier - stand-alone (essential feature set)

In [26]:
sma_mClf, sma_mPred, sma_mResult = develop_clf(all_train, all_test, small_features, clf_name='sma_m', label='Label_cat', clf_type=base_clf, verbose=1)
sma_mErr = int(len(all_test) * (1-sma_mResult.acc_multi))

print("Total Misclassifications: {} out of {}".format(sma_mErr, len(all_test)))
sma_mResult.ctab

Training and testing sma_m......done! Training time: 0.004984s	Inference time: 0.027906s
Total Misclassifications: 46326 out of 173220


Pred,0,1,2,3,4,5,6
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,76927,7428,449,9010,3665,1174,1417
1,1700,17167,975,2840,5562,1260,4093
2,169,991,25368,58,1339,822,103
3,39,26,0,326,15,1,3
4,182,1491,233,344,5513,304,77
5,10,32,30,4,56,453,356
6,1,6,0,24,3,34,1140


#### Multiclass Classifier: Binary

In [27]:
########## MULTI-CLASS CLASSIFIER - BINARY ##########
sma_mPred_bin = np.copy(sma_mPred)
sma_mPred_bin[sma_mPred_bin > 0] = 1
sma_mResult.bin_results(all_test['Nature'], sma_mPred_bin)
sma_mErr_bin = int(len(all_test) * (1-sma_mResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_mErr_bin, len(all_test), sma_mResult.rec, sma_mResult.fpr))
sma_mResult.ctab_bin


Total Misclassifications: 25243 out of 173220 (Recall: 0.971278	FPR: 0.231268)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,76927,23143
1,2101,71049


#### Adversarial Attack against the Multiclass Classifier (binary)

In [28]:
adv_mPred_base = sma_mClf.predict(mal_base[small_features])
adv_mPred_adv = sma_mClf.predict(mal_adv[small_features])

adv_mPred_base_bin = np.copy(adv_mPred_base)
adv_mPred_base_bin[adv_mPred_base_bin > 0] = 1

adv_mPred_adv_bin = np.copy(adv_mPred_adv)
adv_mPred_adv_bin[adv_mPred_adv_bin > 0] = 1


adv_multi_base_rec =  recall_score(mal_base['Nature'], adv_mPred_base_bin)
adv_multi_adv_rec = recall_score(mal_adv['Nature'], adv_mPred_adv_bin)

print("Adversarial Recall (baseline): {:.3f}".format(adv_multi_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_multi_adv_rec))

Adversarial Recall (baseline): 0.969
Adversarial Recall (attack): 0.998


## Ensemble Classifiers (essential feature set)

In [29]:
sma_ensemble_df = pd.DataFrame()
adv_ensemble_df_base = pd.DataFrame()
adv_ensemble_df_adv = pd.DataFrame()

benign_train = benign_df[benign_df['is_train']==True]
benign_test = benign_df[benign_df['is_test']==True]


sma_ens_time = 0
sma_ens_avgFPR = 0
sma_tot_TP = 0
sma_tot_P = 0
sma_ens_infer_time = 0
sma_fakeEns_infer_time = 0

for a in attack_names:
    exec(f"{a}_train = {a}_df[{a}_df['is_train']==True]")
    exec(f"{a}_test = {a}_df[{a}_df['is_test']==True]")

    exec(f"train = pd.concat([benign_train, {a}_train])")
    exec(f"test = pd.concat([benign_test, {a}_test])")

    exec(f"sma_{a}Clf, sma_{a}Pred, sma_{a}Result = develop_clf(train, test, small_features, clf_name='sma_{a}', clf_type=base_clf, verbose=1)")
    exec(f"sma_fakeEns_infer_time += sma_{a}Result.infer_time")
    
    exec(f"sma_{a}_allPred, sma_{a}_allResults, sma_{a}Result.infer_time = evaluate_clf(sma_{a}Clf, all_test, small_features, clf_name='sma_{a}', time=sma_{a}Result.time, verbose=1)")
    exec(f"sma_ensemble_df['{a}'] = sma_{a}_allPred")

    exec(f"adv_{a}Pred_base = sma_{a}Clf.predict(mal_base[small_features])")
    exec(f"adv_{a}Pred_adv = sma_{a}Clf.predict(mal_adv[small_features])")
    exec(f"adv_ensemble_df_base['{a}'] = adv_{a}Pred_base")
    exec(f"adv_ensemble_df_adv['{a}'] = adv_{a}Pred_adv")



    exec(f"sma_tot_TP += (sma_{a}Result.rec * len({a}_test))")
    exec(f"sma_tot_P += len({a}_test)")


    exec(f"sma_ens_avgFPR += sma_{a}Result.fpr")
    exec(f"sma_ens_time += sma_{a}Result.time")
    exec(f"sma_ens_infer_time +=sma_{a}Result.infer_time")



sma_ens_avgFPR = sma_ens_avgFPR / len(attack_names)
sma_ens_avgREC = sma_tot_TP/sma_tot_P


print("Total training time: {:5f}s\t AvgFPR: {:5f}\t AvgTPR: {:5f}\tTotal inference time: {:5f}s (fake: {:5f}s)".
      format(sma_ens_time, sma_ens_avgFPR, sma_ens_avgREC, sma_ens_infer_time, sma_fakeEns_infer_time))

Training and testing sma_neris......done! Training time: 0.004002s	Inference time: 0.016961s
Testing sma_neris...
...done! 	Inference time: 0.020947s
Training and testing sma_rbot......done! Training time: 0.003003s	Inference time: 0.014978s
Testing sma_rbot...
...done! 	Inference time: 0.018503s
Training and testing sma_nsis......done! Training time: 0.003630s	Inference time: 0.013991s
Testing sma_nsis...
...done! 	Inference time: 0.021855s
Training and testing sma_virut......done! Training time: 0.003965s	Inference time: 0.015960s
Testing sma_virut...
...done! 	Inference time: 0.020929s
Training and testing sma_donbot......done! Training time: 0.002984s	Inference time: 0.013000s
Testing sma_donbot...
...done! 	Inference time: 0.018898s
Training and testing sma_murlo......done! Training time: 0.003992s	Inference time: 0.011962s
Testing sma_murlo...
...done! 	Inference time: 0.018937s
Total training time: 0.021575s	 AvgFPR: 0.081058	 AvgTPR: 0.943117	Total inference time: 0.120069s (fa

### Computing real Ensemble

In [30]:
sma_ensemble_df["sum"] = sma_ensemble_df.sum(axis=1)
sma_ensemble_df["LOR"] = (sma_ensemble_df["sum"]>0)
temp = all_test['Nature'] 
sma_ensemble_df['True'] = ((temp.reset_index(drop=True)) > 0)

adv_ensemble_df_base["sum"] = adv_ensemble_df_base.sum(axis=1)
adv_ensemble_df_base["LOR"] = (adv_ensemble_df_base["sum"]>0)
temp = mal_base['Nature']
adv_ensemble_df_base['True'] = ((temp.reset_index(drop=True)) > 0)


adv_ensemble_df_adv["sum"] = adv_ensemble_df_adv.sum(axis=1)
adv_ensemble_df_adv["LOR"] = (adv_ensemble_df_adv["sum"]>0)
temp = mal_adv['Nature']
adv_ensemble_df_adv['True'] = ((temp.reset_index(drop=True)) > 0)

### Ensemble: Logical OR (essential feature set)

In [31]:
sma_enslorResult = Result(sma_ensemble_df['True'], sma_ensemble_df['LOR'], sma_ens_time, sma_ens_infer_time)
sma_enslorErr= int(len(all_test) * (1-sma_enslorResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_enslorErr, len(all_test), sma_enslorResult.rec, sma_enslorResult.fpr))
sma_enslorResult.ctab_bin # you can also try with sma_enslorResult.ctab

Total Misclassifications: 31898 out of 173220 (Recall: 0.982775	FPR: 0.306166)


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
False,69432,30638
True,1260,71890


#### Logical OR: Adversarial Attack

In [32]:
adv_enslor_base_rec = recall_score(mal_base['Nature'], adv_ensemble_df_base["LOR"])
adv_enslor_adv_rec = recall_score(mal_adv['Nature'], adv_ensemble_df_adv["LOR"])

print("Adversarial Recall (baseline): {:.3f}".format(adv_enslor_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_enslor_adv_rec))

Adversarial Recall (baseline): 0.978
Adversarial Recall (attack): 0.014


### Ensemble: Majority Voting (essential feature set)

In [33]:
sma_ensemble_df["MAJV"] = (sma_ensemble_df["sum"]>=min_agree)
sma_ensvotResult = Result(sma_ensemble_df['True'], sma_ensemble_df['MAJV'], sma_ens_time, sma_ens_infer_time)
sma_ensvotErr = int(len(all_test) * (1-sma_ensvotResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_ensvotErr, len(all_test), sma_ensvotResult.rec, sma_ensvotResult.fpr))
sma_ensvotResult.ctab_bin # you can also try with sma_ensvotResult.ctab

Total Misclassifications: 44282 out of 173220 (Recall: 0.449364	FPR: 0.040012)


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
False,96066,4004
True,40279,32871


#### Majority Voting: Adversarial Attack

In [34]:
adv_ensemble_df_base["MAJV"] = (adv_ensemble_df_base["sum"]>=min_agree)
adv_ensemble_df_adv["MAJV"] = (adv_ensemble_df_adv["sum"]>=min_agree)

adv_ensvot_base_rec = recall_score(mal_base['Nature'], adv_ensemble_df_base["MAJV"])
adv_ensvot_adv_rec = recall_score(mal_adv['Nature'], adv_ensemble_df_adv["MAJV"])

print("Adversarial Recall (baseline): {:.3f}".format(adv_ensvot_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_ensvot_adv_rec))

Adversarial Recall (baseline): 0.329
Adversarial Recall (attack): 0.000


### Ensemble: Stacked Classifier (essential feature set)

In [35]:
clf_list = []
for a in attack_names:
    exec(f"clf_list.append(sma_{a}Clf)")

sma_sClf = StackingClassifier(classifiers=clf_list, meta_classifier=meta, fit_base_estimators=False, use_probas = False)
s_timeStart = time.time()
sma_sClf.fit(all_train[small_features], all_train['Nature'])
sma_s_time = time.time() - s_timeStart + sma_ens_time
s_timeStart = time.time()
sma_sPred = sma_sClf.predict(all_test[small_features])
sma_s_infer_time = time.time()-s_timeStart
sma_sResult = Result(all_test['Nature'], sma_sPred, sma_s_time, sma_s_infer_time)
if sma_sResult.acc < sma_sResult.acc_multi:
    sma_sResult.acc = sma_sResult.acc_multi
sma_sErr = int(len(all_test) * (1-sma_sResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_sErr, len(all_test), sma_sResult.rec, sma_sResult.fpr))

sma_sResult.ctab_bin # you can also try with sma_sResult.ctab

Total Misclassifications: 30091 out of 173220 (Recall: 0.980383	FPR: 0.286360)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,71414,28656
1,1435,71715


#### Stacked Classifier: Adversarial Attack

In [36]:
adv_sPred_base = sma_sClf.predict(mal_base[small_features])
adv_sPred_adv = sma_sClf.predict(mal_adv[small_features])

adv_ensstk_base_rec =  recall_score(mal_base['Nature'], adv_sPred_base)
adv_ensstk_adv_rec = recall_score(mal_adv['Nature'], adv_sPred_adv)

print("Adversarial Recall (baseline): {:.3f}".format(adv_ensstk_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_ensstk_adv_rec))

Adversarial Recall (baseline): 0.977
Adversarial Recall (attack): 0.012


# DONE!

In [37]:
# You can now inspect the results by referring to the "Result" variables. 

In [38]:
## BASELINE RESULTS (on Complete Feature Set)
print(
    "TPR",
    bResult.rec,
    mResult.rec,
    ens_avgREC,
    enslorResult.rec,
    ensvotResult.rec,
    sResult.rec,
    "\nFPR",
    bResult.fpr,
    mResult.fpr,
    ens_avgFPR,
    enslorResult.fpr,
    ensvotResult.fpr,
    sResult.fpr,
    "\nTraining Time",
    bResult.time,
    mResult.time,
    ens_time,
    enslorResult.time,
    ensvotResult.time,
    sResult.time,
    "\nInference Time",
    bResult.infer_time,
    mResult.infer_time,
    fakeEns_infer_time,
    enslorResult.infer_time,
    ensvotResult.infer_time,
    sResult.infer_time, 
    "\nAccuracy",
    mResult.acc_multi,   # This is the accuracy on the multiclassification
    mResult.acc,         # This is the accuracy on the binary classification
    mcResult.acc_multic, # This is the accuracy on the multiclassification AFTER the output of the binary classifier (it does not account for benign samples, which are false positives)
    mcResult.acc_multi   # This is the accuracy on the multiclassification on the whole test portion of the malicious dataset
)

TPR 0.991455912508544 0.992822966507177 0.9838824333561176 0.9989883800410116 0.9745317840054682 0.9963636363636363 
FPR 0.06416508444089142 0.07245927850504652 0.03091002964591451 0.11671829719196558 0.01809733186769258 0.09954032177475769 
Training Time 0.006976604461669922 0.007972955703735352 0.02709794044494629 0.02709794044494629 0.02709794044494629 0.03905892372131348 
Inference Time 0.03491997718811035 0.04087400436401367 0.1300203800201416 0.18239569664001465 0.18239569664001465 0.1435544490814209 
Accuracy 0.827872070199746 0.9551091098025632 0.6986418476387453 0.6971428571428572


In [39]:
## BASELINE RESULTS (on Essential Feature Set)
print(
    "TPR",
    sma_bResult.rec,
    sma_mResult.rec,
    sma_ens_avgREC,
    sma_enslorResult.rec,
    sma_ensvotResult.rec,
    sma_sResult.rec,
    "\nFPR",
    sma_bResult.fpr,
    sma_mResult.fpr,
    sma_ens_avgFPR,
    sma_enslorResult.fpr,
    sma_ensvotResult.fpr,
    sma_sResult.fpr,
    "\nTraining Time",
    sma_bResult.time,
    sma_mResult.time,
    sma_ens_time,
    sma_enslorResult.time,
    sma_ensvotResult.time,
    sma_sResult.time,
    "\nInference Time",
    sma_bResult.infer_time,
    sma_mResult.infer_time,
    sma_fakeEns_infer_time,
    sma_enslorResult.infer_time,
    sma_ensvotResult.infer_time,
    sma_sResult.infer_time, 
    "\nAccuracy",
    sma_mResult.acc_multi,   # This is the accuracy on the multiclassification
    sma_mResult.acc,         # This is the accuracy on the binary classification
    sma_mcResult.acc_multic, # This is the accuracy on the multiclassification AFTER the output of the binary classifier (it does not account for benign samples, which are false positives)
    sma_mcResult.acc_multi   # This is the accuracy on the multiclassification on the whole test portion of the malicious dataset
)

TPR 0.9666575529733424 0.9712781954887219 0.9431168831168831 0.9827751196172249 0.4493643198906357 0.9803827751196172 
FPR 0.20768462076546412 0.231268112321375 0.08105825921854702 0.30616568402118516 0.040011991605875874 0.2863595483161787 
Training Time 0.004975318908691406 0.004984140396118164 0.021575212478637695 0.021575212478637695 0.021575212478637695 0.03154921531677246 
Inference Time 0.024428844451904297 0.027906179428100586 0.08685183525085449 0.12006878852844238 0.12006878852844238 0.11062240600585938 
Accuracy 0.7325597506061655 0.854266251010276 0.7000608109063653 0.6894873547505126


In [40]:
## Open World: One attack against all (averaged results)

print('''Open World assessment: performance against one unknown attack (averaged for all attacks in the dataset)
      BD: TPR={:5f}\tFPR={:5f}
      MD (binarized) CLF: TPR={:5f}\tFPR={:5f}
      ED-o: TPR={:5f}\tFPR={:5f}
      ED-v: TPR={:5f}\tFPR={:5f}
      ED-s: TPR={:5f}\tFPR={:5f}
      '''.format(oaac_bin_rec, oaac_bin_fpr,
                 oaac_multi_rec, oaac_multi_fpr,
                 oaac_enslor_rec, oaac_enslor_fpr,
                 oaac_ensvot_rec, oaac_ensvot_fpr,
                 oaac_ensstk_rec, oaac_ensstk_fpr))

Open World assessment: performance against one unknown attack (averaged for all attacks in the dataset)
      BD: TPR=0.937699	FPR=0.058364
      MD (binarized) CLF: TPR=0.900330	FPR=0.069526
      ED-o: TPR=0.972680	FPR=0.104795
      ED-v: TPR=0.614395	FPR=0.011307
      ED-s: TPR=0.967791	FPR=0.092237
      


In [41]:
## Adversarial Attacks 

print(
    "BD (before, after):",
    adv_bin_base_rec,
    adv_bin_adv_rec,
    "\nMD  (before, after):",
    adv_multi_base_rec,
    adv_multi_adv_rec,
    "\nED-o  (before, after):",
    adv_enslor_base_rec,
    adv_enslor_adv_rec, 
    "\nED-v  (before, after):",
    adv_ensvot_base_rec,
    adv_ensvot_adv_rec, 
    "\nED-s  (before, after):",
    adv_ensstk_base_rec, 
    adv_ensstk_adv_rec,
)

BD (before, after): 0.9578044074337362 0.9723265969330761 
MD  (before, after): 0.9686706611150604 0.9977658170001016 
ED-o  (before, after): 0.9781151619782675 0.01365898243119732 
ED-v  (before, after): 0.32862800853051694 0.00035543820452929825 
ED-s  (before, after): 0.9768457398192343 0.012237229613080125
