In [1]:
import os
import sys
from supportFunctions import *

root_folder = "..\\data\\UF-NB15\\flows\\"

In [2]:
#### PARAMETERS #####

## Generic parameters
temporal = True # used to determine if this evaluation assumes a "temporal" dependency among its samples
base_clf = 'dt' # name of the classifier used for this "run". Available names: ['dt', 'rf', 'hgb', 'lr']. You can add more by editing the supportFunctions file
test_size = 0.2 # proportion of the dataset used for testing. We always kept it fixed to 0.2 for our paper
train_size = 100 # proportion of the REMAINING data that are used for training (if >1, then it will take that exact amount). To reproduce the results of the paper, use: 100 (for "limited" training data) or 0.2 or 0.5 or 0.99 (for scarce, moderate, abundant training data, respectively) 
agreement = 0.5 # from 0 to 1. Proportion of classifiers that must agree on an attack (for the ensemble). This is fixed in our paper.
max_size = 500000 ## maximum amount of samples to include when creating the initial dataframes. This is fixed in our paper
max_size_atk = int(max_size / 3) # maximum amount of malicious samples per class. This is fixed in our paper

## Adversarial Attacks parameters
atk_intensity = 1
pkt_intensity = atk_intensity * 10 # 
byt_intensity = pkt_intensity * 100 # 
dur_intensity = atk_intensity * 100 # consider seconds

In [3]:
### Reading input data

malicious_folder = root_folder + "malicious/"

benign_file = root_folder + "benign.csv"
benign_df = pd.read_csv(benign_file, header='infer', index_col=0)
benign_df = benign_df.sample(min(max_size, len(benign_df)))
#sort by timestamp
if temporal == True:
    pass # already sorted
benign_df.reset_index(inplace=True, drop=True)

# attack_names = ['expl', 'recon', 'dos', 'shell', 'fuzz', 'worm', 'bdoor', 'ana', 'other']
attack_names = ['expl', 'recon', 'dos', 'shell', 'fuzz', 'bdoor', 'ana']

expl_file = malicious_folder + "expl.csv"
recon_file = malicious_folder + "recon.csv"
dos_file = malicious_folder + "dos.csv"
shell_file = malicious_folder + "shell.csv"
fuzz_file = malicious_folder + "fuzz.csv"
worm_file = malicious_folder + "worm.csv" # excluded, only 164 samples
bdoor_file = malicious_folder + "bdoor.csv"
ana_file = malicious_folder + "ana.csv"
other_file = malicious_folder + "other.csv" # excluded due to mismatch



for a in attack_names:
    exec(f"{a}_df = pd.read_csv({a}_file, header='infer', index_col=0)")
    exec(f"{a}_df = {a}_df.sample(min(max_size_atk, len({a}_df)))")
    # sort by timestamp
    if temporal == True:
        pass
    exec(f"{a}_df.reset_index(inplace=True, drop=True)")
    exec(f"{a}_df['Label'] = a")

In [4]:
# Determining Train and Test sets for each class

df_list = [benign_df]
for a in attack_names:
    exec(f"df_list.append({a}_df)")

if temporal == True:
    for dummy_df in df_list:
        if train_size <=1:
            train_threshold = int(((1-test_size) * train_size) * len(dummy_df))
        else:
            train_threshold = int(100)
        test_threshold = len(dummy_df) - int(test_size * len(dummy_df))
        dummy_df['index'] = dummy_df.index
        dummy_df['is_test'] = np.where(dummy_df['index'] >= test_threshold , True, False)
        dummy_df['is_train'] = np.where(dummy_df['index'] <= train_threshold , True, False)
else:
    for dummy_df in df_list:
        if train_size <= 1:
            train_threshold = test_size + (1-test_size)*train_size
        else:
            train_threshold = test_size + ((train_size * 100) / (len(dummy_df)) / 100)       
        dummy_df['seed'] = (np.random.uniform(0,1,len(dummy_df)))
        dummy_df['is_test'] = np.where(dummy_df['seed'] <= test_size, True, False)
        dummy_df['is_train'] = np.where((dummy_df['seed'] <= train_threshold) & (dummy_df['is_test']==False), True, False)

# get all together
all_df = pd.concat(df_list)

In [5]:
def handle_categorical(df):
    ## Handling categorical data
    df_dummy = df.copy(deep=True)
    df_dummy['Nature'] = np.where(df_dummy['Label'].str.contains('BENIGN'),0,1)
    
    for column_name in df_dummy.columns:
        if column_name == ('SrcPort_type'):
            df_dummy[column_name+'-f'] = pd.factorize(df_dummy[column_name])[0]
        elif column_name == ('DstPort_type'):
            df_dummy[column_name+'-f'] = pd.factorize(df_dummy[column_name])[0]
        elif column_name == ('Proto'):
            df_dummy[column_name+"-f"] = pd.factorize(df_dummy[column_name])[0]
        else:
            pass
    return df_dummy

all_df = handle_categorical(all_df)
all_df['Label_cat'] = pd.factorize(all_df['Label'])[0]
all_df['int2int'] = np.where( ((all_df['SrcIP_internal']==True) & (all_df['DstIP_internal']==True)), True, False)

all_df['totPkt'] = all_df['Pkts_in'] + all_df['Pkts_out']
all_df['totByt'] = all_df['Bytes_in'] + all_df['Bytes_out']

all_train, all_test = all_df[all_df['is_train']==True], all_df[all_df['is_test']==True]

### SPLITTING ALL BACK ####
benign_df = all_df[all_df['Label']=='BENIGN']
benign_train = benign_df[benign_df['is_train']==True]
benign_test = benign_df[benign_df['is_test']==True]

for a in attack_names:
    exec(f"{a}_df = all_df[all_df['Label']=='{a}']")
    
malicious_df = all_df[all_df['Label']!='BENIGN']
malicious_train, malicious_test = malicious_df[malicious_df['is_train']==True], malicious_df[malicious_df['is_test']==True]



print("& 0 & \\textit{{Benign}} & {} & \\\\ \\cline{{2-4}}".format(len(benign_df)))


for i,a in enumerate(attack_names):
    exec(f"print('& {i+1} & \\\\textit{{{{{a}}}}} & {{}} \\\\\\\\ \\\\cline{{{{2-4}}}}'.format(len({a}_df)))")

& 0 & \textit{Benign} & 500000 & \\ \cline{2-4}
& 1 & \textit{expl} & 31551 \\ \cline{2-4}
& 2 & \textit{recon} & 12779 \\ \cline{2-4}
& 3 & \textit{dos} & 5794 \\ \cline{2-4}
& 4 & \textit{shell} & 1427 \\ \cline{2-4}
& 5 & \textit{fuzz} & 22310 \\ \cline{2-4}
& 6 & \textit{bdoor} & 2169 \\ \cline{2-4}
& 7 & \textit{ana} & 2299 \\ \cline{2-4}


In [6]:
## Feature sets

# the following is the "complete" feature set

features = ['Proto_l7', 'Bytes_in', 'Pkts_in',
       'Bytes_out', 'Pkts_out', 'TCP_Flag', 'Client_TCP_Flag',
       'Server_TCP_Flag', 'Duration(ms)', 'Duration_in', 'Duration_out',
       'TTL_min', 'TTL_max', 'MaxPkts', 'MinPkts', 'MinLen', 'MaxLen',
        'RetrBytes_in', 'RetrPkts_in',
       'RetrBytes_out', 'RetrPkts_out', 'Throughput_SrcDst',
       'Throughput_DstSrc', 'Pkt<128Byt', 'Pkt128<Byt<256', 'Pkt256<Byt<512',
       'Pkt512<Byt<1024', 'Pkt1024<Byt<1514', 'MaxTCPWin_in', 'MaxTCPWin_out',
       'ICMP', 'ICMP_IPV4', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE', 'DNS_TTL_ANSWER',
       'FTP_COMMAND_RET_CODE',
       'SrcPort_type-f', 'DstPort_type-f',
       'Proto-f', 'int2int'
       ]


# this is for the "essential" feature set
small_features = ['Proto-f', 'Bytes_in',
       'Pkts_in', 'Bytes_out', 'Pkts_out', 'TCP_Flag', 'Client_TCP_Flag',
        'totPkt', 'totByt',
        'Duration(ms)',
       'ICMP', 'ICMP_IPV4', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE', 'DNS_TTL_ANSWER',
       'FTP_COMMAND_RET_CODE',
        'DstIP_internal', 'int2int',
       'DstPort_type-f']

In [7]:
# creating adversarial dataset
mal_base = malicious_df[((malicious_df['Proto']==17)) & (malicious_df['is_test']==True)]
print(len(mal_base))
mal_adv = mal_base.copy(deep=True)

# attacking
max_dur = mal_adv['Duration(ms)'].max()
min_dur = mal_adv['Duration(ms)'].min()
mal_adv['Duration(ms)'] = mal_adv['Duration(ms)'] + (dur_intensity * 1000)

mal_adv['Duration(ms)'] = np.where(mal_adv['Duration(ms)'] > max_dur, max_dur, mal_adv['Duration(ms)'])
mal_adv['Duration(ms)'] = np.where(mal_adv['Duration(ms)'] < min_dur, min_dur, mal_adv['Duration(ms)'])


#mal_adv['Pkts_in'] = mal_adv['Pkts_in'] + pkt_intensity
mal_adv['Pkts_out'] = mal_adv['Pkts_out'] + pkt_intensity
#mal_adv['Bytes_in'] = mal_adv['Bytes_in'] + byt_intensity
mal_adv['Bytes_out'] = mal_adv['Bytes_out'] + byt_intensity 


mal_adv['totPkt'] = mal_adv['Pkts_in'] + mal_adv['Pkts_out']
mal_adv['totByt'] = mal_adv['Bytes_in'] + mal_adv['Bytes_out']

2310


# FROM NOW ON, THE CODE IS ALWAYS THE SAME FOR EVERY DATASET!!!!!!

# Baseline: Assessment on "Complete" feature set

## BINARY CLASSIFIER (Complete features)

In [8]:
bClf, bPred, bResult = develop_clf(all_train, all_test, features, clf_name='bin', label='Nature', clf_type=base_clf, verbose=1)

if (bResult.acc == 0):
    bErr = int(len(all_test) * (1-bResult.acc_multi))
else:
    bErr = int(len(all_test) * (1-bResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(bErr, len(all_test), bResult.rec, bResult.fpr))
pd.crosstab(all_test['Nature'], bPred, rownames=['True'], colnames=['Pred'])

Training and testing bin......done! Training time: 0.005981s	Inference time: 0.031964s
Total Misclassifications: 913 out of 115662 (Recall: 0.998340	FPR: 0.008880)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,99112,888
1,26,15636


## MULTI-CLASS CLASSIFIER - cascade

In [9]:
# This is the classifier that analyzes ONLY the malicious samples that "receives" from the initial binary classifier
# It is trained on the same training set---but without using the benign samples
# It is tested on the malicious samples in the test set that are flagged as malicious by the binary classifier


mcClf, mcPred, mcResult = develop_clf(malicious_train, malicious_test, features, clf_name='mc', label='Label_cat', clf_type=base_clf, verbose=1)
mcErr = int(len(malicious_test) * (1-mcResult.acc_multi))
print("Total Misclassifications: {} out of {}".format(mcErr, len(malicious_test)))
pd.crosstab(malicious_test['Label_cat'], mcPred, rownames=['True'], colnames=['Pred'])

Training and testing mc......done! Training time: 0.007957s	Inference time: 0.007967s
Total Misclassifications: 5763 out of 15662


Pred,1,2,3,4,5,6,7
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3362,288,1462,193,375,368,262
2,78,2026,131,0,47,148,125
3,242,61,344,38,114,174,185
4,8,0,10,264,3,0,0
5,251,78,136,31,3561,246,159
6,29,33,63,9,19,151,129
7,10,40,94,0,9,115,191


In [10]:
# We select the samples flagged as malicious by the initial classifier.
# Of course, samples flagged as malicious that are NOT actually malicious will always be misclassified

all_test['bPred'] = bPred
mc_test = all_test[(all_test['bPred']==1) & (all_test['Nature']==1)]
if (len(mc_test)==0):
    # in this case, this classifier receives nothing
    print("There is no malicious sample flagged as malicious to analyze!")

mcPred_m = mcClf.predict(mc_test[features])
mcResult.acc_multic = accuracy_score(mc_test['Label_cat'], mcPred_m, normalize=True, sample_weight=None)
mcErr_m = int((1-mcResult.acc_multic) * len(mc_test))
print("Total Misclassifications (among the malicious samples): {} out of {}".format(mcErr_m, len(mc_test)))




pd.crosstab(mc_test['Label_cat'], mcPred_m, rownames=['True'], colnames=['Pred'])

Total Misclassifications (among the malicious samples): 5754 out of 15636


Pred,1,2,3,4,5,6,7
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3347,288,1459,192,375,368,262
2,78,2026,130,0,46,148,125
3,241,61,342,38,112,174,185
4,8,0,10,264,3,0,0
5,251,78,136,31,3561,246,159
6,29,33,63,9,19,151,129
7,10,40,94,0,9,115,191


In [11]:
## Note: We also accounted for the false positives of the first binary classifier (all of which have been considered as misclassifications)
bin_falsePositives = int(bResult.fpr * len(benign_test))
print("This classifier also analyzed {} benign samples that were incorrectly labelled as 'malicious' by the binary classifier".format(bin_falsePositives))

print("Hence, this classifier was tested on {} samples, of which {} have been misclassified".format(len(mc_test)+bin_falsePositives, bin_falsePositives+mcErr_m))

This classifier also analyzed 887 benign samples that were incorrectly labelled as 'malicious' by the binary classifier
Hence, this classifier was tested on 16523 samples, of which 6641 have been misclassified


## MULTI-CLASS CLASSIFIER - stand-alone

In [12]:
# We first assess its multiclassification performance, and then its binary classification performance

mClf, mPred, mResult = develop_clf(all_train, all_test, features, clf_name='m', label='Label_cat', clf_type=base_clf, verbose=1)
mErr = int(len(all_test) * (1-mResult.acc_multi))
print("Total Misclassifications: {} out of {}".format(mErr, len(all_test)))
mResult.ctab

Training and testing m......done! Training time: 0.008982s	Inference time: 0.035876s
Total Misclassifications: 7591 out of 115662


Pred,0,1,2,3,4,5,6,7
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,98121,385,4,1146,40,206,23,75
1,54,3398,215,1492,189,355,356,251
2,1,71,1978,161,0,33,188,123
3,9,262,55,354,38,97,164,179
4,0,8,0,10,264,3,0,0
5,0,260,83,169,31,3613,161,145
6,0,28,33,65,9,19,157,122
7,0,15,40,90,0,9,120,185


In [13]:
# For the binary classification performance, we use the previous predictions
mPred_bin = np.copy(mPred)
mPred_bin[mPred_bin > 0] = 1
mResult.bin_results(all_test['Nature'], mPred_bin)
mErr_bin = int(len(all_test) * (1-mResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(mErr_bin, len(all_test), mResult.rec, mResult.fpr))
mResult.ctab_bin

Total Misclassifications: 1943 out of 115662 (Recall: 0.995914	FPR: 0.018790)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,98121,1879
1,64,15598


## ENSEMBLE CLASSIFIERS

### Training "individual" binary classifiers

In [14]:
ensemble_df = pd.DataFrame()

benign_train = benign_df[benign_df['is_train']==True]
benign_test = benign_df[benign_df['is_test']==True]


ens_time = 0
ens_avgFPR = 0
tot_TP = 0
tot_P = 0
ens_infer_time = 0
fakeEns_infer_time = 0

for a in attack_names:
    exec(f"{a}_train = {a}_df[{a}_df['is_train']==True]")
    exec(f"{a}_test = {a}_df[{a}_df['is_test']==True]")

    exec(f"train = pd.concat([benign_train, {a}_train])")
    exec(f"test = pd.concat([benign_test, {a}_test])")
    
    # We first train a classifier only on "benign" or on malicious samples of a specific attack. 
    # Afterwards, we immediately test it on a test-set having ONLY malicious samples of this specific attack
    # Note: such "testing" is redundant, because it assumes that the classifier only receives the samples of the attack it is trained on!
    exec(f"{a}Clf, {a}Pred, {a}Result = develop_clf(train, test, features, clf_name='{a}', clf_type=base_clf, verbose=1)")
    exec(f"fakeEns_infer_time += {a}Result.infer_time")
    # We now test the specific classifier on the ENTIRE test-set, thereby allowing to assess its performance also against malicious samples of different attacks
    exec(f"{a}_allPred, {a}_allResults, {a}Result.infer_time = evaluate_clf({a}Clf, all_test, features, clf_name='{a}', time={a}Result.time, verbose=1)")

    exec(f"ensemble_df['{a}'] = {a}_allPred")

    exec(f"tot_TP += ({a}Result.rec * len({a}_test))")
    exec(f"tot_P += len({a}_test)")


    exec(f"ens_avgFPR += {a}Result.fpr")
    exec(f"ens_time += {a}Result.time")
    exec(f"ens_infer_time +={a}Result.infer_time")

ens_avgFPR = ens_avgFPR / len(attack_names)
ens_avgREC = tot_TP/tot_P

print("Total training time: {:5f}s\t AvgFPR: {:5f}\t AvgTPR: {:5f}\tTotal inference time: {:5f}s (fake: {:5f}s)".
      format(ens_time, ens_avgFPR, ens_avgREC, ens_infer_time, fakeEns_infer_time))

Training and testing expl......done! Training time: 0.003998s	Inference time: 0.029844s
Testing expl...
...done! 	Inference time: 0.031895s
Training and testing recon......done! Training time: 0.002996s	Inference time: 0.032922s
Testing recon...
...done! 	Inference time: 0.029931s
Training and testing dos......done! Training time: 0.003985s	Inference time: 0.027873s
Testing dos...
...done! 	Inference time: 0.029886s
Training and testing shell......done! Training time: 0.002990s	Inference time: 0.028909s
Testing shell...
...done! 	Inference time: 0.030430s
Training and testing fuzz......done! Training time: 0.003983s	Inference time: 0.028914s
Testing fuzz...
...done! 	Inference time: 0.031921s
Training and testing bdoor......done! Training time: 0.003988s	Inference time: 0.027884s
Testing bdoor...
...done! 	Inference time: 0.029894s
Training and testing ana......done! Training time: 0.004064s	Inference time: 0.027867s
Testing ana...
...done! 	Inference time: 0.029869s
Total training tim

### Ensemble (real assessment)

In [15]:
# Here we measure the combined performance of the entire ensemble
# This is done with a logical or, or for majority voting (regulated by the "agreement" variable)

ensemble_df["sum"] = ensemble_df.sum(axis=1)
#calculating 
ensemble_df["LOR"] = (ensemble_df["sum"]>0)

#Appending Ground Truth
temp = all_test['Nature'] #> 0)
ensemble_df['True'] = ((temp.reset_index(drop=True)) > 0)

### Ensemble: Logical OR

In [16]:
enslorResult = Result(ensemble_df['True'], ensemble_df['LOR'], ens_time, ens_infer_time)
enslorErr= int(len(all_test) * (1-enslorResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(enslorErr, len(all_test), enslorResult.rec, enslorResult.fpr))
enslorResult.ctab_bin # you can also try with enslorResult.ctab

Total Misclassifications: 921 out of 115662 (Recall: 0.999808	FPR: 0.009180)


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
False,99082,918
True,3,15659


### Ensemble: Majority Voting

In [17]:
min_agree = math.ceil(agreement * len(attack_names))
print("Voting: at least {} out of {} classifiers must agree that a sample is malicious.".format(min_agree, len(attack_names)))
ensemble_df["MAJV"] = (ensemble_df["sum"]>=min_agree)
ensvotResult = Result(ensemble_df['True'], ensemble_df['MAJV'], ens_time, ens_infer_time)
ensvotErr = int(len(all_test) * (1-ensvotResult.acc))
print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(ensvotErr, len(all_test), ensvotResult.rec, ensvotResult.fpr))
ensvotResult.ctab_bin # you can also try with ensvotResult.ctab

Voting: at least 4 out of 7 classifiers must agree that a sample is malicious.
Total Misclassifications: 1558 out of 115662 (Recall: 0.952496	FPR: 0.008150)


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
False,99185,815
True,744,14918


### Ensemble: Stacked Classifier

In [18]:
from mlxtend.classifier import StackingClassifier
clf_list = []
for a in attack_names:
    exec(f"clf_list.append({a}Clf)")

meta = choose_clf(clf_type=base_clf)
sClf = StackingClassifier(classifiers=clf_list, meta_classifier=meta, fit_base_estimators=False, use_probas = False)
s_timeStart = time.time()
sClf.fit(all_train[features], all_train['Nature'])
s_time = time.time() - s_timeStart + ens_time
s_timeStart = time.time()
sPred = sClf.predict(all_test[features])
s_infer_time = time.time()-s_timeStart
sResult = Result(all_test['Nature'], sPred, s_time, s_infer_time)
if sResult.acc < sResult.acc_multi:
    sResult.acc = sResult.acc_multi
sErr = int(len(all_test) * (1-sResult.acc))
print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sErr, len(all_test), sResult.rec, sResult.fpr))

sResult.ctab_bin # you can also try with sResult.ctab

Total Misclassifications: 887 out of 115662 (Recall: 0.999808	FPR: 0.008850)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,99115,885
1,3,15659


## Open World Assessment: One attack against all classifiers

In [19]:
### The following code is a mixture of everything described insofar.
### We are only focused on TPR and FPR here. We do not care about accuracy, adversarial robustness, or runtime.
### These experiments are also done only on the "Complete" feature set


oaac_bin_rec = 0
oaac_bin_fpr = 0
oaac_multi_rec = 0
oaac_multi_fpr = 0
oaac_enslor_rec = 0
oaac_enslor_fpr = 0
oaac_ensvot_rec = 0
oaac_ensvot_fpr = 0
oaac_ensstk_rec = 0
oaac_ensstk_fpr = 0

for u in attack_names: # u is the unknown attack
    #print(u) # this is the unknown attack
    #exec(f"{u}_test = {u}_df[{u}_df['is_train']==False]") # create test set by putting the "test" samples of u
    exec(f"{u}_test = pd.concat([benign_test, {u}_test])") # add to the test set the "benign" test samples
    exec(f"{u}_train = benign_df[benign_df['is_train']==True]") # compose the "training" set: start by putting the benign "training" samples
    for a in attack_names: 
        # for every attack that is not u, add its training samples to the training set of u
        if a==u:
            continue
        exec(f"{u}_train = pd.concat([{u}_train, {a}_df[{a}_df['is_train']==True]])")


    # We have created the training and testing set. Now we must train and test a binary classifier by following the standard procedure
    ########## BINARY CLASSIFIER ##########
    exec(f"{u}_oaac_bClf, {u}_oaac_bPred, {u}_oaac_bResult = develop_clf({u}_train, {u}_test, features, clf_name='{u}_oaac_bin', label='Nature', clf_type=base_clf)")


    ########## Multiclass CLASSIFIER ########## --> Train, then test only on binary
    exec(f"{u}_oaac_mClf, {u}_oaac_mPred, {u}_oaac_mResult = develop_clf({u}_train, {u}_test, features, clf_name='{u}_oaac_multi', label='Label_cat', clf_type=base_clf)")
    exec(f"{u}_oaac_mPred_bin = np.copy({u}_oaac_mPred)")
    exec(f"{u}_oaac_mPred_bin[{u}_oaac_mPred_bin > 0] = 1")
    exec(f"{u}_oaac_mResult.bin_results({u}_test['Nature'], {u}_oaac_mPred_bin)")


    ######### Ensemble ##############
    # send the samples in TEST to all the classifiers of the ensemble (which are already trained), aside from the one focusing on u
    exec(f"{u}_oaac_ens_df = pd.DataFrame()")
    for a in attack_names:    
        if a==u:
                continue
        exec(f"{a}_{u}Pred, {a}_{u}Results, {a}_{u}_infer_time = evaluate_clf({a}Clf, {u}_test, features, clf_name='{a}_{u}', time={a}Result.time)")
        exec(f"{u}_oaac_ens_df['{a}'] = {a}_{u}Pred")

    # now we have the dataframe with all the predictions, let's see the aggregate results
    exec(f"{u}_oaac_ens_df['sum'] = {u}_oaac_ens_df.sum(axis=1)")
    exec(f"{u}_oaac_ens_df['LOR'] = ({u}_oaac_ens_df['sum']>0)")
    exec(f"temp = {u}_test['Nature'] #> 0)")
    exec(f"{u}_oaac_ens_df['True'] = ((temp.reset_index(drop=True)) > 0)")
    exec(f"{u}_oaac_enslorResult = Result({u}_oaac_ens_df['True'], {u}_oaac_ens_df['LOR'], (ens_time-{u}Result.time), (ens_infer_time-{u}Result.infer_time))")

    # now we consider the majority voting of the ensemble
    exec(f"{u}_oaac_ens_df['MAJV'] = ({u}_oaac_ens_df['sum']>=min_agree)")
    exec(f"{u}_oaac_ensvotResult = Result({u}_oaac_ens_df['True'], {u}_oaac_ens_df['MAJV'], (ens_time-{u}Result.time), (ens_infer_time-{u}Result.infer_time))")

    # finally, let's use the stacking ensemble
    exec(f"{u}_clf_list = []")
    for a in attack_names:
        if a==u:
                continue
        exec(f"{u}_clf_list.append({a}Clf)")
    exec(f"{u}_oaac_sClf = StackingClassifier(classifiers={u}_clf_list, meta_classifier=meta, fit_base_estimators=False, use_probas = False)")
    exec(f"{u}_oaac_sClf.fit({u}_train[features], {u}_train['Nature'])")
    exec(f"{u}_oaac_sPred = {u}_oaac_sClf.predict({u}_test[features])")
    exec(f"{u}_oaac_sResult = Result({u}_test['Nature'], {u}_oaac_sPred, (ens_time-{u}Result.time), (ens_infer_time-{u}Result.infer_time))")

    # Updating results
    exec(f"oaac_bin_rec += {u}_oaac_bResult.rec")
    exec(f"oaac_bin_fpr += {u}_oaac_bResult.fpr")
    exec(f"oaac_multi_rec += {u}_oaac_mResult.rec")
    exec(f"oaac_multi_fpr += {u}_oaac_mResult.fpr")
    exec(f"oaac_enslor_rec += {u}_oaac_enslorResult.rec")
    exec(f"oaac_enslor_fpr += {u}_oaac_enslorResult.fpr")
    exec(f"oaac_ensvot_rec += {u}_oaac_ensvotResult.rec")
    exec(f"oaac_ensvot_fpr += {u}_oaac_ensvotResult.fpr")
    exec(f"oaac_ensstk_rec += {u}_oaac_sResult.rec")
    exec(f"oaac_ensstk_fpr += {u}_oaac_sResult.fpr")

# Finalizing averages
oaac_bin_rec /= len(attack_names)
oaac_bin_fpr /= len(attack_names)
oaac_multi_rec /= len(attack_names)
oaac_multi_fpr /= len(attack_names)
oaac_enslor_rec /= len(attack_names)
oaac_enslor_fpr /= len(attack_names)
oaac_ensvot_rec /= len(attack_names)
oaac_ensvot_fpr /= len(attack_names)
oaac_ensstk_rec /= len(attack_names)
oaac_ensstk_fpr /= len(attack_names)


print('''Open World assessment: performance against one unknown attack (averaged for all attacks in the dataset)
      Binary CLF: TPR={:5f}\tFPR={:5f}
      Multiclass (binarized) CLF: TPR={:5f}\tFPR={:5f}
      EnsLOR CLF: TPR={:5f}\tFPR={:5f}
      EnsVOT CLF: TPR={:5f}\tFPR={:5f}
      EnsSTK CLF: TPR={:5f}\tFPR={:5f}
      '''.format(oaac_bin_rec, oaac_bin_fpr,
                 oaac_multi_rec, oaac_multi_fpr,
                 oaac_enslor_rec, oaac_enslor_fpr,
                 oaac_ensvot_rec, oaac_ensvot_fpr,
                 oaac_ensstk_rec, oaac_ensstk_fpr))

Open World assessment: performance against one unknown attack (averaged for all attacks in the dataset)
      Binary CLF: TPR=0.996689	FPR=0.009289
      Multiclass (binarized) CLF: TPR=0.996520	FPR=0.009903
      EnsLOR CLF: TPR=0.999932	FPR=0.009093
      EnsVOT CLF: TPR=0.837641	FPR=0.006321
      EnsSTK CLF: TPR=0.999580	FPR=0.008903
      


## Assessment on Essential feature set (and Adversarial attacks)

### BINARY CLASSIFIER (Essential features)

In [20]:
sma_bClf, sma_bPred, sma_bResult = develop_clf(all_train, all_test, small_features, clf_name='adv_bin', label='Nature', clf_type=base_clf, verbose=1)
sma_bErr = int(len(all_test) * (1-sma_bResult.acc))
if (sma_bResult.acc == 0):
    sma_bErr = int(len(all_test) * (1-sma_bResult.acc_multi))
else:
    sma_bErr = int(len(all_test) * (1-sma_bResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_bErr, len(all_test), sma_bResult.rec, sma_bResult.fpr))
pd.crosstab(all_test['Nature'], sma_bPred, rownames=['True'], colnames=['Pred'])

Training and testing adv_bin......done! Training time: 0.005980s	Inference time: 0.019049s
Total Misclassifications: 7949 out of 115662 (Recall: 0.985379	FPR: 0.077210)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,92279,7721
1,229,15433


#### Adversarial Attack against Binary Classifier

In [21]:
# Note that the adversarial attacks only affect a subset of the initial set of malicious samples
# Hence, we compute the classification performance also on this subset for a far comparison



adv_bPred_base = sma_bClf.predict(mal_base[small_features])
adv_bPred_adv = sma_bClf.predict(mal_adv[small_features])
adv_bin_base_rec =  recall_score(mal_base['Nature'], adv_bPred_base, pos_label=1)
adv_bin_adv_rec = recall_score(mal_adv['Nature'], adv_bPred_adv, pos_label=1)

print("Adversarial Recall (baseline): {:.3f}".format(adv_bin_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_bin_adv_rec))

Adversarial Recall (baseline): 0.990
Adversarial Recall (attack): 0.994


### Multiclass Classifier - cascade (essential feature set)

In [22]:
sma_mcClf, sma_mcPred, sma_mcResult = develop_clf(malicious_train, malicious_test, small_features, clf_name='sma_mc', label='Label_cat', clf_type=base_clf, verbose=1)
sma_mcErr = int(len(malicious_test) * (1-sma_mcResult.acc_multi))

print("Total Misclassifications: {} out of {}".format(sma_mcErr, len(malicious_test)))
pd.crosstab(malicious_test['Label_cat'], sma_mcPred, rownames=['True'], colnames=['Pred'])

Training and testing sma_mc......done! Training time: 0.006969s	Inference time: 0.004983s
Total Misclassifications: 5633 out of 15662


Pred,1,2,3,4,5,6,7
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3374,285,1334,181,528,306,302
2,75,2030,130,0,55,142,123
3,246,55,301,43,161,161,191
4,9,0,7,265,3,1,0
5,198,78,139,20,3720,155,152
6,21,33,68,19,19,149,124
7,13,39,89,0,10,119,189


In [23]:
# We select the samples flagged as malicious by the initial classifier.
# Of course, samples flagged as malicious that are NOT actually malicious will always be misclassified

all_test['sma_bPred'] = sma_bPred
sma_mc_test = all_test[(all_test['sma_bPred']==1) & (all_test['Nature']==1)]
if (len(sma_mc_test)==0):
    # in this case, this classifier receives nothing
    print("There is no malicious sample flagged as malicious to analyze!")

sma_mcPred_m = sma_mcClf.predict(sma_mc_test[small_features])
sma_mcResult.acc_multic = accuracy_score(sma_mc_test['Label_cat'], sma_mcPred_m, normalize=True, sample_weight=None)
sma_mcErr_m = int((1-sma_mcResult.acc_multic) * len(sma_mc_test))
print("Total Misclassifications (among the malicious samples): {} out of {}".format(sma_mcErr_m, len(sma_mc_test)))




pd.crosstab(sma_mc_test['Label_cat'], sma_mcPred_m, rownames=['True'], colnames=['Pred'])

Total Misclassifications (among the malicious samples): 5529 out of 15433


Pred,1,2,3,4,5,6,7
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3276,285,1282,181,521,306,301
2,75,2030,130,0,55,142,123
3,225,55,290,43,156,161,190
4,9,0,7,265,3,1,0
5,195,78,131,20,3704,155,150
6,20,33,65,19,19,149,124
7,13,39,89,0,10,119,189


In [24]:
## Note: We also accounted for the false positives of the first binary classifier (all of which have been considered as misclassifications)
sma_bin_falsePositives = int(sma_bResult.fpr * len(benign_test))
print("This classifier also analyzed {} benign samples that were incorrectly labelled as 'malicious' by the (small) binary classifier".format(sma_bin_falsePositives))

print("Hence, this (small) classifier was tested on {} samples, of which {} have been misclassified".format(len(sma_mc_test)+sma_bin_falsePositives, sma_bin_falsePositives+sma_mcErr_m))

This classifier also analyzed 7721 benign samples that were incorrectly labelled as 'malicious' by the (small) binary classifier
Hence, this (small) classifier was tested on 23154 samples, of which 13250 have been misclassified


### Multiclass Classifier - stand-alone (essential feature set)

In [25]:
sma_mClf, sma_mPred, sma_mResult = develop_clf(all_train, all_test, small_features, clf_name='sma_m', label='Label_cat', clf_type=base_clf, verbose=1)
sma_mErr = int(len(all_test) * (1-sma_mResult.acc_multi))

print("Total Misclassifications: {} out of {}".format(sma_mErr, len(all_test)))
sma_mResult.ctab

Training and testing sma_m......done! Training time: 0.006987s	Inference time: 0.021924s
Total Misclassifications: 15038 out of 115662


Pred,0,1,2,3,4,5,6,7
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,90688,3199,21,4618,203,1082,106,83
1,313,3337,315,988,183,554,307,313
2,0,73,2032,127,0,56,143,124
3,46,235,55,267,47,159,158,191
4,0,13,0,7,262,3,0,0
5,0,209,86,135,24,3697,159,152
6,2,27,33,63,12,22,149,125
7,0,13,39,87,0,10,119,191


#### Multiclass Classifier: Binary

In [26]:
########## MULTI-CLASS CLASSIFIER - BINARY ##########
sma_mPred_bin = np.copy(sma_mPred)
sma_mPred_bin[sma_mPred_bin > 0] = 1
sma_mResult.bin_results(all_test['Nature'], sma_mPred_bin)
sma_mErr_bin = int(len(all_test) * (1-sma_mResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_mErr_bin, len(all_test), sma_mResult.rec, sma_mResult.fpr))
sma_mResult.ctab_bin


Total Misclassifications: 9672 out of 115662 (Recall: 0.976951	FPR: 0.093120)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,90688,9312
1,361,15301


#### Adversarial Attack against the Multiclass Classifier (binary)

In [27]:
adv_mPred_base = sma_mClf.predict(mal_base[small_features])
adv_mPred_adv = sma_mClf.predict(mal_adv[small_features])

adv_mPred_base_bin = np.copy(adv_mPred_base)
adv_mPred_base_bin[adv_mPred_base_bin > 0] = 1

adv_mPred_adv_bin = np.copy(adv_mPred_adv)
adv_mPred_adv_bin[adv_mPred_adv_bin > 0] = 1


adv_multi_base_rec =  recall_score(mal_base['Nature'], adv_mPred_base_bin)
adv_multi_adv_rec = recall_score(mal_adv['Nature'], adv_mPred_adv_bin)

print("Adversarial Recall (baseline): {:.3f}".format(adv_multi_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_multi_adv_rec))

Adversarial Recall (baseline): 0.993
Adversarial Recall (attack): 0.993


## Ensemble Classifiers (essential feature set)

In [28]:
sma_ensemble_df = pd.DataFrame()
adv_ensemble_df_base = pd.DataFrame()
adv_ensemble_df_adv = pd.DataFrame()

benign_train = benign_df[benign_df['is_train']==True]
benign_test = benign_df[benign_df['is_test']==True]


sma_ens_time = 0
sma_ens_avgFPR = 0
sma_tot_TP = 0
sma_tot_P = 0
sma_ens_infer_time = 0
sma_fakeEns_infer_time = 0

for a in attack_names:
    exec(f"{a}_train = {a}_df[{a}_df['is_train']==True]")
    exec(f"{a}_test = {a}_df[{a}_df['is_test']==True]")

    exec(f"train = pd.concat([benign_train, {a}_train])")
    exec(f"test = pd.concat([benign_test, {a}_test])")

    exec(f"sma_{a}Clf, sma_{a}Pred, sma_{a}Result = develop_clf(train, test, small_features, clf_name='sma_{a}', clf_type=base_clf, verbose=1)")
    exec(f"sma_fakeEns_infer_time += sma_{a}Result.infer_time")
    
    exec(f"sma_{a}_allPred, sma_{a}_allResults, sma_{a}Result.infer_time = evaluate_clf(sma_{a}Clf, all_test, small_features, clf_name='sma_{a}', time=sma_{a}Result.time, verbose=1)")
    exec(f"sma_ensemble_df['{a}'] = sma_{a}_allPred")

    exec(f"adv_{a}Pred_base = sma_{a}Clf.predict(mal_base[small_features])")
    exec(f"adv_{a}Pred_adv = sma_{a}Clf.predict(mal_adv[small_features])")
    exec(f"adv_ensemble_df_base['{a}'] = adv_{a}Pred_base")
    exec(f"adv_ensemble_df_adv['{a}'] = adv_{a}Pred_adv")



    exec(f"sma_tot_TP += (sma_{a}Result.rec * len({a}_test))")
    exec(f"sma_tot_P += len({a}_test)")


    exec(f"sma_ens_avgFPR += sma_{a}Result.fpr")
    exec(f"sma_ens_time += sma_{a}Result.time")
    exec(f"sma_ens_infer_time +=sma_{a}Result.infer_time")



sma_ens_avgFPR = sma_ens_avgFPR / len(attack_names)
sma_ens_avgREC = sma_tot_TP/sma_tot_P


print("Total training time: {:5f}s\t AvgFPR: {:5f}\t AvgTPR: {:5f}\tTotal inference time: {:5f}s (fake: {:5f}s)".
      format(sma_ens_time, sma_ens_avgFPR, sma_ens_avgREC, sma_ens_infer_time, sma_fakeEns_infer_time))

Training and testing sma_expl......done! Training time: 0.003992s	Inference time: 0.017864s
Testing sma_expl...
...done! 	Inference time: 0.019932s
Training and testing sma_recon......done! Training time: 0.003988s	Inference time: 0.013952s
Testing sma_recon...
...done! 	Inference time: 0.018929s
Training and testing sma_dos......done! Training time: 0.003987s	Inference time: 0.016934s
Testing sma_dos...
...done! 	Inference time: 0.017939s
Training and testing sma_shell......done! Training time: 0.003979s	Inference time: 0.014958s
Testing sma_shell...
...done! 	Inference time: 0.017938s
Training and testing sma_fuzz......done! Training time: 0.003990s	Inference time: 0.016016s
Testing sma_fuzz...
...done! 	Inference time: 0.016943s
Training and testing sma_bdoor......done! Training time: 0.003903s	Inference time: 0.013960s
Testing sma_bdoor...
...done! 	Inference time: 0.016943s
Training and testing sma_ana......done! Training time: 0.004001s	Inference time: 0.013935s
Testing sma_ana..

### Computing real Ensemble

In [29]:
sma_ensemble_df["sum"] = sma_ensemble_df.sum(axis=1)
sma_ensemble_df["LOR"] = (sma_ensemble_df["sum"]>0)
temp = all_test['Nature'] 
sma_ensemble_df['True'] = ((temp.reset_index(drop=True)) > 0)

adv_ensemble_df_base["sum"] = adv_ensemble_df_base.sum(axis=1)
adv_ensemble_df_base["LOR"] = (adv_ensemble_df_base["sum"]>0)
temp = mal_base['Nature']
adv_ensemble_df_base['True'] = ((temp.reset_index(drop=True)) > 0)


adv_ensemble_df_adv["sum"] = adv_ensemble_df_adv.sum(axis=1)
adv_ensemble_df_adv["LOR"] = (adv_ensemble_df_adv["sum"]>0)
temp = mal_adv['Nature']
adv_ensemble_df_adv['True'] = ((temp.reset_index(drop=True)) > 0)

### Ensemble: Logical OR (essential feature set)

In [30]:
sma_enslorResult = Result(sma_ensemble_df['True'], sma_ensemble_df['LOR'], sma_ens_time, sma_ens_infer_time)
sma_enslorErr= int(len(all_test) * (1-sma_enslorResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_enslorErr, len(all_test), sma_enslorResult.rec, sma_enslorResult.fpr))
sma_enslorResult.ctab_bin # you can also try with sma_enslorResult.ctab

Total Misclassifications: 6246 out of 115662 (Recall: 0.988124	FPR: 0.060600)


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
False,93940,6060
True,186,15476


#### Logical OR: Adversarial Attack

In [31]:
adv_enslor_base_rec = recall_score(mal_base['Nature'], adv_ensemble_df_base["LOR"])
adv_enslor_adv_rec = recall_score(mal_adv['Nature'], adv_ensemble_df_adv["LOR"])

print("Adversarial Recall (baseline): {:.3f}".format(adv_enslor_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_enslor_adv_rec))

Adversarial Recall (baseline): 0.993
Adversarial Recall (attack): 0.992


### Ensemble: Majority Voting (essential feature set)

In [32]:
sma_ensemble_df["MAJV"] = (sma_ensemble_df["sum"]>=min_agree)
sma_ensvotResult = Result(sma_ensemble_df['True'], sma_ensemble_df['MAJV'], sma_ens_time, sma_ens_infer_time)
sma_ensvotErr = int(len(all_test) * (1-sma_ensvotResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_ensvotErr, len(all_test), sma_ensvotResult.rec, sma_ensvotResult.fpr))
sma_ensvotResult.ctab_bin # you can also try with sma_ensvotResult.ctab

Total Misclassifications: 5241 out of 115662 (Recall: 0.708530	FPR: 0.006760)


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
False,99324,676
True,4565,11097


#### Majority Voting: Adversarial Attack

In [33]:
adv_ensemble_df_base["MAJV"] = (adv_ensemble_df_base["sum"]>=min_agree)
adv_ensemble_df_adv["MAJV"] = (adv_ensemble_df_adv["sum"]>=min_agree)

adv_ensvot_base_rec = recall_score(mal_base['Nature'], adv_ensemble_df_base["MAJV"])
adv_ensvot_adv_rec = recall_score(mal_adv['Nature'], adv_ensemble_df_adv["MAJV"])

print("Adversarial Recall (baseline): {:.3f}".format(adv_ensvot_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_ensvot_adv_rec))

Adversarial Recall (baseline): 0.957
Adversarial Recall (attack): 0.002


### Ensemble: Stacked Classifier (essential feature set)

In [34]:
clf_list = []
for a in attack_names:
    exec(f"clf_list.append(sma_{a}Clf)")

sma_sClf = StackingClassifier(classifiers=clf_list, meta_classifier=meta, fit_base_estimators=False, use_probas = False)
s_timeStart = time.time()
sma_sClf.fit(all_train[small_features], all_train['Nature'])
sma_s_time = time.time() - s_timeStart + sma_ens_time
s_timeStart = time.time()
sma_sPred = sma_sClf.predict(all_test[small_features])
sma_s_infer_time = time.time()-s_timeStart
sma_sResult = Result(all_test['Nature'], sma_sPred, sma_s_time, sma_s_infer_time)
if sma_sResult.acc < sma_sResult.acc_multi:
    sma_sResult.acc = sma_sResult.acc_multi
sma_sErr = int(len(all_test) * (1-sma_sResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_sErr, len(all_test), sma_sResult.rec, sma_sResult.fpr))

sma_sResult.ctab_bin # you can also try with sma_sResult.ctab

Total Misclassifications: 6187 out of 115662 (Recall: 0.985570	FPR: 0.059620)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,94038,5962
1,226,15436


#### Stacked Classifier: Adversarial Attack

In [35]:
adv_sPred_base = sma_sClf.predict(mal_base[small_features])
adv_sPred_adv = sma_sClf.predict(mal_adv[small_features])

adv_ensstk_base_rec =  recall_score(mal_base['Nature'], adv_sPred_base)
adv_ensstk_adv_rec = recall_score(mal_adv['Nature'], adv_sPred_adv)

print("Adversarial Recall (baseline): {:.3f}".format(adv_ensstk_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_ensstk_adv_rec))

Adversarial Recall (baseline): 0.987
Adversarial Recall (attack): 0.866


# DONE!

In [36]:
# You can now inspect the results by referring to the "Result" variables. 

In [37]:
## BASELINE RESULTS (on Complete Feature Set)
print(
    "TPR",
    bResult.rec,
    mResult.rec,
    ens_avgREC,
    enslorResult.rec,
    ensvotResult.rec,
    sResult.rec,
    "\nFPR",
    bResult.fpr,
    mResult.fpr,
    ens_avgFPR,
    enslorResult.fpr,
    ensvotResult.fpr,
    sResult.fpr,
    "\nTraining Time",
    bResult.time,
    mResult.time,
    ens_time,
    enslorResult.time,
    ensvotResult.time,
    sResult.time,
    "\nInference Time",
    bResult.infer_time,
    mResult.infer_time,
    fakeEns_infer_time,
    enslorResult.infer_time,
    ensvotResult.infer_time,
    sResult.infer_time, 
    "\nAccuracy",
    mResult.acc_multi,   # This is the accuracy on the multiclassification
    mResult.acc,         # This is the accuracy on the binary classification
    mcResult.acc_multic, # This is the accuracy on the multiclassification AFTER the output of the binary classifier (it does not account for benign samples, which are false positives)
    mcResult.acc_multi   # This is the accuracy on the multiclassification on the whole test portion of the malicious dataset
)

TPR 0.9983399310432894 0.9959136764142511 0.9980845358191802 0.999808453581918 0.9524964883156685 0.999808453581918 
FPR 0.008879999999999999 0.018789999999999973 0.006981428571428551 0.009179999999999966 0.00814999999999999 0.008850000000000025 
Training Time 0.0059814453125 0.008982419967651367 0.026004314422607422 0.026004314422607422 0.026004314422607422 0.038960933685302734 
Inference Time 0.03196406364440918 0.035875797271728516 0.20421147346496582 0.21382498741149902 0.21382498741149902 0.1534864902496338 
Accuracy 0.934360464110944 0.9832010513392471 0.6320030698388335 0.6320393308645128


In [38]:
## BASELINE RESULTS (on Essential Feature Set)
print(
    "TPR",
    sma_bResult.rec,
    sma_mResult.rec,
    sma_ens_avgREC,
    sma_enslorResult.rec,
    sma_ensvotResult.rec,
    sma_sResult.rec,
    "\nFPR",
    sma_bResult.fpr,
    sma_mResult.fpr,
    sma_ens_avgFPR,
    sma_enslorResult.fpr,
    sma_ensvotResult.fpr,
    sma_sResult.fpr,
    "\nTraining Time",
    sma_bResult.time,
    sma_mResult.time,
    sma_ens_time,
    sma_enslorResult.time,
    sma_ensvotResult.time,
    sma_sResult.time,
    "\nInference Time",
    sma_bResult.infer_time,
    sma_mResult.infer_time,
    sma_fakeEns_infer_time,
    sma_enslorResult.infer_time,
    sma_ensvotResult.infer_time,
    sma_sResult.infer_time, 
    "\nAccuracy",
    sma_mResult.acc_multi,   # This is the accuracy on the multiclassification
    sma_mResult.acc,         # This is the accuracy on the binary classification
    sma_mcResult.acc_multic, # This is the accuracy on the multiclassification AFTER the output of the binary classifier (it does not account for benign samples, which are false positives)
    sma_mcResult.acc_multi   # This is the accuracy on the multiclassification on the whole test portion of the malicious dataset
)

TPR 0.985378623419742 0.9769505810241349 0.9665432256416805 0.9881241220789171 0.708530200485251 0.9855701698378241 
FPR 0.07721 0.09311999999999998 0.015475714285714306 0.06059999999999999 0.006759999999999988 0.059620000000000006 
Training Time 0.005980491638183594 0.006987333297729492 0.027839183807373047 0.027839183807373047 0.027839183807373047 0.04079079627990723 
Inference Time 0.01904892921447754 0.021924257278442383 0.10761857032775879 0.12649106979370117 0.12649106979370117 0.09367752075195312 
Accuracy 0.8699745811070188 0.9163683837388252 0.6416769260675177 0.6402758268420381


In [39]:
## Open World: One attack against all (averaged results)

print('''Open World assessment: performance against one unknown attack (averaged for all attacks in the dataset)
      BD: TPR={:5f}\tFPR={:5f}
      MD (binarized) CLF: TPR={:5f}\tFPR={:5f}
      ED-o: TPR={:5f}\tFPR={:5f}
      ED-v: TPR={:5f}\tFPR={:5f}
      ED-s: TPR={:5f}\tFPR={:5f}
      '''.format(oaac_bin_rec, oaac_bin_fpr,
                 oaac_multi_rec, oaac_multi_fpr,
                 oaac_enslor_rec, oaac_enslor_fpr,
                 oaac_ensvot_rec, oaac_ensvot_fpr,
                 oaac_ensstk_rec, oaac_ensstk_fpr))

Open World assessment: performance against one unknown attack (averaged for all attacks in the dataset)
      BD: TPR=0.996689	FPR=0.009289
      MD (binarized) CLF: TPR=0.996520	FPR=0.009903
      ED-o: TPR=0.999932	FPR=0.009093
      ED-v: TPR=0.837641	FPR=0.006321
      ED-s: TPR=0.999580	FPR=0.008903
      


In [40]:
## Adversarial Attacks 

print(
    "BD (before, after):",
    adv_bin_base_rec,
    adv_bin_adv_rec,
    "\nMD  (before, after):",
    adv_multi_base_rec,
    adv_multi_adv_rec,
    "\nED-o  (before, after):",
    adv_enslor_base_rec,
    adv_enslor_adv_rec, 
    "\nED-v  (before, after):",
    adv_ensvot_base_rec,
    adv_ensvot_adv_rec, 
    "\nED-s  (before, after):",
    adv_ensstk_base_rec, 
    adv_ensstk_adv_rec,
)

BD (before, after): 0.9900432900432901 0.9939393939393939 
MD  (before, after): 0.9926406926406927 0.9926406926406927 
ED-o  (before, after): 0.9926406926406927 0.9917748917748918 
ED-v  (before, after): 0.9571428571428572 0.0021645021645021645 
ED-s  (before, after): 0.9874458874458875 0.8662337662337662
