In [1]:
import os
import sys
from supportFunctions import *

root_folder = "..\\data\\IDS17\\flows\\"

In [2]:
#### PARAMETERS #####

## Generic parameters
temporal = True # used to determine if this evaluation assumes a "temporal" dependency among its samples
base_clf = 'dt' # name of the classifier used for this "run". Available names: ['dt', 'rf', 'hgb', 'lr']. You can add more by editing the supportFunctions file
test_size = 0.2 # proportion of the dataset used for testing. We always kept it fixed to 0.2 for our paper
train_size = 100 # proportion of the REMAINING data that are used for training (if >1, then it will take that exact amount). To reproduce the results of the paper, use: 100 (for "limited" training data) or 0.2 or 0.5 or 0.99 (for scarce, moderate, abundant training data, respectively) 
agreement = 0.5 # from 0 to 1. Proportion of classifiers that must agree on an attack (for the ensemble). This is fixed in our paper.
max_size = 500000 ## maximum amount of samples to include when creating the initial dataframes. This is fixed in our paper
max_size_atk = int(max_size / 3) # maximum amount of malicious samples per class. This is fixed in our paper

## Adversarial Attacks parameters
# This attack involves TCP because no malicious UDP flows exist
atk_intensity = 1 # send 100 packets of 10 bytes over 100 seconds
pkt_intensity = atk_intensity * 10 # 
byt_intensity = pkt_intensity * 100 # 
dur_intensity = atk_intensity * 100 # consider seconds

In [3]:
### Reading input data

malicious_folder = root_folder + "malicious/"

benign_file = root_folder + "benign.csv"
benign_df = pd.read_csv(benign_file, header='infer', index_col=0)
benign_df = benign_df.sample(min(max_size, len(benign_df)))
#sort by timestamp
if temporal == True:
    benign_df = benign_df.sort_values(by=['Timestamp'])
benign_df.reset_index(inplace=True, drop=True)

attack_names = ["ddos", "geye", "hulk", "http", "loris", "ftp", "pscan", "ssh", "other"]

ddos_file = malicious_folder + "dos-ddos.csv"
geye_file = malicious_folder + "dos-goldeneye.csv"
hulk_file = malicious_folder + "dos-hulk.csv"
http_file = malicious_folder + "dos-slowhttp.csv"
loris_file = malicious_folder + "dos-slowloris.csv"
ftp_file = malicious_folder + "ftp-patator.csv"
pscan_file = malicious_folder + "portscan.csv"
ssh_file = malicious_folder + "ssh-patator.csv"
other_file = malicious_folder + "other.csv"



for a in attack_names:
    exec(f"{a}_df = pd.read_csv({a}_file, header='infer', index_col=0)")
    exec(f"{a}_df = {a}_df.sample(min(max_size_atk, len({a}_df)))")
    # sort by timestamp
    if temporal == True:
        exec(f"{a}_df = {a}_df.sort_values(by=['Timestamp'])")
    exec(f"{a}_df.reset_index(inplace=True, drop=True)")
    exec(f"{a}_df['Label'] = a")

In [4]:
# Determining Train and Test sets for each class

df_list = [benign_df]
for a in attack_names:
    exec(f"df_list.append({a}_df)")

if temporal == True:
    for dummy_df in df_list:
        if train_size <=1:
            train_threshold = int(((1-test_size) * train_size) * len(dummy_df))
        else:
            train_threshold = int(100)
        test_threshold = len(dummy_df) - int(test_size * len(dummy_df))
        dummy_df['index'] = dummy_df.index
        dummy_df['is_test'] = np.where(dummy_df['index'] >= test_threshold , True, False)
        dummy_df['is_train'] = np.where(dummy_df['index'] <= train_threshold , True, False)
else:
    for dummy_df in df_list:
        if train_size <= 1:
            train_threshold = test_size + (1-test_size)*train_size
        else:
            train_threshold = test_size + ((train_size * 100) / (len(dummy_df)) / 100)       
        dummy_df['seed'] = (np.random.uniform(0,1,len(dummy_df)))
        dummy_df['is_test'] = np.where(dummy_df['seed'] <= test_size, True, False)
        dummy_df['is_train'] = np.where((dummy_df['seed'] <= train_threshold) & (dummy_df['is_test']==False), True, False)

# get all together
all_df = pd.concat(df_list)

In [5]:
def handle_categorical(df):
    ## Handling categorical data
    df_dummy = df.copy(deep=True)
    df_dummy['Nature'] = np.where(df_dummy['Label'].str.contains('BENIGN'),0,1)
    
    for column_name in df_dummy.columns:
        if column_name == ('SrcPort_type'):
            df_dummy[column_name] = pd.factorize(df_dummy[column_name])[0]
        elif column_name == ('DstPort_type'):
            df_dummy[column_name] = pd.factorize(df_dummy[column_name])[0]
        elif column_name == ('Protocol'):
            df_dummy[column_name+"-f"] = pd.factorize(df_dummy[column_name])[0]
        else:
            pass
    return df_dummy

all_df = handle_categorical(all_df)
all_df['Label_cat'] = pd.factorize(all_df['Label'])[0]

all_df['int2int'] = np.where( ((all_df['SrcIP_internal']==True) & (all_df['DstIP_internal']==True)), True, False)

all_df = all_df[~all_df['Label_original'].str.contains("Attempted")] # removing "attempted"

all_df['Duration(s)'] = all_df['FlowDuration'] / 1000000
# all_df['totPkt'] = all_df['FlowPackets/s'] * all_df['Duration(s)']
# all_df['totByt'] = all_df['FlowBytes/s'] * all_df['Duration(s)']

all_df['DstPkt'] = all_df['BwdPackets/s'] * all_df['Duration(s)']
all_df['SrcPkt'] = all_df['FwdPackets/s'] * all_df['Duration(s)']

all_df['DstByt'] = all_df['DstPkt'] * all_df['BwdSegmentSizeAvg']
all_df['SrcByt'] = all_df['SrcPkt'] * all_df['FwdSegmentSizeAvg']

all_df['totPkt'] = all_df['SrcPkt'] + all_df['DstPkt']
all_df['totByt'] = all_df['SrcByt'] + all_df['DstByt']



all_train, all_test = all_df[all_df['is_train']==True], all_df[all_df['is_test']==True]

### SPLITTING ALL BACK ####
benign_df = all_df[all_df['Label']=='BENIGN']
benign_train = benign_df[benign_df['is_train']==True]
benign_test = benign_df[benign_df['is_test']==True]
for a in attack_names:
    exec(f"{a}_df = all_df[all_df['Label']=='{a}']")
    
malicious_df = all_df[all_df['Label']!='BENIGN']
malicious_train, malicious_test = malicious_df[malicious_df['is_train']==True], malicious_df[malicious_df['is_test']==True]


print("& 0 & \\textit{{Benign}} & {} & \\\\ \\cline{{2-4}}".format(len(benign_df)))


for i,a in enumerate(attack_names):
    exec(f"print('& {i+1} & \\\\textit{{{{{a}}}}} & {{}} \\\\\\\\ \\\\cline{{{{2-4}}}}'.format(len({a}_df)))")

& 0 & \textit{Benign} & 497220 & \\ \cline{2-4}
& 1 & \textit{ddos} & 95123 \\ \cline{2-4}
& 2 & \textit{geye} & 7567 \\ \cline{2-4}
& 3 & \textit{hulk} & 158469 \\ \cline{2-4}
& 4 & \textit{http} & 1742 \\ \cline{2-4}
& 5 & \textit{loris} & 4001 \\ \cline{2-4}
& 6 & \textit{ftp} & 3973 \\ \cline{2-4}
& 7 & \textit{pscan} & 159151 \\ \cline{2-4}
& 8 & \textit{ssh} & 2980 \\ \cline{2-4}
& 9 & \textit{other} & 971 \\ \cline{2-4}


In [6]:
## Feature sets

# the following is the "complete" feature set

features = ['Protocol-f',
       'FlowDuration', 
       'FwdPacketLengthMax', 'FwdPacketLengthMin', 'FwdPacketLengthMean',
       'FwdPacketLengthStd', 'BwdPacketLengthMax', 'BwdPacketLengthMin',
       'BwdPacketLengthMean', 'BwdPacketLengthStd', 'FlowBytes/s',
       'FlowPackets/s', 'FlowIATMean', 'FlowIATStd', 'FlowIATMax',
       'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd', 'FwdIATMax',
       'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd', 'BwdIATMax',
       'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags', 'BwdURGFlags',
       'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets/s', 'BwdPackets/s',
       'PacketLengthMin', 'PacketLengthMax', 'PacketLengthMean',
       'PacketLengthStd', 'PacketLengthVariance', 'FINFlagCount',
       'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount', 'ACKFlagCount',
       'URGFlagCount', 'CWRFlagCount', 'ECEFlagCount', 'Down/UpRatio',
       'AveragePacketSize', 'FwdSegmentSizeAvg', 'BwdSegmentSizeAvg',
       'FwdBytes/BulkAvg', 'FwdPacket/BulkAvg', 'FwdBulkRateAvg',
       'BwdBytes/BulkAvg', 'BwdPacket/BulkAvg', 'BwdBulkRateAvg',
       'SubflowFwdPackets', 'SubflowFwdBytes', 'SubflowBwdPackets',
       'SubflowBwdBytes', 'FWDInitWinBytes', 'BwdInitWinBytes',
       'FwdActDataPkts', 'FwdSegSizeMin', 'ActiveMean', 'ActiveStd',
       'ActiveMax', 'ActiveMin', 'IdleMean', 'IdleStd', 'IdleMax', 'IdleMin',
       'SrcPort_type', 'DstPort_type',
       'int2int'
       
       ]

# this is for the "essential" feature set
small_features = ['Protocol-f', 'Duration(s)', 'totPkt', 'totByt',
                'DstPkt', 'SrcPkt', 'DstByt', 'SrcByt', 'SrcPort_type', 
                  'DstPort_type', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags', 'BwdURGFlags',
                  'FINFlagCount',
       'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount', 'ACKFlagCount',
       'URGFlagCount', 'CWRFlagCount', 'ECEFlagCount',
                  'int2int'
                 ]

In [7]:
# creating adversarial dataset
mal_base = malicious_df[((malicious_df['Protocol']==6)) & (malicious_df['is_test']==True)]
mal_adv = mal_base.copy(deep=True)
print(len(mal_base))



# attacking
max_dur = mal_adv['Duration(s)'].max()
min_dur = mal_adv['Duration(s)'].min()
mal_adv['Duration(s)'] = mal_adv['Duration(s)'] + dur_intensity # we increase the duration in seconds
mal_adv['Duration(s)'] = np.where(mal_adv['Duration(s)'] > max_dur, max_dur, mal_adv['Duration(s)'])
mal_adv['Duration(s)'] = np.where(mal_adv['Duration(s)'] < min_dur, min_dur, mal_adv['Duration(s)'])

mal_adv['DstPkt'] = mal_adv['DstPkt'] + pkt_intensity
mal_adv['SrcPkt'] = mal_adv['SrcPkt'] + (pkt_intensity * 2)
mal_adv['DstByt'] = mal_adv['DstByt'] + (pkt_intensity)
mal_adv['SrcByt'] = mal_adv['SrcByt'] + byt_intensity 

mal_adv['SYNFlagCount'] = mal_adv['SYNFlagCount'] + pkt_intensity
mal_adv['ACKFlagCount'] = mal_adv['SYNFlagCount'] + pkt_intensity
mal_adv['totPkt'] = mal_adv['SrcPkt'] + mal_adv['DstPkt']
mal_adv['totByt'] = mal_adv['SrcByt'] + mal_adv['DstByt']

86789


# FROM NOW ON, THE CODE IS ALWAYS THE SAME FOR EVERY DATASET!!!!!!

# Baseline: Assessment on "Complete" feature set

## BINARY CLASSIFIER (Complete features)

In [8]:
bClf, bPred, bResult = develop_clf(all_train, all_test, features, clf_name='bin', label='Nature', clf_type=base_clf, verbose=1)

if (bResult.acc == 0):
    bErr = int(len(all_test) * (1-bResult.acc_multi))
else:
    bErr = int(len(all_test) * (1-bResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(bErr, len(all_test), bResult.rec, bResult.fpr))
pd.crosstab(all_test['Nature'], bPred, rownames=['True'], colnames=['Pred'])

Training and testing bin......done! Training time: 0.014950s	Inference time: 0.090706s
Total Misclassifications: 29505 out of 186349 (Recall: 0.999965	FPR: 0.296333)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,70055,29502
1,3,86789


## MULTI-CLASS CLASSIFIER - cascade

In [9]:
# This is the classifier that analyzes ONLY the malicious samples that "receives" from the initial binary classifier
# It is trained on the same training set---but without using the benign samples
# It is tested on the malicious samples in the test set that are flagged as malicious by the binary classifier


mcClf, mcPred, mcResult = develop_clf(malicious_train, malicious_test, features, clf_name='mc', label='Label_cat', clf_type=base_clf, verbose=1)
mcErr = int(len(malicious_test) * (1-mcResult.acc_multi))
print("Total Misclassifications: {} out of {}".format(mcErr, len(malicious_test)))
pd.crosstab(malicious_test['Label_cat'], mcPred, rownames=['True'], colnames=['Pred'])

Training and testing mc......done! Training time: 0.015484s	Inference time: 0.048845s
Total Misclassifications: 1884 out of 86792


Pred,1,2,3,4,5,6,7,8,9
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,19003,0,0,0,0,21,0,0,0
2,0,1512,0,0,0,0,0,0,1
3,0,531,31137,0,7,0,4,0,14
4,0,127,1,203,16,0,0,0,1
5,0,59,0,591,1,41,92,0,16
6,0,0,0,0,0,794,0,0,0
7,36,19,18,0,18,59,31680,0,0
8,0,2,0,0,0,17,0,577,0
9,0,1,0,0,178,0,0,15,0


In [10]:
# We select the samples flagged as malicious by the initial classifier.
# Of course, samples flagged as malicious that are NOT actually malicious will always be misclassified

all_test['bPred'] = bPred
mc_test = all_test[(all_test['bPred']==1) & (all_test['Nature']==1)]
if (len(mc_test)==0):
    # in this case, this classifier receives nothing
    print("There is no malicious sample flagged as malicious to analyze!")

mcPred_m = mcClf.predict(mc_test[features])
mcResult.acc_multic = accuracy_score(mc_test['Label_cat'], mcPred_m, normalize=True, sample_weight=None)
mcErr_m = int((1-mcResult.acc_multic) * len(mc_test))
print("Total Misclassifications (among the malicious samples): {} out of {}".format(mcErr_m, len(mc_test)))




pd.crosstab(mc_test['Label_cat'], mcPred_m, rownames=['True'], colnames=['Pred'])

Total Misclassifications (among the malicious samples): 1884 out of 86789


Pred,1,2,3,4,5,6,7,8,9
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,19003,0,0,0,0,21,0,0,0
2,0,1512,0,0,0,0,0,0,1
3,0,531,31137,0,7,0,4,0,14
4,0,127,1,203,16,0,0,0,1
5,0,59,0,591,1,41,92,0,16
6,0,0,0,0,0,794,0,0,0
7,36,19,18,0,18,59,31677,0,0
8,0,2,0,0,0,17,0,577,0
9,0,1,0,0,178,0,0,15,0


In [11]:
## Note: We also accounted for the false positives of the first binary classifier (all of which have been considered as misclassifications)
bin_falsePositives = int(bResult.fpr * len(benign_test))
print("This classifier also analyzed {} benign samples that were incorrectly labelled as 'malicious' by the binary classifier".format(bin_falsePositives))

print("Hence, this classifier was tested on {} samples, of which {} have been misclassified".format(len(mc_test)+bin_falsePositives, bin_falsePositives+mcErr_m))

This classifier also analyzed 29502 benign samples that were incorrectly labelled as 'malicious' by the binary classifier
Hence, this classifier was tested on 116291 samples, of which 31386 have been misclassified


## MULTI-CLASS CLASSIFIER - stand-alone

In [12]:
# We first assess its multiclassification performance, and then its binary classification performance

mClf, mPred, mResult = develop_clf(all_train, all_test, features, clf_name='m', label='Label_cat', clf_type=base_clf, verbose=1)
mErr = int(len(all_test) * (1-mResult.acc_multi))
print("Total Misclassifications: {} out of {}".format(mErr, len(all_test)))
mResult.ctab

Training and testing m......done! Training time: 0.017932s	Inference time: 0.094651s
Total Misclassifications: 17414 out of 186349


Pred,0,1,2,3,4,5,6,7,8,9
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,83531,1,3885,139,3193,1018,229,5032,535,1994
1,0,19003,0,0,0,0,0,21,0,0
2,8,0,1504,0,0,0,0,0,0,1
3,40,0,6,31646,0,0,0,0,0,1
4,142,0,1,0,203,1,0,0,0,1
5,105,0,0,1,591,0,0,90,0,13
6,0,0,0,0,0,0,794,0,0,0
7,38,36,0,18,0,0,20,31718,0,0
8,3,0,0,0,0,0,0,16,536,41
9,192,0,0,0,2,0,0,0,0,0


In [13]:
# For the binary classification performance, we use the previous predictions
mPred_bin = np.copy(mPred)
mPred_bin[mPred_bin > 0] = 1
mResult.bin_results(all_test['Nature'], mPred_bin)
mErr_bin = int(len(all_test) * (1-mResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(mErr_bin, len(all_test), mResult.rec, mResult.fpr))
mResult.ctab_bin

Total Misclassifications: 16553 out of 186349 (Recall: 0.993916	FPR: 0.160973)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,83531,16026
1,528,86264


## ENSEMBLE CLASSIFIERS

### Training "individual" binary classifiers

In [14]:
ensemble_df = pd.DataFrame()

benign_train = benign_df[benign_df['is_train']==True]
benign_test = benign_df[benign_df['is_test']==True]


ens_time = 0
ens_avgFPR = 0
tot_TP = 0
tot_P = 0
ens_infer_time = 0
fakeEns_infer_time = 0

for a in attack_names:
    exec(f"{a}_train = {a}_df[{a}_df['is_train']==True]")
    exec(f"{a}_test = {a}_df[{a}_df['is_test']==True]")

    exec(f"train = pd.concat([benign_train, {a}_train])")
    exec(f"test = pd.concat([benign_test, {a}_test])")
    
    # We first train a classifier only on "benign" or on malicious samples of a specific attack. 
    # Afterwards, we immediately test it on a test-set having ONLY malicious samples of this specific attack
    # Note: such "testing" is redundant, because it assumes that the classifier only receives the samples of the attack it is trained on!
    exec(f"{a}Clf, {a}Pred, {a}Result = develop_clf(train, test, features, clf_name='{a}', clf_type=base_clf, verbose=1)")
    exec(f"fakeEns_infer_time += {a}Result.infer_time")
    # We now test the specific classifier on the ENTIRE test-set, thereby allowing to assess its performance also against malicious samples of different attacks
    exec(f"{a}_allPred, {a}_allResults, {a}Result.infer_time = evaluate_clf({a}Clf, all_test, features, clf_name='{a}', time={a}Result.time, verbose=1)")

    exec(f"ensemble_df['{a}'] = {a}_allPred")

    exec(f"tot_TP += ({a}Result.rec * len({a}_test))")
    exec(f"tot_P += len({a}_test)")


    exec(f"ens_avgFPR += {a}Result.fpr")
    exec(f"ens_time += {a}Result.time")
    exec(f"ens_infer_time +={a}Result.infer_time")

ens_avgFPR = ens_avgFPR / len(attack_names)
ens_avgREC = tot_TP/tot_P

print("Total training time: {:5f}s\t AvgFPR: {:5f}\t AvgTPR: {:5f}\tTotal inference time: {:5f}s (fake: {:5f}s)".
      format(ens_time, ens_avgFPR, ens_avgREC, ens_infer_time, fakeEns_infer_time))

Training and testing ddos......done! Training time: 0.004982s	Inference time: 0.061793s
Testing ddos...
...done! 	Inference time: 0.091693s
Training and testing geye......done! Training time: 0.004984s	Inference time: 0.054816s
Testing geye...
...done! 	Inference time: 0.084718s
Training and testing hulk......done! Training time: 0.004989s	Inference time: 0.082687s
Testing hulk...
...done! 	Inference time: 0.084795s
Training and testing http......done! Training time: 0.004946s	Inference time: 0.047841s
Testing http...
...done! 	Inference time: 0.083690s
Training and testing loris......done! Training time: 0.006015s	Inference time: 0.063755s
Testing loris...
...done! 	Inference time: 0.090018s
Training and testing ftp......done! Training time: 0.004983s	Inference time: 0.048836s
Testing ftp...
...done! 	Inference time: 0.083720s
Training and testing pscan......done! Training time: 0.004987s	Inference time: 0.062785s
Testing pscan...
...done! 	Inference time: 0.083724s
Training and testi

### Ensemble (real assessment)

In [15]:
# Here we measure the combined performance of the entire ensemble
# This is done with a logical or, or for majority voting (regulated by the "agreement" variable)

ensemble_df["sum"] = ensemble_df.sum(axis=1)
#calculating 
ensemble_df["LOR"] = (ensemble_df["sum"]>0)

#Appending Ground Truth
temp = all_test['Nature'] #> 0)
ensemble_df['True'] = ((temp.reset_index(drop=True)) > 0)

### Ensemble: Logical OR

In [16]:
enslorResult = Result(ensemble_df['True'], ensemble_df['LOR'], ens_time, ens_infer_time)
enslorErr= int(len(all_test) * (1-enslorResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(enslorErr, len(all_test), enslorResult.rec, enslorResult.fpr))
enslorResult.ctab_bin # you can also try with enslorResult.ctab

Total Misclassifications: 27528 out of 186349 (Recall: 0.997627	FPR: 0.274436)


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
False,72235,27322
True,206,86586


### Ensemble: Majority Voting

In [17]:
min_agree = math.ceil(agreement * len(attack_names))
print("Voting: at least {} out of {} classifiers must agree that a sample is malicious.".format(min_agree, len(attack_names)))
ensemble_df["MAJV"] = (ensemble_df["sum"]>=min_agree)
ensvotResult = Result(ensemble_df['True'], ensemble_df['MAJV'], ens_time, ens_infer_time)
ensvotErr = int(len(all_test) * (1-ensvotResult.acc))
print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(ensvotErr, len(all_test), ensvotResult.rec, ensvotResult.fpr))
ensvotResult.ctab_bin # you can also try with ensvotResult.ctab

Voting: at least 5 out of 9 classifiers must agree that a sample is malicious.
Total Misclassifications: 36486 out of 186349 (Recall: 0.597256	FPR: 0.015378)


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
False,98026,1531
True,34955,51837


### Ensemble: Stacked Classifier

In [18]:
from mlxtend.classifier import StackingClassifier
clf_list = []
for a in attack_names:
    exec(f"clf_list.append({a}Clf)")

meta = choose_clf(clf_type=base_clf)
sClf = StackingClassifier(classifiers=clf_list, meta_classifier=meta, fit_base_estimators=False, use_probas = False)
s_timeStart = time.time()
sClf.fit(all_train[features], all_train['Nature'])
s_time = time.time() - s_timeStart + ens_time
s_timeStart = time.time()
sPred = sClf.predict(all_test[features])
s_infer_time = time.time()-s_timeStart
sResult = Result(all_test['Nature'], sPred, s_time, s_infer_time)
if sResult.acc < sResult.acc_multi:
    sResult.acc = sResult.acc_multi
sErr = int(len(all_test) * (1-sResult.acc))
print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sErr, len(all_test), sResult.rec, sResult.fpr))

sResult.ctab_bin # you can also try with sResult.ctab

Total Misclassifications: 18845 out of 186349 (Recall: 0.995207	FPR: 0.185120)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,81127,18430
1,416,86376


## Open World Assessment: One attack against all classifiers

In [19]:
### The following code is a mixture of everything described insofar.
### We are only focused on TPR and FPR here. We do not care about accuracy, adversarial robustness, or runtime.
### These experiments are also done only on the "Complete" feature set


oaac_bin_rec = 0
oaac_bin_fpr = 0
oaac_multi_rec = 0
oaac_multi_fpr = 0
oaac_enslor_rec = 0
oaac_enslor_fpr = 0
oaac_ensvot_rec = 0
oaac_ensvot_fpr = 0
oaac_ensstk_rec = 0
oaac_ensstk_fpr = 0

for u in attack_names: # u is the unknown attack
    #print(u) # this is the unknown attack
    #exec(f"{u}_test = {u}_df[{u}_df['is_train']==False]") # create test set by putting the "test" samples of u
    exec(f"{u}_test = pd.concat([benign_test, {u}_test])") # add to the test set the "benign" test samples
    exec(f"{u}_train = benign_df[benign_df['is_train']==True]") # compose the "training" set: start by putting the benign "training" samples
    for a in attack_names: 
        # for every attack that is not u, add its training samples to the training set of u
        if a==u:
            continue
        exec(f"{u}_train = pd.concat([{u}_train, {a}_df[{a}_df['is_train']==True]])")


    # We have created the training and testing set. Now we must train and test a binary classifier by following the standard procedure
    ########## BINARY CLASSIFIER ##########
    exec(f"{u}_oaac_bClf, {u}_oaac_bPred, {u}_oaac_bResult = develop_clf({u}_train, {u}_test, features, clf_name='{u}_oaac_bin', label='Nature', clf_type=base_clf)")


    ########## Multiclass CLASSIFIER ########## --> Train, then test only on binary
    exec(f"{u}_oaac_mClf, {u}_oaac_mPred, {u}_oaac_mResult = develop_clf({u}_train, {u}_test, features, clf_name='{u}_oaac_multi', label='Label_cat', clf_type=base_clf)")
    exec(f"{u}_oaac_mPred_bin = np.copy({u}_oaac_mPred)")
    exec(f"{u}_oaac_mPred_bin[{u}_oaac_mPred_bin > 0] = 1")
    exec(f"{u}_oaac_mResult.bin_results({u}_test['Nature'], {u}_oaac_mPred_bin)")


    ######### Ensemble ##############
    # send the samples in TEST to all the classifiers of the ensemble (which are already trained), aside from the one focusing on u
    exec(f"{u}_oaac_ens_df = pd.DataFrame()")
    for a in attack_names:    
        if a==u:
                continue
        exec(f"{a}_{u}Pred, {a}_{u}Results, {a}_{u}_infer_time = evaluate_clf({a}Clf, {u}_test, features, clf_name='{a}_{u}', time={a}Result.time)")
        exec(f"{u}_oaac_ens_df['{a}'] = {a}_{u}Pred")

    # now we have the dataframe with all the predictions, let's see the aggregate results
    exec(f"{u}_oaac_ens_df['sum'] = {u}_oaac_ens_df.sum(axis=1)")
    exec(f"{u}_oaac_ens_df['LOR'] = ({u}_oaac_ens_df['sum']>0)")
    exec(f"temp = {u}_test['Nature'] #> 0)")
    exec(f"{u}_oaac_ens_df['True'] = ((temp.reset_index(drop=True)) > 0)")
    exec(f"{u}_oaac_enslorResult = Result({u}_oaac_ens_df['True'], {u}_oaac_ens_df['LOR'], (ens_time-{u}Result.time), (ens_infer_time-{u}Result.infer_time))")

    # now we consider the majority voting of the ensemble
    exec(f"{u}_oaac_ens_df['MAJV'] = ({u}_oaac_ens_df['sum']>=min_agree)")
    exec(f"{u}_oaac_ensvotResult = Result({u}_oaac_ens_df['True'], {u}_oaac_ens_df['MAJV'], (ens_time-{u}Result.time), (ens_infer_time-{u}Result.infer_time))")

    # finally, let's use the stacking ensemble
    exec(f"{u}_clf_list = []")
    for a in attack_names:
        if a==u:
                continue
        exec(f"{u}_clf_list.append({a}Clf)")
    exec(f"{u}_oaac_sClf = StackingClassifier(classifiers={u}_clf_list, meta_classifier=meta, fit_base_estimators=False, use_probas = False)")
    exec(f"{u}_oaac_sClf.fit({u}_train[features], {u}_train['Nature'])")
    exec(f"{u}_oaac_sPred = {u}_oaac_sClf.predict({u}_test[features])")
    exec(f"{u}_oaac_sResult = Result({u}_test['Nature'], {u}_oaac_sPred, (ens_time-{u}Result.time), (ens_infer_time-{u}Result.infer_time))")

    # Updating results
    exec(f"oaac_bin_rec += {u}_oaac_bResult.rec")
    exec(f"oaac_bin_fpr += {u}_oaac_bResult.fpr")
    exec(f"oaac_multi_rec += {u}_oaac_mResult.rec")
    exec(f"oaac_multi_fpr += {u}_oaac_mResult.fpr")
    exec(f"oaac_enslor_rec += {u}_oaac_enslorResult.rec")
    exec(f"oaac_enslor_fpr += {u}_oaac_enslorResult.fpr")
    exec(f"oaac_ensvot_rec += {u}_oaac_ensvotResult.rec")
    exec(f"oaac_ensvot_fpr += {u}_oaac_ensvotResult.fpr")
    exec(f"oaac_ensstk_rec += {u}_oaac_sResult.rec")
    exec(f"oaac_ensstk_fpr += {u}_oaac_sResult.fpr")

# Finalizing averages
oaac_bin_rec /= len(attack_names)
oaac_bin_fpr /= len(attack_names)
oaac_multi_rec /= len(attack_names)
oaac_multi_fpr /= len(attack_names)
oaac_enslor_rec /= len(attack_names)
oaac_enslor_fpr /= len(attack_names)
oaac_ensvot_rec /= len(attack_names)
oaac_ensvot_fpr /= len(attack_names)
oaac_ensstk_rec /= len(attack_names)
oaac_ensstk_fpr /= len(attack_names)


print('''Open World assessment: performance against one unknown attack (averaged for all attacks in the dataset)
      Binary CLF: TPR={:5f}\tFPR={:5f}
      Multiclass (binarized) CLF: TPR={:5f}\tFPR={:5f}
      EnsLOR CLF: TPR={:5f}\tFPR={:5f}
      EnsVOT CLF: TPR={:5f}\tFPR={:5f}
      EnsSTK CLF: TPR={:5f}\tFPR={:5f}
      '''.format(oaac_bin_rec, oaac_bin_fpr,
                 oaac_multi_rec, oaac_multi_fpr,
                 oaac_enslor_rec, oaac_enslor_fpr,
                 oaac_ensvot_rec, oaac_ensvot_fpr,
                 oaac_ensstk_rec, oaac_ensstk_fpr))

Open World assessment: performance against one unknown attack (averaged for all attacks in the dataset)
      Binary CLF: TPR=0.731778	FPR=0.289414
      Multiclass (binarized) CLF: TPR=0.668178	FPR=0.171463
      EnsLOR CLF: TPR=0.816140	FPR=0.258750
      EnsVOT CLF: TPR=0.175645	FPR=0.010300
      EnsSTK CLF: TPR=0.815693	FPR=0.181915
      


## Assessment on Essential feature set (and Adversarial attacks)

### BINARY CLASSIFIER (Essential features)

In [20]:
sma_bClf, sma_bPred, sma_bResult = develop_clf(all_train, all_test, small_features, clf_name='adv_bin', label='Nature', clf_type=base_clf, verbose=1)
sma_bErr = int(len(all_test) * (1-sma_bResult.acc))
if (sma_bResult.acc == 0):
    sma_bErr = int(len(all_test) * (1-sma_bResult.acc_multi))
else:
    sma_bErr = int(len(all_test) * (1-sma_bResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_bErr, len(all_test), sma_bResult.rec, sma_bResult.fpr))
pd.crosstab(all_test['Nature'], sma_bPred, rownames=['True'], colnames=['Pred'])

Training and testing adv_bin......done! Training time: 0.006977s	Inference time: 0.041860s
Total Misclassifications: 33682 out of 186349 (Recall: 0.993698	FPR: 0.332824)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,66422,33135
1,547,86245


#### Adversarial Attack against Binary Classifier

In [21]:
# Note that the adversarial attacks only affect a subset of the initial set of malicious samples
# Hence, we compute the classification performance also on this subset for a far comparison



adv_bPred_base = sma_bClf.predict(mal_base[small_features])
adv_bPred_adv = sma_bClf.predict(mal_adv[small_features])
adv_bin_base_rec =  recall_score(mal_base['Nature'], adv_bPred_base, pos_label=1)
adv_bin_adv_rec = recall_score(mal_adv['Nature'], adv_bPred_adv, pos_label=1)

print("Adversarial Recall (baseline): {:.3f}".format(adv_bin_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_bin_adv_rec))

Adversarial Recall (baseline): 0.994
Adversarial Recall (attack): 1.000


### Multiclass Classifier - cascade (essential feature set)

In [22]:
sma_mcClf, sma_mcPred, sma_mcResult = develop_clf(malicious_train, malicious_test, small_features, clf_name='sma_mc', label='Label_cat', clf_type=base_clf, verbose=1)
sma_mcErr = int(len(malicious_test) * (1-sma_mcResult.acc_multi))

print("Total Misclassifications: {} out of {}".format(sma_mcErr, len(malicious_test)))
pd.crosstab(malicious_test['Label_cat'], sma_mcPred, rownames=['True'], colnames=['Pred'])

Training and testing sma_mc......done! Training time: 0.005980s	Inference time: 0.016951s
Total Misclassifications: 1800 out of 86792


Pred,1,2,3,4,5,6,7,9
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,19002,0,0,0,0,0,22,0
2,0,1513,0,0,0,0,0,0
3,0,1,31614,10,0,0,0,68
4,0,0,0,176,161,0,0,11
5,208,0,0,297,75,0,218,2
6,0,0,0,0,0,794,0,0
7,131,0,0,0,0,0,31699,0
8,0,11,0,0,0,1,16,568
9,0,0,0,0,0,0,75,119


In [23]:
# We select the samples flagged as malicious by the initial classifier.
# Of course, samples flagged as malicious that are NOT actually malicious will always be misclassified

all_test['sma_bPred'] = sma_bPred
sma_mc_test = all_test[(all_test['sma_bPred']==1) & (all_test['Nature']==1)]
if (len(sma_mc_test)==0):
    # in this case, this classifier receives nothing
    print("There is no malicious sample flagged as malicious to analyze!")

sma_mcPred_m = sma_mcClf.predict(sma_mc_test[small_features])
sma_mcResult.acc_multic = accuracy_score(sma_mc_test['Label_cat'], sma_mcPred_m, normalize=True, sample_weight=None)
sma_mcErr_m = int((1-sma_mcResult.acc_multic) * len(sma_mc_test))
print("Total Misclassifications (among the malicious samples): {} out of {}".format(sma_mcErr_m, len(sma_mc_test)))




pd.crosstab(sma_mc_test['Label_cat'], sma_mcPred_m, rownames=['True'], colnames=['Pred'])

Total Misclassifications (among the malicious samples): 1366 out of 86245


Pred,1,2,3,4,5,6,7,9
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,19002,0,0,0,0,0,22,0
2,0,1513,0,0,0,0,0,0
3,0,1,31614,10,0,0,0,68
4,0,0,0,169,158,0,0,11
5,0,0,0,297,75,0,128,2
6,0,0,0,0,0,794,0,0
7,73,0,0,0,0,0,31696,0
8,0,11,0,0,0,1,16,568
9,0,0,0,0,0,0,0,16


In [24]:
## Note: We also accounted for the false positives of the first binary classifier (all of which have been considered as misclassifications)
sma_bin_falsePositives = int(sma_bResult.fpr * len(benign_test))
print("This classifier also analyzed {} benign samples that were incorrectly labelled as 'malicious' by the (small) binary classifier".format(sma_bin_falsePositives))

print("Hence, this (small) classifier was tested on {} samples, of which {} have been misclassified".format(len(sma_mc_test)+sma_bin_falsePositives, sma_bin_falsePositives+sma_mcErr_m))

This classifier also analyzed 33135 benign samples that were incorrectly labelled as 'malicious' by the (small) binary classifier
Hence, this (small) classifier was tested on 119380 samples, of which 34501 have been misclassified


### Multiclass Classifier - stand-alone (essential feature set)

In [25]:
sma_mClf, sma_mPred, sma_mResult = develop_clf(all_train, all_test, small_features, clf_name='sma_m', label='Label_cat', clf_type=base_clf, verbose=1)
sma_mErr = int(len(all_test) * (1-sma_mResult.acc_multi))

print("Total Misclassifications: {} out of {}".format(sma_mErr, len(all_test)))
sma_mResult.ctab

Training and testing sma_m......done! Training time: 0.005989s	Inference time: 0.044887s
Total Misclassifications: 29965 out of 186349


Pred,0,1,2,3,4,5,6,7,8,9
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,72237,44,814,4071,2532,3254,4307,4275,15,8008
1,0,19002,0,0,0,0,0,22,0,0
2,0,0,1513,0,0,0,0,0,0,0
3,12,0,327,30940,3,0,0,0,0,411
4,151,0,9,18,160,1,0,0,0,9
5,127,336,0,0,296,2,0,1,0,38
6,0,0,0,0,0,1,793,0,0,0
7,21,36,0,18,0,0,19,31736,0,0
8,3,0,2,0,0,14,0,15,0,562
9,194,0,0,0,0,0,0,0,0,0


#### Multiclass Classifier: Binary

In [26]:
########## MULTI-CLASS CLASSIFIER - BINARY ##########
sma_mPred_bin = np.copy(sma_mPred)
sma_mPred_bin[sma_mPred_bin > 0] = 1
sma_mResult.bin_results(all_test['Nature'], sma_mPred_bin)
sma_mErr_bin = int(len(all_test) * (1-sma_mResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_mErr_bin, len(all_test), sma_mResult.rec, sma_mResult.fpr))
sma_mResult.ctab_bin


Total Misclassifications: 27828 out of 186349 (Recall: 0.994147	FPR: 0.274416)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,72237,27320
1,508,86284


#### Adversarial Attack against the Multiclass Classifier (binary)

In [27]:
adv_mPred_base = sma_mClf.predict(mal_base[small_features])
adv_mPred_adv = sma_mClf.predict(mal_adv[small_features])

adv_mPred_base_bin = np.copy(adv_mPred_base)
adv_mPred_base_bin[adv_mPred_base_bin > 0] = 1

adv_mPred_adv_bin = np.copy(adv_mPred_adv)
adv_mPred_adv_bin[adv_mPred_adv_bin > 0] = 1


adv_multi_base_rec =  recall_score(mal_base['Nature'], adv_mPred_base_bin)
adv_multi_adv_rec = recall_score(mal_adv['Nature'], adv_mPred_adv_bin)

print("Adversarial Recall (baseline): {:.3f}".format(adv_multi_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_multi_adv_rec))

Adversarial Recall (baseline): 0.994
Adversarial Recall (attack): 0.904


## Ensemble Classifiers (essential feature set)

In [28]:
sma_ensemble_df = pd.DataFrame()
adv_ensemble_df_base = pd.DataFrame()
adv_ensemble_df_adv = pd.DataFrame()

benign_train = benign_df[benign_df['is_train']==True]
benign_test = benign_df[benign_df['is_test']==True]


sma_ens_time = 0
sma_ens_avgFPR = 0
sma_tot_TP = 0
sma_tot_P = 0
sma_ens_infer_time = 0
sma_fakeEns_infer_time = 0

for a in attack_names:
    exec(f"{a}_train = {a}_df[{a}_df['is_train']==True]")
    exec(f"{a}_test = {a}_df[{a}_df['is_test']==True]")

    exec(f"train = pd.concat([benign_train, {a}_train])")
    exec(f"test = pd.concat([benign_test, {a}_test])")

    exec(f"sma_{a}Clf, sma_{a}Pred, sma_{a}Result = develop_clf(train, test, small_features, clf_name='sma_{a}', clf_type=base_clf, verbose=1)")
    exec(f"sma_fakeEns_infer_time += sma_{a}Result.infer_time")
    
    exec(f"sma_{a}_allPred, sma_{a}_allResults, sma_{a}Result.infer_time = evaluate_clf(sma_{a}Clf, all_test, small_features, clf_name='sma_{a}', time=sma_{a}Result.time, verbose=1)")
    exec(f"sma_ensemble_df['{a}'] = sma_{a}_allPred")

    exec(f"adv_{a}Pred_base = sma_{a}Clf.predict(mal_base[small_features])")
    exec(f"adv_{a}Pred_adv = sma_{a}Clf.predict(mal_adv[small_features])")
    exec(f"adv_ensemble_df_base['{a}'] = adv_{a}Pred_base")
    exec(f"adv_ensemble_df_adv['{a}'] = adv_{a}Pred_adv")



    exec(f"sma_tot_TP += (sma_{a}Result.rec * len({a}_test))")
    exec(f"sma_tot_P += len({a}_test)")


    exec(f"sma_ens_avgFPR += sma_{a}Result.fpr")
    exec(f"sma_ens_time += sma_{a}Result.time")
    exec(f"sma_ens_infer_time +=sma_{a}Result.infer_time")



sma_ens_avgFPR = sma_ens_avgFPR / len(attack_names)
sma_ens_avgREC = sma_tot_TP/sma_tot_P


print("Total training time: {:5f}s\t AvgFPR: {:5f}\t AvgTPR: {:5f}\tTotal inference time: {:5f}s (fake: {:5f}s)".
      format(sma_ens_time, sma_ens_avgFPR, sma_ens_avgREC, sma_ens_infer_time, sma_fakeEns_infer_time))

Training and testing sma_ddos......done! Training time: 0.004899s	Inference time: 0.025922s
Testing sma_ddos...
...done! 	Inference time: 0.033895s
Training and testing sma_geye......done! Training time: 0.003986s	Inference time: 0.020939s
Testing sma_geye...
...done! 	Inference time: 0.032804s
Training and testing sma_hulk......done! Training time: 0.003982s	Inference time: 0.026902s
Testing sma_hulk...
...done! 	Inference time: 0.032968s
Training and testing sma_http......done! Training time: 0.003978s	Inference time: 0.019943s
Testing sma_http...
...done! 	Inference time: 0.033958s
Training and testing sma_loris......done! Training time: 0.003978s	Inference time: 0.017950s
Testing sma_loris...
...done! 	Inference time: 0.032884s
Training and testing sma_ftp......done! Training time: 0.003986s	Inference time: 0.041939s
Testing sma_ftp...
...done! 	Inference time: 0.031901s
Training and testing sma_pscan......done! Training time: 0.003990s	Inference time: 0.025906s
Testing sma_pscan..

### Computing real Ensemble

In [29]:
sma_ensemble_df["sum"] = sma_ensemble_df.sum(axis=1)
sma_ensemble_df["LOR"] = (sma_ensemble_df["sum"]>0)
temp = all_test['Nature'] 
sma_ensemble_df['True'] = ((temp.reset_index(drop=True)) > 0)

adv_ensemble_df_base["sum"] = adv_ensemble_df_base.sum(axis=1)
adv_ensemble_df_base["LOR"] = (adv_ensemble_df_base["sum"]>0)
temp = mal_base['Nature']
adv_ensemble_df_base['True'] = ((temp.reset_index(drop=True)) > 0)


adv_ensemble_df_adv["sum"] = adv_ensemble_df_adv.sum(axis=1)
adv_ensemble_df_adv["LOR"] = (adv_ensemble_df_adv["sum"]>0)
temp = mal_adv['Nature']
adv_ensemble_df_adv['True'] = ((temp.reset_index(drop=True)) > 0)

### Ensemble: Logical OR (essential feature set)

In [30]:
sma_enslorResult = Result(sma_ensemble_df['True'], sma_ensemble_df['LOR'], sma_ens_time, sma_ens_infer_time)
sma_enslorErr= int(len(all_test) * (1-sma_enslorResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_enslorErr, len(all_test), sma_enslorResult.rec, sma_enslorResult.fpr))
sma_enslorResult.ctab_bin # you can also try with sma_enslorResult.ctab

Total Misclassifications: 32788 out of 186349 (Recall: 0.997166	FPR: 0.326868)


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
False,67015,32542
True,246,86546


#### Logical OR: Adversarial Attack

In [31]:
adv_enslor_base_rec = recall_score(mal_base['Nature'], adv_ensemble_df_base["LOR"])
adv_enslor_adv_rec = recall_score(mal_adv['Nature'], adv_ensemble_df_adv["LOR"])

print("Adversarial Recall (baseline): {:.3f}".format(adv_enslor_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_enslor_adv_rec))

Adversarial Recall (baseline): 0.997
Adversarial Recall (attack): 0.625


### Ensemble: Majority Voting (essential feature set)

In [32]:
sma_ensemble_df["MAJV"] = (sma_ensemble_df["sum"]>=min_agree)
sma_ensvotResult = Result(sma_ensemble_df['True'], sma_ensemble_df['MAJV'], sma_ens_time, sma_ens_infer_time)
sma_ensvotErr = int(len(all_test) * (1-sma_ensvotResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_ensvotErr, len(all_test), sma_ensvotResult.rec, sma_ensvotResult.fpr))
sma_ensvotResult.ctab_bin # you can also try with sma_ensvotResult.ctab

Total Misclassifications: 68525 out of 186349 (Recall: 0.238686	FPR: 0.024599)


Pred,False,True
True,Unnamed: 1_level_1,Unnamed: 2_level_1
False,97108,2449
True,66076,20716


#### Majority Voting: Adversarial Attack

In [33]:
adv_ensemble_df_base["MAJV"] = (adv_ensemble_df_base["sum"]>=min_agree)
adv_ensemble_df_adv["MAJV"] = (adv_ensemble_df_adv["sum"]>=min_agree)

adv_ensvot_base_rec = recall_score(mal_base['Nature'], adv_ensemble_df_base["MAJV"])
adv_ensvot_adv_rec = recall_score(mal_adv['Nature'], adv_ensemble_df_adv["MAJV"])

print("Adversarial Recall (baseline): {:.3f}".format(adv_ensvot_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_ensvot_adv_rec))

Adversarial Recall (baseline): 0.239
Adversarial Recall (attack): 0.000


### Ensemble: Stacked Classifier (essential feature set)

In [34]:
clf_list = []
for a in attack_names:
    exec(f"clf_list.append(sma_{a}Clf)")

sma_sClf = StackingClassifier(classifiers=clf_list, meta_classifier=meta, fit_base_estimators=False, use_probas = False)
s_timeStart = time.time()
sma_sClf.fit(all_train[small_features], all_train['Nature'])
sma_s_time = time.time() - s_timeStart + sma_ens_time
s_timeStart = time.time()
sma_sPred = sma_sClf.predict(all_test[small_features])
sma_s_infer_time = time.time()-s_timeStart
sma_sResult = Result(all_test['Nature'], sma_sPred, sma_s_time, sma_s_infer_time)
if sma_sResult.acc < sma_sResult.acc_multi:
    sma_sResult.acc = sma_sResult.acc_multi
sma_sErr = int(len(all_test) * (1-sma_sResult.acc))

print("Total Misclassifications: {} out of {} (Recall: {:5f}\tFPR: {:5f})".format(sma_sErr, len(all_test), sma_sResult.rec, sma_sResult.fpr))

sma_sResult.ctab_bin # you can also try with sma_sResult.ctab

Total Misclassifications: 32464 out of 186349 (Recall: 0.997166	FPR: 0.323624)


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,67338,32219
1,246,86546


#### Stacked Classifier: Adversarial Attack

In [35]:
adv_sPred_base = sma_sClf.predict(mal_base[small_features])
adv_sPred_adv = sma_sClf.predict(mal_adv[small_features])

adv_ensstk_base_rec =  recall_score(mal_base['Nature'], adv_sPred_base)
adv_ensstk_adv_rec = recall_score(mal_adv['Nature'], adv_sPred_adv)

print("Adversarial Recall (baseline): {:.3f}".format(adv_ensstk_base_rec))
print("Adversarial Recall (attack): {:.3f}".format(adv_ensstk_adv_rec))

Adversarial Recall (baseline): 0.997
Adversarial Recall (attack): 0.596


# DONE!

In [36]:
# You can now inspect the results by referring to the "Result" variables. 

In [37]:
## BASELINE RESULTS (on Complete Feature Set)
print(
    "TPR",
    bResult.rec,
    mResult.rec,
    ens_avgREC,
    enslorResult.rec,
    ensvotResult.rec,
    sResult.rec,
    "\nFPR",
    bResult.fpr,
    mResult.fpr,
    ens_avgFPR,
    enslorResult.fpr,
    ensvotResult.fpr,
    sResult.fpr,
    "\nTraining Time",
    bResult.time,
    mResult.time,
    ens_time,
    enslorResult.time,
    ensvotResult.time,
    sResult.time,
    "\nInference Time",
    bResult.infer_time,
    mResult.infer_time,
    fakeEns_infer_time,
    enslorResult.infer_time,
    ensvotResult.infer_time,
    sResult.infer_time, 
    "\nAccuracy",
    mResult.acc_multi,   # This is the accuracy on the multiclassification
    mResult.acc,         # This is the accuracy on the binary classification
    mcResult.acc_multic, # This is the accuracy on the multiclassification AFTER the output of the binary classifier (it does not account for benign samples, which are false positives)
    mcResult.acc_multi   # This is the accuracy on the multiclassification on the whole test portion of the malicious dataset
)

TPR 0.9999654346022675 0.9939164899990782 0.9867038436722279 0.997626509355701 0.5972555074200387 0.9952069315144253 
FPR 0.29633275410066595 0.1609731108812037 0.05773576945870207 0.27443575037415757 0.015378125094167139 0.18512008196309648 
Training Time 0.014950037002563477 0.017932415008544922 0.04498577117919922 0.04498577117919922 0.04498577117919922 0.06286764144897461 
Inference Time 0.09070611000061035 0.0946507453918457 0.5190906524658203 0.7697935104370117 0.7697935104370117 0.5110363960266113 
Accuracy 0.9065516852786974 0.9111666818711128 0.9782806576870341 0.9782814084247397


In [38]:
## BASELINE RESULTS (on Essential Feature Set)
print(
    "TPR",
    sma_bResult.rec,
    sma_mResult.rec,
    sma_ens_avgREC,
    sma_enslorResult.rec,
    sma_ensvotResult.rec,
    sma_sResult.rec,
    "\nFPR",
    sma_bResult.fpr,
    sma_mResult.fpr,
    sma_ens_avgFPR,
    sma_enslorResult.fpr,
    sma_ensvotResult.fpr,
    sma_sResult.fpr,
    "\nTraining Time",
    sma_bResult.time,
    sma_mResult.time,
    sma_ens_time,
    sma_enslorResult.time,
    sma_ensvotResult.time,
    sma_sResult.time,
    "\nInference Time",
    sma_bResult.infer_time,
    sma_mResult.infer_time,
    sma_fakeEns_infer_time,
    sma_enslorResult.infer_time,
    sma_ensvotResult.infer_time,
    sma_sResult.infer_time, 
    "\nAccuracy",
    sma_mResult.acc_multi,   # This is the accuracy on the multiclassification
    sma_mResult.acc,         # This is the accuracy on the binary classification
    sma_mcResult.acc_multic, # This is the accuracy on the multiclassification AFTER the output of the binary classifier (it does not account for benign samples, which are false positives)
    sma_mcResult.acc_multi   # This is the accuracy on the multiclassification on the whole test portion of the malicious dataset
)

TPR 0.993697575813439 0.9941469259839617 0.9895382062862936 0.9971656373859342 0.2386855931422251 0.9971656373859342 
FPR 0.33282441214580594 0.274415661379913 0.08122761611717687 0.32686802535231074 0.024598973452394146 0.3236236527818235 
Training Time 0.006976604461669922 0.0059888362884521484 0.0387723445892334 0.0387723445892334 0.0387723445892334 0.05571413040161133 
Inference Time 0.041860103607177734 0.04488706588745117 0.21473908424377441 0.29760026931762695 0.29760026931762695 0.20990777015686035 
Accuracy 0.8391942001298639 0.8506672963096126 0.9841614006609078 0.979260761360494


In [39]:
## Open World: One attack against all (averaged results)

print('''Open World assessment: performance against one unknown attack (averaged for all attacks in the dataset)
      BD: TPR={:5f}\tFPR={:5f}
      MD (binarized) CLF: TPR={:5f}\tFPR={:5f}
      ED-o: TPR={:5f}\tFPR={:5f}
      ED-v: TPR={:5f}\tFPR={:5f}
      ED-s: TPR={:5f}\tFPR={:5f}
      '''.format(oaac_bin_rec, oaac_bin_fpr,
                 oaac_multi_rec, oaac_multi_fpr,
                 oaac_enslor_rec, oaac_enslor_fpr,
                 oaac_ensvot_rec, oaac_ensvot_fpr,
                 oaac_ensstk_rec, oaac_ensstk_fpr))

Open World assessment: performance against one unknown attack (averaged for all attacks in the dataset)
      BD: TPR=0.731778	FPR=0.289414
      MD (binarized) CLF: TPR=0.668178	FPR=0.171463
      ED-o: TPR=0.816140	FPR=0.258750
      ED-v: TPR=0.175645	FPR=0.010300
      ED-s: TPR=0.815693	FPR=0.181915
      


In [40]:
## Adversarial Attacks 

print(
    "BD (before, after):",
    adv_bin_base_rec,
    adv_bin_adv_rec,
    "\nMD  (before, after):",
    adv_multi_base_rec,
    adv_multi_adv_rec,
    "\nED-o  (before, after):",
    adv_enslor_base_rec,
    adv_enslor_adv_rec, 
    "\nED-v  (before, after):",
    adv_ensvot_base_rec,
    adv_ensvot_adv_rec, 
    "\nED-s  (before, after):",
    adv_ensstk_base_rec, 
    adv_ensstk_adv_rec,
)

BD (before, after): 0.9937319245526507 0.9998847780248649 
MD  (before, after): 0.9941812902556776 0.9035476846144097 
ED-o  (before, after): 0.9972001060042172 0.6251368260954729 
ED-v  (before, after): 0.23869384368986854 0.0 
ED-s  (before, after): 0.9972001060042172 0.595801311226077
