In [14]:
import glob
import os
from os.path import join
from numpy import genfromtxt
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def get_immediate_subdirs(a_dir, only=''):
    if only=='':
        return [os.path.join(a_dir, name) for name in os.listdir(a_dir) 
                if os.path.isdir(os.path.join(a_dir, name)) and name.endswith('_l')]
    else:
        return [os.path.join(a_dir, name) for name in os.listdir(a_dir)
                if os.path.isdir(os.path.join(a_dir, name)) and name.endswith('_l') and (only in name or 'whole' in name)]

In [3]:
def ensure_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)


In [4]:
def get_pkt_count(dirs,ending):
    counts = []
    for d in dirs:
        num_sampled_pkts = 0
        for f in glob.glob(join(d,ending)):
            num_sampled_pkts += int(open(f).readline())
        counts.append(num_sampled_pkts)
    return counts

In [5]:
def get_cms(dirs,local_path):
    cms = []
    for d in dirs:
        data = pd.read_csv(join(d,local_path))        
        cms.append(data)
    return cms

In [6]:
def get_count_dir_names(dirs,ending,local_path):
    pkt_counts = np.array(get_pkt_count(dirs,ending))
    cms = get_cms(dirs,local_path)
    #print(cms)
    dir_names = np.array([os.path.basename(dir)[:-2] for dir in dirs])
        
    sorted_index = np.argsort(pkt_counts)
    pkt_counts = pkt_counts[sorted_index]
    sorted_cms = [cms[i] for i in sorted_index]
    
    dir_names = dir_names[sorted_index]
    return sorted_cms,pkt_counts,dir_names

In [17]:
def extract_sampler_name(d):
    dir_name  = os.path.basename(d)
    sampler_name = dir_name[:dir_name.find('_')]
    if sampler_name=='ffs':
        sampler_name = 'Fast Filtered'
    elif sampler_name=='sel': 
        sampler_name = 'Selective Flow'
    elif sampler_name=='sf':
        sampler_name = 'Random Packet'
    elif sampler_name == 'sgs':
        sampler_name ='Sketch Guided'
    elif sampler_name == 'sk':
        sampler_name = 'SketchFlow'
    elif sampler_name == 'whole':
        sampler_name = "Without Sampling"
    return sampler_name

def extract_nd_nfa(cm,attack_name):
    if attack_name not in cm:
        return -1,-1
    else:
        nd = cm.loc[cm['Unnamed: 0']==attack_name][attack_name].values[0]
        nfa = cm.loc[cm['Unnamed: 0']=='Benign'][attack_name].values[0]
        return nd,nfa

def get_count4label(cm,attack_name):
    #print(cm.loc[cm['Unnamed: 0']==attack_name])
    if attack_name not in cm:
        return -1
    
    row = cm.loc[cm['Unnamed: 0']==attack_name,].values[0]
    
    c = np.sum(row[1:])
    return c

def get_cm4whole():
    d = '/home/juma/data/net_intrusion/CIC-IDS-2018/whole_l/tree_k_5/'
    df = pd.read_csv(join(d,'cm_nonnorm_fold_avg.csv'))          
    return df

def round_up(n,decimals=0):
    #print("rounding up", n)
    multiplier = 10**decimals
    return math.ceil(n*multiplier)/multiplier

In [18]:
AGAINST_WHOLE = True
def compile_results(attack_name,whole_cm,results_file,open_mode, classifier_name):
    results = np.empty((0,num_sampling_rate*2+1),dtype=str)
    presults = np.empty((0,num_sampling_rate*2+1),dtype=str)
    t1= np.array(['method \ SR (%)','10','10','1','1','0.1','0.1']).reshape((1,-1))
    results = np.concatenate((results,t1),axis=0)
    presults = np.concatenate((presults,t1),axis=0)

    t2 = np.array(['','ND','NFA','ND','NFA','ND','NFA']).reshape((1,-1))
    results = np.concatenate((results,t2),axis=0)
    t2 = np.array(['','DR','FAR','DR','FAR','DR','FAR']).reshape((1,-1))
    presults = np.concatenate((presults,t2),axis=0)

    
    rows = defaultdict(list)
    prows = defaultdict(list)
    whole_flow_count = get_count4label(whole_cm,attack_name)
    
    for i,(pkt_count,dir_name) in enumerate(zip(pkt_counts,dir_names)):
        log_sr = round(math.log10(pkt_counts[-1]/pkt_count))
        if log_sr==1:
            cm = cms[i]
            nd,nfa = extract_nd_nfa(cm,attack_name) # return (detection-count,false-alarm-count) for the given attack and cm 
            
            if AGAINST_WHOLE:
                flow_count = whole_flow_count
                benign_flow_count = whole_benign_count
            else:
                flow_count = get_count4label(cm,attack_name)
                benign_flow_count = get_count4label(cm,'Benign')
                
            dr = 100*nd/flow_count
            dr = round_up(dr,2)
            far = 100*nfa/benign_flow_count
            far = round_up(far,2)
            prows[extract_sampler_name(dir_name)].append(dr)
            prows[extract_sampler_name(dir_name)].append(far)
            
            #print(dir_name,cms[i].iloc[1,1],nd)


    for i,(pkt_count,dir_name) in enumerate(zip(pkt_counts,dir_names)):
        log_sr = round(math.log10(pkt_counts[-1]/pkt_count))
        if log_sr==2:
            cm = cms[i]

            nd,nfa = extract_nd_nfa(cm,attack_name) # return (detection-count,false-alarm-count) for the given attack and cm 
            
            
            if AGAINST_WHOLE:
                flow_count = whole_flow_count
                benign_flow_count = whole_benign_count
            else:
                flow_count = get_count4label(cm,attack_name)
                benign_flow_count = get_count4label(cm,'Benign')

            dr = 100*nd/flow_count
            dr = round_up(dr,2)
            far = 100*nfa/benign_flow_count
            far = round_up(far,2)
            prows[extract_sampler_name(dir_name)].append(dr)
            prows[extract_sampler_name(dir_name)].append(far)
            #print(dir_name,cms[i].iloc[1,1],nd)
                
    for i,(pkt_count,dir_name) in enumerate(zip(pkt_counts,dir_names)):
        log_sr = round(math.log10(pkt_counts[-1]/pkt_count))
        if log_sr==3: # SR 0.1%
            cm = cms[i]
            #print(dir_name)
            nd,nfa = extract_nd_nfa(cm,attack_name) # return (detection-count,false-alarm-count) for the given attack and cm 
                        
            if AGAINST_WHOLE:
                flow_count = whole_flow_count
                benign_flow_count = whole_benign_count
            else:
                flow_count = get_count4label(cm,attack_name)
                benign_flow_count = get_count4label(cm,'Benign')

            dr = 100*nd/flow_count
            dr = round_up(dr,2)
            far = 100*nfa/benign_flow_count
            far = round_up(far,2)
            prows[extract_sampler_name(dir_name)].append(dr)
            prows[extract_sampler_name(dir_name)].append(far)

           
        
    #now put the sampler results into variable 'results'
    # 1st put the row for whole data 
    for i,(pkt_count,dir_name) in enumerate(zip(pkt_counts,dir_names)):
        log_sr = round(math.log10(pkt_counts[-1]/pkt_count))
        if log_sr==0:
            whole_cm = cms[i] # we will print this to csv
            nd,nfa = extract_nd_nfa(whole_cm,attack_name)

            dr = round_up(100*nd/whole_flow_count,2)
            far = round_up(100*nfa/whole_benign_count,2)
            t = np.array(['without sampling'] + [dr,far,dr,far,dr,far]).reshape((1,-1))
            presults = np.concatenate((presults,t),axis=0)
            
            #print(dir_name)
            #print(whole_cm)
            break

    # entry for sampling method
    for i,(sampling_method,k) in enumerate(sorted(rows.items())):
        
        k2 = [i if i>=0 else '0' for i in prows[sampling_method] ]
        t = np.array([sampling_method] + k2).reshape((1,-1))
        presults = np.concatenate((presults,t),axis=0)
        
    # writing season
    with open(results_file, open_mode) as f:
        np.savetxt(f,presults,delimiter=',',fmt='%s')        
        print("Done, please check {} for resutls".format(join(root,results_file)))

In [19]:
import math
import pandas as df
from collections import defaultdict
num_samplers = 5
num_sampling_rate = 3

#root = '/home/juma/data/net_intrusion/ISCX-Bot-2014/CSVs'
root = '/home/juma/data/net_intrusion/CIC-IDS-2018/CSVs_mem_SF_r_0.8/SR_1'
ending = '*.spc'
classifier_names = ['tree','forest','softmax','cnn']


dirs = get_immediate_subdirs(root)
#print('Attack types',len(attack_types),attack_types)

# we will use benign_flow_count for NFA


    
for classifier_name in classifier_names:
    local_path = '{}_k_5/cm_nonnorm_fold_avg.csv'.format(classifier_name)
    cms, pkt_counts,dir_names = get_count_dir_names(dirs,ending,local_path)

    whole_cm = get_cm4whole()
    whole_benign_count = get_count4label(whole_cm,'Benign')
    results_file = join(root,'cm_{}_k_5.csv'.format(classifier_name))
    with open(results_file, 'w') as f:
        np.savetxt(f,np.array(['','','Confusion Matrix','','']),delimiter=',',fmt='%s')        
        np.savetxt(f,whole_cm.columns.values.reshape((1,-1)),delimiter=',',fmt='%s')
        np.savetxt(f,whole_cm,delimiter=',',fmt='%s')

    
    ds = whole_cm.columns
    attack_types = ds.drop(labels=['Unnamed: 0']).values
    print(attack_types)

    open_mode = 'w'
        
    for i,attack_name in enumerate(attack_types):
        results_root=join(root,'absolute_recall',attack_name)
        ensure_dir(results_root)
        results_file = join(results_root,classifier_name+'.csv')
        compile_results(attack_name,whole_cm,results_file=results_file,open_mode=open_mode,classifier_name=classifier_name)


['Benign' 'Brute Force-Web' 'Brute Force-XSS' 'DDoS attacks-LOIC-HTTP'
 'DDoS-HOIC' 'DDoS-LOIC-UDP' 'DoS-GoldenEye' 'DoS-Hulk' 'DoS-SlowHTTPTest'
 'DoS-Slowloris' 'FTP-BruteForce' 'Infiltration' 'SQL Injection'
 'SSH-BruteForce']
Done, please check /home/juma/data/net_intrusion/CIC-IDS-2018/CSVs_mem_SF_r_0.8/SR_1/absolute_recall/Benign/tree.csv for resutls
Done, please check /home/juma/data/net_intrusion/CIC-IDS-2018/CSVs_mem_SF_r_0.8/SR_1/absolute_recall/Brute Force-Web/tree.csv for resutls
Done, please check /home/juma/data/net_intrusion/CIC-IDS-2018/CSVs_mem_SF_r_0.8/SR_1/absolute_recall/Brute Force-XSS/tree.csv for resutls
Done, please check /home/juma/data/net_intrusion/CIC-IDS-2018/CSVs_mem_SF_r_0.8/SR_1/absolute_recall/DDoS attacks-LOIC-HTTP/tree.csv for resutls
Done, please check /home/juma/data/net_intrusion/CIC-IDS-2018/CSVs_mem_SF_r_0.8/SR_1/absolute_recall/DDoS-HOIC/tree.csv for resutls
Done, please check /home/juma/data/net_intrusion/CIC-IDS-2018/CSVs_mem_SF_r_0.8/SR_1/abs

## previous code for legacy purbose

In [11]:
import math
import pandas as df
from collections import defaultdict
num_samplers = 5
num_sampling_rate = 3

#root = '/home/juma/data/net_intrusion/ISCX-Bot-2014/CSVs'
root = '/home/juma/data/net_intrusion/CIC-IDS-2018/CSVs'
ending = '*.spc'
classifier_names = 'adaboost'
local_path = '{}_k_5/cm_nonnorm_fold_avg.csv'.format(classifier_name)
results_file = join(root,'abs_{}_k_5.csv'.format(classifier_name))


dirs = get_immediate_subdirs(root)
cms, pkt_counts,dir_names = get_count_dir_names(dirs,ending,local_path)
ds = cms[0].columns
attack_types = ds.drop(labels=['Unnamed: 0', 'Benign']).values
print('Attack types',attack_types)

# we will use benign_flow_count for NFA
index = np.where(dir_names=='whole')[0][0]
print('index',index)
whole_cm = cms[index]
whole_benign_count = get_count4label(whole_cm,'Benign')
print('benign_flow_count',whole_benign_count)

with open(results_file, 'w') as f:
    np.savetxt(f,np.array(['','','Confusion Matrix','','']),delimiter=',',fmt='%s')        
    np.savetxt(f,whole_cm.columns.values.reshape((1,-1)),delimiter=',',fmt='%s')
    np.savetxt(f,whole_cm,delimiter=',',fmt='%s')
    
for i,attack_name in enumerate(attack_types):
    compile_results(attack_name,whole_cm,results_file,open_mode='a')


FileNotFoundError: [Errno 2] No such file or directory: '/home/juma/data/net_intrusion/CIC-IDS-2018/CSVs'