In [1]:
#imports
import os
from os.path import join as os_join
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import numpy as np
from utils import make_value2index
from CICIDS2017 import read_data
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn import tree

#from sklearn.linear_model import LogisticRegression
#from sklearn.svm import LinearSVC
from classifiers.linear_classifier import LinearSVM

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import matplotlib.pyplot as plt
import operator
import argparse
from utils import read_csv
import csv
from sklearn.utils import resample
import pandas as pd
import glob
import time
from multiprocessing import Pool
from classifiers.shallows import LogisticRegression
SEED = 234
fingerprint = None



In [2]:
def plot_confusion_matrix(filename,y_true, y_pred, classes,cm=None,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    np.set_printoptions(precision=2)
    if not title:
        if normalize:
            title = 'Normalized0.5486923076923077 confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    if cm is None:
        cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    fig, ax = plt.subplots(figsize=(20,16))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=a0.5486923076923077x)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    txt_filename = filename.replace('.jpg','.csv')
    f = open(txt_filename,'w')
    f.write('{}'.format(''))
    for cname in classes:
        f.write(',{}'.format(cname))
    f.write('\n')
    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else '.0f'
    thresh = (cm.max()+cm.min()) / 2.
    for i in range(cm.shape[0]):
        f.write('{}'.format(classes[i]))
        for j in range(cm.shape[1]):
            f.write(',{}'.format(cm[i,j]))
            ax.text(j, i, format(cm[i, j], fmt), horizontalalignment='center',
                    verticalalignment='center',
                    color="white" if cm[i, j] > thresh else "black")
        f.write('\n')
    f.close()

    fig.tight_layout()
    plt.savefig(os_join(fingerprint,filename),dpi=200)
    plt.close(fig)
    return ax


In [1]:
def get_labels(Y_str):
    labels = sorted(np.unique(Y_str))
    labels_d = make_value2index(labels)
    return labels,labels_d


def correct_data(df, K): # It will prepare the data into classifer usable format
    # please look at the code

    # we need to obtain only the estimation features
    df_label = df.loc[:,'Label']
    column_names = ["Flowid","Source IP","Source Port","Destination IP","Timestamp","Label"]
    df_data = df.drop(columns=column_names) # now we have 52 features 
    
    return df_data.values,df_label.values


def encode_label(Y_str,labels_d):
    Y = [labels_d[y_str] for y_str  in Y_str]
    Y = np.array(Y)
    return np.array(Y)


In [4]:

def print_evaluation(cm,label_names,filename):
    eps = 0.000000000000005
    with open(filename,'w') as f:
        w = csv.writer(f)
        w.writerow(['Label','Pr','Rc','F1-score'])
        for i,label in enumerate(label_names):
            tp = cm[i,i]
            fp = np.sum(cm[:,i]) - cm[i,i]
            fn = np.sum(cm[i,:]) - cm[i,i]
            pr = tp/(fp+tp+eps)
            rc = tp/(fn+tp+eps)
            f1 = 2*pr*rc/(pr+rc+eps)
            w.writerow(["{:30}".format(label),'{:8.3f}'.format(pr),'{:8.3f}'.format(rc),'{:8.3f}'.format(f1)])

def read_data(dataroot,file_ending='*_TrafficForML_CICFlowMeter.csv'):
    filenames = [i for i in glob.glob(os_join(dataroot,file_ending))]
    data = pd.concat([pd.read_csv(f) for f in filenames],sort=False)
    return data


In [2]:
def normilize_data(data):
        eps = 1e-15
        if len(data[data>=0])<=0: # make sure we actually have at least one none -1 feature
            return

        mask = data==-1
        data[mask]=0
        mean_i = np.mean(data,axis=0)
        min_i = np.min(data,axis=0) #  to leave -1 features as is and exclude in normilizing
        max_i = np.max(data,axis=0)

        r = max_i-min_i+eps
        data = (data-mean_i)/r  # zero centered norm [-0.5,0.5]
        #deal with edge case -1
        data[mask] = 0
        return data


def balance_data(X,y):
    unique,counts = np.unique(y,return_counts=True)
    mean_samples_per_class = int(round(np.mean(counts)))
    
    new_X = np.empty((0,X.shape[1]))
    new_y = np.empty((0),dtype=int)
    for i,c in enumerate(unique):
        temp_x = X[y==c]
        indices = np.random.choice(temp_x.shape[0],mean_samples_per_class)
        new_X = np.concatenate((new_X,temp_x[indices]),axis=0)
        temp_y = np.ones(mean_samples_per_class,dtype=int)*c
        new_y = np.concatenate((new_y,temp_y),axis=0)
    return (new_X,new_y)




In [6]:
dataroot = '/home/juma/data/net_intrusion/CIC-IDS-2018/CSVs/sf_sr_100_l'
classifier_name = 'svm'
file_ending = '*Meter.csv'


data = read_data(dataroot,file_ending)
X,Y = correct_data(data,K)

labels,labels_d = get_labels(Y)
print('data loaded with ',labels,labels_d)

X = normilize_data(X)
Y = encode_label(Y,labels_d)
        
unique,counts = np.unique(Y,return_counts=True)
num_class = len(np.unique(Y))
confusion_matrix_sum = np.zeros((num_class, num_class),dtype=int)
inputs = []

indices = np.random.choice(X.shape[0],X.shape[0]//10)
np.random.shuffle(indices)
print(indices.shape)
X = X[indices]
Y = Y[indices]
dev_len = Y.shape[0]//10
print('dev_len = ',dev_len)
X_val = X[:dev_len,:]
y_val = Y[:dev_len]
X_train = X[dev_len:,:]
y_train = Y[dev_len:]
X_train,y_train = balance_data(X_train,y_train)
X_val,y_val = balance_data(X_val,y_val)
print("X_train.shape is ",X_train.shape)
print("y_train.shape is ",y_train.shape)

data loaded with  ['Benign', 'Brute Force-Web', 'Brute Force-XSS', 'DDoS attacks-LOIC-HTTP', 'DDoS-HOIC', 'DDoS-LOIC-UDP', 'DoS-GoldenEye', 'DoS-Hulk', 'DoS-SlowHTTPTest', 'DoS-Slowloris', 'FTP-BruteForce', 'Infiltration', 'SSH-BruteForce'] {'Benign': 0, 'Brute Force-Web': 1, 'Brute Force-XSS': 2, 'DDoS attacks-LOIC-HTTP': 3, 'DDoS-HOIC': 4, 'DDoS-LOIC-UDP': 5, 'DoS-GoldenEye': 6, 'DoS-Hulk': 7, 'DoS-SlowHTTPTest': 8, 'DoS-Slowloris': 9, 'FTP-BruteForce': 10, 'Infiltration': 11, 'SSH-BruteForce': 12}
(25864,)
dev_len =  2586
X_train.shape is  (23283, 52)
y_train.shape is  (23283,)


In [None]:

learning_rates = [5e-3, 5e-2,1e-1,5e-1,5e-0]
regularization_strengths = [0,2.5e-5,2.5e-4,2.5e-3]

results = {}
best_val = -1   # The highest validation accuracy that we have seen so far.
best_svm = None # The LinearSVM object that achieved the highest validation rate.
best_loss_hist = None

num_epoch=20
for lr in learning_rates:
    for reg in regularization_strengths:
        svm = LinearSVM()
        loss_hist = svm.train(X_train,y_train,learning_rate=lr,reg=reg,num_iters=X_train.shape[0]*num_epoch)
        train_acc = np.mean(svm.predict(X_train)==y_train)
        val_acc = np.mean(svm.predict(X_val)==y_val)
        results[(lr,reg)]=(train_acc,val_acc)
        if best_val<val_acc:
            best_svm = svm
            best_val = val_acc
            best_loss_hist = loss_hist
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    
# Print out results.
for lr, reg in sorted(results):
    train_accuracy, val_accuracy = results[(lr, reg)]
    print('lr %e reg %e train accuracy: %f val accuracy: %f' % (
                lr, reg, train_accuracy, val_accuracy))
    
print('best validation accuracy achieved during cross-validation: %f' % best_val)

In [None]:
# Visualize the cross-validation results
import math
x_scatter = [math.log10(x[0]) for x in results]
y_scatter = [math.log10(x[1]) for x in results]

# plot training accuracy
marker_size = 100
colors = [results[x][0] for x in results]
plt.subplot(2, 1, 1)
plt.scatter(x_scatter, y_scatter, marker_size, c=colors, cmap=plt.cm.coolwarm)
plt.colorbar()
plt.xlabel('log learning rate')
plt.ylabel('log regularization strength')
plt.title('CIFAR-10 training accuracy')

# plot validation accuracy
colors = [results[x][1] for x in results] # default size of markers is 20
plt.subplot(2, 1, 2)
plt.scatter(x_scatter, y_scatter, marker_size, c=colors, cmap=plt.cm.coolwarm)
plt.colorbar()
plt.xlabel('log learning rate')
plt.ylabel('log regularization strength')
plt.title('CIFAR-10 validation accuracy')
plt.show()

In [None]:
# A useful debugging strategy is to plot the loss as a function of
# iteration number:
plt.plot(best_loss_hist)
plt.xlabel('Iteration number')
plt.ylabel('Loss value')
plt.show()

In [9]:
for fold_index, (train_index,test_index) in enumerate(skf.split(X,Y)):
            X_train = X[train_index]
            y_train = Y[train_index]
            unique, counts = np.unique(y_train,return_counts=True)
            print("Imbalanced class distribution")
            print(np.asarray((unique, counts)).T)

            X_train,y_train = balance_data(X_train,y_train)
            unique, counts = np.unique(y_train,return_counts=True)
            print("Balanced class distribution")
            print(np.asarray((unique, counts)).T)


            X_test = X[test_index]
            y_test = Y[test_index]
            unique, counts = np.unique(y_test,return_counts=True)
            print("Balanced class distribution for Test case")
            print(np.asarray((unique, counts)).T)

            svm = LinearSVM()
            loss_hist = svm.train(X_train, y_train, learning_rate=1e-3, reg=2.5e0,
                                          num_iters=1500, verbose=True)
            pred = svm.predict(X_test)

            plot_confusion_matrix(os_join(fingerprint,'cm_norm_fold_{}.jpg'.format(fold_index)), y_test, pred, classes=labels,normalize=True, title='Confusion matrix, with normalization')
            cm_i = confusion_matrix(y_test,pred)
            print(cm_i.astype(np.int))
            confusion_matrix_sum+=cm_i
            break


NameError: name 'skf' is not defined