In [14]:
import os, platform, pprint, sys
import fastai
import keras
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
import sklearn

# from fastai.tabular.data import TabularDataLoaders
# from fastai.tabular.all import FillMissing, Categorify, Normalize, tabular_learner, accuracy, ClassificationInterpretation, ShowGraphCallback

from itertools import cycle

from keras.layers import Dense
from keras.metrics import CategoricalAccuracy, Recall, Precision, AUC
from keras.models import Sequential
from keras.utils import to_categorical, normalize

from math import sqrt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier


seed: int = 14


# set up pretty printer for easier data evaluation
pretty = pprint.PrettyPrinter(indent=4, width=30).pprint


# declare file paths for the data we will be working on
file_path_1: str = '../data/prepared/baseline/Benign_vs_DDoS.csv'
file_path_2: str = '../data/prepared/timebased/Benign_vs_DDoS.csv'
modelPath  : str = './models'


# print library and python versions for reproducibility
print(
    f'''
    python:\t{platform.python_version()}

    \tfastai:\t\t{fastai.__version__}
    \tkeras:\t\t{keras.__version__}
    \tmatplotlib:\t{mpl.__version__}
    \tnumpy:\t\t{np.__version__}
    \tpandas:\t\t{pd.__version__}
    \tseaborn:\t{sn.__version__}
    \tsklearn:\t{sklearn.__version__}
    '''
)


    python:	3.7.10

    	fastai:		2.4.1
    	keras:		2.3.1
    	matplotlib:	3.3.4
    	numpy:		1.20.3
    	pandas:		1.2.5
    	seaborn:	0.11.1
    	sklearn:	0.24.2
    


In [15]:
def load_data(filePath: str) -> pd.DataFrame:
    '''
        Loads the Dataset from the given filepath and caches it for quick access in the future
        Function will only work when filepath is a .csv file
    '''

    # slice off the ./CSV/ from the filePath
    if filePath[0] == '.' and filePath[1] == '.':
        filePathClean: str = filePath[17::]
        pickleDump: str = f'../data/cache/{filePathClean}.pickle'
    else:
        pickleDump: str = f'../data/cache/{filePath}.pickle'
    
    print(f'Loading Dataset: {filePath}')
    print(f'\tTo Dataset Cache: {pickleDump}\n')
    
    # check if data already exists within cache
    if os.path.exists(pickleDump):
        df = pd.read_pickle(pickleDump)
        
    # if not, load data and cache it
    else:
        df = pd.read_csv(filePath, low_memory=True)
        df.to_pickle(pickleDump)

    
    return df

In [16]:
def show_conf_matrix(model=None, X_test=None, y_test=None, classes=[], file=''):
    # Techniques from https://stackoverflow.com/questions/29647749/seaborn-showing-scientific-notation-in-heatmap-for-3-digit-numbers
    # and https://stackoverflow.com/questions/35572000/how-can-i-plot-a-confusion-matrix#51163585
    
    predictions = model.predict(X_test)
    matrix = [ [ 0 for j in range(len(predictions[0])) ]  for i in range(len(predictions[0])) ]
    for i in range(len(predictions)):
        pred = predictions[i]
        test = y_test[i]

        guess = np.argmax(pred)
        actual = np.argmax(test)

        matrix[actual][guess] += 1
        
    df_cm = pd.DataFrame(matrix, range(len(matrix)), range(len(matrix)))
    int_cols = df_cm.columns
    df_cm.columns = classes
    df_cm.index = classes

    fig = plt.figure(figsize=(10,7))
    sn.set(font_scale=1.5) # for label size
    ax = sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='g', cmap=sn.color_palette("Blues")) # font size
    ax.set_ylabel('Actual')
    ax.set_xlabel('Predicted')
    plt.tight_layout()
    
    fig.savefig('conf_matrix_{}.png'.format(file))

    plt.show()
    
def show_roc_curve(model=None, X_test=None, y_test=None, classes=[], file=''):
    y_score = model.predict(X_test)
    
    n_classes = len(classes)
    
    # Produce ROC curve from https://hackernoon.com/simple-guide-on-how-to-generate-roc-plot-for-keras-classifier-2ecc6c73115a
    # Note that I am working through this code and I'm going to clean it up as I learn more about how it works
    import numpy as np
    from numpy import interp
    import matplotlib.pyplot as plt
    from itertools import cycle
    from sklearn.metrics import roc_curve, auc

    # Plot linewidth.
    lw = 2

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # Compute macro-average ROC curve and ROC area

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves of all the classes
    fig = plt.figure(figsize=(12,12))

    colors = cycle(['red', 'blue', 'orange', 'green', 'violet', 'teal', 'turquoise', 'pink'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                 label='ROC curve of {0} (area = {1:0.2f})'.format(classes[i], roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.ylabel('True Positive Rate (Sensativity)')
    plt.xlabel('False Positive Rate (1-Specificity)')
    plt.title('Receiver Operating Characteristic of the Classes')
    plt.legend(loc="lower right")
    
    fig.savefig('roc_curve_classes_{}.png'.format(file))
    
    plt.show()
    
     # Plot all ROC curves with micro and macro averages
    fig = plt.figure(figsize=(12,12))
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
             color='deeppink', linestyle=':', linewidth=4)

    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4)

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.ylabel('True Positive Rate (Sensativity)')
    plt.xlabel('False Positive Rate (1-Specificity)')
    plt.title('Receiver Operating Characteristic of the Micro and Macro Averages')
    plt.legend(loc="lower right")
    
    fig.savefig('roc_curve_micromacro_{}.png'.format(file))
    
    plt.show()

In [17]:
def get_std(x=[], xbar=0):
    o2=0
    for xi in x:
        o2 += (xi - xbar)**2
    o2 /= len(x)-1
    return sqrt(o2)

In [18]:
baseline_df : pd.DataFrame = load_data(file_path_1)
timebased_df: pd.DataFrame = load_data(file_path_2)

Loading Dataset: ../data/prepared/baseline/Benign_vs_DDoS.csv
	To Dataset Cache: ../data/cache/baseline/Benign_vs_DDoS.csv.pickle

Loading Dataset: ../data/prepared/timebased/Benign_vs_DDoS.csv
	To Dataset Cache: ../data/cache/timebased/Benign_vs_DDoS.csv.pickle



In [19]:
dep_var = 'Label'

ind_vars_baseline = (baseline_df.columns.difference([dep_var])).tolist()
ind_vars_timebased = (timebased_df.columns.difference([dep_var])).tolist()

baseline_Xy = (baseline_df[ind_vars_baseline], baseline_df[dep_var])
timebased_Xy = (timebased_df[ind_vars_timebased], timebased_df[dep_var])

In [20]:
names: list = ['Benign', 'DDoS']

In [21]:
X = baseline_Xy[0]
x = baseline_Xy[0]
Y = baseline_Xy[1]

num_classes = Y.nunique()

encoder = LabelEncoder()
y = encoder.fit_transform(Y)

In [22]:
# Lists for accuracies collected from models
list_rf = []
list_dt = []
list_knn = []
list_dnn = []

std_rf = []
std_dt = [] 
std_knn = []
std_dnn = []


# Mean accuracies for each model
mean_rf = 0
mean_dt = 0
mean_knn = 0
mean_dnn = 0

# Keep to calculate std
results_rf = []
results_dt = []
results_knn = []  
results_dnn = []

# 10-fold Stratified Cross-Validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
for train_idxs, test_idxs in skf.split(X, y):
    # Define the training and testing sets
    X_train, X_test = X.iloc[train_idxs], X.iloc[test_idxs]
    y_train, y_test = y[train_idxs], y[test_idxs]
    
    # Create a different version of the y_train and y_test for the Deep Neural Network
    # y_train_dnn = to_categorical(y_train, num_classes=num_classes)
    # y_test_dnn = to_categorical(y_test, num_classes=num_classes)
    
    # Initialize the sklearn models
    rf = RandomForestClassifier(random_state=seed)
    dt = DecisionTreeClassifier(random_state=seed)
    knn = KNeighborsClassifier()
    
    # # Deep Neural Network
    # dnn = Sequential([
    #     Dense(256, input_shape=(69,)),
    #     Dense(128, activation='relu'),
    #     Dense(64, activation='relu'),
    #     Dense(32, activation='relu'),
    #     Dense(2, activation='softmax')
    # ])
    # dnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    
    # Train the models
    rf.fit(X_train, y_train)
    dt.fit(X_train, y_train)
    knn.fit(X_train, y_train)
    # dnn.fit(x=X_train, y=y_train_dnn, batch_size=25, epochs=100, verbose=0, validation_data=(X_test, y_test_dnn))
    
    # Evaluate the models
    results_rf.append(rf.score(X_test, y_test))
    results_dt.append(dt.score(X_test, y_test))
    results_knn.append(knn.score(X_test, y_test))  
    # results_dnn.append( (dnn.evaluate(X_test, y_test_dnn, verbose=0) )[1] )
    
    # print('Random Forest')
    # show_roc_curve(model=rf, X_test=X_test, y_test=y_test, classes=names)
    # print('Decision Tree')
    # show_roc_curve(model=dt, X_test=X_test, y_test=y_test, classes=names)
    # print('k-Nearest Neighbor')
    # show_roc_curve(model=knn, X_test=X_test, y_test=y_test, classes=names)
    # # print('Deep Learning')
    # show_roc_curve(model=dnn, X_test=X_test, y_test=y_test_dnn, classes=names)

    print('Random Forest')
    show_conf_matrix(model=rf, X_test=X_test, y_test=y_test, classes=names)
    print('Decision Tree')
    show_conf_matrix(model=dt, X_test=X_test, y_test=y_test, classes=names)
    print('k-Nearest Neighbor')
    show_conf_matrix(model=knn, X_test=X_test, y_test=y_test, classes=names)
    # print('Deep Learning')
    # show_conf_matrix(model=dnn, X_test=X_test, y_test=y_test_dnn, classes=names)        
    
    #print('Results from DNN: {}'.format(results_dnn))
    
    # Add the results to the running mean
    mean_rf += results_rf[-1] / (n_splits * 1.0)
    mean_dt += results_dt[-1] / (n_splits * 1.0)
    mean_knn += results_knn[-1] / (n_splits * 1.0)
    # mean_dnn += results_dnn[-1] / (n_splits * 1.0)
    
# Push the mean results from all of the splits to the lists
list_rf.append(mean_rf)
list_dt.append(mean_dt)
list_knn.append(mean_knn)
# list_dnn.append(mean_dnn)

std_rf.append(get_std(results_rf, mean_rf))
std_dt.append(get_std(results_dt, mean_dt))
std_knn.append(get_std(results_knn, mean_knn))
# std_dnn.append(get_std(results_dnn, mean_dnn))

print('done')

print('All trainings complete!')

Random Forest


TypeError: object of type 'numpy.int32' has no len()