In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys
sys.path.insert(1, '../')
import pandas as pd
import numpy as np
from IPython.display import clear_output
import random
import seaborn as sb
import matplotlib.pyplot as plt
from genetic_selection import GeneticSelectionCV
from sklearn.tree import ExtraTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, classification_report, make_scorer
from sklearn.utils.validation import check_consistent_length

In [3]:
def eval(X_train, y_train, X_test, y_test):
    model = MLPClassifier(hidden_layer_sizes=(128,128), activation="relu", solver="adam",
                          learning_rate="adaptive", learning_rate_init=0.0003, 
                          batch_size=64, max_iter=300, 
                          early_stopping=True, n_iter_no_change=30)
    model.fit(X_train, y_train)
    y_preds = model.predict(X_test)
    print("\nClassification report:\n")
    print(classification_report(y_test, y_preds))
    print("\nConfusion matrix:\n")
    print(confusion_matrix(y_test, y_preds))

    return (
        accuracy_score(y_test, y_preds)*100,
        precision_score(y_test, y_preds, average="weighted")*100,
        recall_score(y_test, y_preds, average="weighted")*100,
        f1_score(y_test, y_preds, average="weighted")*100       
    )

In [4]:
df_train = pd.read_csv('../UNSW-NB15/cleaned_data/UNSW_NB15_training-set_cleaned.csv')
df_validation = pd.read_csv('../UNSW-NB15/cleaned_data/UNSW_NB15_validation-set_cleaned.csv')
df_test = pd.read_csv('../UNSW-NB15/cleaned_data/UNSW_NB15_testing-set_cleaned.csv')
X_train, y_train = df_train.iloc[:, 0:-2], df_train.iloc[:, -2]
X_validation, y_validation = df_validation.iloc[:, 0:-2], df_validation.iloc[:, -2]
X_test, y_test = df_test.iloc[:, 0:-2], df_test.iloc[:, -2]

In [5]:
def genetic_selector(n_population, X_train, y_train, use_validation_set=False, X_validation=None, y_validation=None, verbose=0):
    selector = GeneticSelectionCV(
        estimator=ExtraTreeClassifier(), 
        scoring=make_scorer(f1_score, average="weighted"),
        n_population=n_population, 
        n_generations=150, 
        n_gen_no_change=30,
        verbose=verbose,
        n_jobs=-1
        )
    selector.fit(X_train, y_train, use_validation_set=use_validation_set, valid_X=X_validation, valid_y=y_validation)
    sel_features = sum([1 for support in selector.support_ if support])
    return selector, sel_features

In [6]:
result_df = pd.DataFrame(columns=["experiment", "pop_size", "features", "accuracy", "precision", "recall", "f1-score"])
sb.set_theme()

In [7]:
for i in range(30):
    n_population = random.randint(80, 120)
    validation_selector, sel_features = genetic_selector(n_population, X_train, y_train, use_validation_set=True, X_validation=X_validation, y_validation=y_validation, verbose=1)
    X_train_valid_selected = validation_selector.transform(X_train)
    X_test_valid_selected = validation_selector.transform(X_test)
    result_df.loc[len(result_df.index)] = [int(i), n_population, sel_features, *eval(X_train_valid_selected, y_train, X_test_valid_selected, y_test)]
    result_df

Selecting features with genetic algorithm.
gen	nevals	avg                               	std                               	min                            	max                                  
0  	80    	[  0.715403  97.7625     0.      ]	[  0.111685  54.516796   0.      ]	[ 0.288971  3.        0.      ]	[   0.780943  191.          0.      ]
1  	49    	[   0.76559  128.4375     0.     ]	[  0.010233  35.895976   0.      ]	[ 0.72995  8.       0.     ]   	[   0.780943  190.          0.      ]
2  	51    	[   0.770703  143.8375      0.      ]	[  0.007703  27.578725   0.      ]	[  0.73246  75.        0.     ]	[   0.783542  183.          0.      ]
3  	50    	[   0.772709  151.375       0.      ]	[  0.00745   22.499653   0.      ]	[  0.740476  79.         0.      ]	[   0.785806  181.          0.      ]
4  	46    	[   0.77355  151.0875     0.     ]   	[  0.009276  16.692209   0.      ]	[   0.731291  107.          0.      ]	[   0.785897  181.          0.      ]
5  	52    	[   0.777144  150.2125

In [None]:
result_dfm = pd.melt(result_df, id_vars=("experiment"), value_vars=("accuracy", "precision", "recall", "f1-score"), var_name="metric", value_name="percentage")
fig, ax1 = plt.subplots(figsize=(24, 4))
fig.suptitle("Comparison of different feature selections")
sb.barplot(ax=ax1, data=result_dfm, x="experiment", y="percentage", hue="metric", palette=["tab:red", "tab:green", "tab:orange", "tab:blue"])
result_dfs = pd.melt(result_df, id_vars=("experiment"), value_vars=("pop_size", "features"), var_name="size", value_name="sizes")
sb.lineplot(ax=ax1, x="experiment", y="sizes", hue="size", data=result_dfs, palette=["purple", "blue"], marker='o')
ax1.legend(bbox_to_anchor=(1.04, 0.75), borderaxespad=0, title="Metrics")
plt.show()

In [None]:
result_df

In [None]:
result_df[["pop_size", "accuracy", "recall", "precision", "f1-score"]].describe()

In [None]:
result_df.to_csv('../output/test-3-result.csv', float_format='%f', index=False)