In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif, chi2, SelectKBest
from ITMO_FS import filters
from numpy import inf
from geneticAlg import GeneticSelection
from itertools import permutations
import time
import pickle
import matplotlib.pyplot as plt

# Import datasets

In [None]:
# Name of the columns for NSL-KDD

columns = ["duration","protocol_type","service","flag","src_bytes",
                    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
                    "logged_in","num_compromised","root_shell","su_attempted","num_root",
                    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
                    "is_host_login","is_guest_login","count","srv_count","serror_rate",
                    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
                    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
                    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
                    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
                    "dst_host_rerror_rate","dst_host_srv_rerror_rate","labels"]


nslkddTrain = pd.read_csv('./NSL-KDD/KDDTrain+_20Percent.txt', sep=",", header=None, usecols = [i for i in range(42)], names=columns)

# Convert boolean features to objects to avoid misinterpretation in NSL-KDD
nslkddTrain['land'] = nslkddTrain['land'].astype('object', copy=False)
nslkddTrain['logged_in'] = nslkddTrain['logged_in'].astype('object', copy=False)
nslkddTrain['urgent'] = nslkddTrain['urgent'].astype('object', copy=False)
nslkddTrain['is_host_login'] = nslkddTrain['is_host_login'].astype('object', copy=False)
nslkddTrain['is_guest_login'] = nslkddTrain['is_guest_login'].astype('object', copy=False)

# Import UNSW-NB15 train set

unswTrain = pd.DataFrame()
unswTrain = pd.read_csv('./UNSW-NB15/UNSW_NB15_training-set.csv', sep=",", na_values=[' '])

# Convert boolean features to objects to avoid misinterpretation in UNSW-NB15
unswTrain['is_ftp_login'] = unswTrain['is_ftp_login'].astype('object', copy=False)
unswTrain['is_sm_ips_ports'] = unswTrain['is_sm_ips_ports'].astype('object', copy=False)
unswTrain['label'] = unswTrain['label'].astype('int32', copy=False)
del unswTrain["attack_cat"]
del unswTrain["id"]



In [None]:
# Print the shapes
print("shape of NSL-KDD train set", nslkddTrain.shape)
print("shape of UNSW train set: ", unswTrain.shape)

In [None]:
# Check the distribution of the values removed by the authors of GA-LR
removedCols = ['land', 'urgent', 'num_failed_logins', 'root_shell', 'su_attempted', 'num_shells', 'num_outbound_cmds', 'is_host_login']
for col in removedCols:
    print(col)
    print(nslkddTrain[col].value_counts())

# Features encoding

In [None]:

# Labelencode the target
nslkddTrain['labels'] = nslkddTrain['labels'].map(lambda x: 0 if (x == 'normal') else 1)

# Perform ordinal encoding on remaining string values
toEncodeKdd = list(nslkddTrain.select_dtypes(include=['object']).columns)
toEncodeUnsw = list(unswTrain.select_dtypes(include=['object']).columns)

OrdinalEncoder.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)

cat_encoding = Pipeline([
    ('cat_encoder', OrdinalEncoder())
    ])

ctEncodingKdd = ColumnTransformer([
    ('cat', cat_encoding, toEncodeKdd)
    ], remainder="passthrough", verbose_feature_names_out=False)

ctEncodingUnsw = ColumnTransformer([
('cat', cat_encoding, toEncodeUnsw)
], remainder="passthrough", verbose_feature_names_out=False)

nslkddTrainEncoded = pd.DataFrame.from_records(ctEncodingKdd.fit_transform(nslkddTrain), columns=ctEncodingKdd.get_feature_names_out())
unswTrainEncoded = pd.DataFrame.from_records(ctEncodingUnsw.fit_transform(unswTrain), columns=ctEncodingUnsw.get_feature_names_out())


# Preprocessing

In [None]:
# Apply log base 10 to columns containing large values
largevalueskdd = ['duration', 'src_bytes', 'dst_bytes', 'num_compromised', 'num_root', 'count', 'srv_count', 'dst_host_count', 'dst_host_srv_count']
largevaluesunsw = ['spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'smean', 'dmean', 'trans_depth', 'response_body_len']

for colname in largevalueskdd:
    nslkddTrainEncoded[colname] = np.log10(nslkddTrainEncoded[colname])
    nslkddTrainEncoded.replace([-np.inf], 0, inplace=True)

for colname in largevaluesunsw:
    unswTrainEncoded[colname] = np.log10(unswTrain[colname])
    unswTrainEncoded[colname].replace([-np.inf], 0, inplace=True)

# Apply minmax scaler on numerical values
MinMaxScaler.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)

numColskdd = list(nslkddTrainEncoded.select_dtypes(include=['float64', 'int64']).columns)
numColsunsw = list(unswTrainEncoded.select_dtypes(include=['float64', 'int64']).columns)

num_minmaxscaling = Pipeline([
    ('num_minmaxscaling', MinMaxScaler()),
])


ctMinMaxkdd = ColumnTransformer([
    ('minmax', num_minmaxscaling, numColskdd)
], remainder="passthrough", verbose_feature_names_out=False)

ctMinMaxunsw = ColumnTransformer([
    ('minmax', num_minmaxscaling, numColsunsw)
], remainder="passthrough", verbose_feature_names_out=False)


nslkddTrainNormalized = pd.DataFrame.from_records(ctMinMaxkdd.fit_transform(nslkddTrainEncoded), columns=ctMinMaxkdd.get_feature_names_out())
unswTrainNormalized  = pd.DataFrame.from_records(ctMinMaxunsw.fit_transform(unswTrainEncoded), columns=ctMinMaxunsw.get_feature_names_out())



In [None]:
# Make the label column the last column
reorderedColumns = [col for col in unswTrainNormalized.columns if col != 'label'] + ['label']
unswTrainNormalized = unswTrainNormalized[reorderedColumns]

In [None]:
# Delete the same columns as Khamassi et al.
for col in removedCols:
    del nslkddTrainNormalized[col]

# Sampling of the datasets

In [None]:
nslkdd10000 = nslkddTrainNormalized.sample(10000, random_state=86)
nslkdd15000 = nslkddTrainNormalized.sample(15000, random_state=86)
nslkdd20000 = nslkddTrainNormalized.sample(20000, random_state=86)

unsw10000 = unswTrainNormalized.sample(10000, random_state=73)
unsw15000 = unswTrainNormalized.sample(15000, random_state=73)
unsw20000 = unswTrainNormalized.sample(20000, random_state=73)

In [None]:
# split in 10 folds
nslkdd10000folds = []
nslkdd15000folds = []
nslkdd20000folds = []

unsw10000folds = []
unsw15000folds = []
unsw20000folds = []

for i in range(10):
    nslkdd10000folds.append((nslkdd10000.iloc[0+(1000*i):1000*(i+1),:-1], nslkdd10000.iloc[0+(1000*i):1000*(i+1),-1:].labels))
    nslkdd15000folds.append((nslkdd15000.iloc[0+(1500*i):1500*(i+1),:-1], nslkdd15000.iloc[0+(1500*i):1500*(i+1),-1:].labels))
    nslkdd20000folds.append((nslkdd20000.iloc[0+(2000*i):2000*(i+1),:-1], nslkdd20000.iloc[0+(2000*i):2000*(i+1),-1:].labels))

    unsw10000folds.append((unsw10000.iloc[0+(1000*i):1000*(i+1),:-1], unsw10000.iloc[0+(1000*i):1000*(i+1),-1:].label))
    unsw15000folds.append((unsw15000.iloc[0+(1500*i):1500*(i+1),:-1], unsw15000.iloc[0+(1500*i):1500*(i+1),-1:].label))
    unsw20000folds.append((unsw20000.iloc[0+(2000*i):2000*(i+1),:-1], unsw20000.iloc[0+(2000*i):2000*(i+1),-1:].label))


In [None]:
# split in 10 folds
nslkdd10000folds = []
nslkdd15000folds = []
nslkdd20000folds = []

unsw10000folds = []
unsw15000folds = []
unsw20000folds = []

for i in range(10):
    nslkdd10000folds.append((nslkdd10000.iloc[0+(1000*i):1000*(i+1),:-1], nslkdd10000.iloc[0+(1000*i):1000*(i+1),-1:].labels))
    nslkdd15000folds.append((nslkdd15000.iloc[0+(1500*i):1500*(i+1),:-1], nslkdd15000.iloc[0+(1500*i):1500*(i+1),-1:].labels))
    nslkdd20000folds.append((nslkdd20000.iloc[0+(2000*i):2000*(i+1),:-1], nslkdd20000.iloc[0+(2000*i):2000*(i+1),-1:].labels))

    unsw10000folds.append((unsw10000.iloc[0+(1000*i):1000*(i+1),:-1], unsw10000.iloc[0+(1000*i):1000*(i+1),-1:].label))
    unsw15000folds.append((unsw15000.iloc[0+(1500*i):1500*(i+1),:-1], unsw15000.iloc[0+(1500*i):1500*(i+1),-1:].label))
    unsw20000folds.append((unsw20000.iloc[0+(2000*i):2000*(i+1),:-1], unsw20000.iloc[0+(2000*i):2000*(i+1),-1:].label))


# Identify best folds for GA-LR

In [None]:
lr = LogisticRegression(solver='newton-cg', random_state=21, penalty='none', n_jobs=-1)

datasetskdd = [nslkdd10000folds, nslkdd15000folds, nslkdd20000folds]
datasetsunsw = [unsw10000folds, unsw15000folds, unsw20000folds]
bestfoldskdd = []
bestfoldsunsw = []

for datasetwithfolds in datasetskdd:
    bestscore = 0
    bestpermutation = None
    for permutation in permutations(datasetwithfolds, 2):
        trainset = permutation[0]
        testset = permutation[1]
        lr.fit(trainset[0], trainset[1])

        score = lr.score(testset[0], testset[1])
        if score > bestscore:
            bestscore = score
            bestpermutation = permutation
    
    print(bestscore)
    bestfoldskdd.append(bestpermutation)

print("Search for best folds in UNSW-NB15")
for datasetwithfolds in datasetsunsw:
    bestscore = 0
    bestpermutation = None
    for permutation in permutations(datasetwithfolds, 2):
        trainset = permutation[0]
        testset = permutation[1]
        lr.fit(trainset[0], trainset[1])

        score = lr.score(testset[0], testset[1])
        if score > bestscore:
            bestscore = score
            bestpermutation = permutation
    
    print(bestscore)
    bestfoldsunsw.append(bestpermutation)

# GA-LR

In [None]:
estimator = LogisticRegression(solver="newton-cg", random_state=21, penalty='none', n_jobs=-1)
geneticalg = GeneticSelection(estimator, verbose=1, n_population=30, n_generations=1000, estimator_weight=0.8, crossover_proba=0.9, mutation_proba=0.03)
selectedFeatures = []
i = 0

print("Selecting features for NSL-KDD")
for bestfolds in bestfoldskdd:
    start = time.time()
    if i == 0:
        print("Selecting features based on 1000 instances fold")
    elif i == 1:
        print("Selecting features based on 1500 instances fold")
    elif i == 2:
        print("Selecting features based on 2000 instances fold")

    trainset = bestfolds[0]
    testset = bestfolds[1]
    geneticalg = geneticalg.fit(X=trainset[0].values, y=trainset[1], X_test=testset[0].values, y_test=testset[1])
    selectedFeatures.append(geneticalg.support_)
    print(geneticalg.n_features_)
    i += 1
    end = time.time()
    print("Execution time: ", end-start)

In [None]:
print("Selecting features for UNSW-NB15")
for bestfolds in bestfoldsunsw:
    start = time.time()
    if i == 0:
        print("Selecting features based on 1000 instances fold")
    elif i == 1:
        print("Selecting features based on 1500 instances fold")
    elif i == 2:
        print("Selecting features based on 2000 instances fold")

    trainset = bestfolds[0]
    testset = bestfolds[1]
    geneticalg = geneticalg.fit(X=trainset[0].values, y=trainset[1], X_test=testset[0].values, y_test=testset[1])
    selectedFeatures.append(geneticalg.support_)
    print(geneticalg.n_features_)
    i += 1
    end = time.time()
    print("Execution time: ", end-start)

In [None]:
print(selectedFeatures)

# Data discretization

In [None]:
KBinsDiscretizer.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)

toDiscretizeNsl = list(nslkdd10000.select_dtypes(include=['float64', 'int64']).columns)
toDiscretizeNsl.remove('labels')

toDiscretizeUNSW = list(unsw10000.select_dtypes(include=['float64', 'int64']).columns)
toDiscretizeUNSW.remove('label')

num_discretization= Pipeline([
    ('num_discretizer', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform'))
])

ctDiscretizationNSL = ColumnTransformer([
    ('num', num_discretization, toDiscretizeNsl)
], remainder="passthrough", verbose_feature_names_out=False)

ctDiscretizationUNSW = ColumnTransformer([
    ('num', num_discretization, toDiscretizeUNSW)
], remainder="passthrough", verbose_feature_names_out=False)

df_norm_NSL10000 = pd.DataFrame.from_records(ctDiscretizationNSL.fit_transform(nslkdd10000), columns=ctDiscretizationNSL.get_feature_names_out())
df_norm_UNSW10000 = pd.DataFrame.from_records(ctDiscretizationUNSW.fit_transform(unsw10000), columns=ctDiscretizationUNSW.get_feature_names_out())
df_norm_NSL15000 = pd.DataFrame.from_records(ctDiscretizationNSL.fit_transform(nslkdd15000), columns=ctDiscretizationNSL.get_feature_names_out())
df_norm_UNSW15000 = pd.DataFrame.from_records(ctDiscretizationUNSW.fit_transform(unsw15000), columns=ctDiscretizationUNSW.get_feature_names_out())
df_norm_NSL20000 = pd.DataFrame.from_records(ctDiscretizationNSL.fit_transform(nslkdd20000), columns=ctDiscretizationNSL.get_feature_names_out())
df_norm_UNSW20000 = pd.DataFrame.from_records(ctDiscretizationUNSW.fit_transform(unsw20000), columns=ctDiscretizationUNSW.get_feature_names_out())

# Filter based feature selection

In [None]:
labelsNSLKDD10000 = df_norm_NSL10000['labels']
del df_norm_NSL10000['labels']
labelsNSLKDD15000 = df_norm_NSL15000['labels']
del df_norm_NSL15000['labels']
labelsNSLKDD20000 = df_norm_NSL20000['labels']
del df_norm_NSL20000['labels']

labelsUNSW10000 = df_norm_UNSW10000['label']
del df_norm_UNSW10000['label']
labelsUNSW15000 = df_norm_UNSW15000['label']
del df_norm_UNSW15000['label']
labelsUNSW20000 = df_norm_UNSW20000['label']
del df_norm_UNSW20000['label']


In [None]:
NSLKDD10000Features = {'chi2':[], 'MI': [], 'SU': []}
NSLKDD15000Features = {'chi2':[], 'MI': [], 'SU': []}
NSLKDD20000Features = {'chi2':[], 'MI': [], 'SU': []}
UNSW10000Features = {'chi2':[], 'MI': [], 'SU': []}
UNSW15000Features = {'chi2':[], 'MI': [], 'SU': []}
UNSW20000Features = {'chi2':[], 'MI': [], 'SU': []}

for i in range(df_norm_NSL10000.shape[1]):

    NSLKDD10000Features['chi2'].append(SelectKBest(chi2, k=i+1).fit(df_norm_NSL10000, labelsNSLKDD10000).get_feature_names_out(input_features=None))
    NSLKDD10000Features['MI'].append(SelectKBest(mutual_info_classif, k=i+1).fit(df_norm_NSL10000, labelsNSLKDD10000).get_feature_names_out(input_features=None))
    NSLKDD10000Features['SU'].append(SelectKBest(filters.univariate.su_measure, k=i+1).fit(df_norm_NSL10000, labelsNSLKDD10000).get_feature_names_out(input_features=None))

    NSLKDD15000Features['chi2'].append(SelectKBest(chi2, k=i+1).fit(df_norm_NSL15000, labelsNSLKDD15000).get_feature_names_out(input_features=None))
    NSLKDD15000Features['MI'].append(SelectKBest(mutual_info_classif, k=i+1).fit(df_norm_NSL15000, labelsNSLKDD15000).get_feature_names_out(input_features=None))
    NSLKDD15000Features['SU'].append(SelectKBest(filters.univariate.su_measure, k=i+1).fit(df_norm_NSL15000, labelsNSLKDD15000).get_feature_names_out(input_features=None))

    NSLKDD20000Features['chi2'].append(SelectKBest(chi2, k=i+1).fit(df_norm_NSL20000, labelsNSLKDD20000).get_feature_names_out(input_features=None))
    NSLKDD20000Features['MI'].append(SelectKBest(mutual_info_classif, k=i+1).fit(df_norm_NSL20000, labelsNSLKDD20000).get_feature_names_out(input_features=None))
    NSLKDD20000Features['SU'].append(SelectKBest(filters.univariate.su_measure, k=i+1).fit(df_norm_NSL20000, labelsNSLKDD20000).get_feature_names_out(input_features=None))

for i in range(df_norm_UNSW10000.shape[1]):

    UNSW10000Features['chi2'].append(SelectKBest(chi2, k=i+1).fit(df_norm_UNSW10000, labelsUNSW10000).get_feature_names_out(input_features=None))
    UNSW10000Features['MI'].append(SelectKBest(mutual_info_classif, k=i+1).fit(df_norm_UNSW10000, labelsUNSW10000).get_feature_names_out(input_features=None))
    UNSW10000Features['SU'].append(SelectKBest(filters.univariate.su_measure, k=i+1).fit(df_norm_UNSW10000, labelsUNSW10000).get_feature_names_out(input_features=None))

    UNSW15000Features['chi2'].append(SelectKBest(chi2, k=i+1).fit(df_norm_UNSW15000, labelsUNSW15000).get_feature_names_out(input_features=None))
    UNSW15000Features['MI'].append(SelectKBest(mutual_info_classif, k=i+1).fit(df_norm_UNSW15000, labelsUNSW15000).get_feature_names_out(input_features=None))
    UNSW15000Features['SU'].append(SelectKBest(filters.univariate.su_measure, k=i+1).fit(df_norm_UNSW15000, labelsUNSW15000).get_feature_names_out(input_features=None))

    UNSW20000Features['chi2'].append(SelectKBest(chi2, k=i+1).fit(df_norm_UNSW20000, labelsUNSW20000).get_feature_names_out(input_features=None))
    UNSW20000Features['MI'].append(SelectKBest(mutual_info_classif, k=i+1).fit(df_norm_UNSW20000, labelsUNSW20000).get_feature_names_out(input_features=None))
    UNSW20000Features['SU'].append(SelectKBest(filters.univariate.su_measure, k=i+1).fit(df_norm_UNSW20000, labelsUNSW20000).get_feature_names_out(input_features=None))



In [None]:
with open('NSLKDD10000Features.pickle', 'wb') as f:
    pickle.dump(NSLKDD10000Features, f)

with open('NSLKDD15000Features.pickle', 'wb') as f:
    pickle.dump(NSLKDD15000Features, f)

with open('NSLKDD20000Features.pickle', 'wb') as f:
    pickle.dump(NSLKDD20000Features, f)

with open('UNSWNB15_10000Features.pickle', 'wb') as f:
    pickle.dump(UNSW10000Features,f)

with open('UNSWNB15_15000Features.pickle', 'wb') as f:
    pickle.dump(UNSW15000Features, f)

with open('UNSWNB15_20000Features.pickle', 'wb') as f:
    pickle.dump(UNSW20000Features, f)
