In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, matthews_corrcoef, confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier
import time
import pickle
import matplotlib.pyplot as plt

# Import datasets


In [None]:
# Name of the columns for NSL-KDD

columns = ["duration","protocol_type","service","flag","src_bytes",
                    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
                    "logged_in","num_compromised","root_shell","su_attempted","num_root",
                    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
                    "is_host_login","is_guest_login","count","srv_count","serror_rate",
                    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
                    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
                    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
                    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
                    "dst_host_rerror_rate","dst_host_srv_rerror_rate","labels"]


nslkddTrain = pd.read_csv('./NSL-KDD/KDDTrain+.txt', sep=",", header=None, usecols = [i for i in range(42)], names=columns)

# Convert boolean features to objects to avoid misinterpretation in NSL-KDD
nslkddTrain['land'] = nslkddTrain['land'].astype('object', copy=False)
nslkddTrain['logged_in'] = nslkddTrain['logged_in'].astype('object', copy=False)
nslkddTrain['urgent'] = nslkddTrain['urgent'].astype('object', copy=False)
nslkddTrain['is_host_login'] = nslkddTrain['is_host_login'].astype('object', copy=False)
nslkddTrain['is_guest_login'] = nslkddTrain['is_guest_login'].astype('object', copy=False)

# Import UNSW-NB15 train set

unswTrain = pd.DataFrame()
unswTrain = pd.read_csv('./UNSW-NB15/UNSW_NB15_training-set.csv', sep=",", na_values=[' '])

# Convert boolean features to objects to avoid misinterpretation in UNSW-NB15
unswTrain['is_ftp_login'] = unswTrain['is_ftp_login'].astype('object', copy=False)
unswTrain['is_sm_ips_ports'] = unswTrain['is_sm_ips_ports'].astype('object', copy=False)
unswTrain['label'] = unswTrain['label'].astype('object', copy=False)
del unswTrain["label"]
del unswTrain["id"]

# Make class categories in NSL-KDD

In [None]:
clustersMapping = {
    'normal': 'normal',

    'back': 'DoS',
    'land': 'DoS',
    'neptune': 'DoS',
    'pod': 'DoS',
    'smurf': 'DoS',
    'teardrop': 'DoS',
    'mailbomb': 'DoS',
    'apache2': 'DoS',
    'processtable': 'DoS',
    'udpstorm': 'DoS',
    

    'ipsweep': 'Probe',
    'nmap': 'Probe',
    'portsweep': 'Probe',
    'satan': 'Probe',
    'mscan': 'Probe',
    'saint': 'Probe',

    'ftp_write': 'R2L',
    'guess_passwd': 'R2L',
    'imap': 'R2L',
    'multihop': 'R2L',
    'phf': 'R2L',
    'spy': 'R2L',
    'warezclient': 'R2L',
    'warezmaster': 'R2L',
    'sendmail': 'R2L',
    'named': 'R2L',
    'snmpgetattack': 'R2L',
    'snmpguess': 'R2L',
    'xlock': 'R2L',
    'xsnoop': 'R2L',
    'sqlattack': 'R2L',
    
    

    'buffer_overflow': 'U2R',
    'loadmodule': 'U2R',
    'perl': 'U2R',
    'rootkit': 'U2R',
    'ps': 'U2R',
    'xterm': 'U2R',
    'httptunnel': 'U2R',
    'worm': 'U2R',
    
}

nslkddTrain['labels'] = nslkddTrain['labels'].map(clustersMapping)


# Sampling of NSLK-KDD while keeping the same distribution

In [None]:
print(nslkddTrain['labels'].value_counts())

# Get the distribution of each label
for value in nslkddTrain['labels'].value_counts():
    print(value/nslkddTrain.shape[0])

In [None]:
normalCount = round(0.5345828074269883 * 50000)
DoSCount = round(0.3645781238836894 * 50000)
ProbeCount = round(0.09252776388591206 * 50000)
R2LCount = round(0.007898517936383194 * 50000)
U2RCount = round(0.00041278686702706137 * 50000)

grouped = nslkddTrain.groupby("labels")
for element in grouped:
    if element[0] == 'DoS':
        dosdataset = element[1].sample(DoSCount)
    elif element[0] == 'normal':
        normaldataset = element[1].sample(normalCount)
    elif element[0] == 'Probe':
        probedataset = element[1].sample(ProbeCount)
    elif element[0] == 'R2L':
        r2ldataset = element[1].sample(R2LCount)
    elif element[0] == 'U2R':
        u2rdataset = element[1].sample(U2RCount)

# Concat all datasets
nslkddTrainSampled = pd.concat([normaldataset, dosdataset, probedataset, r2ldataset, u2rdataset])

# Sampling of UNSW-NB15

In [None]:
print(unswTrain['attack_cat'].value_counts())

# Get the distribution of each label
for value in unswTrain['attack_cat'].value_counts():
    print(value/unswTrain.shape[0])

In [None]:
NormalCount = round(0.44939999028324346 * 50000)
GenericCount = round(0.2292061409901375 * 50000)
ExploitsCount = round(0.13520866734683962 * 50000)
FuzzersCount = round(0.07362872273235194 * 50000)
DoSCount = round(0.049664771899140064 * 50000)
ReconnaissanceCount = round(0.04246222610892484 * 50000)
AnalysisCount = round(0.008222805227615022 * 50000)
BackdoorCount = round(0.007081086333381917 * 50000)
ShellcodeCount = round(0.004591167468299082 * 50000)
WormsCount = round(0.0005344216100665598 * 50000)

grouped = unswTrain.groupby("attack_cat")
for element in grouped:
    if element[0] == 'Normal':
        normaldataset = element[1].sample(NormalCount)
    elif element[0] == 'Generic':
        genericdataset = element[1].sample(GenericCount)
    elif element[0] == 'Exploits':
        exploitsdataset = element[1].sample(ExploitsCount)
    elif element[0] == 'Fuzzers':
        fuzzersdataset = element[1].sample(FuzzersCount)
    elif element[0] == 'DoS':
        dosdataset = element[1].sample(DoSCount)
    elif element[0] == 'Reconnaissance':
        reconnaissancedataset = element[1].sample(ReconnaissanceCount)
    elif element[0] == 'Analysis':
        analysisdataset = element[1].sample(AnalysisCount)
    elif element[0] == 'Backdoor':
        backdoordataset = element[1].sample(BackdoorCount)
    elif element[0] == 'Shellcode':
        shellcodedataset = element[1].sample(ShellcodeCount)
    elif element[0] == 'Worms':
        wormsdataset = element[1].sample(WormsCount +1)

# Concat all datasets
unswTrainSampled = pd.concat([normaldataset, genericdataset, exploitsdataset, fuzzersdataset, dosdataset, reconnaissancedataset, analysisdataset, backdoordataset, shellcodedataset, wormsdataset])


# Features encoding

In [None]:
nslkddTrainLabels = nslkddTrainSampled.labels
lekdd = LabelEncoder()
lekdd.fit(nslkddTrainLabels)
nslkddTrainLabels = lekdd.transform(nslkddTrainLabels)
del nslkddTrainSampled['labels']

unswTrainLabels = unswTrainSampled.attack_cat
leunsw = LabelEncoder()
leunsw.fit(unswTrainLabels)
unswTrainLabels = leunsw.transform(unswTrainLabels)
del unswTrainSampled['attack_cat']

# Perform ordinal encoding on remaining string values
toEncodeKdd = list(nslkddTrainSampled.select_dtypes(include=['object']).columns)
toEncodeUnsw = list(unswTrainSampled.select_dtypes(include=['object']).columns)

OrdinalEncoder.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)

cat_encoding = Pipeline([
    ('cat_encoder', OrdinalEncoder())
    ])

ctEncodingKdd = ColumnTransformer([
    ('cat', cat_encoding, toEncodeKdd)
    ], remainder="passthrough", verbose_feature_names_out=False)

ctEncodingUnsw = ColumnTransformer([
('cat', cat_encoding, toEncodeUnsw)
], remainder="passthrough", verbose_feature_names_out=False)

nslkddTrainEncoded = pd.DataFrame.from_records(ctEncodingKdd.fit_transform(nslkddTrainSampled), columns=ctEncodingKdd.get_feature_names_out())
unswTrainEncoded = pd.DataFrame.from_records(ctEncodingUnsw.fit_transform(unswTrainSampled), columns=ctEncodingUnsw.get_feature_names_out())

# Preprocessing

In [None]:
# Apply log base 10 to columns containing large values
largevalueskdd = ['duration', 'src_bytes', 'dst_bytes', 'num_compromised', 'num_root', 'count', 'srv_count', 'dst_host_count', 'dst_host_srv_count']
largevaluesunsw = ['spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'stcpb', 'dtcpb', 'smean', 'dmean']

for colname in largevalueskdd:
    nslkddTrainEncoded[colname] = np.log10(nslkddTrainEncoded[colname])
    nslkddTrainEncoded.replace([-np.inf], 0, inplace=True)

for colname in largevaluesunsw:
    unswTrainEncoded[colname] = np.log10(unswTrain[colname])
    unswTrainEncoded[colname].replace([-np.inf], 0, inplace=True)

# Apply minmax scaler on numerical values
MinMaxScaler.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)

numColskdd = list(nslkddTrainEncoded.select_dtypes(include=['float64', 'int64']).columns)
numColsunsw = list(unswTrainEncoded.select_dtypes(include=['float64', 'int64']).columns)


num_minmaxscaling = Pipeline([
    ('num_minmaxscaling', MinMaxScaler()),
])


ctMinMaxkdd = ColumnTransformer([
    ('minmax', num_minmaxscaling, numColskdd)
], remainder="passthrough", verbose_feature_names_out=False)

ctMinMaxunsw = ColumnTransformer([
    ('minmax', num_minmaxscaling, numColsunsw)
], remainder="passthrough", verbose_feature_names_out=False)


nslkddTrainNormalized = pd.DataFrame.from_records(ctMinMaxkdd.fit_transform(nslkddTrainEncoded), columns=ctMinMaxkdd.get_feature_names_out())
unswTrainNormalized  = pd.DataFrame.from_records(ctMinMaxunsw.fit_transform(unswTrainEncoded), columns=ctMinMaxunsw.get_feature_names_out())

# Classification
## Classification on full train set

In [None]:
def detection_rate(y, y_predicted):
    matrix = confusion_matrix(y_true=y, y_pred=y_predicted)
    tp = np.diag(matrix)
    fn = matrix.sum(axis=1) - np.diag(matrix)
    fn = fn.astype(float)
    tp = tp.astype(float)
    dr = tp/(tp+fn)
    print(dr)
    return dr.mean()


def false_alarm_rate(y, y_predicted):
    matrix = confusion_matrix(y_true=y, y_pred=y_predicted)
    fp = matrix.sum(axis=0) - np.diag(matrix) 
    fn = matrix.sum(axis=1) - np.diag(matrix)
    tp = np.diag(matrix)
    tn = matrix.sum() - (fp + fn + tp)

    fp = fp.astype(float)
    fn = fn.astype(float)
    tp = tp.astype(float)
    tn = tn.astype(float)
    far = fp/(tn+fp)
    return far.mean()


def F1Score(y_true,y_pred):
    return f1_score(y_true, y_pred, average="micro")

rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=64)
svm = LinearSVC(dual=False, random_state=34)
lr = LogisticRegression(solver='newton-cg', random_state=21, penalty='none', n_jobs=-1)
start = time.time()
crossValidationRf = cross_validate(rf, nslkddTrainNormalized, nslkddTrainLabels, scoring={'accuracy': make_scorer(accuracy_score, greater_is_better=True), 'matthews': make_scorer(matthews_corrcoef, greater_is_better=True), 'f1':make_scorer(F1Score, greater_is_better=True), 'detection_rate':make_scorer(detection_rate, greater_is_better=True), 'false_alarm_rate':make_scorer(false_alarm_rate, greater_is_better=False)}, cv=10)
end = time.time()
print('Time taken for Cross validation RF: ', end-start)
start = time.time()
crossValidationSvm = cross_validate(svm, nslkddTrainNormalized, nslkddTrainLabels, scoring={'accuracy': make_scorer(accuracy_score, greater_is_better=True), 'matthews': make_scorer(matthews_corrcoef, greater_is_better=True), 'f1':make_scorer(F1Score, greater_is_better=True), 'detection_rate':make_scorer(detection_rate, greater_is_better=True), 'false_alarm_rate':make_scorer(false_alarm_rate, greater_is_better=False)}, cv=10)
end = time.time()
print('Time taken for Cross validation SVM: ', end-start)
start = time.time()
crossValidationLr = cross_validate(lr, nslkddTrainNormalized, nslkddTrainLabels, scoring={'accuracy': make_scorer(accuracy_score, greater_is_better=True), 'matthews': make_scorer(matthews_corrcoef, greater_is_better=True), 'f1':make_scorer(F1Score, greater_is_better=True), 'detection_rate':make_scorer(detection_rate, greater_is_better=True), 'false_alarm_rate':make_scorer(false_alarm_rate, greater_is_better=False)}, cv=10, n_jobs=-1)
end = time.time()
print('Time taken for Cross validation LR: ', end-start)


In [None]:
accuracies = crossValidationRf['test_accuracy'].mean()
matthews = crossValidationRf['test_matthews'].mean()
f1 = crossValidationRf['test_f1'].mean()
far = crossValidationRf['test_false_alarm_rate'].mean()
dr = crossValidationRf['test_detection_rate'].mean()
print(accuracies, matthews, f1, far, dr)

In [None]:
accuracies = crossValidationSvm['test_accuracy'].mean()
matthews = crossValidationSvm['test_matthews'].mean()
f1 = crossValidationSvm['test_f1'].mean()
far = crossValidationSvm['test_false_alarm_rate'].mean()
dr = crossValidationSvm['test_detection_rate'].mean()
print(accuracies, matthews, f1, far, dr)

In [None]:
accuracies = crossValidationLr['test_accuracy'].mean()
matthews = crossValidationLr['test_matthews'].mean()
f1 = crossValidationLr['test_f1'].mean()
far = crossValidationLr['test_false_alarm_rate'].mean()
dr = crossValidationLr['test_detection_rate'].mean()
print(accuracies, matthews, f1, far, dr)

In [None]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=64)
svm = LinearSVC(dual=False, random_state=34)
lr = LogisticRegression(solver='newton-cg', random_state=21, penalty='none', n_jobs=-1)
start = time.time()
crossValidationRf = cross_validate(rf, unswTrainNormalized, unswTrainLabels, scoring={'accuracy': make_scorer(accuracy_score, greater_is_better=True), 'matthews': make_scorer(matthews_corrcoef, greater_is_better=True), 'f1':make_scorer(F1Score, greater_is_better=True), 'detection_rate':make_scorer(detection_rate, greater_is_better=True), 'false_alarm_rate':make_scorer(false_alarm_rate, greater_is_better=False)}, cv=10)
end = time.time()
print('Time taken for Cross validation RF: ', end-start)
start = time.time()
crossValidationSvm = cross_validate(svm, unswTrainNormalized, unswTrainLabels, scoring={'accuracy': make_scorer(accuracy_score, greater_is_better=True), 'matthews': make_scorer(matthews_corrcoef, greater_is_better=True), 'f1':make_scorer(F1Score, greater_is_better=True), 'detection_rate':make_scorer(detection_rate, greater_is_better=True), 'false_alarm_rate':make_scorer(false_alarm_rate, greater_is_better=False)}, cv=10)
end = time.time()
print('Time taken for Cross validation SVM: ', end-start)
start = time.time()
crossValidationLr = cross_validate(lr, unswTrainNormalized, unswTrainLabels, scoring={'accuracy': make_scorer(accuracy_score, greater_is_better=True), 'matthews': make_scorer(matthews_corrcoef, greater_is_better=True), 'f1':make_scorer(F1Score, greater_is_better=True), 'detection_rate':make_scorer(detection_rate, greater_is_better=True), 'false_alarm_rate':make_scorer(false_alarm_rate, greater_is_better=False)}, cv=10, n_jobs=-1)
end = time.time()
print('Time taken for Cross validation LR: ', end-start)

In [None]:
accuracies = crossValidationRf['test_accuracy'].mean()
matthews = crossValidationRf['test_matthews'].mean()
f1 = crossValidationRf['test_f1'].mean()
far = crossValidationRf['test_false_alarm_rate'].mean()
dr = crossValidationRf['test_detection_rate'].mean()
print(accuracies, matthews, f1, far, dr)

In [None]:
accuracies = crossValidationSvm['test_accuracy'].mean()
matthews = crossValidationSvm['test_matthews'].mean()
f1 = crossValidationSvm['test_f1'].mean()
far = crossValidationSvm['test_false_alarm_rate'].mean()
dr = crossValidationSvm['test_detection_rate'].mean()
print(accuracies, matthews, f1, far, dr)

In [None]:
accuracies = crossValidationLr['test_accuracy'].mean()
matthews = crossValidationLr['test_matthews'].mean()
f1 = crossValidationLr['test_f1'].mean()
far = crossValidationLr['test_false_alarm_rate'].mean()
dr = crossValidationLr['test_detection_rate'].mean()
print(accuracies, matthews, f1, far, dr)

## Classification with reduced feature sets from GA-LR

In [None]:
A1 = ['service', 'is_guest_login', 'src_bytes', 'dst_bytes', 'hot', 'num_compromised', 'num_root', 'num_file_creations', 'num_access_files', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate']
A2 = ["service", "is_guest_login", "src_bytes", "dst_bytes", "hot", "num_file_creations", "count", "dst_host_srv_count", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_srv_serror_rate"]
A3 = ['service', 'is_guest_login', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'hot', 'num_file_creations', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate']

B1 = ['protocol_type', 'flag', 'is_guest_login', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'hot', 'num_access_files', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_srv_rerror_rate']
B2 = ['flag', 'is_guest_login', 'dst_bytes', 'hot', 'count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_srv_serror_rate', 'dst_host_srv_rerror_rate']
B3 = ['protocol_type', 'service', 'flag', 'logged_in', 'is_guest_login', 'duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'hot', 'num_compromised', 'num_access_files', 'count', 'srv_count', 'rerror_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_srv_serror_rate']

C1 = ['protocol_type', 'service', 'logged_in', 'is_guest_login', 'src_bytes', 'wrong_fragment', 'hot', 'num_compromised', 'num_access_files', 'count', 'serror_rate', 'srv_serror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_srv_serror_rate', 'dst_host_srv_rerror_rate']
C2 = ['protocol_type', 'flag', 'is_guest_login', 'src_bytes', 'wrong_fragment', 'hot', 'num_file_creations', 'num_access_files', 'count', 'srv_count', 'serror_rate', 'rerror_rate', 'same_srv_rate', 'dst_host_same_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate']
C3 = ['protocol_type', 'is_guest_login', 'src_bytes', 'wrong_fragment', 'hot', 'count', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate']

D1 = ["proto", "service", "state", "is_sm_ips_ports", "dur", "dbytes", "dttl", "sload", "dload", "dloss", "sinpkt", "swin", "tcprtt", "ackdat","dmean", "response_body_len", "ct_srv_src", "ct_state_ttl", "ct_dst_ltm", "ct_dst_sport_ltm", "ct_dst_src_ltm", "ct_flw_http_mthd", "ct_src_ltm", "ct_srv_dst"]
D2 = ["proto", "service", "state", "is_sm_ips_ports", "rate", "dttl", "sload", "dloss", "sinpkt", "dinpkt", "stcpb", "dwin", "synack", "dmean", "response_body_len", "ct_srv_src", "ct_state_ttl", "ct_dst_sport_ltm", "ct_dst_src_ltm", "ct_flw_http_mthd", "ct_srv_dst"]
D3 = []

E1 = ["service", "state", "spkts", "dttl", "dload", "sloss", "sinpkt", "dinpkt", "swin", "tcprtt", "smean", "response_body_len", "ct_state_ttl", "ct_dst_ltm", "ct_dst_sport_ltm", "ct_dst_src_ltm", "ct_flw_http_mthd", "ct_srv_dst"]
E2 = []
E3 = ["service", "state", "is_sm_ips_ports", "sbytes", "dttl", "sload", "sjit", "swin", "synack", "ackdat", "dmean", "response_body_len", "ct_srv_src", "ct_state_ttl", "ct_dst_ltm", "ct_dst_sport_ltm", "ct_dst_src_ltm", "ct_flw_http_mthd", "ct_src_ltm", "ct_srv_dst"]

F1 = ["proto", "service", "state", "spkts", "dpkts", "dbytes", "rate", "dttl", "sload", "swin", "dtcpb","synack", "smean", "dmean", "response_body_len", "ct_srv_src", "ct_state_ttl", "ct_dst_ltm","ct_dst_sport_ltm", "ct_dst_src_ltm", "ct_ftp_cmd", "ct_flw_http_mthd", "ct_srv_dst"]
F2 = ["service", "state", "is_sm_ips_ports", "spkts", "dttl", "sloss", "dloss", "swin", "dwin", "synack", "dmean","response_body_len", "ct_srv_src", "ct_state_ttl", "ct_dst_ltm", "ct_dst_sport_ltm","ct_dst_src_ltm", "ct_flw_http_mthd", "ct_srv_dst"]
F3 = ["proto", "service", "state", "dur", "dpkts", "dbytes", "sttl", "dttl", "dloss", "djit", "swin", "dwin", "tcprtt", "response_body_len", "ct_state_ttl", "ct_src_dport_ltm", "ct_dst_sport_ltm","ct_dst_src_ltm", "ct_flw_http_mthd", "ct_srv_dst"]

reducedDatasetNslA1 = nslkddTrainNormalized.loc[:, A1]
reducedDatasetNslA2 = nslkddTrainNormalized.loc[:, A2]
reducedDatasetNslA3 = nslkddTrainNormalized.loc[:, A3]

reducedDatasetNslB3 = nslkddTrainNormalized.loc[:, B3]

reducedDatasetNslC3 = nslkddTrainNormalized.loc[:, C3]

reducedDatasetUnswD1 = unswTrainNormalized.loc[:, D1]
reducedDatasetUnswD2 = unswTrainNormalized.loc[:, D2]


reducedDatasetUnswE1 = unswTrainNormalized.loc[:, E1]
reducedDatasetUnswE3 = unswTrainNormalized.loc[:, E3]

reducedDatasetUnswF1 = unswTrainNormalized.loc[:, F1]
reducedDatasetUnswF2 = unswTrainNormalized.loc[:, F2]
reducedDatasetUnswF3 = unswTrainNormalized.loc[:, F3]

# Cross validations must have their variables changed for each dataset and cell must be run again
start = time.time()
crossValidationRf = cross_validate(rf, reducedDatasetNslA3, nslkddTrainLabels, scoring={'accuracy': make_scorer(accuracy_score, greater_is_better=True), 'matthews': make_scorer(matthews_corrcoef, greater_is_better=True), 'f1':make_scorer(F1Score, greater_is_better=True), 'detection_rate':make_scorer(detection_rate, greater_is_better=True), 'false_alarm_rate':make_scorer(false_alarm_rate, greater_is_better=False)}, cv=10)
end = time.time()
print("Time taken for RF: ", end-start)

start = time.time()
crossValidationSvm = cross_validate(svm, reducedDatasetNslA3, nslkddTrainLabels, scoring={'accuracy': make_scorer(accuracy_score, greater_is_better=True), 'matthews': make_scorer(matthews_corrcoef, greater_is_better=True), 'f1':make_scorer(F1Score, greater_is_better=True), 'detection_rate':make_scorer(detection_rate, greater_is_better=True), 'false_alarm_rate':make_scorer(false_alarm_rate, greater_is_better=False)}, cv=10)
end = time.time()
print("Time taken for SVM: ", end-start)

start = time.time()
crossValidationLr = cross_validate(lr, reducedDatasetNslA3, nslkddTrainLabels, scoring={'accuracy': make_scorer(accuracy_score, greater_is_better=True), 'matthews': make_scorer(matthews_corrcoef, greater_is_better=True), 'f1':make_scorer(F1Score, greater_is_better=True), 'detection_rate':make_scorer(detection_rate, greater_is_better=True), 'false_alarm_rate':make_scorer(false_alarm_rate, greater_is_better=False)}, cv=10)
end = time.time()
print("Time taken for LR: ", end-start)

In [None]:
accuracies = crossValidationRf['test_accuracy'].mean()
matthews = crossValidationRf['test_matthews'].mean()
f1 = crossValidationRf['test_f1'].mean()
far = crossValidationRf['test_false_alarm_rate'].mean()
dr = crossValidationRf['test_detection_rate'].mean()
print(accuracies, matthews, f1, far, dr)

In [None]:
accuracies = crossValidationSvm['test_accuracy'].mean()
matthews = crossValidationSvm['test_matthews'].mean()
f1 = crossValidationSvm['test_f1'].mean()
far = crossValidationSvm['test_false_alarm_rate'].mean()
dr = crossValidationSvm['test_detection_rate'].mean()
print(accuracies, matthews, f1, far, dr)

In [None]:
accuracies = crossValidationLr['test_accuracy'].mean()
matthews = crossValidationLr['test_matthews'].mean()
f1 = crossValidationLr['test_f1'].mean()
far = crossValidationLr['test_false_alarm_rate'].mean()
dr = crossValidationLr['test_detection_rate'].mean()
print(accuracies, matthews, f1, far, dr)

# Import scores from Filter-based feature selection

In [None]:
NSLKDD10000Features = None
NSLKDD15000Features = None
NSLKDD20000Features = None
UNSWNB15_10000Features = None
UNSWNB15_15000Features = None
UNSWNB15_20000Features = None

with open('NSLKDD10000Features.pickle', 'rb') as f:
    NSLKDD10000Features = pickle.load(f)

with open('NSLKDD15000Features.pickle', 'rb') as f:
    NSLKDD15000Features = pickle.load(f)

with open('NSLKDD20000Features.pickle', 'rb') as f:
    NSLKDD20000Features = pickle.load(f)

with open('UNSWNB15_10000Features.pickle', 'rb') as f:
    UNSWNB15_10000Features = pickle.load(f)

with open('UNSWNB15_15000Features.pickle', 'rb') as f:
    UNSWNB15_15000Features = pickle.load(f)

with open('UNSWNB15_20000Features.pickle', 'rb') as f:
    UNSWNB15_20000Features = pickle.load(f)

In [None]:
featureSetsKDD = [NSLKDD10000Features, NSLKDD15000Features, NSLKDD20000Features]
featureSetsUNSW = [UNSWNB15_10000Features, UNSWNB15_15000Features, UNSWNB15_20000Features]

rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=64)
svm = LinearSVC(dual=False, random_state=34)
lr = LogisticRegression(solver='newton-cg', random_state=21, penalty='none')
accuracies10000 = []
accuracies15000 = []
accuracies20000 = []

for i in range(len(featureSetsKDD)):

    featureset = featureSetsKDD[i]
    if i == 0:
        print('Finding right amount of features for NSL-KDD 10000')
    elif i == 1:
        print('Finding right amount of features for NSL-KDD 15000')
    elif i == 2:
        print('Finding right amount of features for NSL-KDD 20000')

    start = time.time()
    for j in range(1, len(featureset['chi2'])):

        print('Nb of features: ', j+1)
        reducedSet = nslkddTrainNormalized.loc[:, featureset['MI'][j]] # Feature set must be changed to MI, SU, or chi2 depending on what we want to compute
        crossValidationRf = cross_validate(rf, reducedSet, nslkddTrainLabels, scoring={'accuracy': make_scorer(accuracy_score, greater_is_better=True), 'matthews': make_scorer(matthews_corrcoef, greater_is_better=True), 'f1':make_scorer(F1Score, greater_is_better=True), 'detection_rate':make_scorer(detection_rate, greater_is_better=True), 'false_alarm_rate':make_scorer(false_alarm_rate, greater_is_better=False)}, cv=10)

        if i == 0:
            accuracies10000.append(crossValidationRf['test_accuracy'].mean())
    
        elif i == 1:
            accuracies15000.append(crossValidationRf['test_accuracy'].mean())
        elif i == 2:
            accuracies20000.append(crossValidationRf['test_accuracy'].mean())
        
    end = time.time()
    print('Time to compute: ', end-start)


In [None]:
# Based on the accuracies obtained at the previous cells, the most interesting must be picked as done at the next line to get its complete metrics
reducedSet = nslkddTrainNormalized.loc[:, NSLKDD20000Features['MI'][23]]
crossValidationRf = cross_validate(rf, reducedSet, nslkddTrainLabels, scoring={'accuracy': make_scorer(accuracy_score, greater_is_better=True), 'matthews': make_scorer(matthews_corrcoef, greater_is_better=True), 'f1':make_scorer(F1Score, greater_is_better=True), 'detection_rate':make_scorer(detection_rate, greater_is_better=True), 'false_alarm_rate':make_scorer(false_alarm_rate, greater_is_better=False)}, cv=10)
print(crossValidationRf['test_matthews'].mean())
f1 = crossValidationRf['test_f1'].mean()
far = crossValidationRf['test_false_alarm_rate'].mean()
dr = crossValidationRf['test_detection_rate'].mean()
print(f1,far,dr)