In [2]:
#import neccessary libraries
import warnings
import os
warnings.filterwarnings('ignore')
import time

#libraries for data analysis
import pandas as pd
from datetime import date, timedelta
import math
import numpy as np
from scipy import stats
from scipy.stats import norm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler

#libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#libraries for model prediction
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, recall_score, precision_score,f1_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
StandardScaler

#show all row and column values
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
np.set_printoptions(precision=4)

## Dataset
Experiments are done with NF-CSE-CIC-IDS2018-v2.csv dataset:
* Netflow format computer network dataset
* Target encoder is used in features (Similar to label encoding except labels are correlated)
* Labelencoder is used in 'Attack' column (6 attack types) : X_train ile eğitilip train&test için kullanılıyor
* y = (["Attack", "Label"])
* Data is splitted into train and test, then graphs are created seperately

In [3]:
df_train = pd.read_parquet("train_embedded.parquet")
df_test = pd.read_parquet("test_embedded.parquet")

In [4]:
train_labels = df_train["Label"]
test_labels = df_test["Label"]

train_samples  = df_train.drop(columns=["Label", "Attack"])
test_samples = df_test.drop(columns=["Label", "Attack"])

In [5]:
train_labels.shape

(2641554,)

In [6]:
X_train = train_samples
Y_train = train_labels
X_test = test_samples
Y_test = test_labels

df_results = [] # pd.DataFrame(columns = ["dataset","model","acc","f1_macro", "precision", "recall", "training_time"])
#df_predictions = pd.DataFrame(columns = ["LGBMClassifier", "CatBoostClassifier", "XGBClassifier", 
                              #"RandomForestClassifier", "GradientBoostingClassifier", "ExtraTreesClassifier"]

In [7]:
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)

In [8]:
#LGBMClassifier
model = LGBMClassifier(objective='binary', 
                          ##bagging_freq=5, bagging_fraction= 0.75, num_threads = 16,                            
                       num_leaves= 10, learning_rate = 0.1, boosting_type='dart',random_state=42, num_iteration=50)
start = time.time()
model.fit(X_train, Y_train)
stop = time.time()
Y_predicted=model.predict(X_test)

df_results.append({'dataset': 'NF-CSE-CIC-IDS2018-v2',
                                'model': 'LGBMClassifier',
                                'acc': accuracy_score(Y_test,Y_predicted),
                                'f1_macro': f1_score(Y_test, Y_predicted, average='macro'),
                                'precision': precision_score(Y_test,Y_predicted),
                                'recall': recall_score(Y_test,Y_predicted),
                                'training_time': stop - start})

[LightGBM] [Info] Number of positive: 316140, number of negative: 2325414
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.152934 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 65276
[LightGBM] [Info] Number of data points in the train set: 2641554, number of used features: 256
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119680 -> initscore=-1.995468
[LightGBM] [Info] Start training from score -1.995468


In [9]:
# CatBoostClassifier
model = CatBoostClassifier(iterations=300,
                           learning_rate=0.1,
                           depth=3,
                           verbose=0)
start = time.time()
model.fit(X_train, Y_train)
stop = time.time()
Y_predicted = model.predict(X_test)

df_results.append({'dataset': 'NF-CSE-CIC-IDS2018-v2',
                                'model': 'CatBoostClassifier',
                                'acc': accuracy_score(Y_test,Y_predicted),
                                'f1_macro': f1_score(Y_test, Y_predicted, average='macro'),
                                'precision': precision_score(Y_test,Y_predicted),
                                'recall': recall_score(Y_test,Y_predicted),
                                'training_time': stop - start})

In [10]:
pd.DataFrame(df_results)

Unnamed: 0,dataset,model,acc,f1_macro,precision,recall,training_time
0,NF-CSE-CIC-IDS2018-v2,LGBMClassifier,0.994086,0.985677,0.998591,0.951929,9.854602
1,NF-CSE-CIC-IDS2018-v2,CatBoostClassifier,0.994784,0.987396,0.998953,0.957421,63.425196


In [11]:
#XGBClassifier
xgb = XGBClassifier(n_estimators=50, learning_rate=0.1, max_depth=3, objective='binary:logistic')

start = time.time()
xgb.fit(X_train, Y_train)
stop = time.time()
Y_predicted = xgb.predict(X_test)

df_results.append({'dataset': 'NF-CSE-CIC-IDS2018-v2',
                                'model': 'XGBClassifier',
                                'acc': accuracy_score(Y_test,Y_predicted),
                                'f1_macro': f1_score(Y_test, Y_predicted, average='macro'),
                                'precision': precision_score(Y_test,Y_predicted),
                                'recall': recall_score(Y_test,Y_predicted),
                                'training_time': stop - start})

In [12]:
pd.DataFrame(df_results)

Unnamed: 0,dataset,model,acc,f1_macro,precision,recall,training_time
0,NF-CSE-CIC-IDS2018-v2,LGBMClassifier,0.994086,0.985677,0.998591,0.951929,9.854602
1,NF-CSE-CIC-IDS2018-v2,CatBoostClassifier,0.994784,0.987396,0.998953,0.957421,63.425196
2,NF-CSE-CIC-IDS2018-v2,XGBClassifier,0.993631,0.984554,0.998175,0.948519,23.325624


In [13]:
#GradientBoostingClassifier
gbc = HistGradientBoostingClassifier(max_iter=50, learning_rate=0.1, max_depth=3)


start = time.time()
gbc.fit(X_train, Y_train)
stop = time.time()
Y_predicted = gbc.predict(X_test)

df_results.append({'dataset': 'NF-CSE-CIC-IDS2018-v2',
                                'model': 'HistGradientBoostingClassifier',
                                'acc': accuracy_score(Y_test,Y_predicted),
                                'f1_macro': f1_score(Y_test, Y_predicted, average='macro'),
                                'precision': precision_score(Y_test,Y_predicted),
                                'recall': recall_score(Y_test,Y_predicted),
                                'training_time': stop - start})

In [14]:
pd.DataFrame(df_results)

Unnamed: 0,dataset,model,acc,f1_macro,precision,recall,training_time
0,NF-CSE-CIC-IDS2018-v2,LGBMClassifier,0.994086,0.985677,0.998591,0.951929,9.854602
1,NF-CSE-CIC-IDS2018-v2,CatBoostClassifier,0.994784,0.987396,0.998953,0.957421,63.425196
2,NF-CSE-CIC-IDS2018-v2,XGBClassifier,0.993631,0.984554,0.998175,0.948519,23.325624
3,NF-CSE-CIC-IDS2018-v2,HistGradientBoostingClassifier,0.994026,0.98553,0.998467,0.951546,28.232386


In [15]:
#RandomForestClassifier
rf = RandomForestClassifier(verbose=1, max_depth=3, min_samples_split=200, min_samples_leaf=10, n_jobs=-1)

start = time.time()
rf.fit(X_train, Y_train)
stop = time.time()
Y_predicted = rf.predict(X_test)

df_results.append({'dataset': 'NF-CSE-CIC-IDS2018-v2',
                                'model': 'RFC',
                                'acc': accuracy_score(Y_test,Y_predicted),
                                'f1_macro': f1_score(Y_test, Y_predicted, average='macro'),
                                'precision': precision_score(Y_test,Y_predicted),
                                'recall': recall_score(Y_test,Y_predicted),                                
                                'training_time': stop - start})

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.1min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.8s finished


In [16]:
pd.DataFrame(df_results)

Unnamed: 0,dataset,model,acc,f1_macro,precision,recall,training_time
0,NF-CSE-CIC-IDS2018-v2,LGBMClassifier,0.994086,0.985677,0.998591,0.951929,9.854602
1,NF-CSE-CIC-IDS2018-v2,CatBoostClassifier,0.994784,0.987396,0.998953,0.957421,63.425196
2,NF-CSE-CIC-IDS2018-v2,XGBClassifier,0.993631,0.984554,0.998175,0.948519,23.325624
3,NF-CSE-CIC-IDS2018-v2,HistGradientBoostingClassifier,0.994026,0.98553,0.998467,0.951546,28.232386
4,NF-CSE-CIC-IDS2018-v2,RFC,0.986131,0.965369,0.998959,0.885038,63.5354


In [17]:
#ExtraTreesClassifier
etc = ExtraTreesClassifier(verbose=1, min_samples_split=800, min_samples_leaf=20, max_depth=3, n_jobs=-1)

start = time.time()
etc.fit(X_train, Y_train)
stop = time.time()
Y_predicted = etc.predict(X_test)

df_results.append({'dataset': 'NF-CSE-CIC-IDS2018-v2',
                                'model': 'ETC',
                                'acc': accuracy_score(Y_test,Y_predicted),
                                'f1_macro': f1_score(Y_test, Y_predicted, average='macro'),
                                'precision': precision_score(Y_test,Y_predicted),
                                'recall': recall_score(Y_test,Y_predicted),
                                'training_time': stop - start})

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   22.0s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.8s finished


In [18]:
pd.DataFrame(df_results)

Unnamed: 0,dataset,model,acc,f1_macro,precision,recall,training_time
0,NF-CSE-CIC-IDS2018-v2,LGBMClassifier,0.994086,0.985677,0.998591,0.951929,9.854602
1,NF-CSE-CIC-IDS2018-v2,CatBoostClassifier,0.994784,0.987396,0.998953,0.957421,63.425196
2,NF-CSE-CIC-IDS2018-v2,XGBClassifier,0.993631,0.984554,0.998175,0.948519,23.325624
3,NF-CSE-CIC-IDS2018-v2,HistGradientBoostingClassifier,0.994026,0.98553,0.998467,0.951546,28.232386
4,NF-CSE-CIC-IDS2018-v2,RFC,0.986131,0.965369,0.998959,0.885038,63.5354
5,NF-CSE-CIC-IDS2018-v2,ETC,0.986096,0.965285,0.998626,0.885038,22.251003
