In [2]:
# Evaluate adaboost algorithm for classification
import pandas as pd
from numpy import mean
from numpy import std
import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, balanced_accuracy_score, roc_auc_score, confusion_matrix

In [3]:
from joblib import dump, load
from sklearn.model_selection import train_test_split, cross_val_score

In [3]:
dataset = pd.read_csv("/media/kmdr7/Seagate/TA/DATASETS/newDatasetSampledEncoded.csv")

In [4]:
dataset.shape

(2000000, 78)

In [5]:
X = dataset.drop(columns=["Label"])
y = dataset["Label"]

In [6]:
# cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1)
# model = DecisionTreeClassifier()
# cross_val_score(model, X, y, scoring='precision', cv=cv, n_jobs=6)

In [7]:
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [8]:
lv = VarianceThreshold(threshold=0.003)
lv.fit_transform(X)
filtered_cols = X.drop(
    columns=X.columns[lv.get_support()].array
).columns.array
newds = X.drop(columns=filtered_cols)
newds["Label"] = dataset["Label"]
cor = newds.corr()
cor_target = abs(cor["Label"])
relevant_features = cor_target[cor_target > 0.3]
all_feature = set(X.columns)
relevant_features_idx = set(relevant_features.index)
removed_feature = list(all_feature - relevant_features_idx)
X.drop(columns=removed_feature, inplace=True)

In [9]:
cor_target[cor_target > 0.3]

Protocol                  0.367517
Flow Duration             0.336105
Fwd Packet Length Max     0.328274
Fwd Packet Length Mean    0.328239
Bwd Packet Length Max     0.302911
Bwd Packet Length Mean    0.327593
Fwd IAT Total             0.328912
Bwd IAT Total             0.322424
Packet Length Max         0.373814
Packet Length Mean        0.392885
Packet Length Std         0.327140
FIN Flag Count            0.434530
SYN Flag Count            0.985020
Average Packet Size       0.394235
Fwd Segment Size Avg      0.328239
Bwd Segment Size Avg      0.327593
FWD Init Win Bytes        0.576701
Fwd Seg Size Min          0.704290
Idle Mean                 0.355265
Idle Max                  0.360631
Idle Min                  0.342896
Label                     1.000000
Name: Label, dtype: float64

In [10]:
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=1 / 7.0, random_state=1
)

In [11]:
x_train.columns

Index(['Protocol', 'Flow Duration', 'Fwd Packet Length Max',
       'Fwd Packet Length Mean', 'Bwd Packet Length Max',
       'Bwd Packet Length Mean', 'Fwd IAT Total', 'Bwd IAT Total',
       'Packet Length Max', 'Packet Length Mean', 'Packet Length Std',
       'FIN Flag Count', 'SYN Flag Count', 'Average Packet Size',
       'Fwd Segment Size Avg', 'Bwd Segment Size Avg', 'FWD Init Win Bytes',
       'Fwd Seg Size Min', 'Idle Mean', 'Idle Max', 'Idle Min'],
      dtype='object')

In [12]:
clf = LogisticRegression(solver='newton-cg')
clf.fit(x_train, y_train)

LogisticRegression(solver='newton-cg')

In [13]:
x_test

Unnamed: 0,Protocol,Flow Duration,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Fwd IAT Total,Bwd IAT Total,Packet Length Max,Packet Length Mean,...,FIN Flag Count,SYN Flag Count,Average Packet Size,Fwd Segment Size Avg,Bwd Segment Size Avg,FWD Init Win Bytes,Fwd Seg Size Min,Idle Mean,Idle Max,Idle Min
1806398,1.000000,0.984720,0.040761,0.040761,0.000000,0.000000,0.984720,0.000000,0.020718,0.040761,...,0.0,0.000000,0.033967,0.040761,0.000000,0.000000,0.181818,0.328240,0.328252,0.328223
422849,0.352941,0.060089,0.000000,0.000000,0.000000,0.000000,0.060089,0.000000,0.000000,0.000000,...,0.0,0.571429,0.000000,0.000000,0.000000,0.445563,0.909091,0.000000,0.000000,0.000000
1772714,1.000000,0.984721,0.040761,0.040761,0.000000,0.000000,0.984721,0.000000,0.020718,0.040761,...,0.0,0.000000,0.033967,0.040761,0.000000,0.000000,0.181818,0.328240,0.328246,0.328233
828325,0.352941,0.060037,0.000000,0.000000,0.000000,0.000000,0.060037,0.000000,0.000000,0.000000,...,0.0,0.571429,0.000000,0.000000,0.000000,0.445563,0.909091,0.000000,0.000000,0.000000
659831,0.352941,0.059981,0.000000,0.000000,0.000000,0.000000,0.059981,0.000000,0.000000,0.000000,...,0.0,0.571429,0.000000,0.000000,0.000000,0.445563,0.909091,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1061856,0.352941,0.015646,0.351223,0.049932,0.489641,0.420700,0.015646,0.013514,0.489641,0.212738,...,0.0,0.142857,0.149290,0.049932,0.420700,0.066835,0.727273,0.000000,0.000000,0.000000
1577687,0.352941,0.995976,0.220109,0.077085,0.025552,0.001034,0.994172,0.994173,0.111878,0.039112,...,0.0,0.000000,0.026141,0.077085,0.001034,0.038895,0.727273,0.000000,0.000000,0.000000
1545258,1.000000,0.987139,0.040761,0.040761,0.020718,0.041096,0.984740,0.984589,0.020718,0.040761,...,0.0,0.000000,0.031056,0.040761,0.041096,0.000000,0.181818,0.326596,0.328246,0.325694
1444245,0.352941,0.000227,0.983696,0.351029,0.063191,0.062671,0.000227,0.000124,0.500000,0.258152,...,0.5,0.142857,0.191224,0.351029,0.062671,0.089113,0.727273,0.000000,0.000000,0.000000


In [14]:
y_pred = clf.predict(x_test)

In [15]:
accuracy_score(y_test, y_pred)

0.9980015049962375

In [16]:
confusion_matrix(y_test, y_pred).ravel()

array([142863,     93,    478, 142281])

In [17]:
recall_score(y_test, y_pred)

0.9966516997177061

In [18]:
f1_score(y_test, y_pred)

0.9979974257627143

In [19]:
precision_score(y_test, y_pred)

0.9993467908466434

In [20]:
roc_auc_score(y_test, y_pred)

0.9980005749490906

In [22]:
dump(clf, "/media/kmdr7/Seagate/TA/MODELS/LogisticRegression.joblib")

['/media/kmdr7/Seagate/TA/MODELS/LogisticRegression.joblib']

In [23]:
print()




In [None]:
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1)

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=6, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
