In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics

## Dataset lowvariance + correlation filter

In [2]:
lc_dataset = pd.read_csv("/media/kmdr7/Seagate/DATASETS/IOT-23/Final Dataset/lowvariance_correlationfilter_dataset.csv")
lc_clean_dataset = lc_dataset.drop(columns=["Label"])
lc_label = lc_dataset["Label"]
lc_dataset

Unnamed: 0,Protocol,Flow Duration,Flow IAT Max,Fwd IAT Total,Fwd IAT Std,Fwd IAT Max,Bwd IAT Std,Bwd IAT Max,Subflow Fwd Packets,Fwd Seg Size Min,Idle Mean,Idle Std,Idle Min,Label
0,0.352941,0.014050,0.001390,0.013998,0.000385,0.001396,0.000382,0.001558,0.0,1.0,0.999995,0.000000,9.999956e-01,Malware
1,0.352941,0.279085,0.219288,0.278991,0.053902,0.219290,0.000045,0.000415,0.0,1.0,0.238094,0.816493,2.092964e-09,Malware
2,0.352941,0.003321,0.005216,0.003323,0.008035,0.005222,0.000000,0.000000,0.0,1.0,0.999995,0.000000,9.999956e-01,Malware
3,0.352941,0.012501,0.004975,0.012324,0.001179,0.004982,0.001108,0.005559,0.0,1.0,0.999995,0.000000,9.999956e-01,Malware
4,0.352941,0.280068,0.219285,0.279974,0.054308,0.219287,0.000062,0.000412,0.0,1.0,0.238094,0.816493,2.299829e-09,Malware
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,0.000000,0.763603,0.208353,0.763603,0.084022,0.208356,0.000000,0.000000,1.0,0.0,0.000000,0.500000,0.000000e+00,Benign
269,0.000000,0.729049,0.515448,0.729050,0.374050,0.515450,0.000000,0.000000,1.0,0.0,0.085714,0.632455,8.187013e-11,Benign
270,0.000000,0.886008,0.556653,0.886009,0.558948,0.556654,0.000000,0.000000,1.0,0.0,0.142857,0.707107,1.499154e-08,Benign
271,0.000000,0.500698,0.789751,0.500699,0.705603,0.789751,0.000000,0.000000,1.0,0.0,0.428571,1.000000,3.550135e-08,Benign


## Dataset PCA

In [3]:
pca_dataset = pd.read_csv("/media/kmdr7/Seagate/DATASETS/IOT-23/Merged/04. Feature Selection/pca_dataset.csv")
pca_clean_dataset = pca_dataset.drop(columns=["Label"])
pca_label = pca_dataset["Label"]
pca_dataset

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Label
0,0.147808,-0.514771,0.286697,-0.137553,0.913494,0.855905,0.273484,-0.479864,0.440577,Malware
1,-1.022914,-0.885687,0.002199,-0.213661,0.404288,-0.050455,-0.015976,-0.223345,0.395839,Malware
2,-0.906647,-1.049279,-0.090859,-0.119511,0.527799,0.251826,0.079525,-0.229274,0.820235,Malware
3,-0.961825,-1.072027,-0.092086,-0.123172,0.547419,0.261675,0.097636,-0.246404,0.766521,Malware
4,-0.932441,-1.071461,-0.095981,-0.121096,0.543819,0.264623,0.091478,-0.242332,0.799116,Malware
...,...,...,...,...,...,...,...,...,...,...
17909,-1.272618,-0.516196,0.256069,-0.001591,0.236656,-0.174635,-0.357812,-0.020789,-0.113106,Benign
17910,-1.272900,-0.516462,0.256063,-0.000838,0.237219,-0.173486,-0.357441,-0.020440,-0.111875,Benign
17911,-1.213357,-0.449642,0.418320,-0.699023,-0.120048,-0.258811,-0.109428,-0.053669,0.016398,Benign
17912,-1.272611,-0.516280,0.256100,-0.001601,0.236615,-0.174460,-0.357927,-0.020759,-0.113045,Benign


## Logistic Regression [lowvariance + correlation filter]

In [4]:
## Bagi dataset menjadi train dan test
x_train, x_test, y_train, y_test = train_test_split(
    lc_clean_dataset, lc_label, test_size=1/7.0, random_state=0
)

In [5]:
lr_classifier = LogisticRegression(random_state=0)
lr_classifier.fit(x_train, y_train)
scores = cross_val_score(estimator=lr_classifier, X=lc_clean_dataset, y=lc_label, cv=10, n_jobs=4)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(), scores.std()), end="\n\n" )

mean: 0.986 (std: 0.033)



In [6]:
y_pred = lr_classifier.predict(x_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 1.0


## Logistic Regression [pca]

In [7]:
## Bagi dataset menjadi train dan test
x_train, x_test, y_train, y_test = train_test_split(
    pca_clean_dataset, pca_label, test_size=1/7.0, random_state=0
)

In [8]:
lr_classifier = LogisticRegression(random_state=0)
lr_classifier.fit(x_train, y_train)
scores = cross_val_score(estimator=lr_classifier, X=pca_clean_dataset, y=pca_label, cv=10, n_jobs=4)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(), scores.std()), end="\n\n" )

mean: 0.944 (std: 0.033)



In [9]:
y_pred = lr_classifier.predict(x_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.96015625
