# ETL

## Extract

script to extract data from csv, json and parquet files

In [1]:
from Extract import extract

folder = 'data/'
data = extract(folder=folder)

loaded ids_0.csv of shape : (1001, 79)
loaded ids_1.csv of shape : (1001, 79)
loaded ids_2.csv of shape : (1001, 79)
loaded ids_3.json of shape : (1001, 79)
loaded ids_4.json of shape : (1001, 79)
loaded ids_5.parquet of shape : (15001, 79)
loaded ids_6.parquet of shape : (5001, 79)
loaded ids_7.json of shape : (9000, 79)
loaded ids_8.parquet of shape : (10293, 79)
loaded ids_9.json of shape : (10293, 79)


saving data into completeData.csv to prevent repeat extraction

In [2]:
# saving the df as a file so we dont need to repeatedly perform the extraction step
#data.to_csv('completeData.csv', index=False) 
#data = pd.read_csv('completeData.csv')

## Transform

In [3]:
#displaying the shape of our data as well as the first rows
data.shape , data.head()

((54593, 79),
     Destination Port   Flow Duration   Total Fwd Packets  \
 0              36102              23                   2   
 1                 53           30805                   2   
 2                443         5535509                   8   
 3                 53           49235                   4   
 4                 53             181                   2   
 
     Total Backward Packets  Total Length of Fwd Packets  \
 0                        1                           31   
 1                        2                          108   
 2                        8                          372   
 3                        2                          152   
 4                        2                           70   
 
     Total Length of Bwd Packets   Fwd Packet Length Max  \
 0                             6                      31   
 1                           230                      54   
 2                          3876                     191   
 3              

In [4]:
# display datas type and non empty values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54593 entries, 0 to 54592
Data columns (total 79 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0    Destination Port             54593 non-null  int64  
 1    Flow Duration                54593 non-null  int64  
 2    Total Fwd Packets            54593 non-null  int64  
 3    Total Backward Packets       54593 non-null  int64  
 4   Total Length of Fwd Packets   54593 non-null  int64  
 5    Total Length of Bwd Packets  54593 non-null  int64  
 6    Fwd Packet Length Max        54593 non-null  int64  
 7    Fwd Packet Length Min        54593 non-null  int64  
 8    Fwd Packet Length Mean       54593 non-null  float64
 9    Fwd Packet Length Std        54593 non-null  float64
 10  Bwd Packet Length Max         54593 non-null  int64  
 11   Bwd Packet Length Min        54593 non-null  int64  
 12   Bwd Packet Length Mean       54593 non-null  float64
 13   

In [5]:
# display our datas empty values
data.isna().sum().sum()

np.int64(150)

In [6]:
# removing samples with missing values
data = data.dropna()
data.isna().sum().sum()

np.int64(0)

In [7]:
# we also want to make clear what our labels are so we will assign attack as the new name
data = data.rename(columns={" Label": "attack"})
data.columns[-1]

'attack'

In [8]:
# viewing the different type of attacks
data['attack'].unique()

array(['BENIGN', 'DoS Hulk', 'DoS GoldenEye'], dtype=object)

In [9]:
# we group the dos attacks by assigning 1 if we see dos in the name of the attack label
data['attack'] = data['attack'].apply(lambda x: 1 if "dos" in str(x).lower() else 0 )
data['attack'].tail()

54588    1
54589    1
54590    1
54591    1
54592    1
Name: attack, dtype: int64

In [10]:
#calculating the variance for each column
column_variances = data.var()

# remove data which contains no variance
filtered_data = data.loc[:, column_variances != 0].copy()
for x, col in enumerate(filtered_data.columns):
    print(f"{x:<{2}} {col:<{30}} var : {filtered_data[col].var():>{25}.5f} mean : {filtered_data[col].mean():>{20}.5f}")

0   Destination Port              var :            41914517.35844 mean :            935.35778
1   Flow Duration                 var :    1830912789519646.50000 mean :       40320983.68206
2   Total Fwd Packets             var :                 795.60425 mean :              5.80492
3   Total Backward Packets        var :                1983.40707 mean :              4.48395
4  Total Length of Fwd Packets    var :             1064269.43731 mean :            368.40567
5   Total Length of Bwd Packets   var :          5878887504.61399 mean :           7608.74043
6   Fwd Packet Length Max         var :               92521.46433 mean :            265.69136
7   Fwd Packet Length Min         var :                 226.87373 mean :              2.11589
8   Fwd Packet Length Mean        var :                6115.42847 mean :             52.74535
9   Fwd Packet Length Std         var :               15095.76915 mean :            107.49852
10 Bwd Packet Length Max          var :            10902763.

In [11]:
# displaying some information about the data which has been removed to check if there is any information we should reconsider
removed_data = data.loc[:, column_variances == 0 ]
removed_data

Unnamed: 0,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,CWE Flag Count,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
54588,0,0,0,0,0,0,0,0,0,0
54589,0,0,0,0,0,0,0,0,0,0
54590,0,0,0,0,0,0,0,0,0,0
54591,0,0,0,0,0,0,0,0,0,0


In [12]:
# viewing if our data is standardized (mean = 0, std =1 )
filtered_data.describe()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,attack
count,54475.0,54475.0,54475.0,54475.0,54475.0,54475.0,54475.0,54475.0,54475.0,54475.0,...,54475.0,54475.0,54475.0,54475.0,54475.0,54475.0,54475.0,54475.0,54475.0,54475.0
mean,935.357779,40320980.0,5.80492,4.483947,368.405672,7608.74,265.691363,2.115888,52.745352,107.498522,...,29.047306,86197.42,13772.69,100824.5,77057.09,37432010.0,463173.0,37768270.0,37096370.0,0.908123
std,6474.14221,42789170.0,28.206458,44.535459,1031.634353,76673.9,304.173412,15.062328,78.201205,122.864841,...,5.217411,580596.1,233265.2,678594.4,549253.3,43059430.0,5066017.0,43342490.0,43078490.0,0.288855
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,80.0,147203.5,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,80.0,11809120.0,6.0,5.0,340.0,11595.0,329.0,0.0,47.571429,126.019707,...,32.0,6.0,0.0,6.0,6.0,6801986.0,0.0,6801986.0,6759334.0,1.0
75%,80.0,85821390.0,8.0,6.0,396.0,11595.0,374.0,0.0,66.5,156.624152,...,32.0,1040.0,0.0,1040.0,1033.0,85300000.0,0.0,85400000.0,85300000.0,1.0
max,63849.0,119999000.0,5117.0,8350.0,88632.0,14500000.0,11595.0,1983.0,2319.0,5185.44164,...,48.0,13700000.0,6905038.0,19200000.0,13700000.0,120000000.0,60700000.0,120000000.0,120000000.0,1.0


In [None]:
import pandas as pd

inf_cols = [' Flow Packets/s','Flow Bytes/s']
filtered_data = filtered_data.drop(columns=inf_cols)
filtered_data

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,attack
0,36102,23,2,1,31,6,31,0,15.500000,21.920310,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
1,53,30805,2,2,108,230,54,54,54.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2,443,5535509,8,8,372,3876,191,0,46.500000,71.876879,...,20,346981.0,0.0,346981,346981,5188524.0,0.0,5188524,5188524,0
3,53,49235,4,2,152,202,38,38,38.000000,0.000000,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
4,53,181,2,2,70,218,35,35,35.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54588,80,11512204,8,5,326,11632,326,0,40.750000,115.258405,...,32,892.0,0.0,892,892,6507197.0,0.0,6507197,6507197,1
54589,80,11513325,5,5,471,3525,471,0,94.200000,210.637604,...,32,918.0,0.0,918,918,6508582.0,0.0,6508582,6508582,1
54590,80,11509201,7,6,314,11632,314,0,44.857143,118.680845,...,32,899.0,0.0,899,899,6503248.0,0.0,6503248,6503248,1
54591,80,11509095,8,5,369,11632,369,0,46.125000,130.461201,...,32,914.0,0.0,914,914,6504954.0,0.0,6504954,6504954,1


In [14]:
filtered_data.columns

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', ' Flow IAT Mean', ' Flow IAT Std',
       ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean',
       ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total',
       ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min',
       'Fwd PSH Flags', ' Fwd Header Length', ' Bwd Header Length',
       'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length',
       ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std',
       ' Packet Length Variance', 'FIN Flag Count', ' SYN Flag Count',
       ' RST Flag Count', ' PSH Flag Count', ' ACK Flag Coun

In [15]:
# removing sample with inf values
df = filtered_data.replace(["inf"], "nan")
df = df.dropna()
df.isna().sum().sum()

np.int64(0)

In [16]:
# destination port converted to categories
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df[' Destination Port'] = encoder.fit_transform(df[' Destination Port'])

In [17]:
# x will contain the information outside of the labels for each sample
x = df.loc[:,df.columns != 'attack']

In [18]:
# y will hold only the label values which are stored as true false or 0,1 where the label is considered an attack
y = df['attack'].copy()

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=.3,random_state=42,stratify=y)

In [20]:
from sklearn.preprocessing import StandardScaler
#creating a standard scaler to standardize the filtered data
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# displaying if x train is standardized (mean = 0, std = 1)
pd.DataFrame(X_train).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
count,38132.0,38132.0,38132.0,38132.0,38132.0,38132.0,38132.0,38132.0,38132.0,38132.0,...,38132.0,38132.0,38132.0,38132.0,38132.0,38132.0,38132.0,38132.0,38132.0,38132.0
mean,4.472104e-18,-8.273392000000001e-17,-4.844779e-18,-8.198857e-18,1.490701e-18,7.453505999999999e-19,-7.565309e-17,1.2670960000000001e-17,2.2546860000000002e-17,-8.944207e-18,...,-4.099428e-18,-2.724256e-16,-1.621138e-17,-1.2670960000000001e-17,2.608727e-18,1.3788990000000001e-17,-8.21749e-17,1.8633770000000002e-17,-1.155293e-17,-5.981439e-17
std,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,...,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013
min,-0.1723232,-0.9448165,-0.1512844,-0.08716127,-0.3331995,-0.08658796,-0.8604339,-0.1385699,-0.6618441,-0.8603899,...,-0.05225008,-5.580236,-0.1489493,-0.05959565,-0.1486489,-0.1404612,-0.8708927,-0.09221932,-0.8730392,-0.8625685
25%,-0.118355,-0.9413399,-0.1200244,-0.08716127,-0.3331995,-0.08658796,-0.8604339,-0.1385699,-0.6618441,-0.8603899,...,-0.05225008,0.5642415,-0.1489493,-0.05959565,-0.1486489,-0.1404612,-0.8708927,-0.09221932,-0.8730392,-0.8625685
50%,-0.118355,-0.6683573,0.005015433,0.008218268,-0.02803391,0.0431169,0.2028614,-0.1385699,-0.06622808,0.1469908,...,-0.02003978,0.5642415,-0.148939,-0.05959565,-0.1486402,-0.1404504,-0.7126335,-0.09221932,-0.7158321,-0.7050774
75%,-0.118355,1.062092,0.06753536,0.02729418,0.02222867,0.0431169,0.348297,-0.1385699,0.1708123,0.3926971,...,0.01217051,0.5642415,-0.1471727,-0.05959565,-0.1471395,-0.1385917,1.109256,-0.09221932,1.098562,1.116685
max,12.92844,1.86134,159.7747,159.1967,79.21809,162.1144,36.61346,130.2651,28.34043,40.56512,...,161.9011,3.63648,23.23187,28.60383,27.68902,24.62948,1.91805,11.79286,1.89736,1.925113


In [23]:
# displayed if x test is standardized (mean = 0, std = 1)
pd.DataFrame(X_test).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
count,16343.0,16343.0,16343.0,16343.0,16343.0,16343.0,16343.0,16343.0,16343.0,16343.0,...,16343.0,16343.0,16343.0,16343.0,16343.0,16343.0,16343.0,16343.0,16343.0,16343.0
mean,0.004595,-0.00639,-0.003609,-0.00542,-0.008461,-0.004915,-0.005819,0.001909,-0.007303,-0.005348,...,-0.005264,-0.009066,-0.006141,-0.00808,-0.008218,-0.0038,-0.003094,-0.0051,-0.003653,-0.002649
std,1.026192,1.002042,0.508093,0.269153,0.724216,0.34464,0.942331,0.968007,0.924659,0.937056,...,0.228429,1.006028,0.969191,0.889943,0.945171,0.976682,1.002502,0.972821,1.002105,1.00247
min,-0.172323,-0.944816,-0.151284,-0.087161,-0.3332,-0.086588,-0.860434,-0.13857,-0.661844,-0.86039,...,-0.05225,-5.580236,-0.148949,-0.059596,-0.148649,-0.140461,-0.870893,-0.092219,-0.873039,-0.862569
25%,-0.118355,-0.941517,-0.120024,-0.087161,-0.3332,-0.086588,-0.860434,-0.13857,-0.661844,-0.86039,...,-0.05225,-1.739938,-0.148949,-0.059596,-0.148649,-0.140461,-0.870893,-0.092219,-0.873039,-0.862569
50%,-0.118355,-0.66979,0.005015,0.008218,-0.028034,0.043117,0.202861,-0.13857,-0.066898,0.146162,...,-0.02004,0.564241,-0.148939,-0.059596,-0.14864,-0.14045,-0.71367,-0.092219,-0.716862,-0.70664
75%,-0.118355,1.062271,0.067535,0.027294,0.021331,0.043117,0.351529,-0.13857,0.16715,0.390286,...,-0.02004,0.564241,-0.147184,-0.059596,-0.147149,-0.138599,1.111581,-0.092219,1.098562,1.119008
max,12.914953,1.858674,59.117611,28.774687,41.255489,39.946127,27.218321,95.083106,23.506721,27.148556,...,17.437941,2.868421,18.62397,27.573338,22.034493,19.747812,1.91805,11.792856,1.89736,1.925113


In [24]:
(pd.DataFrame(filtered_data).corr().style.background_gradient(cmap='PRGn',vmin=-1,vmax=1))

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,attack
Destination Port,1.0,-0.106448,-0.016138,-0.009253,0.079789,-0.012437,0.082552,0.053986,0.150601,0.088189,-0.149435,-0.01171,-0.159799,-0.145269,-0.05641,-0.082127,-0.100051,-0.033474,-0.100519,-0.068084,-0.081616,-0.099373,-0.036886,-0.036536,-0.036309,-0.030177,-0.030445,-0.006219,0.348569,-0.025438,-0.015682,0.038189,0.199585,0.01793,-0.133253,-0.137524,-0.137254,-0.106951,-0.037695,0.348569,-0.000326,-0.088676,0.119137,0.466047,-0.000326,0.055531,-0.134923,0.150601,-0.159799,-0.025438,-0.016138,0.079789,-0.009253,-0.012448,-0.068831,0.169803,-0.00306,-0.079979,-0.017713,-0.005825,-0.017121,-0.017189,-0.098345,-0.008447,-0.098388,-0.097587,-0.415373
Flow Duration,-0.106448,1.0,0.054965,0.046384,0.096871,0.049202,0.22609,-0.122551,0.146729,0.226769,0.345293,-0.150817,0.389194,0.275648,0.412171,0.852133,0.982626,0.177666,0.998802,0.577491,0.863248,0.982543,0.248615,0.402092,0.32045,0.39693,0.408405,0.034532,-0.059462,0.067807,0.067086,-0.278838,-0.05494,-0.161338,0.350877,0.433108,0.350416,0.230524,0.372044,-0.059462,-0.003638,-0.457319,0.241922,-0.10666,-0.003638,0.114139,0.42705,0.146729,0.389194,0.067807,0.054965,0.096871,0.046384,0.049236,-0.462161,-0.022639,0.034167,-0.388463,-0.043121,0.033067,-0.018596,-0.05415,0.975594,0.15485,0.982222,0.962073,0.208825
Total Fwd Packets,-0.016138,0.054965,1.0,0.980671,0.555802,0.987946,0.072699,-0.015322,0.02821,0.046537,0.081797,-0.01442,0.081862,0.061191,-0.032076,0.020137,0.024064,-0.034404,0.051871,-0.028226,0.030026,0.023693,-0.037835,0.055736,0.011601,0.01035,0.014795,-0.003001,0.061608,0.98472,0.979701,-0.039125,-0.008734,-0.02259,0.082155,0.081553,0.067284,0.046393,-0.006773,0.061608,0.001245,0.051493,-0.033516,-0.018128,0.001245,-0.004031,0.07776,0.02821,0.081862,0.98472,1.0,0.555802,0.980671,0.987972,0.042897,0.00463,0.971577,-0.026753,0.050443,0.050295,0.066826,0.038713,0.019824,0.010599,0.020662,0.018801,-0.028093
Total Backward Packets,-0.009253,0.046384,0.980671,1.0,0.546601,0.996089,0.055311,-0.007785,0.021122,0.03587,0.054844,0.000167,0.054584,0.033826,-0.022321,0.019625,0.020942,-0.025677,0.04471,-0.015456,0.029183,0.02065,-0.02657,0.061671,0.017803,0.021659,0.026404,-0.002341,0.066732,0.939263,0.997975,-0.029721,-0.003937,-0.011876,0.055117,0.066628,0.044463,0.026405,0.01367,0.066732,0.000531,0.028951,-0.031564,-0.009247,0.000531,0.036131,0.064027,0.021122,0.054584,0.939263,0.980671,0.546601,1.0,0.996093,0.022525,0.004691,0.990361,-0.034776,0.018328,0.023549,0.028793,0.01371,0.018719,0.004348,0.018948,0.018279,-0.033022
Total Length of Fwd Packets,0.079789,0.096871,0.555802,0.546601,1.0,0.525851,0.546494,0.058952,0.594986,0.507964,0.139248,-0.037884,0.13843,0.121164,-0.071687,0.064213,0.045135,-0.09106,0.088467,-0.049264,0.064948,0.044283,-0.072722,0.155937,0.10229,0.090829,0.091517,0.00396,0.03772,0.54955,0.560682,-0.103861,-0.018963,-0.033276,0.170176,0.190179,0.154499,0.094396,0.000282,0.03772,0.015714,0.16943,-0.137785,-0.036539,0.015714,0.021283,0.184974,0.594986,0.13843,0.54955,0.555802,1.0,0.546601,0.525968,0.15365,0.021577,0.553471,-0.02611,0.215722,0.208428,0.264171,0.169686,0.039368,0.026208,0.041969,0.03666,-0.088654
Total Length of Bwd Packets,-0.012437,0.049202,0.987946,0.996089,0.525851,1.0,0.048755,-0.013593,0.012761,0.030474,0.074546,-0.006823,0.075325,0.053853,-0.021347,0.024931,0.028535,-0.025309,0.047409,-0.01449,0.036442,0.028259,-0.02783,0.052177,0.014129,0.019587,0.023674,-0.004519,0.062847,0.955306,0.993706,-0.02936,-0.005612,-0.017991,0.07433,0.083493,0.063723,0.048329,0.012111,0.062847,-0.000167,0.025989,-0.024627,-0.011353,-0.000167,0.027239,0.080874,0.012761,0.075325,0.955306,0.987946,0.525851,0.996089,1.0,0.021104,6.8e-05,0.984798,-0.030044,0.015137,0.015769,0.021296,0.012162,0.025972,0.003956,0.026114,0.0256,-0.012615
Fwd Packet Length Max,0.082552,0.22609,0.072699,0.055311,0.546494,0.048755,1.0,-0.068975,0.847931,0.964356,0.428002,-0.124533,0.445521,0.422277,-0.142334,0.254426,0.18144,-0.222742,0.205228,-0.057264,0.24906,0.178848,-0.152598,0.249157,0.207503,0.208767,0.196015,-0.02832,-0.012531,0.099977,0.079037,-0.254463,-0.047509,-0.125464,0.480973,0.507987,0.494549,0.375004,0.04188,-0.012531,0.010076,0.357199,-0.302513,-0.091248,0.010076,0.086679,0.506786,0.847931,0.445521,0.099977,0.072699,0.546494,0.055311,0.048794,0.337064,0.006834,0.038373,-0.030865,0.080831,0.049271,0.092585,0.073195,0.175504,0.026372,0.176569,0.172967,0.016446
Fwd Packet Length Min,0.053986,-0.122551,-0.015322,-0.007785,0.058952,-0.013593,-0.068975,1.0,0.10873,-0.118462,-0.155902,0.500156,-0.156379,-0.155492,-0.057246,-0.089451,-0.115258,-0.035112,-0.11571,-0.063769,-0.101535,-0.114336,-0.025451,-0.053846,-0.011629,-0.056538,-0.04685,0.088085,0.063854,-0.024216,-0.014187,-0.033531,0.03852,0.687949,-0.153789,-0.138354,-0.157774,-0.125443,-0.03378,0.063854,-0.000602,-0.097723,-0.114091,0.01737,-0.000602,0.168785,-0.129212,0.10873,-0.156379,-0.024216,-0.015322,0.058952,-0.007785,-0.013604,-0.094648,-0.01098,0.000831,-0.071132,-0.018468,-0.007454,-0.01856,-0.017426,-0.113646,-0.010368,-0.113715,-0.112737,-0.41719
Fwd Packet Length Mean,0.150601,0.146729,0.02821,0.021122,0.594986,0.012761,0.847931,0.10873,1.0,0.907073,0.20457,-0.00912,0.20705,0.194818,-0.059692,0.247878,0.124432,-0.171885,0.135252,0.045363,0.131744,0.123272,-0.008012,0.200083,0.191847,0.186308,0.178534,0.006773,0.039252,0.042853,0.034566,-0.191107,-0.028832,0.015099,0.259572,0.322243,0.270979,0.179968,0.053037,0.039252,0.013359,0.239912,-0.251784,-0.064041,0.013359,0.167899,0.327689,1.0,0.20705,0.042853,0.02821,0.594986,0.021122,0.012775,0.227096,-0.004182,0.021414,-0.031852,0.147293,0.114671,0.160027,0.122386,0.120588,0.016137,0.121616,0.118958,-0.077153
Fwd Packet Length Std,0.088189,0.226769,0.046537,0.03587,0.507964,0.030474,0.964356,-0.118462,0.907073,1.0,0.401718,-0.147645,0.414974,0.394438,-0.105388,0.309551,0.194991,-0.223146,0.206938,0.009154,0.238748,0.192574,-0.084785,0.243854,0.215163,0.222481,0.209065,-0.031156,0.001478,0.070104,0.057667,-0.252673,-0.049482,-0.160441,0.454333,0.50495,0.474957,0.351446,0.066204,0.001478,0.011012,0.355671,-0.305997,-0.093824,0.011012,0.139629,0.507492,0.907073,0.414974,0.070104,0.046537,0.507964,0.03587,0.030502,0.339575,-0.002822,0.021686,-0.024122,0.088724,0.050207,0.095757,0.080406,0.189908,0.020317,0.190427,0.187956,0.061913


In [27]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay

In [28]:
import numpy as np
np.unique(y_test,return_counts=True)

(array([0, 1]), array([ 1502, 14841]))

In [29]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train,y_train)

predictions = model.predict(X_test)

accuracy = accuracy_score(y_test,predictions)

print("model accuracy : ", accuracy)

model accuracy :  0.9967570213547085


In [30]:
from xgboost import XGBClassifier

xg_model = XGBClassifier(n_estimators=150).fit(X_train, y_train, verbose = True)
xg_pred = xg_model.predict(X_test)

xg_model

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [31]:
xg_pred

array([1, 1, 1, ..., 1, 1, 1], shape=(16343,))

In [32]:
# compute the measures
acc_xgboost1 = accuracy_score(y_test, xg_pred)
f1score_xgboost1 = f1_score(y_test, xg_pred)
precision_xgboost1 = precision_score(y_test, xg_pred)
recall_xgboost1 = recall_score(y_test, xg_pred)

# display the measures

print('-'*25)
print('XGBoost Model: 66 features')
print('-'*25)
print('Accuracy: {:.3f}'.format(acc_xgboost1))
print('F1-score: {:.3f}'.format(f1score_xgboost1))
print('Precision: {:.3f}'.format(precision_xgboost1))
print('Recall: {:.3f}'.format(recall_xgboost1))

-------------------------
XGBoost Model: 66 features
-------------------------
Accuracy: 1.000
F1-score: 1.000
Precision: 1.000
Recall: 1.000


In [33]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, xg_pred)

print("Confusion Matrix:")
print("-"*18)
print(conf_matrix)
print("-"*18)
print("tn,fp")
print("fn,tp")

Confusion Matrix:
------------------
[[ 1500     2]
 [    0 14841]]
------------------
tn,fp
fn,tp
