# ETL

## Extract

In [1]:
import pandas as pd
import numpy as np

script to extract data from csv, json and parquet files

In [2]:
#import os
#import json
#import pyarrow as pa
#from pathlib import Path
# this script lets you loop through the data folder to read each csv and pass in information into a dataframe named data

#folder = "data/"
#dataframes = []
#for file in os.listdir(folder):
#    path = os.path.join(folder,file)
#    
#    if file.endswith('.csv'):
#        df = pd.read_csv(path)
#    elif file.endswith('parquet'):
#        df = pd.read_parquet(path)
#    else:
#        df = pd.read_json(path, lines=True)
#    
#    print(f"loaded {file} of shape : {df.shape}")
#    dataframes.append(df)
#    
#stores all dataframes into one dataframe no repeat index,
#data = pd.concat(dataframes, ignore_index=True) 


saving data into completeData.csv to prevent repeat extraction

In [3]:
# saving the df as a file so we dont need to repeatedly perform the extraction step
#data.to_csv('completeData.csv', index=False) 
data = pd.read_csv('completeData.csv')

## Transform

In [4]:
#displaying the shape of our data as well as the first rows
data.shape , data.head()

((54593, 79),
     Destination Port   Flow Duration   Total Fwd Packets  \
 0              36102              23                   2   
 1                 53           30805                   2   
 2                443         5535509                   8   
 3                 53           49235                   4   
 4                 53             181                   2   
 
     Total Backward Packets  Total Length of Fwd Packets  \
 0                        1                           31   
 1                        2                          108   
 2                        8                          372   
 3                        2                          152   
 4                        2                           70   
 
     Total Length of Bwd Packets   Fwd Packet Length Max  \
 0                             6                      31   
 1                           230                      54   
 2                          3876                     191   
 3              

In [5]:
# display datas type and non empty values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54593 entries, 0 to 54592
Data columns (total 79 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0    Destination Port             54593 non-null  int64  
 1    Flow Duration                54593 non-null  int64  
 2    Total Fwd Packets            54593 non-null  int64  
 3    Total Backward Packets       54593 non-null  int64  
 4   Total Length of Fwd Packets   54593 non-null  int64  
 5    Total Length of Bwd Packets  54593 non-null  int64  
 6    Fwd Packet Length Max        54593 non-null  int64  
 7    Fwd Packet Length Min        54593 non-null  int64  
 8    Fwd Packet Length Mean       54593 non-null  float64
 9    Fwd Packet Length Std        54593 non-null  float64
 10  Bwd Packet Length Max         54593 non-null  int64  
 11   Bwd Packet Length Min        54593 non-null  int64  
 12   Bwd Packet Length Mean       54593 non-null  float64
 13   

In [6]:
# display our datas empty values
data.isna().sum()

 Destination Port              0
 Flow Duration                 0
 Total Fwd Packets             0
 Total Backward Packets        0
Total Length of Fwd Packets    0
                              ..
Idle Mean                      0
 Idle Std                      0
 Idle Max                      0
 Idle Min                      0
 Label                         0
Length: 79, dtype: int64

In [7]:
#import seaborn as sns
#import matplotlib as plt
#sns.pairplot(data, kind="scatter")

#using sns pairplot we can visualize data corelation and decide what to 

In [8]:

# we also want to make clear what our labels are so we will assign attack as the new name
data = data.rename(columns={" Label": "attack"})
data.columns[-1]

'attack'

the dataset includes different kinds of attacks

- Benign
- Dos GoldenEye
- Dos Hulk
- Dos Slowhttptest
- Dos slowloris
- Heartbleed

we know Heartbleed is not a denial of service attack and can be removed from our attack data

In [9]:
# viewing the different type of attacks
data['attack'].unique()

array(['BENIGN', 'DoS Hulk', 'DoS GoldenEye'], dtype=object)

In [10]:
# we apply 1 for labels which contain dos as part of the label
data['attack'] = data['attack'].apply(lambda x: 1 if "dos" in str(x).lower() else 0 )
data['attack'].tail()

54588    1
54589    1
54590    1
54591    1
54592    1
Name: attack, dtype: int64

In [11]:
# showing the values in our data which have missing values
data[data.isna().any(axis=1)]

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,attack
5245,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
5365,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
5744,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
5797,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
5867,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32829,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
33339,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
33415,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
33692,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1


In [12]:
data.max()

 Destination Port                  63849.0
 Flow Duration                 119998998.0
 Total Fwd Packets                  5117.0
 Total Backward Packets             8350.0
Total Length of Fwd Packets        88632.0
                                  ...     
Idle Mean                      120000000.0
 Idle Std                       60700000.0
 Idle Max                      120000000.0
 Idle Min                      120000000.0
attack                                 1.0
Length: 79, dtype: float64

In [13]:
# 
column_variances = data.var()

filtered_data = data.loc[:, column_variances != 0]
filtered_data = filtered_data.dropna(axis=1)

for x ,col in enumerate(filtered_data.columns):
    print(x," ",col," : var ",filtered_data[col].var()," : mean " ,filtered_data[col].mean())


0    Destination Port  : var  41825497.60296314  : mean  933.5089663509974
1    Flow Duration  : var  1830461805440085.5  : mean  40233831.92130859
2    Total Fwd Packets  : var  793.9157848016172  : mean  5.796695547048157
3    Total Backward Packets  : var  1979.1633218286488  : mean  4.474254941109666
4   Total Length of Fwd Packets  : var  1062261.7598755558  : mean  367.60938215522134
5    Total Length of Bwd Packets  : var  5866305219.182368  : mean  7592.294524939095
6    Fwd Packet Length Max  : var  92473.73401956048  : mean  265.11708460791675
7    Fwd Packet Length Min  : var  226.39299789179387  : mean  2.1113146374077263
8    Fwd Packet Length Mean  : var  6108.210461470073  : mean  52.63134568792843
9    Fwd Packet Length Std  : var  15088.063840332297  : mean  107.26616948427852
10   Bwd Packet Length Max  : var  10909845.074229602  : mean  3761.430513069441
11    Bwd Packet Length Min  : var  736.5414894958412  : mean  4.855494294140274
12    Bwd Packet Length Mean  : v

In [14]:
# displaying some information about the data which has been removed to check if there is any information we should reconsider
removed_data = data.loc[:, column_variances == 0 ]
removed_data

Unnamed: 0,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,CWE Flag Count,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
54588,0,0,0,0,0,0,0,0,0,0
54589,0,0,0,0,0,0,0,0,0,0
54590,0,0,0,0,0,0,0,0,0,0
54591,0,0,0,0,0,0,0,0,0,0


In [15]:
# viewing if our data is standardized (mean = 0, std =1 )
filtered_data.describe()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,attack
count,54593.0,54593.0,54593.0,54593.0,54593.0,54593.0,54593.0,54593.0,54593.0,54593.0,...,54593.0,54593.0,54593.0,54593.0,54593.0,54593.0,54593.0,54593.0,54593.0,54593.0
mean,933.508966,40233830.0,5.796696,4.474255,367.609382,7592.295,265.117085,2.111315,52.631346,107.266169,...,29.053688,86011.11,13742.92,100606.6,76890.53,37351110.0,462171.9,37686630.0,37016180.0,0.908322
std,6467.263533,42783900.0,28.176511,44.487788,1030.660836,76591.81,304.094942,15.046362,78.155041,122.83348,...,5.213573,579982.1,233013.9,677876.8,548671.0,43047980.0,5060585.0,43331130.0,43066380.0,0.288574
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,80.0,145482.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,80.0,11803760.0,6.0,5.0,340.0,11595.0,329.0,0.0,47.5,125.933018,...,32.0,6.0,0.0,6.0,6.0,6792433.0,0.0,6792784.0,6745733.0,1.0
75%,80.0,85814240.0,8.0,6.0,395.0,11595.0,374.0,0.0,66.4,156.553505,...,32.0,1038.0,0.0,1038.0,1031.0,85200000.0,0.0,85300000.0,85200000.0,1.0
max,63849.0,119999000.0,5117.0,8350.0,88632.0,14500000.0,11595.0,1983.0,2319.0,5185.44164,...,48.0,13700000.0,6905038.0,19200000.0,13700000.0,120000000.0,60700000.0,120000000.0,120000000.0,1.0


In [22]:
# x will contain the information outside of the labels for each sample
x = filtered_data.loc[:,filtered_data.columns != 'attack']
x

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,36102,23,2,1,31,6,31,0,15.500000,21.920310,...,0,32,0.0,0.0,0,0,0.0,0.0,0,0
1,53,30805,2,2,108,230,54,54,54.000000,0.000000,...,1,20,0.0,0.0,0,0,0.0,0.0,0,0
2,443,5535509,8,8,372,3876,191,0,46.500000,71.876879,...,7,20,346981.0,0.0,346981,346981,5188524.0,0.0,5188524,5188524
3,53,49235,4,2,152,202,38,38,38.000000,0.000000,...,3,32,0.0,0.0,0,0,0.0,0.0,0,0
4,53,181,2,2,70,218,35,35,35.000000,0.000000,...,1,20,0.0,0.0,0,0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54588,80,11512204,8,5,326,11632,326,0,40.750000,115.258405,...,1,32,892.0,0.0,892,892,6507197.0,0.0,6507197,6507197
54589,80,11513325,5,5,471,3525,471,0,94.200000,210.637604,...,1,32,918.0,0.0,918,918,6508582.0,0.0,6508582,6508582
54590,80,11509201,7,6,314,11632,314,0,44.857143,118.680845,...,1,32,899.0,0.0,899,899,6503248.0,0.0,6503248,6503248
54591,80,11509095,8,5,369,11632,369,0,46.125000,130.461201,...,1,32,914.0,0.0,914,914,6504954.0,0.0,6504954,6504954


In [17]:
col_std_data = filtered_data.loc[:,filtered_data.columns.str.contains("Std")]

col_std_data

Unnamed: 0,Fwd Packet Length Std,Bwd Packet Length Std,Flow IAT Std,Fwd IAT Std,Bwd IAT Std,Packet Length Std,Active Std,Idle Std
0,21.920310,0.000000,1.202082e+01,0.000000e+00,0.000000e+00,16.350331,0.0,0.0
1,0.000000,0.000000,1.778008e+04,0.000000e+00,0.000000e+00,33.411076,0.0,0.0
2,71.876879,674.522053,1.334058e+06,1.940029e+06,5.592656e+04,503.409858,0.0,0.0
3,0.000000,0.000000,1.370579e+04,1.623855e+04,0.000000e+00,30.740852,0.0,0.0
4,0.000000,0.000000,9.843949e+01,0.000000e+00,0.000000e+00,40.531469,0.0,0.0
...,...,...,...,...,...,...,...,...
54588,115.258405,4437.056321,2.262616e+06,2.460477e+06,3.378874e+06,2713.257986,0.0,0.0
54589,210.637604,990.644740,2.565153e+06,3.255639e+06,3.379306e+06,719.780257,0.0,0.0
54590,118.680845,2544.918912,2.261842e+06,2.655701e+06,3.195689e+06,1857.187221,0.0,0.0
54591,130.461201,4437.056321,2.262122e+06,2.458891e+06,3.377879e+06,2712.638403,0.0,0.0


In [18]:
np.unique(col_std_data,return_counts=True)

(array([0.00000000e+00, 5.00000000e-01, 5.77350269e-01, ...,
        8.27000000e+07, 8.29000000e+07, 8.33000000e+07], shape=(76133,)),
 array([208926,     11,     10, ...,      2,      1,      1],
       shape=(76133,)))

In [None]:
for i in filtered_data[' Active Std']:
    print(i)

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
94295.17009
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1066385.039
0.0
0.0
0.0
6486086.013
0.0
1218384.589
0.0
0.0
0.0
0.0
0.0
0.0
32620.731
0.0
0.0
0.0
0.0
0.0
4108719.251
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
33410.57621
0.0
0.0
37365.82665
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
20679.78026
0.0
0.0
62863.91417
0.0
0.0
899124.456
0.0
0.0
0.0
0.0
711489.9835
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
11964.82889
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
161367.5383
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
119637.8508
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0

In [20]:
# y will hold only the label values which are stored as true false or 0,1 where the label is considered an attack
y = filtered_data['attack'].copy()
y

0        0
1        0
2        0
3        0
4        0
        ..
54588    1
54589    1
54590    1
54591    1
54592    1
Name: attack, Length: 54593, dtype: int64

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=.3,random_state=42,stratify=y)

In [25]:
from sklearn.preprocessing import StandardScaler
#creating a standard scaler to standardize the filtered data
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [26]:
# displaying if x train is standardized (mean = 0, std = 1)
pd.DataFrame(X_train).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
count,38215.0,38215.0,38215.0,38215.0,38215.0,38215.0,38215.0,38215.0,38215.0,38215.0,...,38215.0,38215.0,38215.0,38215.0,38215.0,38215.0,38215.0,38215.0,38215.0,38215.0
mean,1.0412240000000001e-17,3.421166e-17,-1.115598e-18,-7.437318e-19,1.63621e-17,1.859329e-18,7.511691e-17,2.826181e-17,-9.296647e-18,6.32172e-17,...,-3.718659e-19,-2.115917e-16,-4.090525e-18,0.0,2.6030610000000002e-18,0.0,-1.740332e-16,-9.296647e-18,-1.264344e-17,5.4292420000000005e-17
std,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,...,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013
min,-0.1439839,-0.9386471,-0.1508634,-0.08689194,-0.3326649,-0.08631688,-0.8570591,-0.1383242,-0.6615212,-0.8576214,...,-0.05212674,-5.578625,-0.1478583,-0.057629,-0.1479825,-0.140173,-0.8655176,-0.09258336,-0.867548,-0.8571453
25%,-0.1315545,-0.9352448,-0.1195697,-0.08689194,-0.3326649,-0.08631688,-0.8570591,-0.1383242,-0.6615212,-0.8576214,...,-0.05212674,0.5645217,-0.1478583,-0.057629,-0.1479825,-0.140173,-0.8655176,-0.09258336,-0.867548,-0.8571453
50%,-0.1315545,-0.662973,0.005605277,0.008590002,-0.02616215,0.04352738,0.2058798,-0.1383242,-0.06418446,0.1488015,...,-0.01988135,0.5645217,-0.147848,-0.057629,-0.1479737,-0.140162,-0.7078379,-0.09258336,-0.7109448,-0.7006311
75%,-0.1315545,1.066774,0.06819276,0.02768639,0.02522213,0.04352738,0.3544974,-0.1383242,0.1746457,0.3964186,...,0.01236404,0.5645217,-0.1460629,-0.057629,-0.1464407,-0.138294,1.114439,-0.09258336,1.101044,1.122082
max,9.776129,1.865817,159.9479,159.368,79.56719,162.289,36.60427,130.4052,28.42455,40.56101,...,162.0777,3.636095,23.41175,30.733965,28.2069,24.669839,1.923153,11.63873,1.901866,1.930498


In [28]:
# displayed if x test is standardized (mean = 0, std = 1)
pd.DataFrame(X_test).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
count,16378.0,16378.0,16378.0,16378.0,16378.0,16378.0,16378.0,16378.0,16378.0,16378.0,...,16378.0,16378.0,16378.0,16378.0,16378.0,16378.0,16378.0,16378.0,16378.0,16378.0
mean,0.003513,0.00549,-0.002523,-0.004833,-0.004243,-0.00432,-0.001714,0.002222,-0.004641,-0.002782,...,-0.005034,-0.003639,0.000177,0.012183,0.001983,-0.003095,0.008273,-0.010869,0.007337,0.00918
std,1.015934,0.999626,0.508169,0.269137,0.737683,0.344635,0.940312,0.968044,0.932566,0.935617,...,0.228496,1.002882,0.99125,1.12495,1.00366,0.978553,1.001272,0.924744,1.000033,1.001462
min,-0.143984,-0.938647,-0.150863,-0.086892,-0.332665,-0.086317,-0.857059,-0.138324,-0.661521,-0.857621,...,-0.052127,-5.578625,-0.147858,-0.057629,-0.147983,-0.140173,-0.865518,-0.092583,-0.867548,-0.857145
25%,-0.131554,-0.935268,-0.11957,-0.086892,-0.332665,-0.086317,-0.857059,-0.138324,-0.661521,-0.857621,...,-0.052127,0.564522,-0.147858,-0.057629,-0.147983,-0.140173,-0.865518,-0.092583,-0.867548,-0.857145
50%,-0.131554,-0.662517,0.005605,0.00859,-0.027064,0.043527,0.20588,-0.138324,-0.06732,0.146142,...,-0.019881,0.564522,-0.14785,-0.057629,-0.147975,-0.140164,-0.707278,-0.092583,-0.710401,-0.699947
75%,-0.131554,1.067663,0.068193,0.027686,0.021616,0.043527,0.351267,-0.138324,0.164194,0.384432,...,-0.019881,0.564522,-0.146096,-0.057629,-0.146469,-0.138324,1.116763,-0.092583,1.103352,1.124405
max,9.720352,1.863153,59.182069,28.805944,41.43825,39.98942,27.212283,95.185464,23.576868,27.146669,...,17.45712,2.868202,18.768613,29.62696,22.447313,19.780274,1.923153,11.638733,1.901866,1.930498


In [29]:
(pd.DataFrame(filtered_data).corr().style.background_gradient(cmap='PRGn',vmin=-1,vmax=1))

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,attack
Destination Port,1.0,-0.106075,-0.016099,-0.009224,0.079878,-0.012409,0.082732,0.054024,0.150717,0.088364,-0.148897,-0.011658,-0.159192,-0.144758,-0.056265,-0.081868,-0.099711,-0.033398,-0.100177,-0.067898,-0.081353,-0.099038,-0.036802,-0.036391,-0.036168,-0.030047,-0.030318,-0.006205,0.348582,-0.025382,-0.015636,0.038269,0.199597,0.017982,-0.132733,-0.136925,-0.136699,-0.106595,-0.037573,0.348582,-0.000325,-0.08843,0.118716,0.466065,-0.000325,0.055675,-0.134328,0.150717,-0.159192,-0.025382,-0.016099,0.079878,-0.009224,-0.012419,-0.0686,0.169834,-0.003043,-0.080111,-0.017669,-0.005808,-0.017078,-0.017149,-0.098015,-0.008421,-0.098057,-0.097261,-0.415411
Flow Duration,-0.106075,1.0,0.055185,0.046544,0.097492,0.049356,0.227464,-0.122145,0.147888,0.228143,0.346797,-0.150303,0.390676,0.277271,0.412612,0.852314,0.982654,0.178002,0.998802,0.577888,0.863418,0.98257,0.248927,0.402571,0.321011,0.397348,0.408794,0.034596,-0.059245,0.068126,0.067341,-0.277942,-0.054769,-0.160798,0.352377,0.434514,0.351969,0.231944,0.372438,-0.059245,-0.003625,-0.455227,0.239374,-0.106322,-0.003625,0.115168,0.428474,0.147888,0.390676,0.068126,0.055185,0.097492,0.046544,0.049391,-0.460094,-0.022352,0.034256,-0.389107,-0.042777,0.033156,-0.018276,-0.053812,0.975631,0.154886,0.982248,0.962132,0.207955
Total Fwd Packets,-0.016099,0.055185,1.0,0.98067,0.555818,0.987945,0.072892,-0.01528,0.028392,0.046753,0.082013,-0.014367,0.082085,0.061432,-0.031935,0.020339,0.024301,-0.034326,0.052085,-0.028051,0.030225,0.023929,-0.03775,0.055859,0.011733,0.010474,0.014915,-0.002987,0.06163,0.984717,0.979702,-0.039034,-0.008717,-0.022535,0.082371,0.081779,0.067527,0.046615,-0.006655,0.06163,0.001246,0.051667,-0.033772,-0.018093,0.001246,-0.003861,0.077993,0.028392,0.082085,0.984717,1.0,0.555818,0.98067,0.987971,0.043074,0.004668,0.971571,-0.026908,0.050484,0.050311,0.066866,0.038752,0.020061,0.010626,0.020899,0.019037,-0.028182
Total Backward Packets,-0.009224,0.046544,0.98067,1.0,0.546598,0.996089,0.055454,-0.007754,0.021258,0.036031,0.055015,0.000206,0.054761,0.034022,-0.022216,0.019773,0.021117,-0.025619,0.044864,-0.015327,0.029327,0.020823,-0.026506,0.061758,0.017899,0.021748,0.02649,-0.00233,0.066748,0.939258,0.997972,-0.029653,-0.003924,-0.011835,0.055288,0.066787,0.044654,0.026579,0.013755,0.066748,0.000532,0.029086,-0.031748,-0.009222,0.000532,0.036244,0.064191,0.021258,0.054761,0.939258,0.98067,0.546598,1.0,0.996093,0.022662,0.004719,0.990359,-0.034887,0.01836,0.023561,0.028824,0.01374,0.018892,0.004368,0.019122,0.018452,-0.033087
Total Length of Fwd Packets,0.079878,0.097492,0.555818,0.546598,1.0,0.52585,0.546642,0.059051,0.595132,0.508149,0.139913,-0.037739,0.139129,0.121841,-0.071308,0.064733,0.045773,-0.090844,0.089066,-0.048804,0.065474,0.044917,-0.072489,0.156243,0.10261,0.091131,0.091812,0.003996,0.037776,0.549599,0.560711,-0.103609,-0.018915,-0.033126,0.170796,0.190799,0.155161,0.094998,0.000591,0.037776,0.015715,0.169855,-0.138393,-0.036445,0.015715,0.02172,0.185604,0.595132,0.139129,0.549599,0.555818,1.0,0.546598,0.525966,0.154079,0.021674,0.553439,-0.026534,0.215801,0.208445,0.264243,0.169767,0.040001,0.026274,0.042601,0.03729,-0.088877
Total Length of Bwd Packets,-0.012409,0.049356,0.987945,0.996089,0.52585,1.0,0.048901,-0.013562,0.012899,0.030636,0.074685,-0.006784,0.075465,0.054018,-0.021244,0.025074,0.0287,-0.025253,0.047558,-0.014364,0.036579,0.028424,-0.027767,0.052265,0.014225,0.019675,0.023759,-0.004509,0.062863,0.9553,0.993703,-0.029294,-0.005599,-0.01795,0.074469,0.083619,0.06388,0.04848,0.012194,0.062863,-0.000166,0.026124,-0.024816,-0.011328,-0.000166,0.027353,0.081005,0.012899,0.075465,0.9553,0.987945,0.52585,0.996089,1.0,0.02124,9.6e-05,0.984797,-0.030154,0.015169,0.015782,0.021327,0.012192,0.026137,0.003976,0.026279,0.025764,-0.012682
Fwd Packet Length Max,0.082732,0.227464,0.072892,0.055454,0.546642,0.048901,1.0,-0.068651,0.848089,0.964415,0.429199,-0.124089,0.446738,0.423459,-0.141323,0.255454,0.182806,-0.222062,0.206563,-0.056132,0.250109,0.180207,-0.15193,0.249783,0.208157,0.209368,0.196613,-0.028206,-0.012372,0.10025,0.079268,-0.253671,-0.047359,-0.125005,0.482057,0.509074,0.495632,0.376073,0.042593,-0.012372,0.010076,0.35803,-0.303833,-0.090955,0.010076,0.08767,0.507877,0.848089,0.446738,0.10025,0.072892,0.546642,0.055454,0.048941,0.337905,0.007074,0.038454,-0.031896,0.081042,0.049342,0.092787,0.073397,0.176855,0.026522,0.177921,0.174309,0.015831
Fwd Packet Length Min,0.054024,-0.122145,-0.01528,-0.007754,0.059051,-0.013562,-0.068651,1.0,0.108879,-0.118096,-0.155334,0.500182,-0.155755,-0.154946,-0.057094,-0.089174,-0.11489,-0.035032,-0.115338,-0.063574,-0.101246,-0.113972,-0.025364,-0.053688,-0.011485,-0.056394,-0.046712,0.088097,0.063876,-0.024157,-0.014138,-0.033437,0.038536,0.687965,-0.153218,-0.13773,-0.157165,-0.125054,-0.033652,0.063876,-0.000601,-0.097458,-0.114269,0.017404,-0.000601,0.168896,-0.128603,0.108879,-0.155755,-0.024157,-0.01528,0.059051,-0.007754,-0.013574,-0.094391,-0.01094,0.000849,-0.071278,-0.018422,-0.007436,-0.018514,-0.017383,-0.113287,-0.010339,-0.113355,-0.112383,-0.417232
Fwd Packet Length Mean,0.150717,0.147888,0.028392,0.021258,0.595132,0.012899,0.848089,0.108879,1.0,0.907152,0.205843,-0.008855,0.208391,0.196077,-0.058984,0.248682,0.125555,-0.171417,0.136367,0.046148,0.132684,0.124387,-0.007599,0.200624,0.192383,0.186808,0.179028,0.00684,0.039347,0.043107,0.034778,-0.190564,-0.028733,0.015362,0.260747,0.323367,0.272168,0.18104,0.053585,0.039347,0.013359,0.240682,-0.252852,-0.063841,0.013359,0.168601,0.328802,1.0,0.208391,0.043107,0.028392,0.595132,0.021258,0.012913,0.227865,-0.00399,0.021491,-0.03265,0.147433,0.114701,0.160161,0.122528,0.121696,0.016262,0.122726,0.120058,-0.077571
Fwd Packet Length Std,0.088364,0.228143,0.046753,0.036031,0.508149,0.030636,0.964415,-0.118096,0.907152,1.0,0.402976,-0.14718,0.416268,0.395681,-0.104414,0.310503,0.196337,-0.222465,0.208272,0.01021,0.239813,0.193912,-0.084177,0.244486,0.21581,0.223069,0.209651,-0.031039,0.001626,0.070403,0.057915,-0.251881,-0.049331,-0.15995,0.45548,0.506048,0.476089,0.352557,0.066895,0.001626,0.011011,0.356505,-0.307313,-0.093528,0.011011,0.140559,0.508584,0.907152,0.416268,0.070403,0.046753,0.508149,0.036031,0.030665,0.340415,-0.002573,0.021781,-0.025163,0.088929,0.050277,0.095956,0.080603,0.191238,0.020473,0.191759,0.189276,0.061254


In [30]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay

In [31]:
np.unique(y_test,return_counts=True)

(array([0, 1]), array([ 1502, 14876]))

In [32]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train,y_train)

predictions = model.predict(X_test)

accuracy = accuracy_score(y_test,predictions)

print("model accuracy : ", accuracy)

model accuracy :  0.9971302967395287


In [33]:
from xgboost import XGBClassifier

xg_model = XGBClassifier(n_estimators=150).fit(X_train, y_train, verbose = True)
xg_pred = xg_model.predict(X_test)

xg_model

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [34]:
xg_pred

array([1, 1, 1, ..., 1, 1, 1], shape=(16378,))

In [35]:
# compute the measures
acc_xgboost1 = accuracy_score(y_test, xg_pred)
f1score_xgboost1 = f1_score(y_test, xg_pred)
precision_xgboost1 = precision_score(y_test, xg_pred)
recall_xgboost1 = recall_score(y_test, xg_pred)

# display the measures

print('-'*25)
print('XGBoost Model: 66 features')
print('-'*25)
print('Accuracy: {:.3f}'.format(acc_xgboost1))
print('F1-score: {:.3f}'.format(f1score_xgboost1))
print('Precision: {:.3f}'.format(precision_xgboost1))
print('Recall: {:.3f}'.format(recall_xgboost1))

-------------------------
XGBoost Model: 66 features
-------------------------
Accuracy: 1.000
F1-score: 1.000
Precision: 1.000
Recall: 1.000


In [40]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, xg_pred)

print("Confusion Matrix:")
print("-"*18)
print(conf_matrix)
print("-"*18)
print("tn,fp")
print("fn,tp")

Confusion Matrix:
------------------
[[ 1499     3]
 [    0 14876]]
------------------
tn,fp
fn,tp
