In [223]:
#import libs

import numpy as np
import pandas as pd

#encoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#model
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif

from collections import Counter

#score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [224]:
#feature sets and types

field_names = ['ID','timestamp','duration','protocol','srcIP','srcPort','direction','dstIP','dstPort','state','srcToS','dstToS','totalPackets','bytesBothDir','bytesSrcToDst']
dtypes = {'ID':object, 'timestamp':object, 'duration':float, 'protocol':object, 'srcIP':object, 'srcPort':object, 'direction':object, 'dstIP':object, 'dstPort':object, 'state':object, 'srcToS':object, 'dstToS':object, 'totalPackets':int, 'bytesBothDir':int, 'bytesSrcToDst':int}

field_names_label = ['ID','timestamp','duration','protocol','srcIP','srcPort','direction','dstIP','dstPort','state','srcToS','dstToS','totalPackets','bytesBothDir','bytesSrcToDst','label']
dtypes_label = {'ID':object, 'timestamp':object, 'duration':float, 'protocol':object, 'srcIP':object, 'srcPort':object, 'direction':object, 'dstIP':object, 'dstPort':object, 'state':object, 'srcToS':object, 'dstToS':object, 'totalPackets':int, 'bytesBothDir':int, 'bytesSrcToDst':int, 'label':object}

#useless_fields = ['ID','timestamp','label','direction','srcIP','srcPort','dstIP','dstPort']
#useless_fields = ['ID','timestamp','label','direction']
#useless_fields = ['ID','timestamp','label','direction','protocol','srcToS','dstToS']

selected_fields = ['duration','totalPackets','bytesBothDir','bytesSrcToDst','srcPort','state']#
#selected_fields = ['duration','totalPackets','bytesBothDir','bytesSrcToDst','srcPort','bytesPerSecSrcToDst','bytesPerSecBoth','packetsPerSec','packetsSize']

In [225]:
#read csv files

df_train = pd.read_csv (r'training_data_with_label.csv', names = field_names_label, dtype = dtypes_label)
df_test = pd.read_csv (r'test_data_with_label.csv', names = field_names_label, dtype = dtypes_label)
df_valid = pd.read_csv (r'validation_data_with_label.csv', names = field_names_label, dtype = dtypes_label)


In [226]:
#strip categorical features

df_obj = df_train.select_dtypes(['object'])
#print(df_obj)

df_train[df_obj.columns] = df_train[df_obj.columns].apply(lambda x: x.str.strip())
df_test[df_obj.columns] = df_test[df_obj.columns].apply(lambda x: x.str.strip())
df_valid[df_obj.columns] = df_valid[df_obj.columns].apply(lambda x: x.str.strip())


In [227]:
#replace missing values to string 'None' for the three categorical fields: state, srcToS, dstToS

#print(df_train.isna().any())
#print(df_test.isna().any())
#print(df_valid.isna().any())

df_train['state'] = df_train['state'].fillna('None')
df_train['srcToS'] = df_train['state'].fillna('None')
df_train['dstToS'] = df_train['state'].fillna('None')

df_test['state'] = df_test['state'].fillna('None')
df_test['srcToS'] = df_test['srcToS'].fillna('None')
df_test['dstToS'] = df_test['dstToS'].fillna('None')

df_valid['state'] = df_valid['state'].fillna('None')
df_valid['srcToS'] = df_valid['srcToS'].fillna('None')
df_valid['dstToS'] = df_valid['dstToS'].fillna('None')

'''
df_train['state'] = df_train['state'].fillna('None')
df_train['srcToS'] = df_train['state'].fillna('None')
df_train['dstToS'] = df_train['state'].fillna('None')

df_test['state'] = df_test['state'].fillna('None')
df_test['srcToS'] = df_test['srcToS'].fillna('None')
df_test['dstToS'] = df_test['dstToS'].fillna('None')

df_valid['state'] = df_valid['state'].fillna('None')
df_valid['srcToS'] = df_valid['srcToS'].fillna('None')
df_valid['dstToS'] = df_valid['dstToS'].fillna('None')
'''


"\ndf_train['state'] = df_train['state'].fillna('None')\ndf_train['srcToS'] = df_train['state'].fillna('None')\ndf_train['dstToS'] = df_train['state'].fillna('None')\n\ndf_test['state'] = df_test['state'].fillna('None')\ndf_test['srcToS'] = df_test['srcToS'].fillna('None')\ndf_test['dstToS'] = df_test['dstToS'].fillna('None')\n\ndf_valid['state'] = df_valid['state'].fillna('None')\ndf_valid['srcToS'] = df_valid['srcToS'].fillna('None')\ndf_valid['dstToS'] = df_valid['dstToS'].fillna('None')\n"

In [228]:
#creating new field for combinations: srcIP/srcPort and dstIP/dstPort
'''
df_train['srcIP-Port'] = df_train['srcIP'] + '-' + df_train['srcPort']
df_train['dstIP-Port'] = df_train['dstIP'] + '-' + df_train['dstPort']

df_test['srcIP-Port'] = df_test['srcIP'] + '-' + df_test['srcPort']
df_test['dstIP-Port'] = df_test['dstIP'] + '-' + df_test['dstPort']

df_valid['srcIP-Port'] = df_valid['srcIP'] + '-' + df_valid['srcPort']
df_valid['dstIP-Port'] = df_valid['dstIP'] + '-' + df_valid['dstPort']
'''
'''
df_train['bytesPerSecSrcToDst'] = (df_train['bytesSrcToDst']/df_train['duration']).replace([np.inf, -np.inf, np.nan], 0)
df_train['bytesPerSecBoth'] = (df_train['totalPackets']/df_train['duration']).replace([np.inf, -np.inf, np.nan], 0)
df_train['packetsPerSec'] = (df_train['totalPackets']/df_train['duration']).replace([np.inf, -np.inf, np.nan], 0)
df_train['packetsSize'] = (df_train['bytesBothDir']/df_train['totalPackets']).replace([np.inf, -np.inf, np.nan], 0)

df_test['bytesPerSecSrcToDst'] = (df_test['bytesSrcToDst']/df_test['duration']).replace([np.inf, -np.inf, np.nan], 0)
df_test['bytesPerSecBoth'] = (df_test['totalPackets']/df_test['duration']).replace([np.inf, -np.inf, np.nan], 0)
df_test['packetsPerSec'] = (df_test['totalPackets']/df_test['duration']).replace([np.inf, -np.inf, np.nan], 0)
df_test['packetsSize'] = (df_test['bytesBothDir']/df_test['totalPackets']).replace([np.inf, -np.inf, np.nan], 0)

df_valid['bytesPerSecSrcToDst'] = (df_valid['bytesSrcToDst']/df_valid['duration']).replace([np.inf, -np.inf, np.nan], 0)
df_valid['bytesPerSecBoth'] = (df_valid['totalPackets']/df_valid['duration']).replace([np.inf, -np.inf, np.nan], 0)
df_valid['packetsPerSec'] = (df_valid['totalPackets']/df_valid['duration']).replace([np.inf, -np.inf, np.nan], 0)
df_valid['packetsSize'] = (df_valid['bytesBothDir']/df_valid['totalPackets']).replace([np.inf, -np.inf, np.nan], 0)

is_NaN = df_test.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df_test[row_has_NaN]

print(rows_with_NaN)
'''

"\ndf_train['bytesPerSecSrcToDst'] = (df_train['bytesSrcToDst']/df_train['duration']).replace([np.inf, -np.inf, np.nan], 0)\ndf_train['bytesPerSecBoth'] = (df_train['totalPackets']/df_train['duration']).replace([np.inf, -np.inf, np.nan], 0)\ndf_train['packetsPerSec'] = (df_train['totalPackets']/df_train['duration']).replace([np.inf, -np.inf, np.nan], 0)\ndf_train['packetsSize'] = (df_train['bytesBothDir']/df_train['totalPackets']).replace([np.inf, -np.inf, np.nan], 0)\n\ndf_test['bytesPerSecSrcToDst'] = (df_test['bytesSrcToDst']/df_test['duration']).replace([np.inf, -np.inf, np.nan], 0)\ndf_test['bytesPerSecBoth'] = (df_test['totalPackets']/df_test['duration']).replace([np.inf, -np.inf, np.nan], 0)\ndf_test['packetsPerSec'] = (df_test['totalPackets']/df_test['duration']).replace([np.inf, -np.inf, np.nan], 0)\ndf_test['packetsSize'] = (df_test['bytesBothDir']/df_test['totalPackets']).replace([np.inf, -np.inf, np.nan], 0)\n\ndf_valid['bytesPerSecSrcToDst'] = (df_valid['bytesSrcToDst']/df

In [229]:
#label to 1 or -1, 1 is botnet

df_train['label'] = np.where(df_train['label'].str.contains('botnet', case = False), 1, 0)
df_test['label'] = np.where(df_test['label'].str.contains('botnet', case = False), 1, 0)
df_valid['label'] = np.where(df_valid['label'].str.contains('botnet', case = False), 1, 0)


In [230]:
#for encoding combine all

df_all = pd.concat([df_train, df_test, df_valid]) 
df_all.select_dtypes(['object'])


Unnamed: 0,ID,timestamp,protocol,srcIP,srcPort,direction,dstIP,dstPort,state,srcToS,dstToS
0,1,2021-08-14 19:39:49.465881,udp,49.199.46.19,6882,<->,150.35.87.121,6881,CON,CON,CON
1,2,2021-08-14 20:06:01.809520,udp,191.78.136.101,16199,<->,150.35.89.128,35248,CON,CON,CON
2,3,2021-08-14 20:06:01.811109,tcp,122.2.175.95,59066,<?>,150.35.87.17,80,RA_PA,RA_PA,RA_PA
3,4,2021-08-14 20:06:01.812588,tcp,220.172.180.85,56948,<?>,150.35.88.29,54147,RPA_PA,RPA_PA,RPA_PA
4,5,2021-08-14 20:06:01.813672,tcp,122.2.175.95,59064,<?>,150.35.87.17,80,RA_PA,RA_PA,RA_PA
...,...,...,...,...,...,...,...,...,...,...,...
417481,417482,2021-08-15 01:17:13.747768,tcp,150.35.87.62,51152,->,215.99.164.241,80,FSPA_FSPA,0.0,0.0
417482,417483,2021-08-15 01:17:13.759001,udp,150.35.87.62,53151,<->,150.35.83.12,53,CON,0.0,0.0
417483,417484,2021-08-15 01:17:13.799008,tcp,150.35.87.62,51152,->,198.116.235.92,80,FSPA_FSPA,0.0,0.0
417484,417485,2021-08-15 01:17:13.823686,udp,150.35.87.62,57095,<->,150.35.83.12,53,CON,0.0,0.0


In [231]:
#label encoder for categorical features one hot encoder
'''
onehot = OneHotEncoder()

onehot.fit(df_all[['protocol','srcToS','dstToS']].to_numpy())

train_cat_data = onehot.transform(df_train[['protocol','srcToS','dstToS']].to_numpy()).toarray()
test_cat_data = onehot.transform(df_test[['protocol','srcToS','dstToS']].to_numpy()).toarray()
valid_cat_data = onehot.transform(df_valid[['protocol','srcToS','dstToS']].to_numpy()).toarray()

df_train = pd.concat([df_train, pd.DataFrame(data=train_cat_data)], axis=1)
df_test = pd.concat([df_test, pd.DataFrame(data=test_cat_data)], axis=1)
df_valid = pd.concat([df_valid, pd.DataFrame(data=valid_cat_data)], axis=1)
'''


"\nonehot = OneHotEncoder()\n\nonehot.fit(df_all[['protocol','srcToS','dstToS']].to_numpy())\n\ntrain_cat_data = onehot.transform(df_train[['protocol','srcToS','dstToS']].to_numpy()).toarray()\ntest_cat_data = onehot.transform(df_test[['protocol','srcToS','dstToS']].to_numpy()).toarray()\nvalid_cat_data = onehot.transform(df_valid[['protocol','srcToS','dstToS']].to_numpy()).toarray()\n\ndf_train = pd.concat([df_train, pd.DataFrame(data=train_cat_data)], axis=1)\ndf_test = pd.concat([df_test, pd.DataFrame(data=test_cat_data)], axis=1)\ndf_valid = pd.concat([df_valid, pd.DataFrame(data=valid_cat_data)], axis=1)\n"

In [232]:
#label encoder for categorical features label encoder

protocol_le = LabelEncoder()
state_le = LabelEncoder()
srcToS_le = LabelEncoder()
dstToS_le = LabelEncoder()
srcIP_le = LabelEncoder()
dstIP_le = LabelEncoder()
srcPort_le = LabelEncoder()
dstPort_le = LabelEncoder()
#srcIPPort_le = LabelEncoder()
#dstIPPort_le = LabelEncoder()

protocol_le.fit(df_all['protocol'])
state_le.fit(df_all['state'])
srcToS_le.fit(df_all['srcToS'])
dstToS_le.fit(df_all['dstToS'])
srcIP_le.fit(df_all['srcIP'])
dstIP_le.fit(df_all['dstIP'])
srcPort_le.fit(df_all['srcPort'])
dstPort_le.fit(df_all['dstPort'])
#srcIPPort_le.fit(df_all['srcIP-Port'])
#dstIPPort_le.fit(df_all['dstIP-Port'])

df_train['protocol'] = protocol_le.transform(df_train['protocol'])
df_train['state'] = state_le.transform(df_train['state'])
df_train['srcToS'] = srcToS_le.transform(df_train['srcToS'])
df_train['dstToS'] = dstToS_le.transform(df_train['dstToS'])
df_train['srcIP'] = srcIP_le.transform(df_train['srcIP'])
df_train['dstIP'] = dstIP_le.transform(df_train['dstIP'])
df_train['srcPort'] = srcPort_le.transform(df_train['srcPort'])
df_train['dstPort'] = dstPort_le.transform(df_train['dstPort'])
#df_train['srcIP-Port'] = srcIPPort_le.transform(df_train['srcIP-Port'])
#df_train['dstIP-Port'] = dstIPPort_le.transform(df_train['dstIP-Port'])

df_test['protocol'] = protocol_le.transform(df_test['protocol'])
df_test['state'] = state_le.transform(df_test['state'])
df_test['srcToS'] = srcToS_le.transform(df_test['srcToS'])
df_test['dstToS'] = dstToS_le.transform(df_test['dstToS'])
df_test['srcIP'] = srcIP_le.transform(df_test['srcIP'])
df_test['dstIP'] = dstIP_le.transform(df_test['dstIP'])
df_test['srcPort'] = srcPort_le.transform(df_test['srcPort'])
df_test['dstPort'] = dstPort_le.transform(df_test['dstPort'])
#df_test['srcIP-Port'] = srcIPPort_le.transform(df_test['srcIP-Port'])
#df_test['dstIP-Port'] = dstIPPort_le.transform(df_test['dstIP-Port'])

df_valid['protocol'] = protocol_le.transform(df_valid['protocol'])
df_valid['state'] = state_le.transform(df_valid['state'])
df_valid['srcToS'] = srcToS_le.transform(df_valid['srcToS'])
df_valid['dstToS'] = dstToS_le.transform(df_valid['dstToS'])
df_valid['srcIP'] = srcIP_le.transform(df_valid['srcIP'])
df_valid['dstIP'] = dstIP_le.transform(df_valid['dstIP'])
df_valid['srcPort'] = srcPort_le.transform(df_valid['srcPort'])
df_valid['dstPort'] = dstPort_le.transform(df_valid['dstPort'])
#df_valid['srcIP-Port'] = srcIPPort_le.transform(df_valid['srcIP-Port'])
#df_valid['dstIP-Port'] = dstIPPort_le.transform(df_valid['dstIP-Port'])


In [233]:
#feature selection kbest

kbest_sel = SelectKBest(f_classif, k=3)
kbest_sel.fit(df_train[selected_fields], df_train['label'])
kbest_cols = df_train[selected_fields].iloc[:,kbest_sel.get_support(indices=True)].columns
print(kbest_cols)


Index(['duration', 'srcPort', 'state'], dtype='object')


In [234]:
#prediction using logistic regression

clf = LogisticRegression(max_iter=10000)
clf.fit(df_train[selected_fields][kbest_cols], df_train['label'])
predicted_label = clf.predict(df_test[selected_fields][kbest_cols])
predicted_prob = clf.predict_proba(df_test[selected_fields][kbest_cols])

#clf.fit(df_train[selected_fields], df_train['label'])
#predicted_label = clf.predict(df_test[selected_fields])

Counter(predicted_label)

Counter({0: 393502, 1: 23981})

In [235]:
#score 

accu = accuracy_score(df_test['label'], predicted_label)
f1 = f1_score(df_test['label'], predicted_label, average="macro")
pre = precision_score(df_test['label'], predicted_label, average="macro")
rec = recall_score(df_test['label'], predicted_label, average="macro")

print('accuracy : ' + str(accu))
print('f1 : ' + str(f1))
print('precision : ' + str(pre))
print('recall : ' + str(rec))


accuracy : 0.8471458718079539
f1 : 0.6166546967059533
precision : 0.7425654433622939
recall : 0.5943406329047185


In [236]:
#data insight

df_test_origin = pd.read_csv (r'test_data_with_label.csv', names = field_names_label, dtype = dtypes_label)
df_test_origin['label'] = np.where(df_test_origin['label'].str.contains('botnet', case = False), 1, 0)

pd_predicted_label = pd.DataFrame(data=np.array(predicted_label), columns=['predicted_label'])
pd_predicted_prob = pd.DataFrame(data=np.array(predicted_prob.T[1]).T, columns=['predicted_prob'])

df_test = pd.concat([df_test, pd_predicted_label, pd_predicted_prob], axis=1)
df_test_predicted_label = pd.concat([df_test_origin,pd_predicted_label, pd_predicted_prob], axis=1)
print(df_test_predicted_label)


            ID                   timestamp     duration protocol  \
0            1  2021-08-14 19:44:01.514380  1998.730056      udp   
1            2  2021-08-14 19:45:19.524285  1959.888200      udp   
2            3  2021-08-14 20:06:01.812305    14.908785      tcp   
3            4  2021-08-14 20:06:01.813676     0.001743      tcp   
4            5  2021-08-14 20:06:01.816415   196.090732      tcp   
...        ...                         ...          ...      ...   
417478  417479  2021-08-15 01:17:13.598386     0.000795      udp   
417479  417480  2021-08-15 01:17:13.650746     0.000000      udp   
417480  417481  2021-08-15 01:17:13.791659     0.054273      udp   
417481  417482  2021-08-15 01:17:13.872411     0.000000      udp   
417482  417483  2021-08-15 01:17:13.896435     0.000000      tcp   

                 srcIP srcPort direction          dstIP dstPort    state  \
0         85.4.198.159    6883       <->  150.35.87.121    6878      CON   
1       154.31.224.125    6880 

In [237]:
#data insight
print(df_test_predicted_label['label'].value_counts())
print(df_test_predicted_label['predicted_label'].value_counts())
print(df_test_predicted_label[df_test_predicted_label['predicted_label']==1]['srcIP'].value_counts())

0    347702
1     69781
Name: label, dtype: int64
0    393502
1     23981
Name: predicted_label, dtype: int64
150.35.87.196     4250
150.35.87.211     4043
150.35.87.212     3508
150.35.87.208     3173
112.188.5.39         3
                  ... 
96.187.241.12        1
190.153.88.225       1
83.3.54.7            1
89.141.100.177       1
191.223.96.242       1
Name: srcIP, Length: 8323, dtype: int64


In [238]:
# chosen IP 150.35.87.196

print(df_test_predicted_label[df_test_predicted_label['srcIP'] == '150.35.87.196']['predicted_label'].value_counts())
print(df_test_predicted_label[df_test_predicted_label['srcIP'] == '150.35.87.196']['label'].value_counts())


0    13711
1     4250
Name: predicted_label, dtype: int64
1    17961
Name: label, dtype: int64


In [239]:
# check what value associated with 150.35.87.196 

dict(zip(srcIP_le.classes_, srcIP_le.transform(srcIP_le.classes_)))['150.35.87.196']

67657

In [240]:
#prediction with only 150.35.87.196 

clf = LogisticRegression(max_iter=10000)
clf.fit(df_train[selected_fields][kbest_cols], df_train['label'])
predicted_label = clf.predict(df_test[df_test['srcIP']==67657][selected_fields][kbest_cols])

Counter(predicted_label)


Counter({1: 4250, 0: 13711})

In [243]:
# adversial samples using FGSM
def adversial_gen(df,lr,cols,ep):
    dlt = np.matrix(df['predicted_prob'].to_numpy()).T - np.matrix(df['label'].to_numpy()).T
    direction = np.sign(np.matmul(dlt, lr.coef_))
    
    return df[cols].to_numpy() + ep * direction

newInput = adversial_gen(df_test[df_test['srcIP']==67657],clf,kbest_cols,0.8) 
#newInput = adversial_gen(df_test,clf,kbest_cols,0.5) 

print(df_test[df_test['srcIP']==67657][selected_fields][kbest_cols].to_numpy())
print(newInput)

[[2.56386170e-04 2.57000000e+02 7.00000000e+00]
 [1.08181766e-01 2.57000000e+02 7.00000000e+00]
 [3.63367319e-02 2.79000000e+02 2.09000000e+02]
 ...
 [4.81985812e-04 1.92200000e+04 7.00000000e+00]
 [4.04379163e-04 1.92090000e+04 7.00000000e+00]
 [0.00000000e+00 1.90550000e+04 2.19000000e+02]]
[[8.00256386e-01 2.57800000e+02 6.20000000e+00]
 [9.08181766e-01 2.57800000e+02 6.20000000e+00]
 [8.36336732e-01 2.79800000e+02 2.08200000e+02]
 ...
 [8.00481986e-01 1.92208000e+04 6.20000000e+00]
 [8.00404379e-01 1.92098000e+04 6.20000000e+00]
 [8.00000000e-01 1.90558000e+04 2.18200000e+02]]


In [244]:
#use adversial samples for clf with only 150.35.87.196 

predicted_label_ad = clf.predict(newInput)

Counter(predicted_label_ad)


Counter({1: 4224, 0: 13737})