In [90]:
#import libs

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

#for machine learning
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

#feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

#split validation data
from sklearn.model_selection import train_test_split

#count frequency in list
import collections

In [91]:
#field sets

#useless_fields = ['ID','timestamp','direction','srcIP','srcPort','dstIP','dstPort']
useless_fields = ['ID','timestamp','direction']
splunk_sel_columns = ['duration','protocol','state','srcToS','dstToS','totalPackets','bytesBothDir','bytesSrcToDst','srcIP-Port','dstIP-Port']

#useless_fields_label = ['ID','timestamp','srcIP','srcPort','direction','dstIP','dstPort','label','protocol','state','srcToS','dstToS']


In [92]:
#read csv files

df_train = pd.read_csv (r'encoded-train.csv',low_memory=False)
df_test = pd.read_csv (r'encoded-test.csv',low_memory=False)
df_valid = pd.read_csv (r'encoded-valid.csv',low_memory=False)


In [93]:
#feature selection

#first feature selection method 
var_sel = VarianceThreshold(threshold=0.5)
var_sel.fit(df_train.drop(columns=useless_fields))
var_sel_columns = df_train.drop(columns=useless_fields).iloc[:,var_sel.get_support(indices=True)].columns
print(var_sel_columns)

#second feature selection method
corr_features = set()
train_corr = df_train.drop(columns=useless_fields).corr()
for i in range(len(train_corr.columns)):
    for j in range(i):
        if abs(train_corr.iloc[i, j]) > 0.8:
            col = train_corr.columns[i]
            corr_features.add(col)
corr_sel_columns = df_train.drop(columns=useless_fields).drop(labels=corr_features, axis=1).columns
print(corr_sel_columns)


Index(['duration', 'protocol', 'srcIP', 'srcPort', 'dstIP', 'dstPort', 'state',
       'dstToS', 'totalPackets', 'bytesBothDir', 'bytesSrcToDst', 'srcIP-Port',
       'dstIP-Port'],
      dtype='object')
Index(['duration', 'protocol', 'srcIP', 'srcPort', 'dstIP', 'dstPort', 'state',
       'srcToS', 'dstToS', 'totalPackets', 'bytesSrcToDst', 'srcIP-Port'],
      dtype='object')


In [94]:
#isolation forest label predictions for each feature set

def iForestlabelPred(param):
    #splunk selected feature set
    clf = IsolationForest(contamination=param)
    clf.fit(df_train[splunk_sel_columns])
    iforest_splunk_predicted_label = clf.predict(df_test[splunk_sel_columns])

    #variance threshold selected feature set
    clf = IsolationForest(contamination=param)
    clf.fit(df_train[var_sel_columns])
    iforest_var_predicted_label = clf.predict(df_test[var_sel_columns])

    #removed correlated feature set
    clf = IsolationForest(contamination=param)
    clf.fit(df_train[corr_sel_columns])
    iforest_corr_predicted_label = clf.predict(df_test[corr_sel_columns])

    return iforest_splunk_predicted_label, iforest_var_predicted_label, iforest_corr_predicted_label


In [95]:
#Local OutLier Factor perdictions for each feature set

def LOFlabelPred(param):
    #splunk selected feature set
    clf = LocalOutlierFactor(n_neighbors=param)
    LOF_splunk_predicted_label = clf.fit_predict(df_test[splunk_sel_columns])

    #variance threshold selected feature set
    clf = LocalOutlierFactor(n_neighbors=param)
    LOF_splunk_var_label = clf.fit_predict(df_test[var_sel_columns])

    #removed correlated feature set
    clf = LocalOutlierFactor(n_neighbors=param)
    LOF_splunk_corr_label = clf.fit_predict(df_test[corr_sel_columns])

    return LOF_splunk_predicted_label, LOF_splunk_var_label, LOF_splunk_corr_label
    

In [96]:
# model param tunings isolation forest
'''
for i in [0.01,0.02,0.1,0.2,0.3]:
    result = iForestlabelPred(i)
    print('iforest splunk : ' + str(i) + ' : ') 
    print(collections.Counter(result[0]))
    print('iforest var : ' + str(i) + ' : ') 
    print(collections.Counter(result[1]))
    print('iforest corr : ' + str(i) + ' : ') 
    print(collections.Counter(result[2]))
'''

    
clf = IsolationForest()
param_grid = {'contamination': [0.01, 0.02, 0.1, 0.2, 0.3]}

#f1sc = make_scorer(f1_score(average='micro'))
grid_estimator = GridSearchCV(clf, 
                                              param_grid,
                                              scoring='f1_micro', 
                                              refit=True,
                                              cv=10, 
                                              return_train_score=True)

grid_estimator.fit(df_valid.drop(columns=useless_fields), df_valid['label'])

print(grid_estimator.best_params_)


{'contamination': 0.01}


In [97]:
# model param tunings LOF
'''
for i in [5,10,20,30,50]:
    result = LOFlabelPred(i)
    print('LOF splunk ' + str(i) + ' : ') 
    print(collections.Counter(result[0]))
    print('LOF var : ' + str(i) + ' : ') 
    print(collections.Counter(result[1]))
    print('LOF corr : ' + str(i) + ' : ') 
    print(collections.Counter(result[2]))
'''
    
for i in [5,10,20,30,50]:
    clf = LocalOutlierFactor(n_neighbors=i)
    plabels = clf.fit_predict(df_valid.drop(columns=useless_fields))
    print(i)
    print(f1_score(df_valid['label'].to_numpy(), np.array(plabels), average='macro'))
    
    
    

5
0.49354807299637116
10
0.49404807707663967
20
0.4931163936739185
30
0.4929969258344839
50
0.49475594939498413


In [98]:
#predicted labels insight

lof_result = LOFlabelPred(10)

lof_T_list = np.array(lof_result).T

df_predicted_labels_LOF = pd.DataFrame(lof_T_list,columns=['LOF_splunk_predicted_label','LOF_var_predicted_label','LOF_corr_predicted_label'])
df_test_labels = pd.concat([df_test,df_predicted_labels_LOF], axis=1)

df_test_labels


Unnamed: 0,ID,timestamp,duration,protocol,srcIP,srcPort,direction,dstIP,dstPort,state,srcToS,dstToS,totalPackets,bytesBothDir,bytesSrcToDst,srcIP-Port,dstIP-Port,LOF_splunk_predicted_label,LOF_var_predicted_label,LOF_corr_predicted_label
0,339247,2021-08-16 19:40:10.577701,285.799705,8,49802,22886,->,23098,3432,245,0,0,106,24523,17810,583597,45134,1,1,1
1,339246,2021-08-16 19:40:10.525304,595.340778,9,16073,41874,<->,23098,3434,16,0,0,7,1320,1081,21977,45136,1,-1,-1
2,339245,2021-08-16 19:40:10.352642,631.204874,8,51884,33235,<?>,23098,3432,171,0,0,352,34192,21457,587421,45134,1,1,-1
3,339244,2021-08-16 19:40:10.216544,0.000730,9,39751,231,<->,23098,3429,16,0,0,1,132,73,56017,45131,1,1,1
4,339243,2021-08-16 19:40:10.135266,0.000834,9,20276,33616,<->,23098,3433,16,0,0,2,131,71,28004,45135,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348471,9266,2021-08-15 18:31:59.449012,3306.207446,9,76208,31047,<->,23098,3433,16,0,0,7,941,293,636118,45135,1,1,1
348472,9265,2021-08-15 18:31:59.399620,0.000529,9,38972,41531,<->,23098,34470,16,0,0,1,127,59,54936,46610,1,1,1
348473,9264,2021-08-15 18:31:59.306528,0.000968,9,153555,13166,<->,23098,3429,16,0,0,1,133,73,787015,45131,1,1,1
348474,9263,2021-08-15 18:31:59.241627,3263.106004,9,191647,14814,<->,23098,3433,16,0,0,7,860,612,861189,45135,1,1,1


In [99]:

iforest_result = iForestlabelPred(0.01)

iforest_T_list =np.array(iforest_result).T

df_predicted_labels_splunk = pd.DataFrame(iforest_T_list,columns=['iforest_splunk_predicted_label','iforest_var_predicted_label','iforest_corr_predicted_label'])
df_test_labels = pd.concat([df_test_labels,df_predicted_labels_splunk], axis=1)

df_test_labels


Unnamed: 0,ID,timestamp,duration,protocol,srcIP,srcPort,direction,dstIP,dstPort,state,...,bytesBothDir,bytesSrcToDst,srcIP-Port,dstIP-Port,LOF_splunk_predicted_label,LOF_var_predicted_label,LOF_corr_predicted_label,iforest_splunk_predicted_label,iforest_var_predicted_label,iforest_corr_predicted_label
0,339247,2021-08-16 19:40:10.577701,285.799705,8,49802,22886,->,23098,3432,245,...,24523,17810,583597,45134,1,1,1,1,1,1
1,339246,2021-08-16 19:40:10.525304,595.340778,9,16073,41874,<->,23098,3434,16,...,1320,1081,21977,45136,1,-1,-1,1,1,1
2,339245,2021-08-16 19:40:10.352642,631.204874,8,51884,33235,<?>,23098,3432,171,...,34192,21457,587421,45134,1,1,-1,1,1,1
3,339244,2021-08-16 19:40:10.216544,0.000730,9,39751,231,<->,23098,3429,16,...,132,73,56017,45131,1,1,1,1,1,1
4,339243,2021-08-16 19:40:10.135266,0.000834,9,20276,33616,<->,23098,3433,16,...,131,71,28004,45135,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348471,9266,2021-08-15 18:31:59.449012,3306.207446,9,76208,31047,<->,23098,3433,16,...,941,293,636118,45135,1,1,1,1,1,1
348472,9265,2021-08-15 18:31:59.399620,0.000529,9,38972,41531,<->,23098,34470,16,...,127,59,54936,46610,1,1,1,1,1,1
348473,9264,2021-08-15 18:31:59.306528,0.000968,9,153555,13166,<->,23098,3429,16,...,133,73,787015,45131,1,1,1,1,1,1
348474,9263,2021-08-15 18:31:59.241627,3263.106004,9,191647,14814,<->,23098,3433,16,...,860,612,861189,45135,1,1,1,1,1,1


In [100]:

df_test_original = pd.read_csv (r'splunk_test_data.csv')
df_test_original_labels = pd.concat([df_test_original,df_predicted_labels_LOF,df_predicted_labels_splunk], axis=1)
print(df_test_original_labels['dstToS'].value_counts()[:10])

#timestamp to bin per 5 mins
bin5min=1*60*1000000000
df_test_labels['timestamp_converted'] = pd.to_datetime(((pd.to_datetime(df_test_labels['timestamp']).astype(np.int64) // bin5min + 1 ) * bin5min))


0.0     291219
None     57246
3.0         10
2.0          1
Name: dstToS, dtype: int64


In [103]:
#df_test_labels[df_test_labels['iforest_corr_predicted_label']==-1]
#df_test_labels[df_test_labels['LOF_splunk_corr_label']==-1]


for i in ['iforest_splunk_predicted_label','iforest_var_predicted_label','iforest_corr_predicted_label', 'LOF_splunk_predicted_label','LOF_var_predicted_label','LOF_corr_predicted_label']:
    
    print(i)
    #print(df_test_labels[df_test_labels[i]==-1]['srcIP-Port'].value_counts()[:10])
    print(df_test_original_labels[df_test_original_labels[i]==-1].count())
    print(df_test_labels[df_test_labels[i]==-1]['timestamp_converted'].value_counts()[:10])
    print(df_test_labels[df_test_labels[i]==-1]['timestamp'].min())
    print(df_test_labels[df_test_labels[i]==-1]['timestamp'].max())
    print(df_test_original_labels[df_test_original_labels[i]==-1]['srcIP-Port'].value_counts()[:10])
    print(df_test_original_labels[df_test_original_labels[i]==-1]['dstIP-Port'].value_counts()[:10])
    print(df_test_original_labels[df_test_original_labels[i]==-1]['protocol'].value_counts()[:10])
    print(df_test_original_labels[df_test_original_labels[i]==-1]['dstToS'].value_counts()[:10])
    
    #print(df_test_original_labels[df_test_original_labels[i]==-1]['timestamp'].nunique())


iforest_splunk_predicted_label
ID                                2909
timestamp                         2909
duration                          2909
protocol                          2909
srcIP                             2909
srcPort                           2909
direction                         2909
dstIP                             2909
dstPort                           2909
state                             2909
srcToS                            2909
dstToS                            2909
totalPackets                      2909
bytesBothDir                      2909
bytesSrcToDst                     2909
srcIP-Port                        2909
dstIP-Port                        2909
LOF_splunk_predicted_label        2909
LOF_var_predicted_label           2909
LOF_corr_predicted_label          2909
iforest_splunk_predicted_label    2909
iforest_var_predicted_label       2909
iforest_corr_predicted_label      2909
dtype: int64
2021-08-15 20:24:00    451
2021-08-15 20:25:00    194
2021-

0.0     27240
None     6643
2.0         1
Name: dstToS, dtype: int64
LOF_var_predicted_label
ID                                20244
timestamp                         20244
duration                          20244
protocol                          20244
srcIP                             20244
srcPort                           20244
direction                         20244
dstIP                             20244
dstPort                           20244
state                             20244
srcToS                            20244
dstToS                            20244
totalPackets                      20244
bytesBothDir                      20244
bytesSrcToDst                     20244
srcIP-Port                        20244
dstIP-Port                        20244
LOF_splunk_predicted_label        20244
LOF_var_predicted_label           20244
LOF_corr_predicted_label          20244
iforest_splunk_predicted_label    20244
iforest_var_predicted_label       20244
iforest_corr_predicted_labe

In [102]:
#output txt file that contains only steamID of attacking

df_test_labels[df_test_labels['iforest_corr_predicted_label']==-1]['ID'].to_csv('1132300_task1.txt', header=None, index=None, sep=' ')
