In [1]:
import numpy as np
import os
import sys
import pandas as pd
from sklearn.utils import class_weight
from sklearn.metrics import precision_recall_fscore_support as prfs
from sklearn.metrics import accuracy_score

In [2]:
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

In [3]:
def split_file(fname, ratio, trainname, testname):
    Nobs = file_len(fname)
    tfp = int(Nobs * ratio)
    choices_test = np.sort(np.random.choice(Nobs,tfp, replace=False))
    k = 0
    file = open(fname)
    file_tr = open(trainname, 'w')
    file_ts = open(testname, 'w')
    uniq = []
    
    for i in range(Nobs):
        
        line = file.readline()
        
        var = line.split(',')
        size = len(var)
        val = var[size-1]
        
        if(val in uniq):
            pass
        else:
            file_ts.write(line)
            uniq.append(val)
            
        if(k<choices_test.shape[0]):
            if(choices_test[k] == i):
                file_ts.write(line)
                k = k+1
            else:
                file_tr.write(line)
        else:
            file_tr.write(line)
            
    file_ts.close()
    file_tr.close()
    file.close()
    return Nobs, choices_test.shape[0]

In [1]:
# Already splited dont repeat...
test_split = False
if(test_split):
    Nobs, test_size = split_file('kddcup.data.corrected', 0.25, 'train_val','test')

In [5]:
val_split = False
if(val_split):
    Nobs, test_size = split_file('train_val', 0.3333, 'train','val')

# load training data

In [6]:
# file names
trainfile = 'train'
testfile = 'test'
valfile = 'val'

In [7]:
train_size = file_len(trainfile)
print('Training data has',train_size, 'observations')

Training data has 2449339 observations


In [9]:
"""remove duplicates and save as npy"""

# df = pd.read_csv('train')
# df2 = df.drop_duplicates()
# df = None
# batcharr = df2.values
# print(batcharr.shape)
# np.save('train_reduced', df2.values)

# df = pd.read_csv('test')
# df2 = df.drop_duplicates()
# df = None
# batcharr = df2.values
# print(batcharr.shape)
# np.save('test_reduced', df2.values)

# df = pd.read_csv('val')
# df2 = df.drop_duplicates()
# df = None
# batcharr = df2.values
# print(batcharr.shape)
# np.save('val_reduced', df2.values)

'remove duplicates and save as npy'

In [10]:
'''To save entire test data as npy file'''
# df = pd.read_csv('test')
# batcharr = df.values
# print(batcharr.shape)
# np.save('test_npy', df.values)

'To save entire test data as npy file'

In [8]:
train_data = np.load('train_reduced.npy')
print(train_data.shape)
val_data = np.load('val_reduced.npy')
print(val_data.shape)
test_data = np.load('test_reduced.npy')
print(test_data.shape)


(582967, 42)
(318396, 42)
(318203, 42)


#  Put labels for classes : Normal = 0, DOS = 1, Probe = 2, R2L = 3, U2R = 4  


In [9]:
def get_labels(Y):
    Y_cat = np.ones(Y.shape)
    Y_cat[Y == 'normal.'] = 0
    DOS_types = ['back.', 'land.', 'neptune.', 'pod.', 'smurf.', 'teardrop.']
    Probe_types = ['ipsweep.', 'nmap.', 'portsweep.', 'satan.']
    R2L_types = ['ftp_write.', 'guess_passwd.', 'imap.', 'multihop.', 'phf.', 'spy.', 'warezclient.', 'warezmaster.']
    U2R_types = ['buffer_overflow.', 'loadmodule.', 'perl.', 'rootkit.']
    for index, i in enumerate(Y):
        if i in DOS_types:
            Y_cat[index] = 1 
        if i in Probe_types:
            Y_cat[index] = 2 
        if i in R2L_types:
            Y_cat[index] = 3
        if i in U2R_types:
            Y_cat[index] = 4 
    return Y_cat

In [10]:
X = train_data[:,:-1]
Y = train_data[:,-1]
print("Training data has "+ str(X.shape[0]) + " observations.")
Y_new = get_labels(Y)
X = np.delete(X,[1,2,3],1)

Training data has 582967 observations.


In [11]:
np.unique(Y_new)

array([0., 1., 2., 3., 4.])

#  Decision tree classifier

In [14]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn import tree
from sklearn.metrics import confusion_matrix as cmat


In [15]:
model = tree.DecisionTreeClassifier(class_weight='balanced')

In [16]:
model.fit(X,Y_new)

print(model.score(X, Y_new))

0.9999931385481511


In [20]:
#################################################################
#                    validation                                 #
#################################################################
X_val = val_data[:,:-1]
Y_val = val_data[:,-1]
print("Validation has "+str(X_val.shape[0])+" observations.")

Y_val = get_labels(Y_val)
X_val = np.delete(X_val,[1,2,3],1)
Y_pr = model.predict(X_val)
labels = np.unique(Y_val)

prfsarr = np.array(prfs(Y_val, Y_pr, labels = labels))
df = pd.DataFrame(prfsarr.T, columns = ['precison', 'recall', 'f_score', 'support'], index = labels).sort_values(by = 'support', ascending = False)
print()
print(df)
print(df.mean())

Validation has 318396 observations.

     precison    recall   f_score   support
0.0  0.999734  0.999822  0.999778  213983.0
1.0  0.999900  0.999910  0.999905   99676.0
2.0  0.996384  0.996159  0.996272    4426.0
3.0  0.954861  0.935374  0.945017     294.0
4.0  0.500000  0.117647  0.190476      17.0
precison        0.890176
recall          0.809782
f_score         0.826290
support     63679.200000
dtype: float64


# Check on test data

In [21]:
test_data = np.load('test_npy.npy')
print('Testing data has',str(test_data.shape[0]), 'observations')

t_X = test_data[:,:-1]
t_Y = test_data[:,-1]

t_X_new = np.delete(t_X,[1,2,3],1)

t_Y_predicted = model.predict(t_X_new)

print(np.unique(t_Y_predicted, return_counts = True))

t_Y = get_labels(t_Y)
labels = np.unique(t_Y)
print(np.unique(t_Y, return_counts = True))

Testing data has 1224629 observations
(array([0., 1., 2., 3., 4.]), array([243257, 970703,  10380,    283,      6]))
(array([0., 1., 2., 3., 4.]), array([243250, 970708,  10361,    297,     13]))


In [22]:
prfsarr = np.array(prfs(t_Y, t_Y_predicted, labels = labels))
df = pd.DataFrame(prfsarr.T, columns = ['precison', 'recall', 'f_score', 'support'], index = labels)
print(df)
print("********************************************")
print(df.mean())
print("********************************************")
accuracy = accuracy_score(t_Y, t_Y_predicted)
print("Accuracy is "+str(accuracy))

     precison    recall   f_score   support
0.0  0.999712  0.999741  0.999727  243250.0
1.0  0.999985  0.999979  0.999982  970708.0
2.0  0.996243  0.998070  0.997155   10361.0
3.0  0.961131  0.915825  0.937931     297.0
4.0  0.833333  0.384615  0.526316      13.0
********************************************
precison         0.958081
recall           0.859646
f_score          0.892222
support     244925.800000
dtype: float64
********************************************
Accuracy is 0.999888945958327


# Random forest classifier

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
X = train_data[:,:-1]
Y = train_data[:,-1]
print("Training data has "+ str(X.shape[0]) + " observations.")
Y_new = get_labels(Y)
X = np.delete(X,[1,2,3],1)


Training data has 582967 observations.


In [25]:
clf = RandomForestClassifier(n_estimators = 300, random_state=0, class_weight='balanced')
clf.fit(X,Y_new)
print(clf.score(X, Y_new))

0.9999931385481511


In [27]:
#################################################################
#                    validation                                 #
#################################################################
X_val = val_data[:,:-1]
Y_val = val_data[:,-1]
print("Validation data has "+str(X_val.shape[0])+" observations.")

Y_val = get_labels(Y_val)
X_val = np.delete(X_val,[1,2,3],1)

Y_pr = clf.predict(X_val)
labels = np.unique(Y_val)

accuracy = accuracy_score(Y_val, Y_pr)
prfsarr = np.array(prfs(Y_val, Y_pr, labels = labels))
df = pd.DataFrame(prfsarr.T, columns = ['precison', 'recall', 'f_score', 'support'], index = labels).sort_values(by = 'support', ascending = False)
print()
print(df)
print("********************************************")
print(df.mean())
print("********************************************")
print("Accuracy is "+str(accuracy))

Validation data has 318396 observations.

     precison    recall   f_score   support
0.0  0.999668  0.999953  0.999811  213983.0
1.0  0.999960  0.999920  0.999940   99676.0
2.0  0.999091  0.993448  0.996261    4426.0
3.0  0.989130  0.928571  0.957895     294.0
4.0  0.666667  0.117647  0.200000      17.0
********************************************
precison        0.930903
recall          0.807908
f_score         0.830781
support     63679.200000
dtype: float64
********************************************
Accuracy is 0.9997393183331449


In [28]:
#################################################################
#                    Testing                                    #
#################################################################
test_data = np.load('test_npy.npy')
print('Testing data has',str(test_data.shape[0]), 'observations')

t_X = test_data[:,:-1]
t_Y = test_data[:,-1]

t_X_new = np.delete(t_X,[1,2,3],1)

t_Y_predicted = clf.predict(t_X_new)


print(np.unique(t_Y_predicted, return_counts = True))

t_Y = get_labels(t_Y)
labels = np.unique(t_Y)
print(np.unique(t_Y, return_counts = True))

Testing data has 1224629 observations
(array([0., 1., 2., 3., 4.]), array([243283, 970702,  10363,    276,      5]))
(array([0., 1., 2., 3., 4.]), array([243250, 970708,  10361,    297,     13]))


In [29]:
prfsarr = np.array(prfs(t_Y, t_Y_predicted, labels = labels))
df = pd.DataFrame(prfsarr.T, columns = ['precison', 'recall', 'f_score', 'support'], index = labels)
print(df)
print("********************************************")
print(df.mean())
print("********************************************")
accuracy = accuracy_score(t_Y, t_Y_predicted)
print("Accuracy is "+str(accuracy))

     precison    recall   f_score   support
0.0  0.999745  0.999881  0.999813  243250.0
1.0  0.999997  0.999991  0.999994  970708.0
2.0  0.997491  0.997684  0.997587   10361.0
3.0  1.000000  0.929293  0.963351     297.0
4.0  1.000000  0.384615  0.555556      13.0
********************************************
precison         0.999447
recall           0.862293
f_score          0.903260
support     244925.800000
dtype: float64
********************************************
Accuracy is 0.9999256917809394


# PCA + Random Forest

In [62]:
X = train_data[:,:-1]
Y = train_data[:,-1]
print("Training data has "+ str(X.shape[0]) + " observations.")
Y_new = get_labels(Y)
X = np.delete(X,[1,2,3],1)


Training data has 582967 observations.


In [63]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 10)
pca.fit(X)
X = pca.transform(X)

In [64]:
clf2 = RandomForestClassifier(n_estimators = 300, random_state=0, class_weight='balanced')
clf2.fit(X,Y_new)
print(clf2.score(X, Y_new))

0.9999931385481511


In [65]:
#################################################################
#                    validation                                 #
#################################################################
val_data = np.load('val_reduced.npy')
X_val = val_data[:,:-1]
Y_val = val_data[:,-1]
print("Validation data has "+str(X_val.shape[0])+" observations.")

Y_val = get_labels(Y_val)
X_val = np.delete(X_val,[1,2,3],1)

X_val = pca.transform(X_val)
Y_pr = clf2.predict(X_val)
labels = np.unique(Y_val)

accuracy = accuracy_score(Y_val, Y_pr)
prfsarr = np.array(prfs(Y_val, Y_pr, labels = labels))
df = pd.DataFrame(prfsarr.T, columns = ['precison', 'recall', 'f_score', 'support'], index = labels).sort_values(by = 'support', ascending = False)
print()
print(df)
print("********************************************")
print(df.mean())
print("********************************************")
print("Accuracy is "+str(accuracy))

Validation data has 318396 observations.

     precison    recall   f_score   support
0.0  0.999710  0.999916  0.999813  213983.0
1.0  0.999709  0.999940  0.999824   99676.0
2.0  0.996355  0.988251  0.992287    4426.0
3.0  0.996377  0.935374  0.964912     294.0
4.0  0.750000  0.176471  0.285714      17.0
********************************************
precison        0.94843
recall          0.81999
f_score         0.84851
support     63679.20000
dtype: float64
********************************************
Accuracy is 0.9996576590158168


In [66]:
#################################################################
#                    Testing                                    #
#################################################################
test_data = np.load('test_npy.npy')
print('Testing data has',str(test_data.shape[0]), 'observations')

t_X = test_data[:,:-1]
t_Y = test_data[:,-1]

t_X_new = np.delete(t_X,[1,2,3],1)

t_X_new = pca.transform(t_X_new)
t_Y_predicted = clf2.predict(t_X_new)

print(np.unique(t_Y_predicted, return_counts = True))

t_Y = get_labels(t_Y)
labels = np.unique(t_Y)
print(np.unique(t_Y, return_counts = True))

Testing data has 1224629 observations
(array([0., 1., 2., 3., 4.]), array([243289, 970726,  10337,    271,      6]))
(array([0., 1., 2., 3., 4.]), array([243250, 970708,  10361,    297,     13]))


In [67]:
prfsarr = np.array(prfs(t_Y, t_Y_predicted, labels = labels))
df = pd.DataFrame(prfsarr.T, columns = ['precison', 'recall', 'f_score', 'support'], index = labels)
print(df)
print("********************************************")
print(df.mean())
print("********************************************")
accuracy = accuracy_score(t_Y, t_Y_predicted)
print("Accuracy is "+str(accuracy))

     precison    recall   f_score   support
0.0  0.999679  0.999840  0.999760  243250.0
1.0  0.999968  0.999987  0.999977  970708.0
2.0  0.996034  0.993726  0.994879   10361.0
3.0  1.000000  0.912458  0.954225     297.0
4.0  1.000000  0.461538  0.631579      13.0
********************************************
precison         0.999136
recall           0.873510
f_score          0.916084
support     244925.800000
dtype: float64
********************************************
Accuracy is 0.9998775139246253


# XGboost Classification

In [12]:
import xgboost 

In [16]:
X = train_data[:,:-1]
Y = train_data[:,-1]
print("Training data has "+ str(X.shape[0]) + " observations.")
Y_new = get_labels(Y)
print(np.unique(Y_new))
X = np.delete(X,[1,2,3],1)


# xgb = xgboost.XGBClassifier(objective ='reg:logistic',n_estimators=150, random_state=1,learning_rate=0.01)
xgb = xgboost.XGBClassifier(objective ='multi:softmax', num_class = 5, n_estimators = 150,
                            max_depth = 8, random_state=1, learning_rate=0.1)

xgb.fit(X, Y_new)



Training data has 582967 observations.
[0. 1. 2. 3. 4.]


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=8, min_child_weight=1, missing=None,
       n_estimators=150, n_jobs=1, nthread=None, num_class=5,
       objective='multi:softprob', random_state=1, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [17]:
#################################################################
#                    validation                                 #
#################################################################
val_data = np.load('val_reduced.npy')
X_val = val_data[:,:-1]
Y_val = val_data[:,-1]
print("Validation data has "+str(X_val.shape[0])+" observations.")

Y_val = get_labels(Y_val)
X_val = np.delete(X_val,[1,2,3],1)

Y_pr = xgb.predict(X_val)
labels = np.unique(Y_val)

accuracy = accuracy_score(Y_val, Y_pr)
prfsarr = np.array(prfs(Y_val, Y_pr, labels = labels))
df = pd.DataFrame(prfsarr.T, columns = ['precison', 'recall', 'f_score', 'support'], index = labels).sort_values(by = 'support', ascending = False)
print()
print(df)
print("********************************************")
print(df.mean())
print("********************************************")
print("Accuracy is " + str(accuracy))

Validation data has 318396 observations.

     precison    recall   f_score   support
0.0  0.999780  0.999953  0.999867  213983.0
1.0  0.999970  0.999940  0.999955   99676.0
2.0  0.998416  0.996611  0.997512    4426.0
3.0  0.992883  0.948980  0.970435     294.0
4.0  0.750000  0.176471  0.285714      17.0
********************************************
precison        0.948210
recall          0.824391
f_score         0.850697
support     63679.200000
dtype: float64
********************************************
Accuracy is 0.9998115554215505


In [18]:
#################################################################
#                    Testing                                    #
#################################################################
test_data = np.load('test_npy.npy')
print('Testing data has',str(test_data.shape[0]), 'observations')

t_X = test_data[:,:-1]
t_Y = test_data[:,-1]

t_X_new = np.delete(t_X,[1,2,3],1)

t_Y_predicted = xgb.predict(t_X_new)


print(np.unique(t_Y_predicted, return_counts = True))

t_Y = get_labels(t_Y)
labels = np.unique(t_Y)

print(np.unique(t_Y, return_counts = True))
prfsarr = np.array(prfs(t_Y, t_Y_predicted, labels = labels))
df = pd.DataFrame(prfsarr.T, columns = ['precison', 'recall', 'f_score', 'support'], index = labels)
print(df)
print("********************************************")
print(df.mean())
print("********************************************")
accuracy = accuracy_score(t_Y, t_Y_predicted)
print("Accuracy is "+str(accuracy))

Testing data has 1224629 observations
(array([0., 1., 2., 3., 4.]), array([243282, 970703,  10351,    287,      6]))
(array([0., 1., 2., 3., 4.]), array([243250, 970708,  10361,    297,     13]))
     precison    recall   f_score   support
0.0  0.999803  0.999934  0.999868  243250.0
1.0  0.999995  0.999990  0.999992  970708.0
2.0  0.998841  0.997877  0.998358   10361.0
3.0  0.996516  0.962963  0.979452     297.0
4.0  1.000000  0.461538  0.631579      13.0
********************************************
precison         0.999031
recall           0.884460
f_score          0.921850
support     244925.800000
dtype: float64
********************************************
Accuracy is 0.9999461061268351
