In [1]:
import numpy as np
import os
import sys
import pandas as pd
from sklearn.utils import class_weight

# Loading dataset

In [2]:
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

In [3]:
def readlines(fname, start, end):
    file = open(fname)
    batch = []
    r = end - start
    for i in range(start):
        line = file.readline()
    for i in range(r):
        line = file.readline()
        batch.append(line.split(','))
    file.close()
    return batch

In [4]:
def rmreadlines(fname, size):
    Nobs = file_len(fname)
    choices = np.sort(np.random.choice(Nobs,size, replace=False))
    
    k = 0
    
    file = open(fname)
    batch = []
    
    
    for i in range(Nobs):

        line = file.readline()
        if(k<choices.shape[0]):
            if(choices[k] == i):
                batch.append(line.split(','))
                k = k+1
            else:
                pass
        else:
            break;
    
    file.close()
    return batch

In [5]:
def split_file(fname, ratio, trainname, testname):
    Nobs = file_len(fname)
    tfp = int(Nobs * ratio)
    choices_test = np.sort(np.random.choice(Nobs,tfp, replace=False))
    k = 0
    file = open(fname)
    file_tr = open(trainname, 'w')
    file_ts = open(testname, 'w')
    uniq = []
    
    for i in range(Nobs):
        
        line = file.readline()
        
        var = line.split(',')
        size = len(var)
        val = var[size-1]
        
        if(val in uniq):
            pass
        else:
            file_ts.write(line)
            uniq.append(val)
            
        if(k<choices_test.shape[0]):
            if(choices_test[k] == i):
                file_ts.write(line)
                k = k+1
            else:
                file_tr.write(line)
        else:
            file_tr.write(line)
            
    file_ts.close()
    file_tr.close()
    file.close()
    return Nobs, choices_test.shape[0]

In [1]:
# Already splited dont repeat...
test_split = False
if(test_split):
    Nobs, test_size = split_file('kddcup.data.corrected', 0.25, 'train_val','test')

In [7]:
val_split = False
if(val_split):
    Nobs, test_size = split_file('train_val', 0.3333, 'train','val')

# load training data

In [51]:
# file names
trainfile = 'train'
testfile = 'test'
valfile = 'val'

In [52]:
train_size = file_len(trainfile)
print('Training data has',train_size, 'observations')
test_size = file_len(testfile)
print('Test data has',test_size, 'observations')
val_size = file_len(valfile)
print('Val data has',val_size, 'observations')

Training data has 2449339 observations
Test data has 1224630 observations
Val data has 1224508 observations


In [10]:
def get_labels(Y):
    Y_cat = np.ones(Y.shape)
    Y_cat[Y == 'normal.\n'] = 0
    DOS_types = ['back.\n', 'land.\n', 'neptune.\n', 'pod.\n', 'smurf.\n', 'teardrop.\n']
    Probe_types = ['ipsweep.\n', 'nmap.\n', 'portsweep.\n', 'satan.\n']
    R2L_types = ['ftp_write.\n', 'guess_passwd.\n', 'imap.\n', 'multihop.\n', 'phf.\n', 'spy.\n', 'warezclient.\n', 'warezmaster.\n']
    U2R_types = ['buffer_overflow.\n', 'loadmodule.\n', 'perl.\n', 'rootkit.\n']
    for index, i in enumerate(Y):
        if i in DOS_types:
            Y_cat[index] = 1 
        if i in Probe_types:
            Y_cat[index] = 2 
        if i in R2L_types:
            Y_cat[index] = 3
        if i in U2R_types:
            Y_cat[index] = 4 
    return Y_cat
    

# Pytorch NN model

In [56]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

In [57]:

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(38, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64,16)
        self.fc5 = nn.Linear(16,5)
        

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x


In [58]:
import torch.optim as optim
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.003)

In [59]:
net = Net()

epochs = 500
running_loss = 0
train_losses, test_losses = [], []

for epoch in range(epochs):  # loop over the dataset multiple times
    
    running_loss = 0.0
    batch_size_ = 100000
    test_loss = 0.0
    accuracy = 0
    correct = 0
    total = 0
    batch = rmreadlines(trainfile, batch_size_)
    batchdf = pd.DataFrame(np.array(batch)).astype(float, errors='ignore')
    for col in batchdf.columns:
        batchdf[col] = batchdf[col].astype('float64', errors = 'ignore') 

    batcharr = batchdf.values
    
    batch_size = 10000
    for itr in range(batcharr.shape[0]//batch_size ):  #iter over whole batch
        
#         print(batcharr.shape)
        X = batcharr[itr*batch_size:(itr+1)*batch_size,:-1]
        Y = batcharr[itr*batch_size:(itr+1)*batch_size,-1]


        Y_cat = get_labels(Y)
        X_new = np.delete(X,[1,2,3],1)
#         print(np.unique(Y_cat))
        cw_cat = class_weight.compute_class_weight('balanced', np.unique(Y_cat), Y_cat)
        cws_cat = np.arange(5)*0.0
        cws_cat[np.unique(Y_cat).astype(int)] = cw_cat
        cws_cat = torch.FloatTensor(cws_cat)
        
        # get the inputs; data is a list of [inputs, labels]
        X_new = X_new.astype(np.float32)
        inputs = Variable(torch.from_numpy((X_new)))
        labels = Variable(torch.from_numpy((Y_cat)).long())
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        criterion = nn.CrossEntropyLoss(weight = cws_cat) 
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        if(itr%batch_size==0):
            test_loss = 0.0
            accuracy = 0
            correct = 0
            total = 0
            net.eval()

            with torch.no_grad():
                batch_val = rmreadlines(valfile, int(batch_size))
                batchdf_val = pd.DataFrame(np.array(batch_val)).astype(float, errors='ignore')
                for col in batchdf_val.columns:
                    batchdf_val[col] = batchdf_val[col].astype('float64', errors = 'ignore') 

                batcharr_val = batchdf_val.values
                X_val = batcharr_val[:,:-1]
                Y_val = batcharr_val[:,-1]

                Y_cat_val = get_labels(Y_val)
                X_new_val = np.delete(X_val,[1,2,3],1)

                X_new_val = X_new_val.astype(np.float32)

                inputs_val = Variable(torch.from_numpy((X_new_val)))
                labels_val = Variable(torch.from_numpy((Y_cat_val)).long())
                outputs = net.forward(inputs_val)
                criterion = nn.CrossEntropyLoss()
                batch_loss = criterion(outputs, labels_val)
                test_loss += batch_loss.item()

                _, predicted = torch.max(outputs.data, 1)
                total += labels_val.size(0)
                correct += (predicted == labels_val).sum().item()

            train_losses.append(running_loss/batch_size)
            test_losses.append(test_loss/batch_size) 
            print(f"Epoch {epoch+1}/{epochs}.. "
                  f"Train loss: {running_loss/batch_size:.3f}.. "
                  f"Val loss: {test_loss/total:.3f}.. "
                  f"Val accuracy: {correct/total:.3f}")

            net.train()

print('Finished Training')
PATH = 'pytorch_model_2.pth'
torch.save(net.state_dict(), PATH)

Exception ignored in: <bound method Booster.__del__ of <xgboost.core.Booster object at 0x7f1fef7aa550>>
Traceback (most recent call last):
  File "/home/parul/anaconda3/lib/python3.6/site-packages/xgboost/core.py", line 957, in __del__
    if self.handle is not None:
AttributeError: 'Booster' object has no attribute 'handle'


Epoch 1/500.. Train loss: 0.001.. Val loss: 0.001.. Val accuracy: 0.575
Epoch 2/500.. Train loss: 0.002.. Val loss: 0.001.. Val accuracy: 0.580
Epoch 3/500.. Train loss: 0.001.. Val loss: 0.001.. Val accuracy: 0.577
Epoch 4/500.. Train loss: 0.001.. Val loss: 0.001.. Val accuracy: 0.573
Epoch 5/500.. Train loss: 0.002.. Val loss: 0.001.. Val accuracy: 0.579
Epoch 6/500.. Train loss: 0.004.. Val loss: 0.001.. Val accuracy: 0.583
Epoch 7/500.. Train loss: 0.001.. Val loss: 0.001.. Val accuracy: 0.575
Epoch 8/500.. Train loss: 0.001.. Val loss: 0.001.. Val accuracy: 0.576
Epoch 9/500.. Train loss: 0.002.. Val loss: 0.001.. Val accuracy: 0.577
Epoch 10/500.. Train loss: 0.001.. Val loss: 0.001.. Val accuracy: 0.591
Epoch 11/500.. Train loss: 0.001.. Val loss: 0.001.. Val accuracy: 0.572
Epoch 12/500.. Train loss: 0.001.. Val loss: 0.001.. Val accuracy: 0.567
Epoch 13/500.. Train loss: 0.002.. Val loss: 0.001.. Val accuracy: 0.578
Epoch 14/500.. Train loss: 0.002.. Val loss: 0.001.. Val acc

KeyboardInterrupt: 

# Test

In [16]:
from sklearn.metrics import precision_recall_fscore_support as prfs

In [17]:
test_size = file_len(testfile)
print('Testing data has',test_size, 'observations')

Testing data has 1224630 observations


In [28]:
PATH = 'pytorch_model2.pth'  # load pytorch saved model
batch_size = 600000
currpos = 0
test_loss=0
test_accuracy=0
all_prf = []
total = 0
correct = 0
while(True):
    
    nextpos = currpos + batch_size
    if(nextpos > test_size-1):
        nextpos = test_size-1
        
    print("curr_test_data : " ,currpos,':', nextpos)
    batch = readlines(testfile, currpos, nextpos)
    batchdf = pd.DataFrame(np.array(batch)).astype(float, errors='ignore')

    for col in batchdf.columns:
        batchdf[col] = batchdf[col].astype('float64', errors = 'ignore') 

    batcharr = batchdf.values

    X = batcharr[:,:-1]
    Y = batcharr[:,-1]

    Y_cat = get_labels(Y)
    batch_labels = np.unique(Y_cat)
    
    X_new = np.delete(X,[1,2,3],1)
    X_new = X_new.astype(np.float32)
    
    inputs_val = Variable(torch.from_numpy((X_new)))
    labels_val = Variable(torch.from_numpy((Y_cat)).long())
    net = Net()
    net.load_state_dict(torch.load(PATH))
    outputs = net(inputs_val)
    
    criterion = nn.CrossEntropyLoss()
    batch_loss = criterion(outputs, labels_val)
    test_loss += batch_loss.item()
    
    _, predicted = torch.max(outputs.data, 1)
    total += labels_val.size(0)
    correct += (predicted == labels_val).sum().item()
    
    prf_array = (prfs(labels_val, predicted, labels = batch_labels))
    prf_array = np.array(prf_array)
    df = pd.DataFrame(prf_array.T, columns = ['precison', 'recall', 'f_score', 'support'], index = batch_labels).sort_values(by = 'support', ascending = False)
    print(df.mean())
    all_prf.append(df.mean().values)
    if(nextpos == test_size - 1):
        break;
    else:
        currpos += batch_size
    print("------------------------------------------------")

print("Total loss :::: %s ,Accuracy ::: %s "%(str(test_loss/total),str(correct/total)))

curr_test_data :  0 : 600000
precison         0.599885
recall           0.592795
f_score          0.594209
support     120000.000000
dtype: float64
------------------------------------------------
curr_test_data :  600000 : 1200000
precison         0.533413
recall           0.598570
f_score          0.559575
support     120000.000000
dtype: float64
------------------------------------------------
curr_test_data :  1200000 : 1224629
precison       0.918909
recall         0.997300
f_score        0.952533
support     8209.666667
dtype: float64
Total loss :::: 4.440615642229138e-07 ,Accuracy ::: 0.9976188706947166 


In [30]:
precisions=[]
recalls=[]
f_scores=[]
supports=[]
for i in range(len(all_prf)):
    precisions.append(all_prf[i][0])
    recalls.append(all_prf[i][1])
    f_scores.append(all_prf[i][2])
    supports.append(all_prf[i][3])
print("Avg Precision :: "+str(np.mean(np.array(precisions))))
print("Avg Recall :: "+str(np.mean(np.array(recalls))))
print("Avg F score :: "+str(np.mean(np.array(f_scores))))
print("Avg Accuracy :: "+str(correct/total))

Avg Precision :: 0.6840690847250445
Avg Recall :: 0.7295547959682698
Avg F score :: 0.7021057084462706
Avg Accuracy :: 0.9976188706947166
