In [15]:
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import os
from scipy.spatial import distance

In [16]:
# Data read and shuffle
data_path = './data/kddcup.data_10_percent_shuffled'
if os.path.exists(data_path) == False:
    print("Creating new shuffled data....")
    data = pd.read_csv('./data/kddcup.data_10_percent')
    data = data.sample(frac = 1)
    data.to_csv(data_path, header=False, index=False)
else:
    data = pd.read_csv(data_path)
    
data = data.to_numpy()
print("Data shape :" , data.shape)

Data shape : (494019, 42)


In [17]:
# visualize the labels
labels = []
for i in range(0, data.shape[0], 1):
    if data[i, 41] not in labels:
        labels.append(data[i, 41])
print(labels)

['smurf.', 'normal.', 'neptune.', 'nmap.', 'warezclient.', 'teardrop.', 'satan.', 'back.', 'pod.', 'portsweep.', 'ipsweep.', 'guess_passwd.', 'perl.', 'land.', 'rootkit.', 'warezmaster.', 'buffer_overflow.', 'imap.', 'ftp_write.', 'multihop.', 'loadmodule.', 'spy.', 'phf.']


In [18]:
# Count the amount of normal(benign) traffic
def count(string):
    count = 0
    for i in range(0, data.shape[0], 1):
        if data[i, 41] == string:
            count = count + 1
    return count

print('Amount of normal (benign) data :', count("normal."))
# amount of data is diffetent from original paper. Why??

Amount of normal (benign) data : 97277


In [19]:
# Pick the normal (benign) traffic
def pick(string):
    data_target = data[0:count(string), :].copy()
    index = 0
    for i in range(0, data.shape[0], 1):
        if data[i, 41] == string:
            data_target[index, :] = data[i, :].copy()
            index = index + 1
    return data_target

data_normal = pick('normal.')

print("Data shape (normal only) :", data_normal.shape)

Data shape (normal only) : (97277, 42)


In [20]:
# Pick the continuous features
def reduct(data):
    data_target = np.delete(data, [1, 2, 3, 6, 7, 11, 19, 20, 21, 41], axis = 1) 
    #7 and 19 are the features which var = 0, if we include this features normalization would be impossible.
    data_target = data_target.astype('float32')
    return data_target

data_normal_reducted = reduct(data_normal)
print("Pick the continuous features : ", data_normal_reducted.shape)

# Room to improve and fix...
# Cannot understand which 32 features should I select
# In the paper "the TAMs of the different types of traffic records are generated using 32 continuous features.""
# Refer this https://kdd.ics.uci.edu/databases/kddcup99/kddcup.names

Pick the continuous features :  (97277, 32)


In [21]:
# Normalize
def normalization(df):
    df_norm = (df - df.mean(axis= 0)) / df.std(ddof = 0, axis = 0)
    return df_norm, df.mean(axis= 0), df.std(ddof = 0, axis = 0) # normalized data, mean, std

In [22]:
# Cartesian coordinate
area_length = data_normal_reducted.shape[1]
amount = count('normal.')
print("amount of samples :", amount)

cov_length = int(1/2 * (1 + (area_length - 1)) * (area_length - 1))
coordinate = np.zeros((cov_length, 2), "int32")
print("coordinate :", coordinate.shape)
index = 0
for j in range(1, area_length, 1):
    for k in range(j, area_length, 1):
        coordinate[index] = np.array([k, j-1])
        index = index + 1

amount of samples : 97277
coordinate : (496, 2)


In [23]:
# TAM lower
def tam_lower(data):
    tam = np.zeros((data.shape[0], cov_length),  "float32")
    for i in range(0, data.shape[0], 1):
        for j in range(0, cov_length, 1):
            tam[i, j] = data[i, coordinate[j, 0]] * data[i, coordinate[j, 1]] / 2
    return tam

# tam = tam_lower(data_normal_reducted)
# print("TAM shape:", tam.shape)

In [24]:
# Calc Covariance of TAM using lower TAM.
def covariance(tam, save_name):
    ave_tam = np.average(tam, axis = 0)
    #print('average of TAM :', ave_tam.shape)
    sum_tam = np.zeros((cov_length, cov_length), "float32")
    flag_tam = np.zeros((cov_length, cov_length), "int8")
    num_path = "./out_num/" + save_name

    if os.path.exists(num_path) == False:
        """
        for jk in range(0, cov_length, 1):
            for lv in range(0, cov_length, 1):
                if flag_tam[jk, lv] == 0:
                    for n in range(0, tam.shape[0], 1):
                        sum_tam[jk, lv] = sum_tam[jk, lv] + ((tam[n, jk] - ave_tam[jk]) * (tam[n, lv] - ave_tam[lv]))
                    flag_tam[lv, jk] = 1
                else:
                    sum_tam[jk, lv] = flag_tam[lv, jk]

        cov = sum_tam * (1 / (tam.shape[0] - 1))
        """
        cov_sum = np.zeros((tam.shape[1], tam.shape[1]), 'float32')
        for i in range(0, 10, 1):
            n = int(tam.shape[0] / 10)
            cov_sum = cov_sum + np.cov(tam[i*n:(i+1)*n, :].T)
        cov = cov_sum / 10
        #print('Cov shape :', cov.shape)
        np.save(num_path, cov)

    else :
        cov = np.load(num_path)

    return cov
    

In [30]:
# Evaluate the system
def evaluation(labels, alpha, ave_normal, std_normal, ave_tam, start, end, mean, std):
    evaluate = np.zeros((len(labels), len(alpha)))
    
    # Evaluate all the sample belonging to labels
    for i in range(0, len(labels), 1):
        data = pick(labels[i])
        data = data[:100, :] # cut
        data_reducted = reduct(data)
        if labels[i] == "normal.":
            data_reducted = data_reducted[start:end]
        
        # each labels
        for j in range(0, data_reducted.shape[0], 1):
            observed = data_reducted[j]
            observed = (observed - mean) / std
            tam_observed = np.zeros((cov_length,))
            
            # each samples
            for k in range(0, cov_length, 1):
                tam_observed[k] = observed[coordinate[k, 0]] * observed[coordinate[k, 1]]  / 2
            
            # print(tam_observed)
            md_observed = distance.mahalanobis(tam_observed, ave_tam, cov_inv)
            # md_observed = np.dot(np.dot((tam_observed - ave_tam).T, cov_inv), (tam_observed - ave_tam)) ** (1/2)
            # print(md_observed)
            
            # each alpha
            for l in range(0, len(alpha), 1):
                if labels[i] == 'normal.':
                    if ((ave_normal - (std_normal * alpha[l])) <= md_observed) and (md_observed <= (ave_normal + (std_normal * alpha[l]))):
                        evaluate[i, l] = evaluate[i, l] + 0.
                    else :
                        evaluate[i, l] = evaluate[i, l] + 1. # 0.       
                else:
                    if ((ave_normal - (std_normal * alpha[l])) <= md_observed) and (md_observed <= (ave_normal + (std_normal * alpha[l]))):
                        evaluate[i, l] = evaluate[i, l] + 0. # 0.
                    else :
                        evaluate[i, l] = evaluate[i, l] + 1.
                        
        evaluate[i, :] = evaluate[i, :] / float(data_reducted.shape[0])
        
    return evaluate

In [32]:
# Evaluate by using 10 Cross Validation
k = 10
eva_labels = ['normal.', 'teardrop.', 'smurf.', 'pod.', 'neptune.', 'land.', 'back.']
alpha = [1.0, 1.5, 2.0, 2.5, 3.0]
evaluate = np.zeros((10, len(eva_labels), len(alpha)))

for i in range(0, k, 1):
    # Divide train data & validation data
    start = i * int(amount / k)
    end = (i + 1) * int(amount / k)
    train_data = np.concatenate([data_normal_reducted[:start, :], data_normal_reducted[end:, :]])
    train_data_normalized, mean, std = normalization(train_data)
    train_tam = tam_lower(train_data_normalized)
    ave_tam = np.average(train_tam, axis = 0)
    #val_tam = tam[start:end, :]
    
    # Calc cov from train data
    # print(i, ": Calc COV. takes time...")
    cov = covariance(train_tam, str(i) + '_norm.npy')
    
    # Clac MD distance
    cov_inv = np.linalg.pinv(cov)
    md = np.zeros((train_tam.shape[0], ))
    for j in range(0, train_tam.shape[0], 1):
        md[j] = distance.mahalanobis(train_tam[j], ave_tam, cov_inv)
        # md[j] = np.dot(np.dot((train_tam[j] - ave_tam).T, cov_inv), (train_tam[j] - ave_tam)) ** (1/2)
    ave_normal = np.average(md, axis = 0)
    std_normal = np.std(md, axis = 0, ddof=1)
    #print(var_normal)
    
    # Evaluate
    print("Evaluating ...")
    result = evaluation(eva_labels, alpha, ave_normal, std_normal, ave_tam, start, end, mean, std)
    evaluate[i] =  result
    break

out = np.average(evaluate, axis = 0)

np.save('./results/evaluate.npy', evaluate)
np.save('./results/out.npy', out)

Evaluating ...


In [33]:
#np.set_printoptions(precision=3, floatmode='fixed', suppress=True)
print(out*10*100)

[[  3.   2.   1.   1.   1.]
 [ 63.  57.  47.  39.  35.]
 [100. 100. 100. 100. 100.]
 [ 55.  55.  46.  24.  16.]
 [100. 100. 100. 100. 100.]
 [100. 100. 100. 100. 100.]
 [100. 100. 100. 100. 100.]]
