In [1]:
import urllib.parse as urlparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import mixture
from scipy.cluster.vq import whiten
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [2]:
normal_file_raw = 'normalTrafficTraining.txt'
normalTest_file_raw = 'normalTrafficTest.txt'
anomaly_file_raw = 'anomalousTrafficTest.txt'

In [3]:
#load dataset and process features

In [4]:
def load_dataset(file_name):
    file = open(file_name)
    doc = file.readlines()

    parsed_requests = []
    for i in range(len(doc)):
        line = doc[i].strip()
        content_length = 0
        if line.startswith("GET"):
            parsed_requests.append("GET: " + line.split(" ")[1] + " " + str(content_length))
        elif line.startswith("POST") or line.startswith("PUT"):
            url = line.split(' ')[0] + " " + line.split(' ')[1]
            j = 1
            while True:
                if doc[i + j].startswith("Content-Length"):
                    string = doc[i + j].split(' ')
                    content_length = string[1]
                    break
                j += 1
            j += 1
            data = doc[i + j + 1].strip()
            url += '?' + data
            parsed_requests.append(url + " " + content_length)

    file.close()

    data_set = np.zeros((len(parsed_requests), 12))
    counter = 0

    for request in parsed_requests:

        url = request.split(" ")
        content_length = int(url[2])
        parsed = urlparse.urlparse(url[1])
        path = parsed.path
        request_arguments = list(urlparse.parse_qs(parsed.query).values())

        length_of_arguments = 0
        number_of_arguments = len(request_arguments)
        length_of_query = len(parsed.query)
        number_of_letter_chars_in_path = sum(c.isalpha() for c in set(list(parsed.query)))

        number_of_digits_in_arguments = 0
        number_of_letters_in_arguments = 0
        number_of_special_chars_in_args = 0
        arg_lens = [0]
        equal_char = 0
        for a in request_arguments:
            numbers = sum(c.isdigit() for c in a[0])
            letters = sum(c.isalpha() for c in a[0])
            number_of_digits_in_arguments += numbers
            number_of_letters_in_arguments += letters
            number_of_special_chars_in_args += sum(not c.isdigit() and not c.isalpha() for c in a[0])
            length_of_arguments += len(a[0])
            arg_lens.append(len(a[0]))
            equal_char += sum(c == "=" for c in a[0])

        max_arg_len = max(arg_lens)
        if not number_of_arguments == 0:
            avg_arg_len = length_of_arguments / number_of_arguments
        else:
            avg_arg_len = 0

        length_of_path = len(path)
        path = path.replace("/", "")
        length_of_request = length_of_query + length_of_path
        number_of_special_chars_in_path = sum(not c.isdigit() and not c.isalpha() for c in set(list(path)))

        data_set[counter] = np.array(
            [max_arg_len, number_of_arguments, content_length,  number_of_digits_in_arguments,
             number_of_special_chars_in_args, length_of_arguments,  number_of_letters_in_arguments,
             length_of_request,  number_of_special_chars_in_path, avg_arg_len, number_of_letter_chars_in_path,
             length_of_path]
        )

        counter += 1
    return data_set

In [28]:
def get_acc_and_f1(y_test, y_pred):
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
    Precision = (TP * 1.0) / (TP + FP)
    Recall = (TP * 1.0) / (TP + FN)
    ACC = (TP + TN) * 1.0 / (TP + TN + FP + FN)
    F1 = 2.0 * Precision * Recall / (Precision + Recall)
    return [ACC, F1]

In [5]:
train_set = load_dataset(normal_file_raw)
testset_good = load_dataset(normalTest_file_raw)
testset_bad = load_dataset(anomaly_file_raw)

In [6]:
#data whitening

In [7]:
train_set = whiten(train_set)
testset_good = whiten((testset_good))
testset_bad = whiten(testset_bad)



In [None]:
#Do parameter tuning:

In [88]:
X_train, X_val, y_train, y_test = train_test_split(train_set, [0]*train_set.shape[0], test_size=0.3) 

In [None]:
F1_prev = 0
n_components = 1
for number_of_components in range(1, 10):
    gmm = mixture.GaussianMixture(n_components=number_of_components, covariance_type='full').fit(X_train)
    pred = gmm.score_samples(X_train)
    treshold = np.min(pred)
    misclasses = 0
    FP = 1
    FN = 0
    TP = 1
    TN = 0
    for sample in X_val:
        pred3 = gmm.score_samples(sample.reshape(1, -1))
        if pred3 < treshold:
            misclasses += 1
            FN += 1
        else:
            TN += 1
    Precision = (TP * 1.0) / (TP + FP)
    Recall = (TP * 1.0) / (TP + FN)
    F1 = (2*Precision*Recall)/(Precision+Recall)
    if F1<=F1_prev:
        n_components = number_of_components
        break
    else:
        F1_prev = F1

In [78]:
print("number of mixture components: ", n_components)

number of mixture components:  3


In [79]:
#Learn distribution of normal requests (EM algorithm is used)

In [80]:
gmm = mixture.GaussianMixture(n_components=n_components, covariance_type='full').fit(train_set)

In [81]:
#select treshold : the 

In [82]:
pred = gmm.score_samples(train_set)
treshold = np.min(pred)

In [83]:
#Compute TP, TN, FP, FN, accuracy

In [84]:
misclasses = 0
FP = 0
FN = 0
TP =0
TN = 0
max_pred = treshold
for sample in testset_bad:
    pred2 = gmm.score_samples(sample.reshape(1, -1))
    if pred2 > treshold:
        misclasses += 1
        FP += 1
    else:
        TP += 1
    if pred2 > max_pred:
        max_pred = pred2

for sample in testset_good:
    pred3 = gmm.score_samples(sample.reshape(1, -1))
    if pred3 < treshold:
        misclasses += 1
        FN += 1
    else:
        TN += 1

In [85]:
#print information about the classifier performance:

In [86]:
print("number of misclassifications ", misclasses)
print("accuracy: ", 1 - misclasses / (testset_good.shape[0] + testset_good.shape[0]))
Precision = (TP * 1.0) / (TP + FP)
Recall = (TP * 1.0) / (TP + FN)
print("F1 ", (2*Precision*Recall)/(Precision+Recall))

number of misclassifications  0
accuracy:  1.0
F1  1.0


In [9]:
#One should get accuracy and F1 measure in range [0.999,1.0], where 1.0 indicates perfect classifier

In [10]:
#Other option is to use a surpevised classifier, as suggested:

In [11]:
#Concatenate all three datasets:

In [12]:
all_data = np.concatenate((testset_bad, train_set, testset_good))

In [13]:
#Make target labels:

In [14]:
yBad = [1] * testset_bad.shape[0]
yGood = [0] * (train_set.shape[0] + testset_good.shape[0])
y = yBad + yGood

In [15]:
#divide data onto training set and test set. 60% in traing set, 40% in test set
#There are 25065 samples of anomalous http requests, all datasets contain together 97065 samples,
#40% of all data means 38826.0 samples. So in both datasets, training and test set will be samples from both classes

In [16]:
 X_train, X_test, y_train, y_test = train_test_split(all_data, y, test_size=0.4)

In [17]:
#Make a surpevised logistic regression classifier:
#Logistic regression is a simple classifier, but it can provide us a baseline and also give us an information
#whether the classification problem is linearly separable or not

In [40]:
lgs = LogisticRegression()
lgs.fit(X_train, y_train)
y_pred = lgs.predict(X_test)

In [41]:
#evaluate classifier:

In [42]:
[ACC, F1] = get_acc_and_f1(y_test, y_pred)

In [43]:
#show accuracy and F1 measure

In [44]:
print("accuracy: ", ACC, " F1:", F1)

accuracy:  0.9961195054945055  F1: 0.9923550504025438


In [None]:
#As the accuracy and F1 are over 0.99, it indicates, that the two classes are linearly separable

In [33]:
#With SVM using a linear kernel, it is possible to get 100% accuracy:

In [34]:
svc = svm.LinearSVC()
parameters = {'C': [0.001, 0.1, 1,  100, 1000]}
clf = GridSearchCV(svc, parameters, cv=10)

In [35]:
#Tune the hyperparameter C with 10-fold crossvalidation using grid search algorithm
#and classify the samples in test set

In [25]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [39]:
[ACC, F1] = get_acc_and_f1(y_test, y_pred)

In [38]:
print("accuracy: ", ACC, " F1:", F1)

accuracy:  1.0  F1: 1.0


In [None]:
#SVM: The primal classification problem is transformed onto a dual problem (Lagrangian duality).
#The dual problem is a convex optimization problem and leads to Quadratic Programming.
#For a QP problems, it is always possible to find a global optima.