Extracting raw data from *_dump.tx files.

In [1]:
from uuid import UUID
from ast import literal_eval

from parse_tree_node import ParseTreeNode      

with open("datasets/xss_dump.tx", 'r') as xss:
    raw_xss_train = eval(xss.read())
    xss_trees_train = []
    for entry in raw_xss_train:
        xss_trees_train.append(ParseTreeNode.from_dict(entry['req_tree']['tree']))

with open("datasets/sqli_dump.tx", 'r') as sqli:
    raw_sqli_train = eval(sqli.read())
    sqli_trees_train = []
    for entry in raw_sqli_train:
        sqli_trees_train.append(ParseTreeNode.from_dict(entry['req_tree']['tree']))

with open("datasets/cominj_dump.tx", 'r') as cominj:
    raw_cominj_train = eval(cominj.read())
    cominj_trees_train = []
    for entry in raw_cominj_train:
        cominj_trees_train.append(ParseTreeNode.from_dict(entry['req_tree']['tree']))
        
with open("test_datasets/xss_dump.tx", 'r') as xss:
    raw_xss_test = eval(xss.read())
    xss_trees_test = []
    for entry in raw_xss_test:
        xss_trees_test.append(ParseTreeNode.from_dict(entry['req_tree']['tree']))
        
with open("test_datasets/sqli_dump.tx", 'r') as sqli:
    raw_sqli_test = eval(sqli.read())
    sqli_trees_test = []
    for entry in raw_sqli_test:
        sqli_trees_test.append(ParseTreeNode.from_dict(entry['req_tree']['tree']))
        
with open("test_datasets/cominj_dump.tx", 'r') as cominj:
    raw_cominj_test = eval(cominj.read())
    cominj_trees_test = []
    for entry in raw_cominj_test:
        cominj_trees_test.append(ParseTreeNode.from_dict(entry['req_tree']['tree']))

Prepocessing request trees.

In [2]:
from random import shuffle
from urllib import unquote_plus

# size of a special class
CLASS_SIZE_TRAIN = 1124
XSS_CLASS_SIZE_TEST = len(xss_trees_test)
SQLI_CLASS_SIZE_TEST = len(sqli_trees_test)
COMINJ_CLASS_SIZE_TEST = len(cominj_trees_test)

# checking the path begins with a subpath from the list_of_subpaths
def contains_subpath(path, list_of_subpaths):
    for subpath in list_of_subpaths:
        if path[:len(subpath)] == subpath:
            return True
    return False

# extracting values from fields specified by useful_paths
def extracting_useful_fields(tree, useful_paths):
    feature_string = ""
    for path, node in tree.walk():
        if node.is_leaf() and contains_subpath(path, useful_paths):
            feature_string += node.value + ' '
    return feature_string[:-1]

# building a corpus of useful strings from trees
def building_corpus(trees, useful_paths):
    corpus = [extracting_useful_fields(entry, useful_paths) for entry in trees]
    return corpus
   
all_trees_train = xss_trees_train + sqli_trees_train + cominj_trees_train
all_trees_test = xss_trees_test + sqli_trees_test + cominj_trees_test
    
# extracting all potentially useful fields from trees    
useful_paths = [['url', 'query'], ['headers', 'user-agent'], ['headers', 'referer'], ['headers', 'cookie'], ['body']]
full_corpus_train = building_corpus(all_trees_train, useful_paths)
full_corpus_test = building_corpus(all_trees_test, useful_paths)

# extracting ['url', 'query'] fields from trees
url_query_corpus_train = building_corpus(all_trees_train, [['url', 'query']])
url_query_corpus_test = building_corpus(all_trees_test, [['url', 'query']])

# extracting ['headers', 'user-agent'] fields from trees
user_agent_corpus_train = building_corpus(all_trees_train, [['headers', 'user-agent']])
user_agent_corpus_test = building_corpus(all_trees_test, [['headers', 'user-agent']])

# extracting ['headers', 'referer'] fields from trees
referer_corpus_train = building_corpus(all_trees_train, [['headers', 'referer']])
referer_corpus_test = building_corpus(all_trees_test, [['headers', 'referer']])

# extracting ['headers', 'cookie'] fields from trees
cookie_corpus_train = building_corpus(all_trees_train, [['headers', 'cookie']])
cookie_corpus_test = building_corpus(all_trees_test, [['headers', 'cookie']])

# extracting ['body'] fields from trees
body_corpus_train = building_corpus(all_trees_train, [['body']])
body_corpus_test = building_corpus(all_trees_test, [['body']])

# class labels list
labels_corpus_train = [0 for i in range(CLASS_SIZE_TRAIN)] + [1 for i in range(CLASS_SIZE_TRAIN)] + [2 for i in range(CLASS_SIZE_TRAIN)]
labels_corpus_test = [0 for i in range(XSS_CLASS_SIZE_TEST)] + [1 for i in range(SQLI_CLASS_SIZE_TEST)] + [2 for i in range(COMINJ_CLASS_SIZE_TEST)]
labels_names = ["XSS", "SQL Injection", "Command Injection"]

# mixing entries in corpuses
zip_corpus_train = zip(full_corpus_train, url_query_corpus_train, user_agent_corpus_train, referer_corpus_train, 
                       cookie_corpus_train, body_corpus_train, labels_corpus_train)
shuffle(zip_corpus_train)
full_corpus_train, url_query_corpus_train, user_agent_corpus_train, referer_corpus_train, cookie_corpus_train, body_corpus_train, labels_corpus_train = zip(*zip_corpus_train)

zip_corpus_test = zip(full_corpus_test, url_query_corpus_test, user_agent_corpus_test, referer_corpus_test, 
                       cookie_corpus_test, body_corpus_test, labels_corpus_test)
shuffle(zip_corpus_test)
full_corpus_test, url_query_corpus_test, user_agent_corpus_test, referer_corpus_test, cookie_corpus_test, body_corpus_test, labels_corpus_test = zip(*zip_corpus_test)

160
275
71


Extracting features from train corpuses.

In [33]:
import re
from sklearn.feature_extraction.text import CountVectorizer

# crawler specific words - delete from resulting tokens
stop_words = ['gq435t372gnkq2inkmh2j5p444', '3fi7p4bkf6ug0qtrd2erlon453', 'cominj', 'sqli', 'xss']

# preprocessing includes lowercasing and decoding urlencode for feature_string - fix this comment
def preprocessing_feature_string(feature_string):
    feature_string = feature_string.lower()
    while True:
        unquote_feature_string = unquote_plus(feature_string)
        if unquote_feature_string != feature_string:
            feature_string = unquote_feature_string
        else:
            break
            
    # replacing all numbers by 'NUM'
    # feature_string = re.sub(r'(?u)\b\d+\b', u'NUM', feature_string)
    
    feature_string = re.sub(r'(?u)sqlmap/1\.2\.4\.10#dev \(http://sqlmap\.org\)', u' ', feature_string)
    feature_string = re.sub(r'(?u)commix/v2\.4-dev#27 \(http://commixproject\.com\)', u' ', feature_string)
    feature_string = re.sub(r'(?u)http://10\.0\.0\.2:80/bwapp/xss_get\.php', u' ', feature_string)
    feature_string = re.sub(r'(?u)http://10\.0\.0\.2:80/bwapp/xss_post\.php', u' ', feature_string)
    
    feature_string = re.sub(r'(?u)\b\d+\b', u' ', feature_string)
    
    feature_string = re.sub(r'(?u)http://www\.w3\.org/ /xml-events', u' ', feature_string)
    feature_string = re.sub(r'(?u)http://www\.w3\.org/ /svg', u' ', feature_string)
    
    # print feature_string
    
    return feature_string

# maybe useful: analyzer - char n-grams from raw string
# extracting features from full_corpus: words...
vectorizer_words = CountVectorizer(decode_error='ignore', preprocessor=preprocessing_feature_string, ngram_range=(1,2), stop_words=stop_words, token_pattern=u'(?u)\\b\\w+\\b', binary=True)
full_words_matrix_train = vectorizer_words.fit_transform(full_corpus_train)
# ...and nonalphabetic simbols
vectorizer_nonalphabetic = CountVectorizer(decode_error='ignore', preprocessor=preprocessing_feature_string, token_pattern=u'(?u)[^\\w\\s]', binary=True)
full_nonalphabetic_matrix_train = vectorizer_nonalphabetic.fit_transform(full_corpus_train)

# extracting features from url_query_corpus - corpus that contains only parameter values from the url:query fields
vectorizer_url_query = CountVectorizer(decode_error='ignore', preprocessor=preprocessing_feature_string, stop_words=stop_words, token_pattern=u'(?u)\\b\\w+\\b', binary=True)
url_query_matrix_train = vectorizer_url_query.fit_transform(url_query_corpus_train)

# extracting features from user_agent_corpus - corpus that contains only parameter values from the headers:user-agent fields
vectorizer_user_agent = CountVectorizer(decode_error='ignore', preprocessor=preprocessing_feature_string, stop_words=stop_words, token_pattern=u'(?u)\\b\\w+\\b', binary=True)
user_agent_matrix_train = vectorizer_user_agent.fit_transform(user_agent_corpus_train)

# extracting features from referer_corpus - corpus that contains only parameter values from the headers:referer fields
vectorizer_referer = CountVectorizer(decode_error='ignore', preprocessor=preprocessing_feature_string, stop_words=stop_words, token_pattern=u'(?u)\\b\\w+\\b', binary=True)
referer_matrix_train = vectorizer_referer.fit_transform(referer_corpus_train)

# extracting features from cookie_corpus - corpus that contains only parameter values from the headers:cookie fields
vectorizer_cookie = CountVectorizer(decode_error='ignore', preprocessor=preprocessing_feature_string, stop_words=stop_words, token_pattern=u'(?u)\\b\\w+\\b', binary=True)
cookie_matrix_train = vectorizer_cookie.fit_transform(cookie_corpus_train)

# extracting features from body_corpus - corpus that contains only parameter values from the body fields
vectorizer_body = CountVectorizer(decode_error='ignore', preprocessor=preprocessing_feature_string, stop_words=stop_words, token_pattern=u'(?u)\\b\\w+\\b', binary=True)
body_matrix_train = vectorizer_body.fit_transform(body_corpus_train)

Extracting features from test corpuses.

In [34]:
# extracting features from full_corpus: words...
full_words_matrix_test = vectorizer_words.transform(full_corpus_test)
# ...and nonalphabetic simbols
full_nonalphabetic_matrix_test = vectorizer_nonalphabetic.transform(full_corpus_test)

# extracting features from url_query_corpus - corpus that contains only parameter values from the url:query fields
url_query_matrix_test = vectorizer_url_query.transform(url_query_corpus_test)

# extracting features from user_agent_corpus - corpus that contains only parameter values from the headers:user-agent fields
user_agent_matrix_test = vectorizer_user_agent.transform(user_agent_corpus_test)

# extracting features from referer_corpus - corpus that contains only parameter values from the headers:referer fields
referer_matrix_test = vectorizer_referer.transform(referer_corpus_test)

# extracting features from cookie_corpus - corpus that contains only parameter values from the headers:cookie fields
cookie_matrix_test = vectorizer_cookie.transform(cookie_corpus_test)

# extracting features from body_corpus - corpus that contains only parameter values from the body fields
body_matrix_test = vectorizer_body.transform(body_corpus_test)

Combining the feature matrices.

In [35]:
from scipy.sparse import csr_matrix, hstack

def get_all_feature_names():
    all_feature_names = []
    all_feature_names += vectorizer_words.get_feature_names() + vectorizer_nonalphabetic.get_feature_names()
    all_feature_names += ['url:query:' + feature_name for feature_name in vectorizer_url_query.get_feature_names()]
    all_feature_names += ['headers:user-agent:' + feature_name for feature_name in vectorizer_user_agent.get_feature_names()]
    all_feature_names += ['headers:referer:' + feature_name for feature_name in vectorizer_referer.get_feature_names()]
    all_feature_names += ['headers:cookie:' + feature_name for feature_name in vectorizer_cookie.get_feature_names()]
    all_feature_names += ['body:' + feature_name for feature_name in vectorizer_body.get_feature_names()]
    return all_feature_names

features_matrix_train = hstack([full_words_matrix_train, full_nonalphabetic_matrix_train, url_query_matrix_train, 
                                user_agent_matrix_train, referer_matrix_train, cookie_matrix_train, body_matrix_train], format='csr')

features_matrix_test = hstack([full_words_matrix_test, full_nonalphabetic_matrix_test, url_query_matrix_test, 
                                user_agent_matrix_test, referer_matrix_test, cookie_matrix_test, body_matrix_test], format='csr')
'''

features_matrix_train = hstack([full_words_matrix_train, url_query_matrix_train, 
                                user_agent_matrix_train, referer_matrix_train, cookie_matrix_train, body_matrix_train], format='csr')

features_matrix_test = hstack([full_words_matrix_test, url_query_matrix_test, 
                                user_agent_matrix_test, referer_matrix_test, cookie_matrix_test, body_matrix_test], format='csr')
''' 

# labels column in Compressed Sparse Column format
labels_train = csr_matrix(list(labels_corpus_train)).T
labels_test = csr_matrix(list(labels_corpus_test)).T

# printing all feature names
''' features = get_all_feature_names()
for feature in features:
    print feature '''

' features = get_all_feature_names()\nfor feature in features:\n    print feature '

Classification and testing.

In [36]:
from numpy import argsort
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

# printng features with the highest coefficient values, per class
def top20_features(classifier, labels_names):
    feature_names = get_all_feature_names()
    for i, label_name in enumerate(labels_names):
        top20 = argsort(classifier.coef_[i])[-20:]
        print("%s: %s" % (label_name, " | ".join(feature_names[j] for j in top20)))

def train_and_test(classifier, classifier_name):
    if isinstance(classifier, MLPClassifier):
        classifier.fit(features_matrix_train, labels_train.toarray().ravel())
    else:
        classifier.fit(features_matrix_train, labels_train.toarray())
    
    labels_prediction = classifier.predict(features_matrix_test)
    positive = 0
    test_len = len(labels_corpus_test)
    for i in range(test_len):
        if labels_corpus_test[i] == labels_prediction[i]:
            positive += 1
    
    result = classifier_name + ':\n'
    result +=  'ratio: ' + str(positive) + ':' + str(test_len) + '\n'
    result +=  'score: ' + str(classifier.score(features_matrix_test, labels_test.toarray())) + '\n'
        
    print result
        
    top20_features(classifier, labels_names)
        
    print
        
train_and_test(MultinomialNB(), 'Multinomial Naive Bayes')
train_and_test(LogisticRegression(dual=True), 'Logistic Regression (liblinear)')
train_and_test(LogisticRegression(solver='lbfgs', multi_class='multinomial'), 'Logistic Regression (lbfgs)')
train_and_test(LogisticRegression(solver='newton-cg', multi_class='multinomial'), 'Logistic Regression (newton-cg)')
train_and_test(LogisticRegression(solver='sag', multi_class='multinomial'), 'Logistic Regression (sag)')
train_and_test(LogisticRegression(solver='saga', multi_class='multinomial'), 'Logistic Regression (saga)')
train_and_test(LinearSVC(dual=True), 'Linear SVC (ovr)')
train_and_test(LinearSVC(multi_class='crammer_singer'), 'Linear SVC (multi_class)')
''' train_and_test(MLPClassifier(), 'MLPClassifier') '''

Multinomial Naive Bayes:
ratio: 83:506
score: 0.16403162055335968

XSS: headers:referer:script | ; | ( | ) | url:query:javascript | headers:cookie:src | headers:referer:src | headers:user-agent:src | body:javascript | src | headers:user-agent:javascript | headers:referer:javascript | headers:cookie:javascript | javascript | : | " | / | = | < | >
SQL Injection: headers:cookie:script | ( | url:query:javascript | ) | script | - | src | headers:referer:src | headers:user-agent:src | ; | headers:cookie:javascript | headers:user-agent:javascript | headers:referer:javascript | javascript | : | " | / | = | > | <
Command Injection: headers:cookie:script | headers:user-agent:style | headers:referer:style | style | headers:cookie:style | url:query:javascript | - | ; | javascript | headers:cookie:javascript | headers:referer:javascript | headers:user-agent:javascript | ( | ) | " | : | / | = | > | <

Logistic Regression (liblinear):
ratio: 72:506
score: 0.1422924901185771

XSS: svg script | 8de9f5e

" train_and_test(MLPClassifier(), 'MLPClassifier') "