In [128]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from collections import defaultdict

def data_loader(f_name, l_name):
    with open(f_name, mode='r', encoding='utf-8') as f:
        data = list(set(f.readlines()))
        label = [l_name for i in range(len(data))]
        return data, label

XSS_TRAIN_FILE = 'dataset/train_level_1.csv'
XSS_TEST_FILE = 'dataset/test_level_2.csv'
XSS2_TRAIN_FILE = 'dataset/train_level_2.csv'
XSS2_TEST_FILE = 'dataset/test_level_1.csv'
NORMAL_TRAIN_FILE = 'dataset/normal.csv'
NORMAL_TEST_FILE = 'dataset/normal.csv'

NON_LABEL_FILE = 'dataset/level_3_non_label.csv'
LABEL_FILE = 'dataset/level_3_label.csv'
DOC2VEC_MODEL_FILE = 'dataset/doc2vec'

STOP_WORDS = []

ll = ["xss", "normal"]

def data_label_loader(f_name):
    data, _ = data_loader(f_name, "")
    label = [ll[int(d[-2])] for d in data]
    data = [d[:-2] for d in data]
    return data, label
    

In [129]:
FMT_URL = "https?://"
FMT_TAG = "</*[a-zA-Z0-9]+|>"
FMT_HTML_ESCAPE = "&[a-zA-Z0-9]+;"
FMT_SYMBOL = "=|:|;|\"|\\\\\\\\|\\\\|\(|\)|`|&|#"

FORMAT = "(%s|%s|%s|%s)" %(FMT_URL, FMT_TAG, FMT_HTML_ESCAPE, FMT_SYMBOL)


ZEN = "".join(chr(0xff01 + i) for i in range(94))
HAN = "".join(chr(0x21 + i) for i in range(94))

ZEN2HAN = str.maketrans(ZEN, HAN)

def filter_not_script(w):
    return (w[0] != "<") or (w == "<script")

def preprocess_text(text):
    text = text.lower()
    text = text.rstrip("\n")
    text = text.translate(ZEN2HAN)
    return text

def parse_text(text):
    text = preprocess_text(text)
    parsed = re.split(FORMAT, text)
    # remove white space in head and tail
    parsed = map(lambda x : x.strip(), parsed)
    # remove empty string
    parsed = filter(None, parsed)
    # filter not <script tag
    parsed = filter(filter_not_script, parsed)
    # remove ">"
    parsed = filter(lambda x : x != ">", parsed)
    return list(parsed)

In [130]:
data, _ = data_loader(NON_LABEL_FILE, "none")
data2, label2 = data_label_loader(LABEL_FILE)

xss_train_data, xss_train_label = data_loader(XSS_TRAIN_FILE, 'xss')
xss_test_data, xss_test_label = data_loader(XSS_TEST_FILE, 'xss')
xss2_train_data, xss2_train_label = data_loader(XSS2_TRAIN_FILE, 'xss')
xss2_test_data, xss2_test_label = data_loader(XSS2_TEST_FILE, 'xss')
normal_train_data, normal_train_label = data_loader(NORMAL_TRAIN_FILE, 'normal')
normal_test_data, normal_test_label = data_loader(NORMAL_TEST_FILE, 'normal')

X_train = xss_train_data + normal_train_data + xss2_train_data
y_train = xss_train_label + normal_train_label + xss2_train_label
X_test = xss_test_data + normal_test_data + xss2_test_data
y_test = xss_test_label + normal_test_label + xss2_test_label

X = X_train + X_test
Y = y_train + y_test

X = data2
Y = label2

In [111]:
# https://qiita.com/Ikeda_yu/items/94247d819e6a0808d0b7

from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

trainings = [TaggedDocument(words = parse_text(d), tags = [i]) for i,d in enumerate(X)]

# トレーニング（パラメータについては後日）
m = Doc2Vec(documents= trainings, dm = 1, vector_size=300, window=8, min_count=5, workers=4)

# モデルのセーブ
m.save(DOC2VEC_MODEL_FILE)

In [112]:
# m = Doc2Vec.load(DOC2VEC_MODEL_FILE)

In [113]:
#ベクトルをリストに格納
vectors_list=[m.docvecs[n] for n in range(len(m.docvecs))]

#ドキュメント番号のリスト
doc_nums=range(200,200+len(m.docvecs))

#クラスタリング設定
#クラスター数を変えたい場合はn_clustersを変えてください
n_clusters = 2
kmeans = KMeans(n_clusters=2).fit_predict(vectors_list)

In [114]:
#ラベルとドキュメント番号の辞書づくり
cluster_to_docs = defaultdict(list)
for cluster_id, doc_num in zip(labels, doc_nums):
    cluster_to_docs[cluster_id].append(doc_num)



In [115]:
vv = ["normal", "xss"]

pred = [vv[i] for i in kmeans]

acc_score = accuracy_score(Y, pred)
conf_mat = confusion_matrix(
    pred, Y, labels=['xss', 'normal']
)
print("=====================================")
print(" RESULT")
print("=====================================")
print("acc: \n", acc_score)
print("confusion matrix: \n", conf_mat)
print()

 RESULT
acc: 
 0.5255102040816326
confusion matrix: 
 [[ 31  18]
 [168 175]]



In [116]:
# https://medium.com/@MSalnikov/text-clustering-with-k-means-and-tf-idf-f099bcf95183

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

tfidf_vectorizer = TfidfVectorizer(tokenizer=parse_text)



tfidf = tfidf_vectorizer.fit_transform(X)

kmeans = KMeans(n_clusters=2).fit_predict(tfidf)

In [117]:
vv = ["normal", "xss"]

pred = [vv[i] for i in kmeans]

acc_score = accuracy_score(Y, pred)
conf_mat = confusion_matrix(
    pred, Y, labels=['xss', 'normal']
)
print("=====================================")
print(" RESULT")
print("=====================================")
print("acc: \n", acc_score)
print("confusion matrix: \n", conf_mat)
print()

for x, y, result in zip(X, Y, pred):
        if y != result:
            print('[actual: \"%s\" == predict: \"%s\"] : %r' % (y, result, x))
            print("parsed: ", parse_text(x))
            print()

 RESULT
acc: 
 0.9413265306122449
confusion matrix: 
 [[176   0]
 [ 23 193]]

[actual: "xss" == predict: "normal"] : '<object/data=evil.html #</a>\t'
parsed:  ['/data', '=', 'evil.html', '#']

[actual: "xss" == predict: "normal"] : '"onKeyDown="document.write(\'<x73cript>x61lert(x22XSSx22)</x73cript>\')"\t'
parsed:  ['"', 'onkeydown', '=', '"', 'document.write', '(', "'", 'x61lert', '(', 'x22xssx22', ')', "'", ')', '"']

[actual: "xss" == predict: "normal"] : '"ondblclick="&#00119indow[\'&#0097lert\'](\'&#0088SS\')"\t'
parsed:  ['"', 'ondblclick', '=', '"', '&', '#', "00119indow['", '&', '#', "0097lert']", '(', "'", '&', '#', "0088ss'", ')', '"']

[actual: "xss" == predict: "normal"] : '"onSelectStart="&#x00077indow[\'&#x00061lert\'](\'&#x00058SS\')"\t'
parsed:  ['"', 'onselectstart', '=', '"', '&', '#', "x00077indow['", '&', '#', "x00061lert']", '(', "'", '&', '#', "x00058ss'", ')', '"']

[actual: "xss" == predict: "normal"] : '"onKeyPress="u0077indow[\'u0061lert\'](\'u0058SS\')" \t'


In [138]:
from sklearn.cluster import SpectralClustering

clustering = SpectralClustering(n_clusters=2,random_state=0).fit_predict(tfidf)

In [141]:
vv = ["xss", "normal"]

maxx = 0
agrs = []
current_arg = []


for affinity in ["rbf"]: # , "precomputed"
    for assign_labels in ["discretize"]:
        for gamma in np.arange(-0.1, 1.0, 0.1):
            for n_neighbors in range(1, 5, 2):
                for degree in range(-5, 5, 3):
                    for coef0 in np.arange(0.01, 0.1, 0.05):
                        current_arg = [affinity, assign_labels, gamma, n_neighbors, degree, coef0]
                        clustering = SpectralClustering(n_clusters=2, random_state=1, affinity=affinity, assign_labels=assign_labels,
                                                       gamma=gamma, n_neighbors=n_neighbors, degree=degree, coef0=coef0).fit_predict(tfidf)
                        pred = [vv[i] for i in clustering]
                        acc_score = accuracy_score(Y, pred)
                        if acc_score < 0.5:
                            acc_score = 1 - acc_score
                        print("score :", acc_score)
                        if acc_score > maxx:
                            print("update : ", acc_score)
                            maxx = acc_score
                            args = current_arg

                if affinity == "rbf":
                    break
            if affinity == "nearest_neighbors":
                break



score : 0.5357142857142857
update :  0.5357142857142857
score : 0.5357142857142857
score : 0.5357142857142857
score : 0.5357142857142857
score : 0.5357142857142857
score : 0.5357142857142857
score : 0.5357142857142857
score : 0.5357142857142857
score : 0.5153061224489796
score : 0.5153061224489796
score : 0.5153061224489796
score : 0.5153061224489796
score : 0.5153061224489796
score : 0.5153061224489796
score : 0.5153061224489796
score : 0.5153061224489796
score : 0.8673469387755102
update :  0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.8673469387755102
score : 0.86734693877551

In [142]:
maxx

0.8673469387755102

In [143]:
args

['rbf', 'discretize', 0.1, 1, -5, 0.01]

In [139]:
vv = ["xss", "normal"]

pred = [vv[i] for i in clustering]

acc_score = accuracy_score(Y, pred)
conf_mat = confusion_matrix(
    pred, Y, labels=['xss', 'normal']
)
print("=====================================")
print(" RESULT")
print("=====================================")
print("acc: \n", acc_score)
print("confusion matrix: \n", conf_mat)
print()

for x, y, result in zip(X, Y, pred):
        if y != result:
            print('[actual: \"%s\" == predict: \"%s\"] : %r' % (y, result, x))
            print("parsed: ", parse_text(x))
            print()

 RESULT
acc: 
 0.17346938775510204
confusion matrix: 
 [[ 68 193]
 [131   0]]

[actual: "normal" == predict: "xss"] : '<code>Redirect <font color="red">/urlpathto/twiki/index.html</font> http://<font color="red">yourdomain.com/urlpathto/twiki/bin/</font>view</code> <br>\t'
parsed:  ['redirect', 'color', '=', '"', 'red', '"', '/urlpathto/twiki/index.html', 'http://', 'color', '=', '"', 'red', '"', 'yourdomain.com/urlpathto/twiki/bin/', 'view']

[actual: "normal" == predict: "xss"] : '<code>apache -k restart -n apache</code> for Apache running as a Win2000 service (-n gives name of service)\t'
parsed:  ['apache -k restart -n apache', 'for apache running as a win2000 service', '(', '-n gives name of service', ')']

[actual: "xss" == predict: "normal"] : 'icon=<iframe/<body onload=?u0061lert();>\t'
parsed:  ['icon', '=', '/', 'onload', '=', '?u0061lert', '(', ')', ';']

[actual: "normal" == predict: "xss"] : '<h2><a name="Client_Requirements"> Client Requirements </a></h2>\t'
parsed:  ['na

In [120]:
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(n_clusters=2).fit_predict(tfidf.toarray())

In [122]:
vv = ["xss", "normal"]

pred = [vv[i] for i in clustering]

acc_score = accuracy_score(Y, pred)
conf_mat = confusion_matrix(
    pred, Y, labels=['xss', 'normal']
)
print("=====================================")
print(" RESULT")
print("=====================================")
print("acc: \n", acc_score)
print("confusion matrix: \n", conf_mat)
print()

for x, y, result in zip(X, Y, pred):
        if y != result:
            print('[actual: \"%s\" == predict: \"%s\"] : %r' % (y, result, x))
            print("parsed: ", parse_text(x))
            print()

 RESULT
acc: 
 0.9209183673469388
confusion matrix: 
 [[169   1]
 [ 30 192]]

[actual: "xss" == predict: "normal"] : '<object/data=evil.html #</a>\t'
parsed:  ['/data', '=', 'evil.html', '#']

[actual: "xss" == predict: "normal"] : '"onKeyDown="document.write(\'<x73cript>x61lert(x22XSSx22)</x73cript>\')"\t'
parsed:  ['"', 'onkeydown', '=', '"', 'document.write', '(', "'", 'x61lert', '(', 'x22xssx22', ')', "'", ')', '"']

[actual: "xss" == predict: "normal"] : '"onSelectStart="/123441/[\'cons\'+\'truc\'+\'tor\'][\'cons\'+\'truc\'+\'tor\'](\'a\'+\'l\'+\'e\'+\'r\'+\'t\'+\'(\'+\'\'\'+\'XS\'+\'S\'+\'\'\'+\')\')()\t'
parsed:  ['"', 'onselectstart', '=', '"', "/123441/['cons'+'truc'+'tor']['cons'+'truc'+'tor']", '(', "'a'+'l'+'e'+'r'+'t'+'", '(', "'+'''+'xs'+'s'+'''+'", ')', "'", ')', '(', ')']

[actual: "normal" == predict: "xss"] : '<script language="JavaScript" type="text/javascript">\t'
parsed:  ['<script', 'language', '=', '"', 'javascript', '"', 'type', '=', '"', 'text/javascript', '"']

In [123]:
from sklearn.mixture import GaussianMixture

clustering = GaussianMixture(n_components=2).fit_predict(tfidf.toarray())

In [125]:
vv = ["xss", "normal"]

pred = [vv[i] for i in clustering]

acc_score = accuracy_score(Y, pred)
conf_mat = confusion_matrix(
    pred, Y, labels=['xss', 'normal']
)
print("=====================================")
print(" RESULT")
print("=====================================")
print("acc: \n", acc_score)
print("confusion matrix: \n", conf_mat)
print()

for x, y, result in zip(X, Y, pred):
        if y != result:
            print('[actual: \"%s\" == predict: \"%s\"] : %r' % (y, result, x))
            print("parsed: ", parse_text(x))
            print()

 RESULT
acc: 
 0.625
confusion matrix: 
 [[138  86]
 [ 61 107]]

[actual: "xss" == predict: "normal"] : '"onMouseLeave="alert(1)"\t'
parsed:  ['"', 'onmouseleave', '=', '"', 'alert', '(', '1', ')', '"']

[actual: "xss" == predict: "normal"] : '"onMouseEnter="alert(1)" "onMouseDown="alert(1)"\t'
parsed:  ['"', 'onmouseenter', '=', '"', 'alert', '(', '1', ')', '"', '"', 'onmousedown', '=', '"', 'alert', '(', '1', ')', '"']

[actual: "normal" == predict: "xss"] : '<code>apache -k restart -n apache</code> for Apache running as a Win2000 service (-n gives name of service)\t'
parsed:  ['apache -k restart -n apache', 'for apache running as a win2000 service', '(', '-n gives name of service', ')']

[actual: "xss" == predict: "normal"] : '"onMouseMove="alert(1)" "onFocusOut="alert(1)"\t'
parsed:  ['"', 'onmousemove', '=', '"', 'alert', '(', '1', ')', '"', '"', 'onfocusout', '=', '"', 'alert', '(', '1', ')', '"']

[actual: "xss" == predict: "normal"] : '<ScrIpt>alert(1);</SCript>\t'
parsed:  ['<