In [1]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from collections import defaultdict

def data_loader(f_name):
    with open(f_name, mode='r', encoding='utf-8') as f:
        data = list(set(f.readlines()))
        return data

NON_LABEL_FILE = 'dataset/level_3_non_label.csv'
DOC2VEC_MODEL_FILE = 'dataset/doc2vec'

STOP_WORDS = []

In [2]:
fmt_tag = "</*[a-zA-Z0-9]+|>"
fmt_html_escape = "&[a-zA-Z0-9]+;"
fmt_symbol = "=|:|;|\"|\\\\\\\\|\\\\|\(|\)|`|&"

fmt = "(%s|%s|%s)" %(fmt_tag, fmt_html_escape, fmt_symbol)

def filter_not_script(w):
    return (w[0] != "<") or (w == "<script")

def parse_text(text):
    text = text.lower()
    parsed = re.split(fmt, text.rstrip("\n"))
    # remove white space in head and tail
    parsed = map(lambda x : x.strip(), parsed)
    # remove empty string
    parsed = filter(None, parsed)
    # filter not <script tag
    parsed = filter(filter_not_script, parsed)
    # remove ">"
    parsed = filter(lambda x : x != ">", parsed)
    return list(parsed)

In [3]:
data = data_loader(NON_LABEL_FILE)

In [4]:
# https://qiita.com/Ikeda_yu/items/94247d819e6a0808d0b7

from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

trainings = [TaggedDocument(words = parse_text(d), tags = [i]) for i,d in enumerate(data)]

# トレーニング（パラメータについては後日）
m = Doc2Vec(documents= trainings, dm = 1, vector_size=300, window=8, min_count=5, workers=4)

# モデルのセーブ
m.save(DOC2VEC_MODEL_FILE)

In [5]:
# m = Doc2Vec.load(DOC2VEC_MODEL_FILE)

In [6]:
#ベクトルをリストに格納
vectors_list=[m.docvecs[n] for n in range(len(m.docvecs))]

#ドキュメント番号のリスト
doc_nums=range(200,200+len(m.docvecs))

#クラスタリング設定
#クラスター数を変えたい場合はn_clustersを変えてください
n_clusters = 2
kmeans_model = KMeans(n_clusters=n_clusters, verbose=1, random_state=1, n_jobs=-1)

#クラスタリング実行
kmeans_model.fit(vectors_list)

#クラスタリングデータにラベル付け
labels=kmeans_model.labels_

labels

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,

In [7]:
#ラベルとドキュメント番号の辞書づくり
cluster_to_docs = defaultdict(list)
for cluster_id, doc_num in zip(labels, doc_nums):
    cluster_to_docs[cluster_id].append(doc_num)

#クラスター出力

for docs in cluster_to_docs.values():
    print(docs)


[200, 201, 202, 203, 204, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 231, 232, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 285, 286, 287, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 305, 306, 307, 308, 310, 311, 312, 313, 315, 316, 317, 318, 319, 320, 322, 325, 326, 327, 329, 330, 331, 332, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 351, 352, 353, 354, 355, 358, 359, 360, 362, 363, 364, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 384, 385, 386, 387, 388, 390, 391, 392, 393, 394, 395, 396, 397, 398, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 411, 412, 414, 415, 416, 417, 420, 421, 423, 424, 425, 426, 428, 430, 431,

In [8]:
cluster0 = [data[i-200] for i in cluster_to_docs[0]]
cluster1 = [data[i-200] for i in cluster_to_docs[1]]

In [9]:
cluster0[0:20]

['"onmouseover="alert(decodeURI(decodeURI(/%58%53%53/.source)))"\n',
 '"><frame src="javascript:alert`1`">\n',
 'Type <code><b>echo hi &gt;t</b></code>\n',
 'Another useful command is <code>apache -k stop</code>.\n',
 '<SELECT NAME="" onmouseover=alert(1)></select>\n',
 'If necessary, start apache, either as a Win2000 service (using Admin Tools | Computer Management, or by typing <code>apache -k start -n apache</code>\n',
 '<strong>2. Install Apache</strong>\n',
 '&#39;><s><img src="javascript:alert(1)>\n',
 '<svg><style><img/src=x onerror=alert(1)// </b>\n',
 'The TWiki <a href="http://TWiki.org">standard installation</a>\n',
 '</iframe> /></textarea><video><source onerror=alert();\n',
 '<progress </caption><video><source <body onerror=alert();\n',
 '<a href="#Server_Requirements">Server Requirements</a>\n',
 '<div/style="width:expression(alert(1))">x</div>\n',
 "<strong>This applies only if you have root access:</strong> on hosted accounts, you shouldn't have this problem - otherwise

In [10]:
cluster1[0:20]

["!#%&)(*+,-./:;=?@]\\[^_'}|{~&#39;<s>&#39;><script>alert(1)</script>\n",
 '&#39;<s>&#39;><img src="javascript:alert(1)&#39;>\n',
 '&#39;<s>&#39;><script>alert(1)</script>\n',
 '<embed/<a/onload=alert();?n<script type="text/javascript">\n',
 '/></track><video><source onerror=javascript:window.onerror=alert();script:al\\u0065rt();\n',
 "!#%&)(*+,-./:;=?@]\\[^_'}|{~<s><script>alert(1)</script>\n",
 '<img src="http://TWiki.org/p/pub/TWiki/TWikiDocGraphics/tip.gif" border="0" alt="TIP" width="16" height="16" />\n',
 'Specify <code><b>c:\\</b></code> as the installation directory - this actually installs Apache into <code><b>c:\\apache</b></code> (if you specify <code>c:\\apache</code>\n',
 '<frameset><frame/onload=alert(1); />\n',
 '";\\r\\nalert`1`;//\n',
 '<tr><th bgcolor="#99CCCC"> <strong>Resource</strong> </th><th bgcolor="#99CCCC">\n',
 '<embed/onload=al?u0065rt();?n<script type="text/javascript">\n',
 '\\r\\nalert`1`;//\n',
 '<div style="color:&#x65;xpression(alert(1));">a</div>\n',

In [16]:
# https://medium.com/@MSalnikov/text-clustering-with-k-means-and-tf-idf-f099bcf95183

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

tfidf_vectorizer = TfidfVectorizer(tokenizer=parse_text)
tfidf = tfidf_vectorizer.fit_transform(data)

kmeans = KMeans(n_clusters=2).fit_predict(tfidf)

for doc, cls in zip(data, kmeans):
    print("%d : %s" % (cls, doc))

1 : icon=<iframe/<body onload=?u0061lert();>

0 : <a href="http://www.apache.org/dyn/closer.cgi" target="_top">http://www.apache.org/dyn/closer.cgi</a>

1 : " onmousemove=alert(1); 

1 : <img src=# <keygen onerror=?u0061lert``;>

0 : "onSelectStart="&#x00077indow['&#x00061lert']('&#x00058SS')"

1 : <img src=x onerror=alert(1);>

0 : <strong>Security issue:</strong> Directories <code>twiki/data</code> , <code>twiki/templates</code>

1 : javascript:\u0061lert();</script><frameset><basefont/</td>

0 : <h2><a name="Recent_updates"> Recent updates </a></h2>

0 : <h2><a name="Server_Requirements"> Server Requirements </a></h2>

1 : "; alert(1); //

1 : "onKeyPress="/127641/['constructor']['constructor']('al'+'ert'+'('+'''+1+'''+')')()

1 : %EF%BC%9Cscript%EF%BC%9Ealert(1)%EF%BC%9C/script%EF%BC%9E

1 : <svg/onload=al?u0065rt();?n<script type="text/javascript">

0 : 15 Jun 2002 - various notes on Cygwin installation and troubleshooting: use of 'Unix' as default text file type (i.e. for mountin