# Сравнение алгоритмов кластеризации по метрикам качества кластеризации

In [1]:
import json
import csv
import time
import logging
import sys
import os
import gc

import numpy as np

from itertools import groupby
from collections import Counter

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import AgglomerativeClustering

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

from sklearn import metrics
from gensim import corpora
from collections import defaultdict
from gensim.sklearn_api import LdaTransformer
from scipy.sparse import csr_matrix

In [2]:
run_time = int(time.time())

In [3]:
log_path = '/data/logs/%s.log' % run_time
print("log path: %s" % log_path)

root = logging.getLogger()
root.setLevel(logging.INFO)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('[%(asctime)s] %(levelname)-5s %(message)s')
ch.setFormatter(formatter)
root.addHandler(ch)

fh = logging.FileHandler(log_path, 'w', encoding='utf8')
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
root.addHandler(fh)

log path: /data/logs/1514117172.log


In [4]:
# printt = print
printt = logging.info

## Подготовка данных для кластеризации

In [5]:
class News:
    def __init__(self, id, date, title, content, url, siteType):
        self.id = id
        self.date = date
        self.title = title
        self.content = content
        self.url = url
        self.siteType = siteType
    
    @classmethod
    def from_json(cls, json_str):
        json_dict = json.loads(json_str)
        return cls(**json_dict)

## Загрузка тестовой выборки

In [6]:
news = []
with open('/data/10k.test.normalized.json', encoding="utf8") as f:
    for line in f:
        news.append(News.from_json(line))

In [7]:
words = []

for n in news:
    words.extend(n.content.split())
counts = Counter(words)
one_time = [k for k, v in dict(counts).items() if v < 2 and v > 10000]
printt("total words: %s" % (len(words) - len(one_time)))

news_content = [x.content for x in news]

[2017-12-24 12:06:14,071] INFO  total words: 2740504


In [8]:
stopwords = one_time
with open('/data/stopwords.txt', encoding="utf8") as f:
    for line in f:
        stopwords.append(line)
printt("stop words: %s" % (len(stopwords)))

[2017-12-24 12:06:14,079] INFO  stop words: 230


In [9]:
texts = [[word for word in document.lower().split()] for document in news_content]
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

## Загрузка размеченной выборки

In [10]:
marked_map = {} # (id, label)
with open('/data/mark_news.csv', encoding="utf8") as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    for row in spamreader:
        marked_map[row[0]] = int(row[3])

marked_news = []
for n in news:
    label = marked_map[n.id]
    marked_news.append((n.id, label))
    
marked_labels = [label for n_id, label in marked_news]

In [11]:
marked_map['170bf9b9-d62d-437e-a75a-cac7b7c9f282']

40

# Вспомогательные функции

In [12]:
def zip_news(n,l):
    return list(map(assign_label_to_news, zip(n, l)))

def assign_label_to_news(tuplezz):
    (nws, lbl) = tuplezz
    nws.label = lbl.item()
    return nws

def filter_words(text):
    words_list = text.split()
    newWords = [x for x in words_list if len(x) > 3]
    return " ".join(newWords)

def print_clusters(cluster_news, clustre_labels):
    newsLabels = zip_news(cluster_news, clustre_labels)
    newsLabels = sorted(newsLabels, key=lambda n: n.label)
    for label, group in groupby(newsLabels, lambda n: n.label):
        groupList = list(group)
        printt("Cluster: %s, count news: %s, titles:" % (label, len(groupList)))
        for gr in groupList:
            printt("\t" + gr.title)
            
def print_topics(components, feature_names, n_top_words):
    for topic_idx, topic in enumerate(components):
        printt("Topic #%d:" % topic_idx)
        printt(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [13]:
%run /data/jupyter/semantic_group.py

[2017-12-24 12:06:16,343] INFO  loading projection weights from /data/gensim/news_0_300_2.bin.gz
[2017-12-24 12:06:22,514] INFO  loaded (124590, 300) matrix from /data/gensim/news_0_300_2.bin.gz
[2017-12-24 12:06:22,515] INFO  precomputing L2-norms of word weight vectors


# Векторизация

## TF-IDF

In [14]:
def tf_idf(content, ngram_range = (1, 1), max_features = None, min_df = 1, max_df = 1.0):
    time_start = time.time()
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, tokenizer=lambda text: text.split(" "), stop_words=stopwords, norm='l2', ngram_range=ngram_range, max_features=max_features, max_df=max_df, min_df=min_df)
    tfidf_matrix = tfidf_vectorizer.fit_transform(content)
    printt("vocabulary size: %s" % len(tfidf_vectorizer.vocabulary_))
    printt("tf_idf time: %s s" % (time.time() - time_start))
    return tfidf_matrix

## TF

In [15]:
def tf(content):
    time_start = time.time()
    tf_vectorizer = TfidfVectorizer(use_idf=False, tokenizer=lambda text: text.split(" "), stop_words=stopwords) # , ngram_range=(1, 3)
    tf_matrix = tf_vectorizer.fit_transform(content)
    printt("vocabulary size: %s" % len(tf_vectorizer.vocabulary_))
    printt("tf time: %s s" % (time.time() - time_start))
    return tf_matrix

## Semantic group

In [16]:
def semantic_group(content, ngram_range = (1, 1), min_df = 1, max_df = 1.0):
    time_start = time.time()
    sem_content = extractSemanticGroup(content)
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, tokenizer=lambda text: text.split(" "), stop_words=stopwords, norm='l2', ngram_range=ngram_range, min_df=min_df, max_df=max_df)
    tfidf_matrix = tfidf_vectorizer.fit_transform(sem_content)
    printt("vocabulary size: %s" % len(tfidf_vectorizer.vocabulary_))
    printt("semantic_group time: %s s" % (time.time() - time_start))
    return tfidf_matrix

# Семантический анализ

## LDA

In [17]:
def lda(corpus, id2word, n):
    time_start = time.time()
    lda_t = LdaTransformer(num_topics=n, id2word=id2word)
    lda_matrix = lda_t.fit_transform(corpus)
    normalizer = Normalizer()
    norm_matrix = normalizer.fit_transform(lda_matrix)
    printt("vocabulary size: %s" % norm_matrix.shape[0])
    printt("lda-%s, time: %s s" % (n, time.time() - time_start))
    return csr_matrix(norm_matrix)

# Кластеризация

# Проверка качества кластеризации

In [18]:
def score(matrix, marked_labels, clustered_labels):
    printt("V-measure: %0.3f" % metrics.v_measure_score(marked_labels, clustered_labels))
    printt("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(marked_labels, clustered_labels))
    printt("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(matrix, clustered_labels, sample_size=1000))
    printt("Cluster count: %s" % len(set(clustered_labels)))

# File

In [19]:
def score_file(matrix, marked_labels, clustered_labels, file):
    file.write("Homogeneity: %0.3f\n" % metrics.homogeneity_score(marked_labels, clustered_labels))
    file.write("Completeness: %0.3f\n" % metrics.completeness_score(marked_labels, clustered_labels))
    file.write("V-measure: %0.3f\n" % metrics.v_measure_score(marked_labels, clustered_labels))
    file.write("Adjusted Rand-Index: %.3f\n" % metrics.adjusted_rand_score(marked_labels, clustered_labels))
    file.write("Silhouette Coefficient: %0.3f\n" % metrics.silhouette_score(matrix, clustered_labels, sample_size=1000))
    file.write("Cluster count: %s\n" % len(set(clustered_labels)))

In [20]:
def print_clusters_file(cluster_news, clustre_labels, file):
    newsLabels = zip_news(cluster_news, clustre_labels)
    newsLabels = sorted(newsLabels, key=lambda n: n.label)
    for label, group in groupby(newsLabels, lambda n: n.label):
        groupList = list(group)
        file.write("Cluster: %s, count news: %s, titles:\n" % (label, len(groupList)))
        for gr in groupList:
            file.write("\t" + gr.title + "\n")

# Тест 

In [21]:
max_iter = 100 # 100
n_clusters = 130 # Количество кластеров
min_samples = 4

In [22]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

[2017-12-24 12:06:29,727] INFO  adding document #0 to Dictionary(0 unique tokens: [])
[2017-12-24 12:06:32,549] INFO  built Dictionary(43455 unique tokens: ['активность', 'ар', 'боевик', 'ввс', 'вести']...) from 10000 documents (total 2717157 corpus positions)


## LDA-Base

In [23]:
vectorization = {
    # "lda_1000": lda(corpus, dictionary, 1000),
    "lda_5000": lda(corpus, dictionary, 5000),
    # "lda_10000": lda(corpus, dictionary, 10000),
}

[2017-12-24 12:06:39,120] INFO  using symmetric alpha at 0.0002
[2017-12-24 12:06:39,122] INFO  using symmetric eta at 0.0002
[2017-12-24 12:06:39,129] INFO  using serial LDA version on this node
[2017-12-24 12:46:08,446] INFO  running online (single-pass) LDA training, 5000 topics, 1 passes over the supplied corpus of 10000 documents, updating model once every 2000 documents, evaluating perplexity every 10000 documents, iterating 50x with a convergence threshold of 0.001000
[2017-12-24 12:46:08,447] INFO  PROGRESS: pass 0, at document #2000/10000
[2017-12-24 12:49:27,605] INFO  merging changes from 2000 documents into a model of 10000 documents
[2017-12-24 12:50:07,700] INFO  topic #1492 (0.000): 0.013*"город" + 0.013*"год" + 0.009*"урал" + 0.009*"система" + 0.008*"инвест" + 0.008*"москва" + 0.007*"немец" + 0.007*"россия" + 0.006*"башнефть" + 0.006*"цена"
[2017-12-24 12:50:07,701] INFO  topic #2041 (0.000): 0.017*"самолет" + 0.014*"человек" + 0.014*"тайвань" + 0.011*"год" + 0.010*"спа

## TF-Base

In [None]:
vectorization = {
    # "tf-idf": tf_idf(news_content, (1, 1), None, 2, 0.9),
    "tf-idf_ngram_1_2": tf_idf(news_content, (1, 2), None, 2, 0.9),
    # "semantic_group": semantic_group(news_content, (1, 1), 2, 0.9),
}

## Sparce matrix

In [24]:
clasterization = {
    "kmeans": lambda: KMeans(n_clusters=n_clusters),
    "affinityPropagation": lambda: AffinityPropagation(),
    "birch": lambda: Birch(n_clusters=n_clusters),
}

In [None]:
clasterization = {
    "birch": lambda: Birch(n_clusters=None)
}

## Full matrix

In [None]:
clasterization = {
    "agglomerativeClustering": lambda: AgglomerativeClustering(n_clusters=n_clusters)
}

## Console output

In [None]:
for cl_name, cl_alg in clasterization.items():
    for vec_name, vec_matrix in vectorization.items():
        try:
            printt("----")
            printt("%s %s" % (cl_name, vec_name))
             alg = cl_alg()
            result_matrix = alg.fit(vec_matrix)
            # result_matrix = cl_alg().fit(vec_matrix.toarray())
            labels = result_matrix.labels_
            score(vec_matrix, marked_labels, labels)
            printt("%s %s: %s second" % (cl_name, vec_name, time.time() - start_time))
            
            del labels
            del result_matrix
            del alg
            gc.collect()
        except Exception as ex:
            printt("ERROR: %s, %s" % (type(ex), str(ex)))

## File output
Сохраняет результаты кластеризации вместе с примерами кластеризации в файл

In [25]:
path = "/data/results/clustering/%s" % (run_time)

In [26]:
os.makedirs(path, exist_ok=True)

In [27]:
for cl_name, cl_alg in clasterization.items():
    for vec_name, vec_matrix in vectorization.items():
        printt("%s %s" % (cl_name, vec_name))
        try:
            start_time = time.time()
            alg = cl_alg()
            result_matrix = alg.fit(vec_matrix)
            # result_matrix = cl_alg().fit(vec_matrix.toarray())
            labels = result_matrix.labels_
            printt("time: %s" % (time.time() - start_time))
            file_name = "/%s_%s.txt" % (cl_name, vec_name)
            printt("path: %s" %(path + file_name))
            
            with open(path + file_name, mode='w', encoding='utf-8') as f:
                f.write("%s %s\n" % (cl_name, vec_name))
                score_file(vec_matrix, marked_labels, labels, f)
                f.write("time: %s\n" % (time.time() - start_time))
                print_clusters_file(news, labels, f)
                
            del labels
            del result_matrix
            del alg
            gc.collect()
        except Exception as ex:
            printt("ERROR: %s, %s" % (type(ex), str(ex)))

[2017-12-24 13:21:52,202] INFO  kmeans lda_5000
[2017-12-24 13:22:39,484] INFO  time: 47.28091907501221
[2017-12-24 13:22:39,485] INFO  path: /data/results/clustering/1514117172/kmeans_lda_5000.txt
[2017-12-24 13:22:40,138] INFO  affinityPropagation lda_5000
[2017-12-24 13:24:22,870] INFO  time: 102.73041367530823
[2017-12-24 13:24:22,871] INFO  path: /data/results/clustering/1514117172/affinityPropagation_lda_5000.txt
[2017-12-24 13:24:30,088] INFO  birch lda_5000
[2017-12-24 13:26:02,856] INFO  time: 92.76637101173401
[2017-12-24 13:26:02,857] INFO  path: /data/results/clustering/1514117172/birch_lda_5000.txt


In [28]:
clasterization2 = {
    "agglomerativeClustering": lambda: AgglomerativeClustering(n_clusters=n_clusters)
}

In [29]:
for cl_name, cl_alg in clasterization2.items():
    for vec_name, vec_matrix in vectorization.items():
        printt("%s %s" % (cl_name, vec_name))
        try:
            start_time = time.time()
            alg = cl_alg()
            # result_matrix = alg.fit(vec_matrix)
            result_matrix = cl_alg().fit(vec_matrix.toarray())
            labels = result_matrix.labels_
            printt("time: %s" % (time.time() - start_time))
            file_name = "/%s_%s.txt" % (cl_name, vec_name)
            printt("path: %s" %(path + file_name))
            
            with open(path + file_name, mode='w', encoding='utf-8') as f:
                f.write("%s %s\n" % (cl_name, vec_name))
                score_file(vec_matrix, marked_labels, labels, f)
                f.write("time: %s\n" % (time.time() - start_time))
                print_clusters_file(news, labels, f)
                
            del labels
            del result_matrix
            del alg
            gc.collect()
        except Exception as ex:
            printt("ERROR: %s, %s" % (type(ex), str(ex)))

[2017-12-24 13:26:03,541] INFO  agglomerativeClustering lda_5000
[2017-12-24 13:29:11,979] INFO  time: 188.43630361557007
[2017-12-24 13:29:11,980] INFO  path: /data/results/clustering/1514117172/agglomerativeClustering_lda_5000.txt


## DBScan

In [None]:
time_start = time.time()
matrix = csr_matrix(vectorization["tf-idf"])
dbscan = DBSCAN(eps=1.1, min_samples=min_samples, metric="euclidean")
dbscan.fit(matrix)
labels = dbscan.labels_
score(matrix, marked_labels, labels)
printt("dbscan time: %s s" % (time.time() - time_start))

file_name = "/%s_%s.txt" % ("dbscan", "tf-idf")
with open(path + file_name, mode='w', encoding='utf-8') as f:
        f.write("%s %s\n" % ("dbscan", "tf-idf"))
        score_file(matrix, marked_labels, labels, f)
        f.write("time: %s\n" % (time.time() - time_start))
        print_clusters_file(news, labels, f)

In [None]:
time_start = time.time()
matrix = csr_matrix(vectorization["tf-idf_ngram_1_2"])
dbscan = DBSCAN(eps=1.2, min_samples=min_samples, metric="euclidean")
dbscan.fit(matrix)
labels = dbscan.labels_
score(matrix, marked_labels, labels)
printt("dbscan time: %s s" % (time.time() - time_start))

file_name = "/%s_%s.txt" % ("dbscan", "idf_ngram_1_2")
with open(path + file_name, mode='w', encoding='utf-8') as f:
        f.write("%s %s\n" % ("dbscan", "idf_ngram_1_2"))
        score_file(matrix, marked_labels, labels, f)
        f.write("time: %s\n" % (time.time() - time_start))
        print_clusters_file(news, labels, f)

In [None]:
time_start = time.time()
matrix = vectorization["semantic_group"]
dbscan = DBSCAN(eps=1, min_samples=min_samples, metric="euclidean")
dbscan.fit(matrix)
labels = dbscan.labels_
score(matrix, marked_labels, labels)
printt("dbscan time: %s s" % (time.time() - time_start))

file_name = "/%s_%s.txt" % ("dbscan", "semantic_group")
with open(path + file_name, mode='w', encoding='utf-8') as f:
        f.write("%s %s\n" % ("dbscan", "semantic_group"))
        score_file(matrix, marked_labels, labels, f)
        f.write("time: %s\n" % (time.time() - time_start))
        print_clusters_file(news, labels, f)

In [None]:
time_start = time.time()
matrix = vectorization["lda_1000"]
dbscan = DBSCAN(eps=0.62, min_samples=min_samples, metric="euclidean")
dbscan.fit(matrix)
labels = dbscan.labels_
score(matrix, marked_labels, labels)
printt("dbscan time: %s s" % (time.time() - time_start))

file_name = "/%s_%s.txt" % ("dbscan", "lda_1000")
with open(path + file_name, mode='w', encoding='utf-8') as f:
        f.write("%s %s\n" % ("dbscan", "lda_1000"))
        score_file(matrix, marked_labels, labels, f)
        f.write("time: %s\n" % (time.time() - time_start))
        print_clusters_file(news, labels, f)

In [34]:
time_start = time.time()
matrix = vectorization["lda_5000"]
dbscan = DBSCAN(eps=0.8, min_samples=min_samples, metric="euclidean")
dbscan.fit(matrix)
labels = dbscan.labels_
score(matrix, marked_labels, labels)
printt("dbscan time: %s s" % (time.time() - time_start))

file_name = "/%s_%s.txt" % ("dbscan", "lda_5000")
with open(path + file_name, mode='w', encoding='utf-8') as f:
        f.write("%s %s\n" % ("dbscan", "lda_5000"))
        score_file(matrix, marked_labels, labels, f)
        f.write("time: %s\n" % (time.time() - time_start))
        print_clusters_file(news, labels, f)

[2017-12-24 14:22:16,508] INFO  V-measure: 0.413
[2017-12-24 14:22:16,513] INFO  Adjusted Rand-Index: 0.011
[2017-12-24 14:22:16,860] INFO  Silhouette Coefficient: -0.033
[2017-12-24 14:22:16,862] INFO  Cluster count: 196
[2017-12-24 14:22:16,863] INFO  dbscan time: 1.8217382431030273 s
