# Сравнение алгоритмов кластеризации по метрикам качества кластеризации

In [1]:
import json
import csv
import time

import numpy as np

from itertools import groupby
from collections import Counter


from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.decomposition import LatentDirichletAllocation

from sklearn import metrics

## Подготовка данных для кластеризации

In [2]:
class News:
    def __init__(self, id, date, title, content, url, siteType):
        self.id = id
        self.date = date
        self.title = title
        self.content = content
        self.url = url
        self.siteType = siteType
    
    @classmethod
    def from_json(cls, json_str):
        json_dict = json.loads(json_str)
        return cls(**json_dict)

## Загрузка тестовой выборки

In [3]:
news = []
with open('/data/10k.test.normalized.json', encoding="utf8") as f:
    for line in f:
        news.append(News.from_json(line))

In [4]:
words = []
for n in news:
    words.extend(n.content.split())
counts = Counter(words)
one_time = [k for k, v in dict(counts).items() if v == 1]
print("total words: %s" % (len(words) - len(one_time)))

news_content = [x.content for x in news]

total words: 2717122


In [5]:
stopwords = set(one_time)

## Загрузка размеченной выборки

In [6]:
marked_map = {} # (id, label)
with open('/data/mark_news.csv', encoding="utf8") as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    for row in spamreader:
        marked_map[row[0]] = int(row[3])

marked_news = []
for n in news:
    label = marked_map[n.id]
    marked_news.append((n.id, label))
    
marked_labels = [label for n_id, label in marked_news]

In [7]:
marked_map['170bf9b9-d62d-437e-a75a-cac7b7c9f282']

40

# Вспомогательные функции

In [8]:
def zip_news(n,l):
    return list(map(assign_label_to_news, zip(n, l)))

def assign_label_to_news(tuplezz):
    (nws, lbl) = tuplezz
    nws.label = lbl.item()
    return nws

def filter_words(text):
    words_list = text.split()
    newWords = [x for x in words_list if len(x) > 3]
    return " ".join(newWords)

def print_clusters(cluster_news, clustre_labels):
    newsLabels = zip_news(cluster_news, clustre_labels)
    newsLabels = sorted(newsLabels, key=lambda n: n.label)
    for label, group in groupby(newsLabels, lambda n: n.label):
        groupList = list(group)
        print("Cluster: %s, count news: %s, titles:" % (label, len(groupList)))
        for gr in groupList:
            print("\t" + gr.title)
            
def print_topics(components, feature_names, n_top_words):
    for topic_idx, topic in enumerate(components):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

# Векторизация

## TF-IDF

In [9]:
def tf_idf(content):
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, tokenizer=lambda text: text.split(" "), stop_words=stopwords) # , ngram_range=(1, 3)
    tfidf_matrix = tfidf_vectorizer.fit_transform(content)
    print("vocabulary size: %s" % len(tfidf_vectorizer.vocabulary_))
    return tfidf_matrix

## LDA

In [15]:
def lda(content, n, max_iter, n_jobs):
    tf = CountVectorizer(stop_words=stopwords).fit_transform(content)
    lda = LatentDirichletAllocation(n_topics=n, max_iter=max_iter, learning_method='online', learning_offset=50., n_jobs=n_jobs)
    lda_matrix = lda.fit_transform(tf)
    return lda_matrix

# Кластеризация

## DBScan

In [11]:
def dbscan(matrix, eps, samples):
    db = DBSCAN(eps=eps, min_samples=samples).fit(matrix)
    labels = db.labels_
    print('count clusters: %d' % (len(set(db.labels_)) - (1 if -1 in db.labels_ else 0)))
    labels = db.labels_
    print("-1: %s, 0: %s" % (labels.tolist().count(-1), labels.tolist().count(0)))
    return labels

## KMeans

In [12]:
def kmeans(matrix, n, n_jobs):
    km = KMeans(n_clusters=n, n_jobs=n_jobs).fit(matrix)
    labels = km.labels_
    return labels

# Проверка качества кластеризации

In [13]:
def score(matrix, marked_labels, clustered_labels):
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(marked_labels, clustered_labels))
    print("Completeness: %0.3f" % metrics.completeness_score(marked_labels, clustered_labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(marked_labels, clustered_labels))
    print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(marked_labels, clustered_labels))
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(matrix, clustered_labels, sample_size=1000))

# Тест 

In [19]:
n_clusters = 130 # Количество кластеров
n_topics = 1000 # Количество топиков для LDA
n_jobs = 1 # Количество потоков для кластеризации
max_iter = 10 # 100

In [20]:
tfidf_matrix = tf_idf(news_content)
lda = lda(news_content, n_topics, max_iter, n_jobs)

vocabulary size: 43607


In [21]:
# KMeans tfidf
print("kmeans tf-idf...")
start_time = time.time()
kk_labels = kmeans(tfidf_matrix, n_clusters, n_jobs)
score(tfidf_matrix, marked_labels, kk_labels)
print("kmeans tf-idf: %s second" % (time.time() - start_time))
print("")

# KMeans lda
print("kmeans lda...")
start_time = time.time()
kk_labels = kmeans(lda, n_clusters, n_jobs)
score(lda, marked_labels, kk_labels)
print("kmeans lda: %s second" % (time.time() - start_time))

kmeans tf-idf...
Homogeneity: 0.744
Completeness: 0.753
V-measure: 0.749
Adjusted Rand-Index: 0.355
Silhouette Coefficient: 0.041
kmeans tf-idf: 618.5275161266327 second

kmeans lda...
Homogeneity: 0.514
Completeness: 0.502
V-measure: 0.508
Adjusted Rand-Index: 0.127
Silhouette Coefficient: 0.176
kmeans lda: 34.0408833026886 second


In [24]:
eps_tf_idf = 1
eps_lda = 0.1
sampels = 10

In [25]:
# DBScan tfidf
print("dbscan tf-idf...")
start_time = time.time()
kk_labels = dbscan(tfidf_matrix, eps_tf_idf, sampels)
score(tfidf_matrix, marked_labels, kk_labels)
print("dbscan tf-idf: %s second" % (time.time() - start_time))
print("")

# DBScan lda
print("dbscan lda...")
start_time = time.time()
kk_labels = dbscan(lda, eps_lda, sampels)
score(lda, marked_labels, kk_labels)
print("dbscan lda: %s second" % (time.time() - start_time))

dbscan tf-idf...
count clusters: 76
-1: 7891, 0: 105
Homogeneity: 0.223
Completeness: 0.751
V-measure: 0.344
Adjusted Rand-Index: 0.009
Silhouette Coefficient: 0.003
dbscan tf-idf: 8.35751724243164 second

dbscan lda...
count clusters: 15
-1: 5730, 0: 3546
Homogeneity: 0.099
Completeness: 0.465
V-measure: 0.164
Adjusted Rand-Index: 0.011
Silhouette Coefficient: -0.362
dbscan lda: 25.161997318267822 second
