# Сравнение алгоритмов кластеризации по метрикам качества кластеризации

In [27]:
import json
import csv
import time
import os

import numpy as np

from itertools import groupby
from collections import Counter

from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

from sklearn import metrics

## Подготовка данных для кластеризации

In [28]:
class News:
    def __init__(self, id, date, title, content, url, siteType):
        self.id = id
        self.date = date
        self.title = title
        self.content = content
        self.url = url
        self.siteType = siteType
    
    @classmethod
    def from_json(cls, json_str):
        json_dict = json.loads(json_str)
        return cls(**json_dict)

## Загрузка тестовой выборки

In [29]:
news = []
with open('/data/kasandra/year/10k.test.normalized.json', encoding="utf8") as f:
    for line in f:
        news.append(News.from_json(line))

In [49]:
words = []
news = news[:100]
for n in news:
    words.extend(n.content.split())
counts = Counter(words)
one_time = [k for k, v in dict(counts).items() if v == 1]
print("total words: %s" % (len(words) - len(one_time)))

news_content = [x.content for x in news]

total words: 23724


In [31]:
stopwords = set(one_time)

## Загрузка размеченной выборки

In [62]:
marked_map = {} # (id, label)
with open('/data/kasandra/marked/marked_10k.csv', encoding="utf8") as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    for row in spamreader:
        marked_map[row[0]] = int(row[3])

marked_news = []
for n in news:
    label = marked_map[n.id]
    marked_news.append((n.id, label))
    
marked_labels = [label for n_id, label in marked_news]

In [33]:
marked_map['170bf9b9-d62d-437e-a75a-cac7b7c9f282']

40

# Вспомогательные функции

In [34]:
def zip_news(n,l):
    return list(map(assign_label_to_news, zip(n, l)))

def assign_label_to_news(tuplezz):
    (nws, lbl) = tuplezz
    nws.label = lbl.item()
    return nws

def filter_words(text):
    words_list = text.split()
    newWords = [x for x in words_list if len(x) > 3]
    return " ".join(newWords)

def print_clusters(cluster_news, clustre_labels):
    newsLabels = zip_news(cluster_news, clustre_labels)
    newsLabels = sorted(newsLabels, key=lambda n: n.label)
    for label, group in groupby(newsLabels, lambda n: n.label):
        groupList = list(group)
        print("Cluster: %s, count news: %s, titles:" % (label, len(groupList)))
        for gr in groupList:
            print("\t" + gr.title)
            
def print_topics(components, feature_names, n_top_words):
    for topic_idx, topic in enumerate(components):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

# Векторизация

## TF-IDF

In [35]:
def tf_idf(content):
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, tokenizer=lambda text: text.split(" "), stop_words=stopwords) # , ngram_range=(1, 3)
    tfidf_matrix = tfidf_vectorizer.fit_transform(content)
    print("vocabulary size: %s" % len(tfidf_vectorizer.vocabulary_))
    return tfidf_matrix

## TF

In [36]:
def tf(content):
    tf_vectorizer = TfidfVectorizer(use_idf=False, tokenizer=lambda text: text.split(" "), stop_words=stopwords) # , ngram_range=(1, 3)
    tf_matrix = tfidf_vectorizer.fit_transform(content)
    print("vocabulary size: %s" % len(tf_vectorizer.vocabulary_))
    return tf_matrix

# Семантический анализ

## LDA

In [37]:
def lda(content, n, max_iter):
    tf = CountVectorizer(stop_words=stopwords).fit_transform(content)
    lda = LatentDirichletAllocation(n_topics=n, max_iter=max_iter, learning_method='online', learning_offset=50.)
    lda_matrix = lda.fit_transform(tf)
    normalizer = Normalizer()
    norm_matrix = normalizer.fit_transform(lda_matrix)
    return norm_matrix

## LSA

In [38]:
def lsa(matrix, n, max_inter):
    lsa = TruncatedSVD(n_components=n, n_iter=max_inter)
    lsa_matrix = lsa.fit_transform(matrix)
    return lsa_matrix

# Кластеризация

# Проверка качества кластеризации

In [39]:
def score(matrix, marked_labels, clustered_labels):
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(marked_labels, clustered_labels))
    print("Completeness: %0.3f" % metrics.completeness_score(marked_labels, clustered_labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(marked_labels, clustered_labels))
    print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(marked_labels, clustered_labels))
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(matrix, clustered_labels, sample_size=1000))
    print("Cluster count: %s" % len(set(clustered_labels)))

# file

In [70]:
def score_file(matrix, marked_labels, clustered_labels, file):
    file.write("Homogeneity: %0.3f\n" % metrics.homogeneity_score(marked_labels, clustered_labels))
    file.write("Completeness: %0.3f\n" % metrics.completeness_score(marked_labels, clustered_labels))
    file.write("V-measure: %0.3f\n" % metrics.v_measure_score(marked_labels, clustered_labels))
    file.write("Adjusted Rand-Index: %.3f\n" % metrics.adjusted_rand_score(marked_labels, clustered_labels))
    file.write("Silhouette Coefficient: %0.3f\n" % metrics.silhouette_score(matrix, clustered_labels, sample_size=1000))
    file.write("Cluster count: %s\n" % len(set(clustered_labels)))

In [66]:
def print_clusters_file(cluster_news, clustre_labels, file):
    newsLabels = zip_news(cluster_news, clustre_labels)
    newsLabels = sorted(newsLabels, key=lambda n: n.label)
    for label, group in groupby(newsLabels, lambda n: n.label):
        groupList = list(group)
        file.write("Cluster: %s, count news: %s, titles:\n" % (label, len(groupList)))
        for gr in groupList:
            file.write("\t" + gr.title + "\n")

# Тест 

In [58]:
max_iter = 10 # 100
n_components = 10000
n_clusters = 10 # Количество кластеров
min_samples = 4
eps = 1.3

In [54]:
tfidf_matrix = tf_idf(news_content)

vocabulary size: 6514


In [55]:
vectorization = {
    "tf-idf": tfidf_matrix,
    "tf": tf_idf(news_content),
    #"lda": lda(news_content, n_components, max_iter),
    #"lsa": lsa(tfidf_matrix, n_components, max_iter)
}

vocabulary size: 6514


In [60]:
clasterization = {
    "kmeans": KMeans(n_clusters=n_clusters),
    #"miniBatchKMeanss": MiniBatchKMeans(n_clusters=n_clusters, max_iter=max_iter),
    #"dbscan": DBSCAN(eps=eps, min_samples=min_samples),
    #"birch": Birch(n_clusters=n_clusters),
    #"affinityPropagation": AffinityPropagation()
}

In [None]:
for cl_name, cl_alg in clasterization.items():
    for vec_name, vec_matrix in vectorization.items():
        try:
            print("%s %s" % (cl_name, vec_name))
            start_time = time.time()
            result_matrix = cl_alg.fit(vec_matrix)
            labels = result_matrix.labels_
            score(vec_matrix, marked_labels, labels)
            print("%s %s: %s second" % (cl_name, vec_name, time.time() - start_time))
            print("----")
        except Exception as ex:
            print(ex)

In [67]:
path = "/data/kasandra/results/clustering/%s" % (int(time.time()))

In [68]:
os.makedirs(path, exist_ok=True)

In [69]:
for cl_name, cl_alg in clasterization.items():
    for vec_name, vec_matrix in vectorization.items():
        print("%s %s" % (cl_name, vec_name))
        try:
            start_time = time.time()
            result_matrix = cl_alg.fit(vec_matrix)
            labels = result_matrix.labels_
            print("time: %s" % (time.time() - start_time))
            file_name = "/%s_%s.txt" % (cl_name, vec_name)
            print("path: %s" %(path + file_name))
            
            with open(path + file_name, mode='w') as f:
                f.write("%s %s\n" % (cl_name, vec_name))
                score_file(vec_matrix, marked_labels, labels, f)
                f.write("time: %s\n" % (time.time() - start_time))
                print_clusters_file(news, labels, f)
        except Exception as ex:
            print(ex)

kmeans tf-idf
time: 0.41614198684692383
path: /data/kasandra/results/clustering/1513977535/kmeans_tf-idf.txt
kmeans tf


Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, e

time: 0.3805050849914551
path: /data/kasandra/results/clustering/1513977535/kmeans_tf.txt


Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, exact=1)
Importing `comb` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.comb` instead.
  return comb(n, 2, e

## LDA

In [21]:
start_time = time.time()
lda_matrix = lda(news_content, n_clusters, max_iter)
lda_labels = [x.argmax() for x in lda_matrix]
score(lda_matrix, marked_labels, lda_labels)
print("lda lda: %s second" % (time.time() - start_time))

Homogeneity: 0.170
Completeness: 0.773
V-measure: 0.279
Adjusted Rand-Index: 0.054
Silhouette Coefficient: 0.803
Cluster count: 9
lda lda: 12.823763608932495 second


## LSA

In [22]:
start_time = time.time()
lsa_matrix = lsa(tfidf_matrix, n_clusters, max_iter)
lsa_labels = [x.argmax() for x in lsa_matrix]
score(lsa_matrix, marked_labels, lsa_labels)
print("lsa: %s second" % (time.time() - start_time))

Homogeneity: 0.688
Completeness: 0.714
V-measure: 0.701
Adjusted Rand-Index: 0.225
Silhouette Coefficient: 0.045
Cluster count: 109
lsa: 1.0879974365234375 second
