In [None]:
import json
import csv
import time

import numpy as np

from itertools import groupby
from collections import Counter


from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import metrics

## Подготовка данных для кластеризации

In [None]:
class News:
    def __init__(self, id, date, title, content, url, siteType):
        self.id = id
        self.date = date
        self.title = title
        self.content = content
        self.url = url
        self.siteType = siteType
    
    @classmethod
    def from_json(cls, json_str):
        json_dict = json.loads(json_str)
        return cls(**json_dict)

## Загрузка тестовой выборки

In [None]:
news = []
with open('/data/kasandra/year/all.normalized.json', encoding="utf8") as f:
    for line in f:
        news.append(News.from_json(line))

In [None]:
words = []
for n in news:
    words.extend(n.content.split())
counts = Counter(words)
one_time = [k for k, v in dict(counts).items() if v == 1]
print("total words: %s" % (len(words) - len(one_time)))

news_content = [x.content for x in news]

In [None]:
stopwords = set(one_time)

## Загрузка размеченной выборки

In [None]:
marked_map = {} # (id, label)
with open('eggs.csv', 'rb') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    for row in spamreader:
        marked_map[row[0]] = int(row[3])

marked_news = []
for n in news:
    label = marked_map[n.id]
    marked_news.append((n.id, label))
    
marked_labels = [label for n_id, label in marked_news]

# Вспомогательные функции

In [None]:
def zip_news(n,l):
    return list(map(assign_label_to_news, zip(n, l)))

def assign_label_to_news(tuplezz):
    (nws, lbl) = tuplezz
    nws.label = lbl.item()
    return nws

def filter_words(text):
    words_list = text.split()
    newWords = [x for x in words_list if len(x) > 3]
    return " ".join(newWords)

def print_clusters(cluster_news, clustre_labels):
    newsLabels = zip_news(cluster_news, clustre_labels)
    newsLabels = sorted(newsLabels, key=lambda n: n.label)
    for label, group in groupby(newsLabels, lambda n: n.label):
        groupList = list(group)
        print("Cluster: %s, count news: %s, titles:" % (label, len(groupList)))
        for gr in groupList:
            print("\t" + gr.title)
            
def print_topics(components, feature_names, n_top_words):
    for topic_idx, topic in enumerate(components):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

# Векторизация

## TF-IDF

In [None]:
def tf_idf(content):
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, tokenizer=lambda text: text.split(" "), stop_words=stopwords) # , ngram_range=(1, 3)
    tfidf_matrix = tfidf_vectorizer.fit_transform(content)
    print("vocabulary size: %s" % len(tfidf_vectorizer.vocabulary_))
    return tfidf_matrix

## LDA

In [None]:
def lda(matrix, n):
    lda = LatentDirichletAllocation(n_topics=n, max_iter=100, learning_method='online', learning_offset=50.)
    lda_matrix = lda.fit_transform(matrix)
    return lda_matrix

# Кластеризация

## DBScan

In [None]:
def dbscan(matrix, eps, samples, n_jobs):
    db = DBSCAN(eps=eps, min_samples=samples, n_jobs=n_jobs).fit(matrix)
    labels = db.labels_
    print('count clusters: %d' % (len(set(db.labels_)) - (1 if -1 in db.labels_ else 0)))
    labels = db.labels_
    print("-1: %s, 0: %s" % (labels.tolist().count(-1), labels.tolist().count(0)))
    return labels

## KMeans

In [None]:
def kmeans(matrix, n, n_jobs)
    km = KMeans(n_clusters=n, n_jobs=n_jobs).fit(matrix)
    labels = km.labels_
    return labels

# Проверка качества кластеризации

In [None]:
def score(matrix, marked_labels, clustered_labels):
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(marked_labels, clustered_labels))
    print("Completeness: %0.3f" % metrics.completeness_score(marked_labels, clustered_labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(marked_labels, clustered_labels))
    print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(marked_labels, km.labels_))
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(matrix, clustered_labels, sample_size=1000))

# Тест 

In [4]:
n_clusters = 130 # Количество кластеров
n_topics = 100000 # Количество топиков для LDA
sampels = 10
eps = 1.3
n_jobs = 1 # Количество потоков для кластеризации

In [None]:
tfidf_matrix = tf_idf(news_content)
lda = lda(tfidf_matrix, n_topics)

# KMeans tfidf
print("kmeans tf-idf...")
start_time = time.time()
kk_labels = kmeans(tfidf_matrix, n_clusters, n_jobs)
score(tfidf_matrix, marked_labels, kk_labels)
print("kmeans tf-idf: %s second" % (time.time() - start_time))

# KMeans lda
print("kmeans lda...")
start_time = time.time()
kk_labels = kmeans(lda, n_clusters, n_jobs)
score(lda, marked_labels, kk_labels)
print("kmeans lda: %s second" % (time.time() - start_time))

# DBScan tfidf
print("dbscan tf-idf...")
start_time = time.time()
kk_labels = dbscan(tfidf_matrix, eps, sampels, n_jobs)
score(tfidf_matrix, marked_labels, kk_labels)
print("dbscan tf-idf: %s second" % (time.time() - start_time))

# DBScan lda
print("dbscan lda...")
start_time = time.time()
kk_labels = dbscan(lda, eps, sampels, n_jobs)
score(lda, marked_labels, kk_labels)
print("dbscan lda: %s second" % (time.time() - start_time))