In [52]:
import json
import numpy as np
from itertools import groupby
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline

from gensim.sklearn_api import LdaTransformer
import scipy
import logging, gensim, bz2
from gensim import corpora
from collections import defaultdict
from scipy.sparse import csr_matrix

In [3]:
class News:
    def __init__(self, id, date, title, content, url, siteType):
        self.id = id
        self.date = date
        self.title = title
        self.content = content
        self.url = url
        self.siteType = siteType
    
    @classmethod
    def from_json(cls, json_str):
        json_dict = json.loads(json_str)
        return cls(**json_dict)

In [4]:
news = []
with open('/data/10k.test.normalized.json', encoding="utf8") as f:
    for line in f:
        news.append(News.from_json(line))

In [5]:
words = []
for n in news:
    words.extend(n.content.split())
counts = Counter(words)
one_time = [k for k, v in dict(counts).items() if v == 1]
print("total words: %s" % (len(words) - len(one_time)))

total words: 2717122


In [6]:
stopwords = set(one_time)

In [7]:
content = list(map(lambda x: x.content, news))

In [9]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [16]:
texts = [[word for word in document.lower().split()] for document in content]

In [19]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

In [20]:
dictionary = corpora.Dictionary(texts)

2017-12-03 17:40:41,629 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-12-03 17:40:44,520 : INFO : built Dictionary(43455 unique tokens: ['экстремист', 'исламский', 'государство', 'обезглавливать', 'второй']...) from 10000 documents (total 2717157 corpus positions)


In [21]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [22]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=1)

2017-12-03 17:42:02,679 : INFO : using symmetric alpha at 0.01
2017-12-03 17:42:02,680 : INFO : using symmetric eta at 2.3012311586698885e-05
2017-12-03 17:42:02,688 : INFO : using serial LDA version on this node
2017-12-03 17:42:52,465 : INFO : running online (single-pass) LDA training, 100 topics, 1 passes over the supplied corpus of 10000 documents, updating model once every 10000 documents, evaluating perplexity every 10000 documents, iterating 50x with a convergence threshold of 0.001000
2017-12-03 17:46:24,648 : INFO : -16.408 per-word bound, 86956.6 perplexity estimate based on a held-out corpus of 10000 documents with 2717157 words
2017-12-03 17:46:24,648 : INFO : PROGRESS: pass 0, at document #10000/10000
2017-12-03 17:48:00,611 : INFO : topic #55 (0.010): 0.010*"год" + 0.006*"россия" + 0.005*"становиться" + 0.004*"человек" + 0.004*"мочь" + 0.004*"украина" + 0.004*"сообщать" + 0.004*"страна" + 0.004*"дело" + 0.003*"время"
2017-12-03 17:48:00,612 : INFO : topic #73 (0.010): 0.0

In [29]:
lda.print_topics(num_topics=10, num_words=10)

2017-12-03 17:50:45,191 : INFO : topic #66 (0.010): 0.007*"год" + 0.007*"человек" + 0.006*"россия" + 0.004*"мочь" + 0.004*"страна" + 0.004*"становиться" + 0.003*"время" + 0.003*"говорить" + 0.003*"рассказывать" + 0.002*"мир"
2017-12-03 17:50:45,192 : INFO : topic #62 (0.010): 0.008*"год" + 0.006*"россия" + 0.005*"человек" + 0.004*"мочь" + 0.003*"время" + 0.003*"становиться" + 0.003*"страна" + 0.003*"российский" + 0.003*"сообщать" + 0.003*"рубль"
2017-12-03 17:50:45,193 : INFO : topic #80 (0.010): 0.010*"год" + 0.006*"россия" + 0.006*"ребенок" + 0.005*"мочь" + 0.005*"украина" + 0.004*"президент" + 0.004*"страна" + 0.004*"становиться" + 0.003*"самый" + 0.003*"российский"
2017-12-03 17:50:45,194 : INFO : topic #15 (0.010): 0.012*"год" + 0.008*"россия" + 0.005*"человек" + 0.004*"мочь" + 0.004*"время" + 0.004*"новый" + 0.003*"страна" + 0.003*"первый" + 0.003*"данные" + 0.003*"день"
2017-12-03 17:50:45,195 : INFO : topic #84 (0.010): 0.011*"год" + 0.010*"россия" + 0.005*"российский" + 0.005*

[(66,
  '0.007*"год" + 0.007*"человек" + 0.006*"россия" + 0.004*"мочь" + 0.004*"страна" + 0.004*"становиться" + 0.003*"время" + 0.003*"говорить" + 0.003*"рассказывать" + 0.002*"мир"'),
 (62,
  '0.008*"год" + 0.006*"россия" + 0.005*"человек" + 0.004*"мочь" + 0.003*"время" + 0.003*"становиться" + 0.003*"страна" + 0.003*"российский" + 0.003*"сообщать" + 0.003*"рубль"'),
 (80,
  '0.010*"год" + 0.006*"россия" + 0.006*"ребенок" + 0.005*"мочь" + 0.005*"украина" + 0.004*"президент" + 0.004*"страна" + 0.004*"становиться" + 0.003*"самый" + 0.003*"российский"'),
 (15,
  '0.012*"год" + 0.008*"россия" + 0.005*"человек" + 0.004*"мочь" + 0.004*"время" + 0.004*"новый" + 0.003*"страна" + 0.003*"первый" + 0.003*"данные" + 0.003*"день"'),
 (84,
  '0.011*"год" + 0.010*"россия" + 0.005*"российский" + 0.005*"война" + 0.004*"руб" + 0.004*"украина" + 0.004*"человек" + 0.004*"страна" + 0.004*"мочь" + 0.003*"говорить"'),
 (47,
  '0.007*"год" + 0.005*"человек" + 0.004*"российский" + 0.004*"фото" + 0.004*"мочь" +

In [45]:
def get_doc_topic(corpus, model): 
    doc_topic = list() 
    for doc in corpus: 
        doc_topic.append(model.__getitem__(doc, eps=0)) 
    return csr_matrix(doc_topic)

In [47]:
res = get_doc_topic(corpus, lda)

In [49]:
len(res[0])

100

In [50]:
lda_r = LdaTransformer(num_topics=100, id2word=dictionary)

In [51]:
lda_r_res = lda_r.fit_transform(corpus)

2017-12-03 18:11:07,473 : INFO : using symmetric alpha at 0.01
2017-12-03 18:11:07,474 : INFO : using symmetric eta at 2.3012311586698885e-05
2017-12-03 18:11:07,480 : INFO : using serial LDA version on this node
2017-12-03 18:11:53,160 : INFO : running online (single-pass) LDA training, 100 topics, 1 passes over the supplied corpus of 10000 documents, updating model once every 2000 documents, evaluating perplexity every 10000 documents, iterating 50x with a convergence threshold of 0.001000
2017-12-03 18:11:53,162 : INFO : PROGRESS: pass 0, at document #2000/10000
2017-12-03 18:12:09,602 : INFO : merging changes from 2000 documents into a model of 10000 documents
2017-12-03 18:12:15,085 : INFO : topic #43 (0.010): 0.009*"россия" + 0.008*"год" + 0.005*"человек" + 0.004*"президент" + 0.004*"украина" + 0.004*"украинский" + 0.004*"рубль" + 0.004*"время" + 0.003*"военный" + 0.003*"российский"
2017-12-03 18:12:15,086 : INFO : topic #33 (0.010): 0.016*"год" + 0.008*"россия" + 0.006*"мочь" + 

In [61]:
sparse_res = csr_matrix(lda_r_res)