In [1]:
from sklearn.decomposition  import LatentDirichletAllocation 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from doc_pb2 import Document, Dictionary
from tqdm import tqdm
from pprint import pprint
from functools import reduce
import numpy as np
import heapq
import pickle
import jieba
import lmdb
import os

In [3]:
def read_all_texts(lmdb_path: str, l):
    db = lmdb.open(lmdb_path)
    with db.begin() as cur:
        docs = []
        for k, v in cur.cursor():
            doc = Document()
            doc.ParseFromString(v)
            docs.append(doc.title + doc.content)
            del doc
            if len(docs) ==l:
                break
    db.close()
    return docs

def read_all_indexs(lmdb_path: str, l):
    db = lmdb.open(lmdb_path)
    with db.begin() as cur:
        docs = []
        for k, v in cur.cursor():
            docs.append(k)
            if len(docs) ==l:
                break
    db.close()
    return docs

In [4]:
%%time
# load stopwords
stopwords_dir_path = r'C:\Users\zjxua\GitHub\CAS-NLP\data\stopwords'
stopwords_filelist = [os.path.join(stopwords_dir_path, p) for p in os.listdir(stopwords_dir_path)]
stopwords = list(reduce(lambda x, y: x + y, [open(p).read().split('\n') for p in stopwords_filelist]))

Wall time: 19.5 ms


In [5]:
%%time
all_texts = read_all_texts('../../data/news data/documents', 100000)

Wall time: 8.22 s


In [7]:
%%time
# features
tf_vectorizer = CountVectorizer(tokenizer=lambda text: jieba.lcut(text), max_df=0.9, min_df=20, stop_words=stopwords)
    
tf_matrix = tf_vectorizer.fit_transform(all_texts)
print(tf_matrix.shape)

(100000, 46981)
Wall time: 4min 56s


In [10]:
%%time
lda = LatentDirichletAllocation(n_topics=1000, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

docres = lda.fit_transform(tf_matrix)

Wall time: 6h 53min 53s


In [3]:
%%time
np.save('dumps/doc2topic.npy', docres, allow_pickle=True, fix_imports=True)
np.save('dumps/topic2word.npy', lda.components_, allow_pickle=True, fix_imports=True)
# Wall time: 12.7 s
with open('dumps/lda.pickle', 'wb') as f:
    pickle.dump(file=f, obj=lda)
# Wall time: 114 ms
with open ('dumps/feature_names.txt', 'w', encoding='utf-8') as f:
    for name in tf_vectorizer.get_feature_names():
        f.write(name)
        f.write('\n')
# Wall time: 5.87 s
indexs = read_all_indexs('../../data/news data/documents', 100000)
# Wall time: 121 ms
with open ('dumps/indexs.txt', 'w', encoding='utf-8') as f:
    for index in indexs:
        f.write(index.decode('utf-8'))
        f.write('\n')

In [5]:
%%time
doc2topic = np.load('dumps/doc2topic.npy')
topic2word = np.load('dumps/topic2word.npy')
with open ('dumps/feature_names.txt', 'r', encoding='utf-8') as f:
    feature_names = [line for line in f]
with open ('dumps/indexs.txt', 'r', encoding='utf-8') as f:
    urls = [line for line in f]

Wall time: 551 ms


In [21]:
my_dic = Dictionary()
# 对每一个topic，找出前1000个最大概率的doc并保存
_, topic_num = doc2topic.shape
for topic in tqdm(range(topic_num)):
    topic2doc = my_dic.topic2doc.get_or_create(topic)
    topic2doc.topic_code = topic
    
    topic_vec = doc2topic[:, topic]
    largest_1000_indexs = topic_vec.argsort()[::-1][:1000]
    for rank, index in enumerate(largest_1000_indexs):
        docs_with_proba = topic2doc.docs.get_or_create(rank)
        docs_with_proba.proba = topic_vec[index]
        docs_with_proba.url = urls[index]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:24<00:00, 41.29it/s]


In [22]:
%%time
with open ('dumps/dict.dump', 'wb') as f:
    f.write(my_dic.SerializeToString())

Wall time: 1min 13s


In [32]:
%%time
parse_dict = Dictionary()
with open ('dumps/dict.dump', 'rb') as f:
    parse_dict.ParseFromString(f.read())

Wall time: 59.2 s


In [33]:
# 对每一个word，找出前100个最大概率的topic并保存
_, word_num = topic2word.shape
for word_index in tqdm(range(word_num)):
    word = feature_names[word_index].replace('\n', '')
    
    word2topic = parse_dict.word2topic.get_or_create(word)
    word2topic.word = word
    
    word_vec = topic2word[:, word_index]
    largest_100_indexs = word_vec.argsort()[::-1][:100]
    for rank, index in enumerate(largest_100_indexs):
        topic_with_proba = word2topic.topics.get_or_create(rank)
        topic_with_proba.proba = word_vec[index]
        topic_with_proba.topic_code = index

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 46981/46981 [01:31<00:00, 514.89it/s]


In [34]:
%%time
with open ('dumps/full_dict.dump', 'wb') as f:
    f.write(parse_dict.SerializeToString())

Wall time: 7min 52s


In [31]:
%%time
dictionary = Dictionary()
with open ('dumps/dict.dump', 'rb') as f:
    dictionary.ParseFromString(f.read())

Wall time: 1min 34s
