In [1]:
from sklearn.decomposition  import LatentDirichletAllocation 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from doc_pb2 import Document
from tqdm import tqdm
from pprint import pprint
from functools import reduce
import pickle
import jieba
import lmdb
import os

In [2]:
def chunk_reader(lmdb_path: str, chunk_size: int):
    """
    chunk_size the size of each chunk it yield, set to -1 to get all docs
    """
    db = lmdb.open(lmdb_path)
    with db.begin() as cur:
        docs = []
        for k, v in cur.cursor():
            doc = Document()
            doc.ParseFromString(v)
            docs.append(doc)
            if len(docs) == chunk_size:
                yield docs
                docs = []
    db.close()
    return docs

def read_all_texts(lmdb_path: str, l):
    db = lmdb.open(lmdb_path)
    with db.begin() as cur:
        docs = []
        for k, v in cur.cursor():
            doc = Document()
            doc.ParseFromString(v)
            docs.append(doc.title + doc.content)
            del doc
            if len(docs) ==l:
                break
    db.close()
    return docs

In [6]:
%%time
# load stopwords
stopwords_dir_path = r'C:\Users\zjxua\GitHub\CAS-NLP\data\stopwords'
stopwords_filelist = [os.path.join(stopwords_dir_path, p) for p in os.listdir(stopwords_dir_path)]
stopwords = list(reduce(lambda x, y: x + y, [open(p).read().split('\n') for p in stopwords_filelist]))

Wall time: 17.5 ms


In [8]:
%%time
all_texts = read_all_texts('../../data/news data/documents')

Wall time: 1min 6s


In [9]:
%%time
# features
tf_vectorizer = CountVectorizer(tokenizer=lambda text: jieba.lcut(text), max_df=0.9, min_df=20, stop_words=stopwords)
    
tf_matrix = tf_vectorizer.fit_transform(all_texts)
tf_matrix.shape

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zjxua\AppData\Local\Temp\jieba.cache
Loading model cost 0.774 seconds.
Prefix dict has been built succesfully.


Wall time: 53min


In [18]:
with open('tf_matrix.pickle', 'wb') as f:
    pickle.dump(file=f, obj=tf_matrix)

In [2]:
with open('tf_matrix.pickle', 'rb') as f:
    tf_matrix = pickle.load(file=f)

In [3]:
tf_matrix

<1133280x180968 sparse matrix of type '<class 'numpy.int64'>'
	with 136364813 stored elements in Compressed Sparse Row format>

In [None]:
%%time
lda = LatentDirichletAllocation(n_topics=1000, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

lda.fit(tf_matrix)

In [None]:
docres = lda.transform(tf_matrix)

In [None]:
with open('doc2verb.pickle', 'wb') as f:
    pickle.dump(file=f, obj=(docres, lda.components_))

In [None]:
"hello?"

In [24]:
%%time
tf_feature_names = tf_vectorizer.get_feature_names()

print_top_words(lda, tf_feature_names, 10)