In [35]:
from loader import read_articles_from_txt

In [37]:
import pkuseg

In [472]:
from scipy.sparse.linalg import svds

In [38]:
seg = pkuseg.pkuseg()

In [39]:
cutter = lambda sentence: seg.cut(sentence)

In [51]:
from typing import Tuple, List

In [223]:
from scipy.sparse import csr_matrix

In [475]:
from random import randint

In [518]:
from loader import read_articles_from_txt

In [224]:
class Article:
        
    def __init__(self, name: str = '', terms: List[str] = []):
        self.name = name
        self.terms = terms


In [534]:
class ArticleStats:
    
    def __init__(self):
        
        self.article_names = list()
        
        # term -> index
        self.term_index = dict()
        
        # index -> term
        self.index_term = dict()
        
        # term_document_matrix, arguments for scipy.sparse.csr_matrix
        self.term_document_matrix_data = list()
        self.term_document_matrix_indptr = [0]
        self.term_document_matrix_indices = list()
    
    # 用一篇文章更新 term-document-matrix
    def add_article(self, article: Article):
        
        self.article_names.append(article.name)
        
        for term in article.terms:
            
            index_of_this_term = self.term_index.setdefault(
                term, 
                len(self.term_index)
            )
            
            self.index_term[index_of_this_term] = term
            
            self.term_document_matrix_indices.append(index_of_this_term)
            self.term_document_matrix_data.append(1)
        
        self.term_document_matrix_indptr.append(len(self.term_document_matrix_indices))
    
    # 获取 term-document-matrix 的稀疏矩阵表示
    def get_term_document_matrix(self):
        
        data = self.term_document_matrix_data
        indices = self.term_document_matrix_indices
        indptr = self.term_document_matrix_indptr
        
        return csr_matrix((data, indices, indptr), dtype=int)
    
    # 计算 tf
    def get_tf(self):
        
        doc_matrix = self.get_term_document_matrix()
        tf = doc_matrix.multiply(1/doc_matrix.sum(axis=1))
        
        return tf
    
    # 计算 idf
    def get_idf(self):
        
        doc_matrix = self.get_term_document_matrix()
        
        # (n_x[0], n_y[0]), (n_x[1], n_y[2]), ... 表示非零元素坐标
        n_x, n_y = doc_matrix.nonzero()
        
        # 去重
        s = set()
        for j in range(len(n_x)):
            s.add((n_x[j], n_y[j],))
        
        n_y = list()
        for x, y in s:
            n_y.append(y)
        
        col_indexes, non_zeros_count = np.unique(n_y, return_counts=True)
        idf = np.log(doc_matrix.shape[0]/non_zeros_count)
        
        return idf
            
    # 计算 tf-idf
    def get_tf_idf(self):
        
        tf = self.get_tf()
        idf = self.get_idf()
        
        tf_idf = tf.multiply(idf)
        
        return tf_idf
    
    # term_indexes: { 'a': 10, 'b': 2, 'c': 9, 'e': 7 }
    # terms:   [ 'c', 'a',  'a', 'b', 'b', 'e', 'b' ]
    # indexes: [  9,  10,   10,   2,   2,   7,   2  ]
    def terms_to_indexes(self, terms: List[str]) -> List[int]:
        
        nterms = len(self.term_index)
        indexes = list()
        for term in terms:
            indexes.append(self.term_index.get(term, randint(0, nterms-1)))
        
        return indexes
    
    def terms_to_new_row(self, terms: List[str]) -> np.ndarray:
        
        indices = np.array(self.terms_to_indexes(terms))
        data = np.ones(indices.shape)
        indptr = [0, indices.shape[0]]
        
        new_row = csr_matrix((data, indices, indptr), shape=(1, len(self.term_index), ))
        
        return new_row.toarray()

In [535]:
article_names, article_contents = read_articles_from_txt('data/rmrb.txt')

articles = list()
for i in range(len(article_names)):
    articles.append(Article(
        name = article_names[i],
        terms = cutter(article_contents[i])
    ))

rmrb = ArticleStats()

for article in articles:
    rmrb.add_article(article)

opening data/rmrb.txt
Found: 中共中央印发中国共产党地方组织选举工作条例 , length: 510
Found: 中共中央致电祝贺朝鲜劳动党八大召开 , length: 477
Found: 农业科技进步贡献率超60%农业农村现代化迈上新台阶 , length: 1647
Found: 提高新时代地方党组织选举质量的制度保证 , length: 1335
Found: 全国宣传部长会议在京召开王沪宁出席并讲话 , length: 843
Found: 通海蔬菜远销海外 , length: 1172
Found: 复兴号高寒动车组亮相 , length: 189
Found: 胡春华强调立足新发展阶段推动农民工工作取得更大成就 , length: 513
Found: 国办印发《意见》进一步优化地方政务服务便民热线 , length: 1080
Found: 民生欢歌，旋律更高昂 , length: 2196
Found: 推动住房和城乡建设事业高质量发展 , length: 2406
Found 11 articles.


In [532]:
rmrb.term_index['地方']

16

In [533]:
rmrb.term_index['组织']

17

In [526]:
csr_matrix(([0], [0], [0,1]), shape=(1, 100)).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [471]:
class LatentSemanticAnalyzer:
    
    def __init__(self, term_document_matrix):
        self.doc_matrix = term_document_matrix
        
    def perform_svd(self, k = 0):
        if k == 0:
            k = min(self.doc_matrix.shape)-1
        
        u, s, vh = svds(self.doc_matrix, k = k)
        
        self.svd_u = u
        self.svd_s = s
        self.svd_vh = vh
    
    def to_feature_coord(origin_coord):
        
        origin_coord = np.reshape(origin_coord, newshape=(1, self.svd_vh.T.shape[0],))
        feature_coord = np.matmul(origin_coord, self.svd_vh.T)
        
        return feature_coord[0, :]
        

In [473]:
rmrb.term_index

{'新华社': 0,
 '北京': 1,
 '1月': 2,
 '6日': 3,
 '电': 4,
 '近日': 5,
 '，': 6,
 '中共中央': 7,
 '印发': 8,
 '了': 9,
 '修订': 10,
 '后': 11,
 '的': 12,
 '《': 13,
 '中国': 14,
 '共产党': 15,
 '地方': 16,
 '组织': 17,
 '选举': 18,
 '工作': 19,
 '条例': 20,
 '》': 21,
 '（': 22,
 '以下': 23,
 '简称': 24,
 '）': 25,
 '并': 26,
 '发出': 27,
 '通知': 28,
 '要求': 29,
 '各': 30,
 '地区': 31,
 '部门': 32,
 '认真': 33,
 '遵照': 34,
 '执行': 35,
 '。': 36,
 '指出': 37,
 '以': 38,
 '习近平': 39,
 '新时代': 40,
 '特色': 41,
 '社会主义': 42,
 '思想': 43,
 '为': 44,
 '指导': 45,
 '党章': 46,
 '根本': 47,
 '遵循': 48,
 '深入': 49,
 '贯彻': 50,
 '党': 51,
 '十九大': 52,
 '和': 53,
 '十九': 54,
 '届': 55,
 '二中': 56,
 '、': 57,
 '三中': 58,
 '四中': 59,
 '五中全会': 60,
 '精神': 61,
 '落实': 62,
 '建设': 63,
 '总': 64,
 '路线': 65,
 '是': 66,
 '基本': 67,
 '实施': 68,
 '对于': 69,
 '坚持': 70,
 '加强': 71,
 '全面': 72,
 '领导': 73,
 '健全': 74,
 '维护': 75,
 '集中': 76,
 '统一': 77,
 '制度': 78,
 '规范': 79,
 '提高': 80,
 '执政': 81,
 '能力': 82,
 '水平': 83,
 '具有': 84,
 '重要': 85,
 '意义': 86,
 '各级': 87,
 '党委': 88,
 '要': 89,
 '增强': 90,
 '“': 91,
 '四': 92,

In [498]:
c.toarray()

array([[0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1.]])

In [485]:
c.toarray()

array([[0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1]])

In [483]:
?csr_matrix

[0;31mInit signature:[0m [0mcsr_matrix[0m[0;34m([0m[0marg1[0m[0;34m,[0m [0mshape[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mcopy[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Compressed Sparse Row matrix

This can be instantiated in several ways:
    csr_matrix(D)
        with a dense matrix or rank-2 ndarray D

    csr_matrix(S)
        with another sparse matrix S (equivalent to S.tocsr())

    csr_matrix((M, N), [dtype])
        to construct an empty matrix with shape (M, N)
        dtype is optional, defaulting to dtype='d'.

    csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
        where ``data``, ``row_ind`` and ``col_ind`` satisfy the
        relationship ``a[row_ind[k], col_ind[k]] = data[k]``.

    csr_matrix((data, indices, indptr), [shape=(M, N)])
        is the standard CSR representation where the column indices for
        row i are stored in ``i