In [563]:
import pandas as pd
import numpy as np
import re
import time
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from tqdm import tqdm
import pkuseg
from typing import Tuple

In [564]:
def load_data_from_file(
    term_indexes_filename: str,
    article_indexes_filename: str,
    term_doc_matrix_filename: str
) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray]:
    
    # 加载 doc_matrix 的行标、列标与 term, article 的对应关系
    term_indexes = pd.read_csv(term_indexes_filename)
    articles = pd.read_csv(article_indexes_filename)
    doc_matrix = np.loadtxt(term_doc_matrix_filename, delimiter=',')
    
    return (
        term_indexes,
        articles,
        doc_matrix,
    )
    

In [480]:
# 这个函数判断一个 word(term) 里面有没有标点符号
def no_punctuations_or_letters_or_digits_Q(word):
    
    # if word is not str, then False
    if not (type(word) is str):
        return False
    
    # if word has digits, letters or punctuations
    result = re.search(
        '[' +
        '\u2000-\u206F' +  # 一般的符号
        '\u0000-\u002F' +  # 符号
        '\u003A-\u0040' +  # 符号
        '\u005B-\u0060' +  # 符号
        '\u007B-\u007F' +  # 符号
        '\uFF00-\uFFEF' +  # 全角字符
        '\u3000-\u303F' +  # 中文标点
        ']',
        word
    )
    
    return (result is None)

In [481]:
# 这个函数判断一个 word(term) 的长度是不是大于或等于 2
def length_equal_or_greater_than_2_Q(word):
    return len(word) >= 2

In [482]:
# 按照上述规则进行筛选
all_indexes = list(range(0, term_indexes.shape[0]))

def pass_rule(x):
    
    if not no_punctuations_or_letters_or_digits_Q(x):
        return False
    
    if not length_equal_or_greater_than_2_Q(x):
        return False
    
    return True
    
selected_indexes = list(filter(
    lambda i: pass_rule(term_indexes.iloc[i, 0]),
    all_indexes
))

In [459]:
selected_terms = term_indexes.iloc[selected_indexes, :]
selected_terms.to_csv('termdocmatrix/selected_terms.csv', index=False)

In [460]:
doc_matrix_full = np.loadtxt('termdocmatrix/doc_matrix.txt', delimiter=',')
doc_matrix = doc_matrix_full[:, selected_indexes]

((73, 73), (73,), (8819, 8819))

In [724]:
def foo() -> np.ndarray:
    return np.array([1,2,3])

In [725]:
foo()

array([1, 2, 3])

In [726]:
def bar(a: np.ndarray):
    return 1

In [727]:
bar(np.array([1,2,3]))

1

In [471]:
# 加载分词模型
seg = pkuseg.pkuseg()

In [723]:
np.divide([1,2,3], [4,5,6])

array([0.25, 0.4 , 0.5 ])

In [474]:
# 建立一个 term 到 子term_doc_matrix 的角标 的对应关系
term_to_col_index = dict()
for i in range(selected_terms.shape[0]):
    term = selected_terms.iloc[i, 0]
    col_index = selected_terms.iloc[i, 2]
    term_to_col_index[term] = col_index

['多边形']

True

Unnamed: 0,article_name,row_num_in_doc_matrix,match_val
9,complex-plane-and-eulers-formular-and-liuhuis-...,9,8.072534e-01
46,methods-telling-if-a-point-is-inside-some-poly...,46,7.465705e-01
30,implementing-a-rubkis-cube-in-mathematica.md,30,1.331943e-01
34,introducing-my-new-project.md,34,9.781453e-16
8,cellular-automata-infectious-disease-simulatio...,8,8.321220e-16
...,...,...,...
12,correct-time-in-cloudcone-vps.md,12,9.507361e-18
38,learning-language-interoperability-and-linking.md,38,7.196818e-18
47,minimum-gulp-workspace.md,47,6.691183e-18
25,how-fast.md,25,4.023889e-18
