In [563]:
import pandas as pd
import numpy as np
import re
import time
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from tqdm import tqdm
import pkuseg
from typing import Tuple

In [564]:
def load_data_from_file(
    term_indexes_filename: str,
    article_indexes_filename: str,
    term_doc_matrix_filename: str
) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray]:
    
    # 加载 doc_matrix 的行标、列标与 term, article 的对应关系
    term_indexes = pd.read_csv(term_indexes_filename)
    articles = pd.read_csv(article_indexes_filename)
    doc_matrix = np.loadtxt(term_doc_matrix_filename, delimiter=',')
    
    return (
        term_indexes,
        articles,
        doc_matrix,
    )
    

In [480]:
# 这个函数判断一个 word(term) 里面有没有标点符号
def no_punctuations_or_letters_or_digits_Q(word):
    
    # if word is not str, then False
    if not (type(word) is str):
        return False
    
    # if word has digits, letters or punctuations
    result = re.search(
        '[' +
        '\u2000-\u206F' +  # 一般的符号
        '\u0000-\u002F' +  # 符号
        '\u003A-\u0040' +  # 符号
        '\u005B-\u0060' +  # 符号
        '\u007B-\u007F' +  # 符号
        '\uFF00-\uFFEF' +  # 全角字符
        '\u3000-\u303F' +  # 中文标点
        ']',
        word
    )
    
    return (result is None)

In [481]:
# 这个函数判断一个 word(term) 的长度是不是大于或等于 2
def length_equal_or_greater_than_2_Q(word):
    return len(word) >= 2

In [482]:
# 按照上述规则进行筛选
all_indexes = list(range(0, term_indexes.shape[0]))

def pass_rule(x):
    
    if not no_punctuations_or_letters_or_digits_Q(x):
        return False
    
    if not length_equal_or_greater_than_2_Q(x):
        return False
    
    return True
    
selected_indexes = list(filter(
    lambda i: pass_rule(term_indexes.iloc[i, 0]),
    all_indexes
))

In [459]:
selected_terms = term_indexes.iloc[selected_indexes, :]
selected_terms.to_csv('termdocmatrix/selected_terms.csv', index=False)

In [460]:
doc_matrix_full = np.loadtxt('termdocmatrix/doc_matrix.txt', delimiter=',')
doc_matrix = doc_matrix_full[:, selected_indexes]

In [467]:
# 是否没有 doc 的坐标全为 0
np.count_nonzero(np.sum(doc_matrix, axis=1)) == doc_matrix.shape[0]

True

In [468]:
# 计算tf-idf
idf = np.log(doc_matrix.shape[0]/np.count_nonzero(doc_matrix, axis = 0))
tf_idf = doc_matrix * idf 
normalization_factor = np.tile(np.sum(tf_idf, axis = 1), reps=[ doc_matrix.shape[1], 1 ]).T
tf_idf = tf_idf/normalization_factor

In [469]:
# 通过 svd 进行 lsa 分析
u, s, vh = np.linalg.svd(tf_idf, full_matrices=True)
u.shape, s.shape, vh.shape

((73, 73), (73,), (8819, 8819))

In [470]:
# 将 原始空间 的 doc 投影到 特征空间
matS = np.zeros(shape=doc_matrix.shape)
matS[0:s.shape[0], 0:s.shape[0]] = np.diag(s)
doc_coords = np.matmul(u, matS)

In [471]:
# 加载分词模型
seg = pkuseg.pkuseg()

In [473]:
# 计算子 term_doc_matrix 的角标与 term_index 的对应关系
col_num_in_new_doc_matrix = pd.DataFrame({
    'col_num_in_new_doc_matrix': np.array(range(0, doc_matrix.shape[1]))
})

selected_terms = selected_terms.reset_index(drop=True)
selected_terms = pd.concat(
    objs = [selected_terms, col_num_in_new_doc_matrix],
    axis = 1
)

In [474]:
# 建立一个 term 到 子term_doc_matrix 的角标 的对应关系
term_to_col_index = dict()
for i in range(selected_terms.shape[0]):
    term = selected_terms.iloc[i, 0]
    col_index = selected_terms.iloc[i, 2]
    term_to_col_index[term] = col_index

In [555]:
# 接受用户输入
query_terms = seg.cut('多边形')
query_terms

['多边形']

In [556]:
# 构建 query_row, 这个 query_row 相当于一个 doc
query_row = np.array(range(doc_matrix.shape[1]))
query_row[:] = 0
query_row = np.reshape(a = query_row, newshape = (1, query_row.shape[0]))
for term in query_terms:
    if term in term_to_col_index:
        col_index = term_to_col_index[term]
        query_row[0, col_index] = 1

In [557]:
# 是否可以进行搜索
np.sum(query_row) > 0

True

In [558]:
# 将 query_row 投影到 lsa 特征空间
query_coord = np.matmul(query_row, vh.T)

query_coord = query_coord[:, 0:doc_coords.shape[0]]
doc_coords = doc_coords[:, 0:doc_coords.shape[0]]

In [559]:
# 用这个函数计算两个向量的余弦值
def cos_of_two_vector(x1, x2):
    n_x1 = np.linalg.norm(x1)
    n_x2 = np.linalg.norm(x2)
    inner_prod = np.abs(np.sum(x1 * x2))
    
    if n_x1 * n_x2 == 0:
        n_x1 = n_x1 + (1E-10)
        n_x2 = n_x2 + (1E-10)
        
    return inner_prod / (n_x1 * n_x2)

In [560]:
# 计算 query_coord (它是查询关键字组成的 word_vector 的 lsa 特征空间的投影) 与
# doc 在特征空间中的投影的余弦值，以此来判断接近程度
cos_values = list()
for i in range(doc_coords.shape[0]):
    doc_coord = doc_coords[i, :]
    cos_value = cos_of_two_vector(query_coord, doc_coord)
    cos_values.append(cos_value)

In [561]:
# 更新匹配度那一列
articles['match_val'] = cos_values

In [562]:
# 展示搜索结果
articles.sort_values(by = 'match_val', ascending=False)

Unnamed: 0,article_name,row_num_in_doc_matrix,match_val
9,complex-plane-and-eulers-formular-and-liuhuis-...,9,8.072534e-01
46,methods-telling-if-a-point-is-inside-some-poly...,46,7.465705e-01
30,implementing-a-rubkis-cube-in-mathematica.md,30,1.331943e-01
34,introducing-my-new-project.md,34,9.781453e-16
8,cellular-automata-infectious-disease-simulatio...,8,8.321220e-16
...,...,...,...
12,correct-time-in-cloudcone-vps.md,12,9.507361e-18
38,learning-language-interoperability-and-linking.md,38,7.196818e-18
47,minimum-gulp-workspace.md,47,6.691183e-18
25,how-fast.md,25,4.023889e-18
