In [275]:
import pandas as pd
import numpy as np
import re
import time
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from tqdm import tqdm
import pkuseg

In [135]:
term_indexes = pd.read_csv('termdocmatrix/term_indexes.csv')

In [321]:
def no_punctuations_or_letters_or_digits_Q(word):
    
    # if word is not str, then False
    if not (type(word) is str):
        return False
    
    # if word has digits, letters or punctuations
    result = re.search(
        '[' +
        '\u2000-\u206F' +  # 一般的符号
        '\u0000-\u002F' +  # 符号
        '\u003A-\u0040' +  # 符号
        '\u005B-\u0060' +  # 符号
        '\u007B-\u007F' +  # 符号
        '\uFF00-\uFFEF' +  # 全角字符
        '\u3000-\u303F' +  # 中文标点
        ']',
        word
    )
    
    if result is None:
        return True
    else:
        return False

In [322]:
def length_equal_or_greater_than_2_Q(word):
    return len(word) >= 2

In [327]:
all_indexes = list(range(0, term_indexes.shape[0]))

def pass_rule(x):
    
    if not no_punctuations_or_letters_or_digits_Q(x):
        return False
    
    if not length_equal_or_greater_than_2_Q(x):
        return False
    
    return True
    
selected_indexes = list(filter(
    lambda i: pass_rule(term_indexes.iloc[i, 0]),
    all_indexes
))

In [328]:
term_indexes.iloc[selected_indexes, :].to_csv('termdocmatrix/selected_terms.csv', index=False)

In [87]:
doc_matrix_full = np.loadtxt('termdocmatrix/doc_matrix.txt', delimiter=',')
doc_matrix = doc_matrix_full[:, selected_indexes]

In [313]:
np.sum(doc_matrix, axis=1)

array([1341., 2110.,  857., 1189.,    8.,  518., 1221., 2084., 2967.,
       1248., 1989.,    5.,  400.,  885.,  581., 1195., 1741.,  842.,
        184.,  102.,   64., 1881., 3457.,  708.,  194.,    9.,  380.,
        102.,  639.,  347., 1019.,  584.,  401.,  215.,  902.,  436.,
        124.,    0.,  381.,  606., 1331.,   22.,  812.,  957.,   16.,
        741., 2192.,   68.,  558.,  795., 1516., 1810.,  813.,  878.,
        947., 4083.,   46.,  219.,  834.,   51.,  400., 3255., 1490.,
         77.,  671.,  904.,  966.,   51.,  919., 1318., 1377.,  556.,
         92.])

In [88]:
doc_matrix.shape

(73, 5173)

In [95]:
idf = np.log(doc_matrix.shape[0]/np.count_nonzero(doc_matrix, axis = 0))

In [96]:
tf_idf = doc_matrix * idf 

In [104]:
nonzeros = np.count_nonzero(tf_idf, axis=1) != 0

In [105]:
normalization_factor = np.tile(np.sum(tf_idf[nonzeros, :], axis = 1), reps=[ doc_matrix.shape[1], 1 ]).T

In [106]:
tf_idf[nonzeros, :] = tf_idf[nonzeros, :]/normalization_factor

In [107]:
tf_idf

array([[0.00553897, 0.0246926 , 0.01955867, ..., 0.        , 0.        ,
        0.        ],
       [0.00027235, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0006781 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.01252998, 0.        ,
        0.        ],
       [0.00399856, 0.        , 0.        , ..., 0.        , 0.03388642,
        0.03388642]])

In [128]:
u, s, vh = np.linalg.svd(tf_idf, full_matrices=False)
u.shape, s.shape, vh.shape

((73, 73), (73,), (73, 5173))

In [129]:
np.sum((tf_idf - np.matmul(np.matmul(u, np.diag(s)), vh))**2)

7.073832030958538e-30

In [170]:
selected_terms = term_indexes.iloc[selected_indexes, :]
selected_terms['col_index_in_doc_matrix'] = np.array(range(0, doc_matrix.shape[1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_terms['col_index_in_doc_matrix'] = np.array(range(0, doc_matrix.shape[1]))


In [289]:
u,d,vh = np.linalg.svd(doc_matrix, full_matrices=True)

In [278]:
seg = pkuseg.pkuseg()

In [279]:
query_terms = seg.cut('网页缓存')

In [280]:
query_terms

['网页', '缓存']

In [286]:
indexes_to_set = list()
for term in query_terms:
    correspond_indexes = selected_terms.iloc[selected_terms.iloc[:, 0].to_numpy() == term, :]
    if correspond_indexes.shape[0]:
        indexes_to_set.append(correspond_indexes.iloc[0, 1])

In [288]:
query_row = np.array(range(doc_matrix.shape[1]))
query_row[:] = 0
for index_to_set in indexes_to_set:
    query_row[index_to_set] = 1

In [290]:
query_coord = np.matmul(query_row.reshape((1, doc_matrix.shape[1])), vh1.T)[0, 0:doc_matrix.shape[0]]
doc_coords = np.matmul(doc_matrix, vh1.T)[:, 0:doc_matrix.shape[0]]

In [291]:
def cos_of_two_vector(x1, x2):
    n_x1 = np.linalg.norm(x1)
    n_x2 = np.linalg.norm(x2)
    inner_prod = np.abs(np.sum(x1 * x2))
    return inner_prod / (n_x1 * n_x2)

In [293]:
cos_of_query_and_docs = [cos_of_two_vector(coord, doc_coords[i,:]) for i in range(doc_coords.shape[0])]

  return inner_prod / (n_x1 * n_x2)


In [309]:
cos_of_query_and_docs

[2.3132362493006597e-16,
 0.007398161806718659,
 0.7885719195177288,
 2.6897202958542385e-16,
 4.899329720416439e-17,
 1.1881759827980173e-17,
 0.012318763189771448,
 0.20332744408799136,
 0.004133441811905724,
 3.2491792429161596e-16,
 3.3506991792932823e-16,
 1.5578600237784392e-16,
 0.020578444687700752,
 0.012948239477395873,
 0.10444513208530037,
 0.009880505571827675,
 4.437921564084582e-16,
 0.1317224140776608,
 3.432004289563128e-17,
 1.6338009787566354e-16,
 6.295490638809145e-17,
 1.5897409305238466e-16,
 1.0426864022604856e-16,
 3.516525769220923e-17,
 0.0545999024913028,
 8.795559671635933e-17,
 5.842078162358262e-17,
 0.06427562977040387,
 0.06751842931971178,
 2.0587006615862056e-16,
 4.781647602759868e-16,
 8.978382161134312e-16,
 0.07026304525981104,
 8.449848021069044e-17,
 0.013623393366589659,
 1.4499742149952298e-16,
 1.6021658682268865e-16,
 nan,
 1.562673248673795e-16,
 1.0614289035574102e-16,
 0.04011678116961417,
 2.249493740021366e-16,
 5.279404629040544e-16,
 