In [4]:
import pandas as pd
import numpy as np
import re
import time
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from tqdm import tqdm

In [135]:
term_indexes = pd.read_csv('termdocmatrix/term_indexes.csv')

In [78]:
def no_punctuations_or_letters_or_digits_Q(word):
    
    # if word is not str, then False
    if not (type(word) is str):
        return False
    
    # if word has digits, letters or punctuations
    result = re.search(
        '[' +
        '\u2000-\u206F' +
        '\u0000-\u002F' +
        '\u003A-\u0040' +
        '\u005B-\u0060' +
        '\u007B-\u007F' +
        '\u0080-\u00FF' +
        '\uFF00-\uFF0F' +
        '\u3000-\u303F' +
        '\u0030-\u0039' +
        '\u0040-\u007F' +
        ']',
        word
    )
    
    if result is None:
        return True
    else:
        return False

In [79]:
def length_equal_or_greater_than_2_Q(word):
    return len(word) >= 2

In [83]:
all_indexes = list(range(0, term_indexes.shape[0]))

def pass_rule(x):
    
    if not no_punctuations_or_letters_or_digits_Q(x):
        return False
    
    if not length_equal_or_greater_than_2_Q(x):
        return False
    
    return True
    
selected_indexes = list(filter(
    lambda i: pass_rule(term_indexes.iloc[i, 0]),
    all_indexes
))

In [85]:
term_indexes.iloc[selected_indexes, :]

Unnamed: 0,term
5,问题
7,分支
8,定界
9,求解
10,思路
...,...
14906,发向
14915,获知
14955,自制
14973,没有用


In [87]:
doc_matrix_full = np.loadtxt('termdocmatrix/doc_matrix.txt', delimiter=',')
doc_matrix = doc_matrix_full[:, selected_indexes]

In [88]:
doc_matrix.shape

(73, 5173)

In [95]:
idf = np.log(doc_matrix.shape[0]/np.count_nonzero(doc_matrix, axis = 0))

In [96]:
tf_idf = doc_matrix * idf 

In [104]:
nonzeros = np.count_nonzero(tf_idf, axis=1) != 0

In [105]:
normalization_factor = np.tile(np.sum(tf_idf[nonzeros, :], axis = 1), reps=[ doc_matrix.shape[1], 1 ]).T

In [106]:
tf_idf[nonzeros, :] = tf_idf[nonzeros, :]/normalization_factor

In [107]:
tf_idf

array([[0.00553897, 0.0246926 , 0.01955867, ..., 0.        , 0.        ,
        0.        ],
       [0.00027235, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0006781 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.01252998, 0.        ,
        0.        ],
       [0.00399856, 0.        , 0.        , ..., 0.        , 0.03388642,
        0.03388642]])

In [128]:
u, s, vh = np.linalg.svd(tf_idf, full_matrices=False)
u.shape, s.shape, vh.shape

((73, 73), (73,), (73, 5173))

In [129]:
np.sum((tf_idf - np.matmul(np.matmul(u, np.diag(s)), vh))**2)

7.073832030958538e-30

In [130]:
query_words = ['分支', '定界', '求解']

In [170]:
selected_terms = term_indexes.iloc[selected_indexes, :]
selected_terms['col_index_in_doc_matrix'] = np.array(range(0, doc_matrix.shape[1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_terms['col_index_in_doc_matrix'] = np.array(range(0, doc_matrix.shape[1]))


In [261]:
selected_terms.iloc[selected_terms.iloc[:, 0].to_numpy() == '缓存', :]

Unnamed: 0,term,col_index_in_doc_matrix
1685,缓存,830


In [262]:
selected_terms.iloc[selected_terms.iloc[:, 0].to_numpy() == '书桌', :]

Unnamed: 0,term,col_index_in_doc_matrix
1729,书桌,862


In [263]:
selected_terms.iloc[selected_terms.iloc[:, 0].to_numpy() == '节省', :]

Unnamed: 0,term,col_index_in_doc_matrix
1744,节省,875


In [265]:
selected_terms.iloc[selected_terms.iloc[:, 0].to_numpy() == '网页', :]

Unnamed: 0,term,col_index_in_doc_matrix
1016,网页,502


In [268]:
query_row = np.array(range(doc_matrix.shape[1]))

query_row[:] = 0

query_row[830] = 1
query_row[862] = 1
query_row[875] = 1
query_row[502] = 1

u1,d1,vh1 = np.linalg.svd(doc_matrix)

coord = np.matmul(query_row.reshape((1, doc_matrix.shape[1])), vh1.T)[0, 0:73]

coords = np.matmul(doc_matrix, vh1.T)[:, 0:73]

In [269]:
def my_cos(x1, x2):
    n_x1 = np.linalg.norm(x1)
    n_x2 = np.linalg.norm(x2)
    inner_prod = np.abs(np.sum(x1 * x2))
    return inner_prod / (n_x1 * n_x2)

In [270]:
coss = [my_cos(coord, coords[i,:]) for i in range(coords.shape[0])]

  return inner_prod / (n_x1 * n_x2)


In [271]:
coss = np.array(coss)

In [273]:
coss > 0.7885

array([False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [274]:
coss > 0.1

array([False, False,  True, False, False, False, False,  True, False,
       False, False, False, False, False,  True, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False,  True, False,
       False])