In [4]:
import pandas as pd
import numpy as np
import re
import time
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from tqdm import tqdm

In [5]:
term_indexes = pd.read_csv('termdocmatrix/term_indexes.csv')

In [78]:
def no_punctuations_or_letters_or_digits_Q(word):
    
    # if word is not str, then False
    if not (type(word) is str):
        return False
    
    # if word has digits, letters or punctuations
    result = re.search(
        '[' +
        '\u2000-\u206F' +
        '\u0000-\u002F' +
        '\u003A-\u0040' +
        '\u005B-\u0060' +
        '\u007B-\u007F' +
        '\u0080-\u00FF' +
        '\uFF00-\uFF0F' +
        '\u3000-\u303F' +
        '\u0030-\u0039' +
        '\u0040-\u007F' +
        ']',
        word
    )
    
    if result is None:
        return True
    else:
        return False

In [79]:
def length_equal_or_greater_than_2_Q(word):
    return len(word) >= 2

In [83]:
all_indexes = list(range(0, term_indexes.shape[0]))

def pass_rule(x):
    
    if not no_punctuations_or_letters_or_digits_Q(x):
        return False
    
    if not length_equal_or_greater_than_2_Q(x):
        return False
    
    return True
    
selected_indexes = list(filter(
    lambda i: pass_rule(term_indexes.iloc[i, 0]),
    all_indexes
))

In [85]:
term_indexes.iloc[selected_indexes, :]

Unnamed: 0,term
5,问题
7,分支
8,定界
9,求解
10,思路
...,...
14906,发向
14915,获知
14955,自制
14973,没有用


In [64]:
words = pure_words.iloc[selected_indexes]

In [65]:
words

Unnamed: 0,term


In [100]:
words.to_csv(path_or_buf='termdocmatrix/words.csv', index=False)

In [3]:
double_words = double_words.sort_values(
    by = 'count', 
    ascending=False
)

In [106]:
doc_matrix_full = np.loadtxt('termdocmatrix/matrix.txt', delimiter=',')
doc_matrix = doc_matrix_full[:, double_words.iloc[:, 0]]

In [108]:
doc_matrix.shape

(73, 6012)

In [347]:
idf = np.log(doc_matrix.shape[0]/np.count_nonzero(doc_matrix, axis = 0))

In [352]:
idf.shape

(6012,)

In [349]:
tf_idf = doc_matrix * idf 

In [369]:
nonzeros = np.count_nonzero(tf_idf, axis=1) != 0

In [390]:
normalization_factor = np.tile(np.sum(tf_idf[nonzeros, :], axis = 1), reps=[ doc_matrix.shape[1], 1 ]).T

In [391]:
tf_idf[nonzeros, :] = tf_idf[nonzeros, :]/normalization_factor

In [392]:
tf_idf

array([[0.        , 0.00106671, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.0032015 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [436]:
u, s, vh = np.linalg.svd(tf_idf, full_matrices=False)
u.shape, s.shape, vh.shape

((73, 73), (73,), (73, 6012))

In [396]:
truncate_factor = 20
u = u[:, 0:truncate_factor]
s = s[range(truncate_factor)]
vh = vh[0:truncate_factor, :]
u.shape, s.shape, vh.shape

((73, 20), (20,), (20, 6012))

In [397]:
np.sum((tf_idf - np.matmul(np.matmul(u, np.diag(s)), vh))**2)

0.4996914442851216

In [406]:
doc_coords = np.matmul(u, np.diag(s))
term_coords = np.matmul(np.diag(s), vh).T

In [408]:
articles = pd.read_csv('termdocmatrix/index_to_article.csv')

In [399]:
terms = pd.read_csv('termdocmatrix/index_to_word.csv')

In [419]:
n_clusters = 3

clusterer_trainer_average = AgglomerativeClustering(
    n_clusters = n_clusters,
    affinity = 'cosine',
    linkage = 'average'
)

clusterer_trainer_single = AgglomerativeClustering(
    n_clusters = n_clusters,
    affinity = 'cosine',
    linkage = 'single'
)

clusterer_trainer_complete = AgglomerativeClustering(
    n_clusters = n_clusters,
    affinity = 'cosine',
    linkage = 'complete'
) 

In [420]:
model_average = clusterer_trainer_average.fit(doc_coords)
model_single = clusterer_trainer_single.fit(doc_coords)
model_complete = clusterer_trainer_complete.fit(doc_coords)

0             4sum-problem-branch-and-bound-solution.md
1               a-layout-algorithm-for-equal-returns.md
2                        add-your-site-cache-control.md
3     association-rules-mining-and-apriori-algorithm.md
5                              aws-budget-bill-alert.md
                            ...                        
68                                symlink-vs-harlink.md
69                       talk-about-my-recent-doings.md
70    telling-static-file-accelerating-and-whole-sit...
71                           traceroute-introduction.md
72                                    vscode-linting.md
Name: article, Length: 71, dtype: object

In [432]:
def words_cluster(n, model):
    return terms.iloc[(double_words.iloc[:, 0])[model.labels_==n], :].iloc[:,1].to_numpy()

def docs_cluster(n, model):
    return articles.iloc[model.labels_ == n, 1]

def output_to_file(filename = None, model = None):
    output_str = str()
    for i in range(0, n_clusters):
        output_str = output_str + 'n_cluster: ' + str(i) + '\n\n'

        words_in_this_cluster = docs_cluster(i, model)
        for word in words_in_this_cluster:
            output_str = output_str + word + '; \n'

        output_str = output_str + '\n\n'


    f = open(filename, 'w')
    f.write(output_str)
    f.close

In [433]:
output_to_file(
    filename = 'clusters_average.txt',
    model = model_average
)

In [434]:
output_to_file(
    filename = 'clusters_single.txt',
    model = model_single
)

In [435]:
output_to_file(
    filename = 'clusters_complete.txt',
    model = model_complete
)