In [90]:
import pandas as pd
import numpy as np
import re
import time
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering

In [91]:
words = pd.read_csv('termdocmatrix/index_to_word.csv')

In [92]:
puncs_indexes = list()
for i in range(0, words.shape[0]):
    word = words.iloc[i, 1]
    if not (type(word) is str):
        puncs_indexes.append(i)
        continue
    
    result = re.search(
        '[' +
        '\u2000-\u206F' +
        '\u0000-\u002F' +
        '\u003A-\u0040' +
        '\u005B-\u0060' +
        '\u007B-\u007F' +
        '\u0080-\u00FF' +
        '\uFF00-\uFF0F' +
        '\u3000-\u303F' +
        '\u0030-\u0039' +
        '\u0040-\u007F' +
        ']',
        words.iloc[i, 1]
    )
        
    if not (result is None):
        puncs_indexes.append(i)

In [93]:
len(puncs_indexes)

5202

In [94]:
pure_word_indexes = list()
puncs_indexes_set = set(puncs_indexes)
for i in range(0, words.shape[0]):
    if not (i in puncs_indexes_set):
        pure_word_indexes.append(i)

In [95]:
pure_words = words.iloc[pure_word_indexes]

In [96]:
double_word_indexes = list()
for i in range(0, pure_words.shape[0]):
    if len(pure_words.iloc[i, 1]) >= 2:
        double_word_indexes.append(i)

In [99]:
words = pure_words.iloc[double_word_indexes]

In [100]:
words.to_csv(path_or_buf='termdocmatrix/words.csv', index=False)

In [3]:
double_words = double_words.sort_values(
    by = 'count', 
    ascending=False
)

In [106]:
doc_matrix_full = np.loadtxt('termdocmatrix/matrix.txt', delimiter=',')
doc_matrix = doc_matrix_full[:, double_words.iloc[:, 0]]

In [108]:
doc_matrix.shape

(73, 6012)

In [109]:
normalization_factor = np.sum(doc_matrix, axis = 0)

In [117]:
idf = np.log(doc_matrix.shape[0]/np.count_nonzero(doc_matrix, axis = 0))

In [118]:
tf_idf = doc_matrix * idf / normalization_factor

In [306]:
u, s, vh = np.linalg.svd(tf_idf, full_matrices=False)
u.shape, s.shape, vh.shape

truncate_factor = 20
u = u[:, 0:truncate_factor]
s = s[range(truncate_factor)]
vh = vh[0:truncate_factor, :]
u.shape, s.shape, vh.shape

((73, 20), (20,), (20, 6012))

In [307]:
np.sum((tf_idf - np.matmul(np.matmul(u, np.diag(s)), vh))**2)

25612.44969053936

In [308]:
term_coords = np.matmul(np.diag(s), vh).T

In [309]:
terms = pd.read_csv('termdocmatrix/index_to_word.csv')

In [314]:
n_clusters = 1000

clusterer_trainer_average = AgglomerativeClustering(
    n_clusters = n_clusters,
    affinity = 'cosine',
    linkage = 'average'
)

clusterer_trainer_single = AgglomerativeClustering(
    n_clusters = n_clusters,
    affinity = 'cosine',
    linkage = 'single'
)

clusterer_trainer_complete = AgglomerativeClustering(
    n_clusters = n_clusters,
    affinity = 'cosine',
    linkage = 'complete'
) 

In [316]:
model_average = clusterer_trainer_average.fit(term_coords)
model_single = clusterer_trainer_single.fit(term_coords)
model_complete = clusterer_trainer_complete.fit(term_coords)

In [318]:
def words_cluster(n, model):
    return terms.iloc[(double_words.iloc[:, 0])[model.labels_==n], :].iloc[:,1].to_numpy()

def output_to_file(filename = None, model = None):
    output_str = str()
    for i in range(0, n_clusters):
        output_str = output_str + 'n_cluster: ' + str(i) + '\n\n'

        words_in_this_cluster = words_cluster(i, model)
        for word in words_in_this_cluster:
            output_str = output_str + word + ', '

        output_str = output_str + '\n\n'


    f = open(filename, 'w')
    f.write(output_str)
    f.close

In [319]:
output_to_file(
    filename = 'clusters_average.txt',
    model = model_average
)

In [320]:
output_to_file(
    filename = 'clusters_single.txt',
    model = model_single
)

In [None]:
output_to_file(
    filename = 'clusters_complete.txt',
    model = model_complete
)