# Text Clustering
```
Author: hong cui
Dataset:isamples
Reference: 
(1) https://doi.org/10.30534/ijeter/2020/04832020
(2) http://christopher5106.github.io/deep/learning/2020/04/02/fasttext_pretrained_embeddings_subword_word_representations.html
```

In [None]:
from platform import python_version

print(python_version()) #3.9.7

In [None]:
import pandas as pd
import numpy as np

import re

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

import fasttext # may need pip install fasttext-0.9.2-cp310-cp310-win_amd64.whl before import fasttext

from sklearn.cluster import KMeans 
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D

## Load and preprocess dataset


In [None]:
import random 
#ds = "GEOME"
#ds = "OPENCONTEXT"
#ds = "SMITHSONIAN"
ds = "SESAR"
#ds = "All4"
dslist = ["GEOME", "OPENCONTEXT", "SMITHSONIAN", "SESAR"]

sample = 1000
if ds == "All4": #combine 4 sets
    olines = list()
    for data in dslist:
        olines1 = list() #original lines, one sample record is one line
        with open(data+'.txt', "r", encoding='utf-8') as input: 
            for line in input:
                olines1.append(line)
     
        rowCount = len(olines1)
        if rowCount>sample: 
            random.seed(1234)
            olines1 = random.sample(olines1, sample)
        olines.extend(olines1)
else:
    olines = list() #original lines, one sample record is one line
    with open(ds+'.txt', "r", encoding='utf-8') as input: 
        for line in input:
            olines.append(line)
     
    rowCount = len(olines)
    if rowCount>sample: 
        random.seed(1234)
        olines = random.sample(olines, sample)

print(len(olines))
print(olines[0])

In [None]:
def removeFieldsContainNum(text):
    subed = True
    while (subed):
        text_s = re.sub(r"###[^ #]*?\d[^ #]*?###",  "###", text, 1)
        if(text_s == text):
            subed = False
        else:
            text = text_s
            
    text = re.sub("###", " ", text)
    return text




In [None]:
#l = "I have to go to https://doi.xyz"
#' '.join([item for item in l.split() if item not in stopwords and not item.startswith("https://") and not item.startswith("http://") ]) #no stemming

In [None]:
nltk.download('stopwords')
estopwords = stopwords.words('english')
stemmer = SnowballStemmer("english")

In [None]:
#for i in range(len(lines)):
plines = list() #processed lines
for i in range(0,len(olines)):
    l = removeFieldsContainNum(olines[i]) #remove fields that containing one token and the token contains a number (id, codes)
    l = l.lower()
    l = re.sub(r"[,|()\'\"]", " ", l)
    #l = ' '.join([stemmer.stem(item) for item in x.split() if item not in stopwords])) #stemming
    l = ' '.join([item for item in l.split() if item not in estopwords and 
                  not item.startswith("https://") and 
                  not item.startswith("http://") and
                 not re.match("\d.*", item) and
                len(item)>2]) #no stemming #no stemming
    l = re.sub(r"[/]", " ", l)
    plines.append(l)

#data['processed_text'] = text_preprocess(data['original'], stemmer, stopwords)
print(plines[0:3])

In [None]:
#store plines for reuse

import pickle
with open("plines."+ds+".pkl", 'wb') as outp:
    pickle.dump(plines, outp, pickle.HIGHEST_PROTOCOL)


## Use pre-trained word vectors to obtain sentence vector for our corpus 
wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz



In [None]:
model = fasttext.load_model('cc.en.300.bin') #.bin is a big file, use wget above to get it

In [None]:
vlines = list() #records represented as vectors
for pline in plines:
    vlines.append(model.get_sentence_vector(pline))

#normalize to unit vector
from sklearn.preprocessing import normalize
vlines = normalize(vlines, axis=1)

print(vlines[0])


In [None]:
#store vlines for reuse

with open("vlines."+ds+".pkl", 'wb') as outp:
    pickle.dump(vlines, outp, pickle.HIGHEST_PROTOCOL)

# Test for clustering tendency

In [None]:
len(vlines) #2381 for GEOME, 59419 for All4
df = pd.DataFrame(vlines)

#from sklearn.metrics.pairwise import cosine_similarity
#dist = 1 - cosine_similarity(df) 
from scipy.spatial.distance import pdist
dist = pdist(df, metric="cosine")

#type(dist)
#dist

In [None]:
from pyclustertend import hopkins

hopkins(df,1000)
#hopkins is slow, so use only a sample. Hopkins value close to 0.5 = no cluster, close to 0 = good clustering tendency
#3000 samples out of All4 = 0.05
#1500 samples out of GEMOE = 0.12
#3000 OPENCONTEXT = 0.09
#1000 SMITHSONIA
#1000 SESEAR 0.05

In [None]:
from pyclustertend import vat
vatview = vat(df) #take data matrix, not dist matrix
vatview


In [None]:
with open("vat."+ds+".pkl", 'wb') as outp:
    pickle.dump(vatview, outp, pickle.HIGHEST_PROTOCOL)

## Hierarchical clustering
also see https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/

In [None]:
from scipy.cluster.hierarchy import linkage, ward, dendrogram #consider https://cupy.dev/ to use gpu

#linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
linkage_matrix = linkage(dist, 'ward') #single, complete, average, weighted, median, centroid, ward
fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=df.index);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.ylabel("sample index")
plt.xlabel("distance")
plt.tight_layout() #show plot with tight layout

#uncomment below to save figure
plt.savefig('ward_clusters.'+ds+'.png', dpi=200) #save figure as ward_clusters

In [None]:
np.unique(linkage_matrix[:, 0])


In [None]:
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

c, coph_dists = cophenet(linkage_matrix,dist)
c

## Obtain clusters and their informative terms 

In [None]:
#obtain clusters

from scipy.cluster.hierarchy import fcluster
max_d = 0.5 #1.4
clusters = fcluster(linkage_matrix, max_d, criterion='distance')
num_clusters = len(np.unique(clusters))
num_clusters #count of cluters

In [None]:
#index lines with clusters

records = {'description':plines, 'cluster':clusters}
frame = pd.DataFrame(records, columns=['description', 'cluster'])

In [None]:
frame['cluster'].value_counts()

### Terms selected using TFIDF 

In [None]:
#df = pd.DataFrame({"Id":["a", 'b', 'a','b','b'], 'seq':['X', 'Z','P','C','G']})
#df = df.groupby('Id').seq.apply(' '.join).reset_index(name='new_seq')
#df


In [None]:
#cframe = frame.groupby('cluster').description.apply(' '.join).reset_index(name='concat_descriptions')
#cframe.loc[cframe['cluster']==1, 'concat_descriptions'].values[0]
#len(cframe)
#cframe.columns

In [None]:
#create document-by-term matrix with tf*idf scores for the clusters

from sklearn.feature_extraction.text import TfidfVectorizer

#merge descriptions by clusters
cframe = frame.groupby('cluster').description.apply(' '.join).reset_index(name='concat_descriptions')

#compute tfidf matrix for the clusters, one row for a cluster
#to verify most informative terms are selected, set max_features to a small number like 6 and top n to 3.
tfidf_vectorizer = TfidfVectorizer(max_df = 0.8, max_features=2000, min_df=0.2, stop_words='english',
                                  use_idf=True)

tfidf_matrix = tfidf_vectorizer.fit_transform(cframe['concat_descriptions'])

print(tfidf_matrix.shape)

terms = tfidf_vectorizer.get_feature_names()

#print(terms)

#print (tfidf_matrix.toarray())


In [None]:
#select top n most informative terms for each of the clusters
#from __future__ import print_function
n = 10

print("Top "+str(n)+" terms per cluster:")
print()
ordered = tfidf_matrix.toarray().argsort()[:, ::-1]
for i in range(num_clusters):
    print("Cluster %d words:" % (i+1), end='')
    for ind in ordered[i, :n]:
        print(' %s' % terms[ind], end=",")
    print()
    