# Text Clustering
```
Author: hong cui
Dataset:iSamples
```

In [None]:
from platform import python_version
print(python_version()) #3.9.7

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import fasttext # pip install fasttext-0.9.2-cp310-cp310-win_amd64.whl
import matplotlib.pyplot as plt 
import pickle
from joblib import Parallel, delayed
from sklearn.cluster import Birch
import fastcluster
from scipy.cluster.hierarchy import fcluster
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

## Load and preprocess dataset


In [None]:
import random 
#ds = "GEOME"
#ds = "OPENCONTEXT"
#ds = "SMITHSONIAN"
#ds = "SESAR"
ds = "All4"
#ds = "All4Even"


#TODO: index lines with collection name
df_content = pd.DataFrame()
dslist = ["GEOME", "OPENCONTEXT", "SMITHSONIAN", "SESAR"]
sample = 50000

if ds.startswith("All4") : #combine 4 sets, each set constains the amount of samples
    if(ds == "All4"): sample=0
    for data in dslist:
        olines1 = list() #original lines, one sample record is one line
        src1 = list()
        df_content1 = pd.DataFrame()
        with open(data+'.txt', "r", encoding='utf-8') as input: 
            for line in input:
                olines1.append(line)
                src1.append(data)
        
        df_content1['src'] = src1
        df_content1['content'] = olines1
        rowCount = len(olines1)
        if sample!=0 and rowCount > sample: 
            df_content1 = df_content1.sample(sample, random_state=1234)
        df_content=df_content.append(df_content1)
else:
    olines = list() #original lines, one sample record is one line
    src = list()
    with open(ds+'.txt', "r", encoding='utf-8') as input: 
        for line in input:
            olines.append(line)
            src.append(ds)
    
    df_content['src'] = src
    df_content['content'] = olines
    rowCount = len(olines)
    if rowCount>sample: 
        df_content = df_content.sample(sample, random_state=1234)

print(len(df_content))
print(df_content[0:7])
print(df_content.iloc[0]['content'])

In [None]:
def removeFieldsContainNum(text):
    subed = True
    while (subed):
        text_s = re.sub(r"###[^ #]*?\d[^ #]*?###",  "###", text, 1)
        if(text_s == text):
            subed = False
        else:
            text = text_s
            
    text = re.sub("###", " ", text)
    return text


In [None]:
nltk.download('stopwords')
estopwords = stopwords.words('english')
stemmer = SnowballStemmer("english")

In [None]:
#process/clean content

def clean(line):
    l = removeFieldsContainNum(line) #remove fields that containing one token and the token contains a number (id, codes)
    l = l.lower()
    l = re.sub(r"[,|()\'\"]", " ", l)
    #l = ' '.join([stemmer.stem(item) for item in x.split() if item not in stopwords])) #stemming
    l = ' '.join([item for item in l.split() if item not in estopwords and 
                  not item.startswith("https://") and 
                  not item.startswith("http://") and
                 not re.match("\d.*", item) and
                len(item)>2]) #no stemming #no stemming
    l = re.sub(r"[/]", " ", l)
    return l
                               

%time plines = Parallel(n_jobs=6)(delayed(clean)(line) for line in df_content['content'])

print(plines[0:3])

In [None]:
#store plines for reuse

#with open("plines."+ds+".pkl", 'wb') as outp:
#    pickle.dump(plines, outp, pickle.HIGHEST_PROTOCOL)


## Form the dataset: use pre-trained word vectors to obtain sentence vector for our corpus 
wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz



In [None]:
%time model = fasttext.load_model('data/cc.en.300.bin') #takes 7GB

In [None]:
vlines = list() #records represented as vectors
for pline in plines:
    vlines.append(model.get_sentence_vector(pline))

#normalize to unit vector
vlines = normalize(vlines, axis=1)
del model #reclaim memory
#print(vlines[0])
#print(len(vlines)) #2381 for GEOME, 59419 for All4



In [None]:
#store vlines for reuse
#with open("vlines."+ds+".pkl", 'wb') as outp:
#    pickle.dump(vlines, outp, pickle.HIGHEST_PROTOCOL) 

## BIRCH clustering: upper level raw clustering 

In [None]:
#read in vlines
#with open("vlines."+ds+".pkl", 'rb') as inp:
#    vlines = pickle.load(inp)


In [None]:
brc = Birch(branching_factor=50, n_clusters=None, threshold=0.5) 
#SESAER th=0.35, 8 clusters, one contains 75% of total records (35365 out of 47000)
#SESAER th=0.3, 212 clusters, one contains 75% of total records (35365 out of 47000) 
#SESAER th=0.2, 359 clusters, one contains 75% of total records (35365 out of 47000)
#SESAER th=0.1, 2480 clusters, one contains 75% of total records (35365 out of 47000)
%time brc.fit(vlines) #build the CF tree #4.5M records takes 1 min wall time
#centroids of the raw clusters formed
#print(brc.subcluster_centers_) 
#labels of input data
#print(brc.labels_)
print(brc.subcluster_labels_)
#print(brc.n_features_in_)

labels = brc.predict(vlines)

#from collections import Counter
#Counter(labels)




In [None]:
labels = brc.predict(vlines)
df = pd.DataFrame({'src':df_content['src'], 'birchcluster':labels, 'hcluster':-1,'content':plines, 'original':df_content['content']}) 
df
tab = df.groupby(['src', 'birchcluster']).size()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(tab.unstack().transpose())
    
with open("clusters."+ds+".pkl", 'wb') as outp:
    pickle.dump(df, outp, pickle.HIGHEST_PROTOCOL)

## clustering of subclusters of BIRCH result with parallal programming
https://www.machinelearningplus.com/python/parallel-processing-python/# not working!

https://github.com/yngvem/parallelising-python 

also see https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/

https://github.com/mdimura/sparsehc-dm

https://pypi.org/project/fastcluster/

https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html


In [None]:
from fastcluster import linkage_vector
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram


#birchcluster_df: the input observation data (D=300xN)
#birchcluster_number:  cluster number of birchcluster_df in BIRCH result 
#min_obs: minimal observations in birchcluster_df to run hierarichal clustering, must be >=2

def h_clustering_fast(birchcluster_df, birchcluster_number, link_method, min_obs=20 ):
    #dist = pdist(birchcluster_df)
    if len(birchcluster_df) < min_obs:
        return [] 
    else:
        linkage_matrix = fastcluster.linkage_vector(birchcluster_df, link_method) #single, complete, average, weighted, median, centroid, ward
        #del dist #dist is modified in linkage_vector
    
        fig, ax = plt.subplots(figsize=(15, 20)) # set size
        ax = dendrogram(linkage_matrix, orientation="right");
        plt.tick_params(\
            axis= 'x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelbottom='off')
        plt.ylabel("sample index")
        plt.xlabel("distance")
        plt.tight_layout() #show plot with tight layout
    
        plt.savefig('clusters.'+str(birchcluster_number)+'.'+link_method+'.png', dpi=200) #save figure as ward_clusters
    
        with open('linkage_matrix.'+str(birchcluster_number)+'.'+link_method+".pkl", 'wb') as outp:
            pickle.dump(linkage_matrix, outp, pickle.HIGHEST_PROTOCOL)
            
        return linkage_matrix
    

In [None]:
#birchcluster_df: df holding the birchcluster observation content
#birchcluster_number:  cluster number of birchcluster_df in BIRCH result 
#linkage_matrix: the linkage_matrix for the birchcluster
#max_d: distance threshold to obtain clusters from hierarchical clustering
#top_n: top n terms to return

def collect_terms(df, birchcluster_df, birchcluster_number, linkage_matrix, max_d=0.7, top_n=10):
    print(birchcluster_number)
    if np.any(linkage_matrix):
        #obtain clusters from hierarchial clustering
        clusters = fcluster(linkage_matrix, max_d, criterion='distance')
        #criterion: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html
        df.loc[df['birchcluster']==birchcluster_number, 'hcluster'] = clusters
        num_clusters = len(np.unique(clusters))
        print("# of clusters:" + str(num_clusters))
        
        #index lines with clusters
        records = {'description':df[df['birchcluster']==birchcluster_number].content, 'cluster':clusters, 'birchcluster':birchcluster_number}
        frame = pd.DataFrame(records, columns=['description', 'cluster', 'birchcluster'])
        cframe = frame.groupby('cluster').description.apply(' '.join).reset_index(name='concat_descriptions')

        #compute tfidf matrix for the clusters, one row for a cluster
        #to verify most informative terms are selected, set max_features to a small number like 6 and top n to 3.
        tfidf_vectorizer = TfidfVectorizer(max_df = 0.8, max_features=200, min_df=0.2, stop_words='english',
                                  use_idf=True)

        try:
            tfidf_matrix = tfidf_vectorizer.fit_transform(cframe['concat_descriptions'])
            terms = tfidf_vectorizer.get_feature_names()
            ordered_index = tfidf_matrix.toarray().argsort()[:, ::-1]
         
            top_n = min(len(ordered_index), top_n)
            #print("top_n:"+str(top_n))
            #print("length of ordered index:"+str(len(ordered_index)))
            #print("# of clusters:"+str(num_clusters))
            allterms = []
            for i in range(0, num_clusters):
                n_terms = []
                for ind in ordered_index[i, :top_n]:
                    n_terms.append(terms[ind])
                allterms.append(n_terms)
            return allterms
        
        except ValueError:#no term obtained from tfidf when all records holds the same set of terms
            return []
        
        
    else:
        return []
    
#allterms = collect_terms(df[df['birchcluster']==4], 4, linkage_matrix=results[4], max_d=0.5, top_n=10)
#for i in range(0, len(allterms)):
#    print(allterms[i])
            
            


In [None]:
#print(len(vlines[df.index[df['cluster']==0]]))

#%time h_clustering(vlines[df.index[df['cluster']==0]], 0, 'ward') 
#35K 3min without plotting and dumping, 9GB RAM
# 5min 51s with plotting and dumping

#%time h_clustering_fast(vlines[df.index[df['cluster']==0]], 0, 'ward') 
#linkage:
#35K 2min 39s without plotting and dumping 9GB RAM
#5min 19s mins with plotting and dumping
#linkage_vector:
#35K  8min 8s without plotting and dumping 0GB RAM
#11min 1s with plotting and dumping 0.6GB RAM

In [None]:
#results = []
#terms = []
#without plot
#n_jobs = 1: 7min 10s
#n_jobs = 4: 6min 50s, all subclusters ends in 1 min, subcluster 17 takes 6min 50s

#with plot
#n_jobs = 4: 10min 13s, all subclusters ends in 1 min, subcluster 17 takes 10min 13s

%time results = Parallel(n_jobs=4, verbose=50)(delayed(h_clustering_fast)(vlines[df.index[df['birchcluster']==c]], c, 'ward', 20)  for c in range(0, len(df.birchcluster.unique())-1))
len(results)
results[0]

#import pickle
#with open("results."+ds+".pkl", 'wb') as outp:
#    pickle.dump(results, outp, pickle.HIGHEST_PROTOCOL)

%time terms = Parallel(n_jobs=1, verbose=50)(delayed(collect_terms)(df, df[df['birchcluster']==c], c, linkage_matrix=results[c], max_d=0.5, top_n=10) for c in range(0, len(df.birchcluster.unique())-1))
df.head()


In [None]:
#verify results
df[df['birchcluster']==70].groupby('hcluster').size()
terms[56]