## Cluster abstracts with doc2vec
Tested under Python 3.7.<br>
First run <i>convert all pdfs to dfs</i>.
<br>

### Directories and settings

In [None]:
# dataframe for training, either from single set of abstracts or from multiple sets combined into one dataframe 
train_df_path = r''

# the one dataframe that should be clustered (or directory, if all dataframes in that directory should be clustered)
cluster_df_path = r''

human_clusters_path = r''
nlp_clusters_path = r''

n_clusters = 20

# vec_size is the dimensionality of the final vector, 
# i.e. it must be smaller than the number of documents used for training, or else all vectors can be linearly independent 
# usually vec_size is tens to hundreds, and number of documents is millions
# vector_size = n_clusters # this is probably a good setting if training set is very small (100 abstracts)

vector_size = 10

title_weight = 3 # weight of title (w.r.t. rest of text), must be integer

custom_filter = ['could', 'might', 'many', 'also', 'scan', 'abstracts', 'use', 'people', 'new', 'researchers', 'would', 'may', 'one', 'users', 'article', 'using']

### Imports

In [None]:
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import glob
import os
import nltk
import pandas as pd
from collections import Counter
import time
import pickle
import matplotlib.pyplot as plt

### Functions

In [None]:
def nint(i):
    return int(round(i))

def timestamp():
    import time
    return time.strftime('_%Y%m%d_%H%M%S')

def print_time(time0 = 0):
    import time
    print("Time elapsed: ", (time.time()-time0)/60., " min")
    
def remove_stopwords(words):
    """
    input: list
    output: list without stopwords, all entries are cast to lower case
    """
    filtered_words = []
    for word in words:
        if (word.lower() not in stopwords.words('english')) & (word.lower() not in custom_filter):
            filtered_words.append(word)
    return filtered_words

def assure_index(df):
    if df.index.name != 'abstract_id':
        df.set_index('abstract_id', inplace = True) 
    df['abstract_id'] = df.index
    return df

def make_text_col(df, title_weight = 3):
    """
    combine abstract title, overview, summary, implications into one text
    input: df
    output: df with additional column 'text'
    """
    df['text'] = (df.abstract_title + ' ') * nint(title_weight) + ' ' + df.overview + ' ' + df.summary + ' ' + df.implications
    return df

def tag_data(df_text, rm_stopwords=True):
    """
    takes an array of text, removes stopwords, tags each element
    input: column of df containing text
    output: list of TaggedDocuments, each TaggedDocument is like a dict with an array of words and a tag, specific type for gensim.models.doc2vec
    """
    tagged_data = []
    for i, doc in enumerate(df_text):
#         filtered_words = [word for word in word_tokenize(doc.lower()) if word not in stopwords.words('english')]
        if rm_stopwords == True:
            filtered_words = remove_stopwords(word_tokenize(doc))
        else:
            filtered_words = word_tokenize(doc)
        tagged_data.append(TaggedDocument(words=filtered_words, tags=[str(i)]))
    return tagged_data

def make_index(a, prefix):
    a2 = np.zeros_like(a, dtype=object)
    for i in range(len(a)):
        a2[i] = prefix+str(a[i]).zfill(3)
    return a2

def id_to_int(id):
    return int(id[-3:])


### Read dataframes
Read training data and abstracts to be clustered.

In [None]:
df = pd.read_pickle(train_df_path)
df = assure_index(df)
print('train df head:')
display(df.head())
print(len(df))

In [None]:
df_cluster = pd.read_pickle(cluster_df_path)
df_cluster = assure_index(df_cluster)
prefix = np.array(Counter(df_cluster.abstract_id.apply(lambda x: x[:14])).most_common())[0][0]
df_cluster = make_text_col(df_cluster, title_weight = title_weight)
print('cluster df head:')
display(df_cluster.head())
print(len(df_cluster))
print('prefix: ', prefix)

### Create column that holds all text to be transformed to vector
Title has (potentially) higher weight, see above.

In [None]:
df = make_text_col(df, title_weight = title_weight)

### Tag training data
To make format compatible with gensim implementation of doc2vec.
Inspired by https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5

In [None]:
time0 = time.time()
# new version, removes stopwords
# takes 5 min for all training data
tagged_data = tag_data(df.text)
print_time(time0)

save_tagged_data(tagged_data, 'onetagperabstract_title'+str(int(title_weight))+'_nostopwords')

### Load tagged data (or store tagged data)
To avoid tagging again.

In [None]:
# print(len(tagged_data))

def save_tagged_data(tagged_data, name=''):
    savepath = os.path.join(os.path.dirname(train_df_path),'tagged_data_'+name+timestamp()+'.data')
    print('Tagged data saved as ', savepath)
    f = open(savepath, 'wb')
    pickle.dump(tagged_data, f)
    f.close()

def read_tagged_data(name=''):
    if name == '':
        name = glob.glob(os.path.join(os.path.dirname(train_df_path),'tagged_data_*.data'))[-1]
    else:
        name = glob.glob(os.path.join(os.path.dirname(train_df_path),'*'+name+'*.data'))[-1]
    print('Reading tagged data from ', name)
    return pickle.load(open(name, 'rb'))

tagged_data = read_tagged_data('yourfilenamehere')

### Train and save model

In [None]:
max_epochs = 50
vector_size = vector_size 
alpha = 0.025

time0 = time.time()
model = Doc2Vec(vector_size=vector_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =0)  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

# model.save(os.path.join(os.path.dirname(train_df_path),'d2v_onerowpersentence'+timestamp()+'.model'))
savepath = os.path.join(os.path.dirname(train_df_path),'d2v'+timestamp()+'.model')
model.save(savepath)
print("Model saved as ", savepath)
print_time(time0)

### Load trained model

In [None]:
model = r'your .model path here'
print("Loading model ", model)
model= Doc2Vec.load(model)

### Add fake entry to see if it is assigned to same cluster (for quality control only)

In [None]:
def add_fake_entry(df):
    """
    adds a fake row into df, similar to existing row
    this is to check if they end up in the same cluster
    """
    df = df.append(df.iloc[-1,:], ignore_index=True)
    df.iloc[-1, df.columns.get_loc('abstract_id')] = df.iloc[-1, df.columns.get_loc('abstract_id')].replace('SC-', 'fak')
    text = df.iloc[-1, df.columns.get_loc('text')]
    text = 'fakefakefake' + text[12:]
    df.iloc[-1, df.columns.get_loc('text')] = text
    return df

df_fake = add_fake_entry(df_cluster)
df_fake.tail()
df_cluster = df_fake
tagged_data = tag_data(df_cluster.text)

In [None]:
df_cluster.tail()

### Apply model to data to be clustered (assign vector to each abstract), cluster vectors
See below to apply this to a bunch of abstract sets at once.

In [None]:
from scipy.spatial.distance import cdist
from nltk.cluster import KMeansClusterer

# Tag data to be clustered
df_cluster = make_text_col(df_cluster)
tagged_data = tag_data(df_cluster.text)

# Assign vector to each abstract
X = np.zeros((len(df_cluster), vector_size))
for i in range(len(df_cluster)):
    print('Inference for ', df_cluster.iloc[i, df_cluster.columns.get_loc('abstract_title')])
    X[i,:] = model.infer_vector(tagged_data[i].words)
    if i == len(df_cluster)-1:
        break


# kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.euclidean_distance, repeats=25)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
print(len(assigned_clusters))
print(assigned_clusters)
df_cluster['assigned_cluster'] = assigned_clusters
# assigned_clusters = df.index[assigned_clusters].values
# print(assigned_clusters)

### Print clusters to screen and to txt file

In [None]:
%%capture cap --no-stderr
#  sometimes has to be run twice to avoid error, not sure why

min_size = 3 # minimum number of abstract in a cluster to be printed to file


def get_most_frequent_from_df(dfp, n=5):
    """
    input: a dataframe and number n of most frequent words desired
    output: 
    the most common n words from all of the fields in the dataframe
    np array with n rows, column zero has frequent words, column 1 has number of word occurences, row 0 has most frequent word
    This is used to generate a title for each cluster.
    """
    from collections import Counter
    cols = dfp.columns.values
    dfp['text_all'] = dfp.loc[:, cols[0]]
    try:
        for col in cols[1:]:
            dfp.loc[:, 'text_all'] += dfp.loc[:, col].values
    except:
        pass
    text_all = remove_stopwords(str(dfp['text_all'].values).lower().split())
    return np.array(Counter(text_all).most_common(n))

nlp_clusters = []

colidx = []
for col in ['abstract_title', 'overview', 'summary', 'implications']:
    colidx.append(df_cluster.columns.get_loc(col))

print(n_clusters, ' Clusters:')    
for i in range(n_clusters):
    idx = np.where(df_cluster.assigned_cluster.values == i)[0]
    titles = df_cluster.iloc[idx, df_cluster.columns.get_loc('abstract_title')].values
    if np.sum(idx) >= min_size:
        indices = df_cluster.index.values[idx]    
        authors = df_cluster.submitted_by.values[idx]
        most_frequent = get_most_frequent_from_df(df_cluster.iloc[idx, colidx])[:,0]
        print('Cluster number ', str(i), ': ', most_frequent)
        for index, author, title in zip(indices, authors, titles):
            print(index, ": (", author, ")", title)
        nlp_clusters.append(most_frequent)
        if float(np.array(Counter(authors).most_common(1))[0][1])/len(authors) >= .5:
            print('More than or equal to half of the abstracts in this cluster were submitted by the same person.')
        print('____________________________________________________________________________')
txt = cluster_df_path.replace('.df', '_NLP_clusters_1d_'+timestamp()+'.txt')
with open(txt, 'w', encoding='utf-8') as f:
    f.write(cap.stdout)

In [None]:
print('saved to ', txt)

### Apply model to a series of abstract sets, save as dataframes

In [None]:
from scipy.spatial.distance import cdist
from nltk.cluster import KMeansClusterer

def load_df_tagged(dfname):
    df_cluster = pd.read_pickle(dfname)
    df_cluster = assure_index(df_cluster)
    try:
        name = dfname.replace('.df', '.data')
        tagged_data = pickle.load(open(name, 'rb'))
        print('Tagged data read from ', name)
    except:
        print('Reading ', dfname)
        prefix = np.array(Counter(df_cluster.abstract_id.apply(lambda x: x[:14])).most_common())[0][0]
        print('prefix: ', prefix)
        # Tag data to be clustered
        df_cluster = make_text_col(df_cluster)
        print('tagging ... please wait')
        tagged_data = tag_data(df_cluster.text)
        f = open(name, 'wb')
        pickle.dump(tagged_data, f)
        f.close()
        print('Saved tagged data to ', name)
    return [tagged_data, df_cluster]
        
time0 = time.time()

if os.path.isdir(cluster_df_path):
    dfnames = glob.glob(os.path.join(cluster_df_path, '*.df'))
    for dfname in dfnames:
        [tagged_data, df_cluster] = load_df_tagged(dfname)
        # Assign vector to each abstract
        X = np.zeros((len(df_cluster), vector_size))
        for i in range(len(df_cluster)):
            print('Inference for ', df_cluster.iloc[i, df_cluster.columns.get_loc('abstract_title')])
            X[i,:] = model.infer_vector(tagged_data[i].words)
            if i == len(df_cluster)-1:
                break
        
        # cluster vectors
#         kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
        kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.euclidean_distance, repeats=25)
        assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
        # print(len(assigned_clusters))
        # print(assigned_clusters)
        df_cluster['assigned_cluster'] = assigned_clusters
        df_cluster.to_pickle(os.path.join(cluster_df_path, 'clusters_NLP', os.path.basename(dfname).replace('.df', '_clusters_NLP.df')))
else:
    print('cluster_df_path is not a directory')
    
print_time(time0)

###  Similarity of clusters: Rand Index
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html<br>
Nicely explained:<br>
https://davetang.org/muse/2017/09/21/the-rand-index/
https://davetang.org/muse/2017/09/21/adjusted-rand-index/<br>
Rand Index is invariable to permutation.
Only problem is that these metrics assume non-overlapping clusters, but the Scan ground truth has overlapping clusters, i.e. the same abstract can be assigned to multiple clusters. Modifying the above algorithm to account for that does not seem to be easy, given that there are so many new options now.<br>
Maybe the best way to tackle this is to assign each abstract to exactly one cluster. If SBI has it assigned to multiple clusters, pick one of them randomly. Maybe repeat multiple times and see how the random pick affects Rand Index.
<br>
Paper that deals with measuring similarity between overlapping clusters: C:\python\scansmeeting\feb2019\Measuring Similarity between Sets of Overlapping Clusters.pdf<br>Paper contains three proposed algorithms, I could not find evidence that the community has agrees that either of them is standard.<br>
<b>Rand Index: <br></b>
1.0: clusters are the same<br>
0.0: clusters are randomly related<br>
< 0 :clusters are opposite<br>

### Comparison between NLP clusters and SBI ground truth

In [None]:
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
dfnames_nlp = glob.glob(os.path.join(nlp_clusters_path, '*.df'))
max_similarities = np.zeros(len(dfnames_nlp))
avg_similarities = np.zeros(len(dfnames_nlp))
for j, dfname_nlp in enumerate(dfnames_nlp):
    print(dfname_nlp)
    df_nlp = pd.read_pickle(dfname_nlp)
    if id_to_int(df_nlp.abstract_id.values[-1]) != len(df_nlp):
        print('Missing abstract ID in NLP')
    else:
        nlp_clusters = np.array(df_nlp.assigned_cluster.values)
    root = os.path.basename(dfname_nlp)[7:17]
    print(root)
    dfnames_human = glob.glob(os.path.join(human_clusters_path, '*'+root+'*.df'))
    similarities = np.zeros(len(dfnames_human))
    for i, dfname_human in enumerate(dfnames_human):
        df_human = pd.read_pickle(dfname_human)
        if df_human['Abstract number'].values[-1] != len(df_human):
            print('Missing abstract ID in human')
        elif len(df_nlp) > len(df_human):
            # human_clusters can be shorter, because if the last abstract(s) was not assigned to a cluster in the report, 
            # we don't know that the abstract even existed
            # => append human_clusters with new cluster labels
            n_missing = len(df_nlp) - len(df_human)
            human_clusters = np.array(df_human['Cluster ID'].values)
            human_clusters = np.append(human_clusters, np.arange(n_missing)+1+np.max(human_clusters))
            print('Fixed human clusters. Added ', n_missing, ' abstract(s) and cluster(s).')
        else:
            human_clusters = np.array(df_human['Cluster ID'].values)
        try:
            similarities[i] = adjusted_rand_score(nlp_clusters, human_clusters)
            print('Similarity: ', similarities[i])
        except:
            print('Could not compute similarity.', len(nlp_clusters), len(human_clusters))
        
#         break
    print('Average similarity, max similarity: ', np.average(similarities), np.max(similarities))
    max_similarities[j] = np.max(similarities)
    avg_similarities[j] = np.average(similarities)
    print(dfname_nlp)
    print(dfnames_human)
    print('_____________________________')
#     break
plt.hist(max_similarities, color='black', alpha =.5, bins=40, label='max')
plt.hist(avg_similarities, color='red', alpha =.5, bins=40, label='avg')
plt.legend()
plt.xlabel('Similarity')
plt.ylabel('Number of abstract sets')
plt.show()
print('Average max. similarity: ', np.average(max_similarities))
print('Average avg. similarity: ', np.average(avg_similarities))