# Creating clusters based on Word Mover Distance

Word Mover Distance 

https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.wmdistance


In [9]:
import pandas as pd
import numpy as np 
import re

from pathlib import Path

import pickle

from gensim import corpora
from gensim.models import KeyedVectors

from budget_corpus import read_raw_corpus
from gensim.utils import simple_preprocess
from gensim.utils import tokenize

from hdbscan import HDBSCAN

In [10]:
# For this one I used the smart stopwords in the Word Mover Distance 
# paper rather than my budget/legal stopwords. Since words need to be
# matched to other words, I may get better results leaving them in since
# I'm not sure that I got all the different ways the budget talks about
# money (cost, funds, accounts, expenditures, expenses, etc.) If the 
# stop words clean one document better than another, things may be 
# too far apart? 

raw_corpus = read_raw_corpus()
corpus = [simple_preprocess(doc) for doc in raw_corpus]

from smart_stopwords import smart_stopwords as stopwords

corpus = [[w for w in doc if w not in stopwords] for doc in corpus]


## Load or calculate distances

In [11]:
wmdist_path = Path('../data/wmdistance.pkl')
subset_path = Path( '../data/subset.pkl')

In [13]:
if wmdist_path.exists() and subset_path.exists():
    with open(wmdist_path, 'rb') as fp:
        result_array = pickle.load(fp)
    with open(subset_path, 'rb') as fp:
        subset_corpus = pickle.load(fp)
        
else:
    google_vec_path = '/Users/jlc/Downloads/GoogleNews-vectors-negative300.bin'
    google_vec = KeyedVectors.load_word2vec_format(google_vec_path, binary=True)

    # Add an additional pruning of words not in google vec
    dictionary = corpora.Dictionary(corpus)
    word_list = dictionary.values()
    prune = []
    for i in word_list:
        try:
            google_vec.get_vector(i)
        except KeyError:
            prune.append(i)
            
    corpus = [ [w for w in doc if w not in prune] for doc in corpus ]

    # subset to corpus members that are between 10 and 100 words long
    # (so that this finishes in a reasonable amount of time)

    indexed_corpus = enumerate(corpus)
    subset_corpus = [ ic for ic in indexed_corpus if len(ic[1]) > 9 ]
    subset_corpus = [ ic for ic in subset_corpus if len(ic[1]) < 101 ]
    subset_size = len(subset_corpus)

    result_array = np.zeros((subset_size, subset_size))

    for i in range(subset_size):
        for j in range(subset_size):
            if i == j:
                result_array[i,j] = 0
            else:
                result_array[i,j] = google_vec.wmdistance(
                    subset_corpus[i][1],subset_corpus[j][1])
                result_array[j,i] = result_array[i,j]

    fp = open('../data/wmdistance.pkl', 'wb')
    pickle.dump(result_array, fp)
    fp.close()

    fp = open('../data/subset.pkl', 'wb')
    pickle.dump(subset_corpus, fp)
    fp.close()

## Clustering

In [14]:
hdb = HDBSCAN(metric='precomputed', approx_min_span_tree=False,
                     cluster_selection_method='leaf')
hdb.fit(result_array)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=False, cluster_selection_method='leaf',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(location=None),
    metric='precomputed', min_cluster_size=5, min_samples=None, p=None,
    prediction_data=False)

In [15]:
labels = pd.Series(hdb.labels_)
labels.value_counts()

-1     830
 7      17
 3      14
 6      11
 4      10
 1       9
 11      8
 0       7
 12      6
 10      6
 9       6
 8       6
 5       6
 2       6
dtype: int64

In [16]:
for i in range(13):
    print(f'-------- {i} ---------')
    for index,label in enumerate(hdb.labels_):
        if label == i:
            raw_label = subset_corpus[index][0]
            print("")
            print(raw_corpus[raw_label])
            print("")
            

-------- 0 ---------

532.(a) None of the funds made available in this Act may be used to maintain or establish a computer network unless such network blocks the viewing, downloading, and exchanging of pornography.(b) Nothing in subsection (a) shall limit the use of funds necessary for any Federal, State, tribal, or local law enforcement agency or any other entity carrying out criminal investigations, prosecution, adjudication, or other law enforcement- or victim assistance-related activity.


713.(a) None of the funds made available in this Act may be used to maintain or establish a computer network unless such network blocks the viewing, downloading, and exchanging of pornography.(b) Nothing in subsection (a) shall limit the use of funds necessary for any Federal, State, tribal, or local law enforcement agency or any other entity carrying out criminal investigations, prosecution, or adjudication activities.


628.(a) None of the funds made available in this Act may be used to maintai

# Can we do any better with regularization? 

With regularization, there are fewer unclustered documents, and the 
butterfly document remains in the unclustered subset.

But it is hard to evaluate the quality of the regluarization: Clusters that make sense to a human 
versus clusters that make sense to the word2vec math are not quite the same thing. We can calcualte
on the latter, but not the former. 

At any rate, we have too many unclusterable documents to say that we've made our 
search for 'butterflies' any easier. 

In [38]:
hdb = HDBSCAN(metric='precomputed', approx_min_span_tree=False, alpha=.0001 )
hdb.fit(result_array)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=0.0001,
    approx_min_span_tree=False, cluster_selection_method='eom',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(location=None),
    metric='precomputed', min_cluster_size=5, min_samples=None, p=None,
    prediction_data=False)

In [39]:
labels = pd.Series(hdb.labels_)
labels.value_counts()

-1     706
 26     17
 23     15
 27     13
 12     13
 5      12
 13     12
 21     11
 17     10
 28      9
 18      9
 1       8
 4       8
 2       8
 11      7
 25      7
 16      6
 19      6
 20      6
 9       6
 8       6
 7       6
 22      6
 15      5
 10      5
 6       5
 24      5
 3       5
 0       5
 14      5
dtype: int64

In [40]:
for index,data in enumerate(subset_corpus):
    if 'butterfly' in data[1]:
        print(index)
        break

174


In [41]:
hdb.labels_[174]

-1