# Glove

In ths notebook, I'm trying to see if there
are unique words that could be combined into larger categories to help
documents match by using word similarity scores from a 
pretrained Glove model.

In [78]:
import pandas as pd
import numpy as np 

from pathlib import Path

from gensim import corpora
from gensim.models import KeyedVectors

from budget_corpus import read_documents

import hdbscan

In [79]:
corpus = read_documents()

In [80]:
dictionary = corpora.Dictionary(tokens for tokens in corpus)

In [35]:
glove_file = '/Users/jlc/Downloads/glove.6B/glove.6B.100d.txt'
w2v_file = '/Users/jlc/Downloads/glove.6B/glove.6B.100d.txt.w2v'

glove = KeyedVectors.load_word2vec_format(w2v_file, binary=False)

In [36]:
rare_words = []
for tokenid, count in dictionary.dfs.items():
    if count < 5:
        rare_words.append(dictionary[tokenid])

prune = []
for word in rare_words:
    try:
        glove.word_vec(word)
    except KeyError:
        prune.append(word)
        
for word in prune:
    rare_words.remove(word)
    
len(rare_words)


2839

In [37]:
distance_arr = np.zeros((len(rare_words), len(rare_words)))

for i, wordi in enumerate(rare_words):
    for j, wordj in enumerate(rare_words):
        if i < j:
            distance_arr[i,j] = glove.distance(wordi, wordj)
            distance_arr[j,i] = distance_arr[i,j]
        elif i == j:
            distance_arr[i,j] = 0

In [73]:
hdb = hdbscan.HDBSCAN(metric='precomputed', 
                      approx_min_span_tree=False, 
                      cluster_selection_method='leaf',
                     alpha=.0001)

In [74]:
hdb.fit(distance_arr)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=0.0001,
    approx_min_span_tree=False, cluster_selection_method='leaf',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(location=None),
    metric='precomputed', min_cluster_size=5, min_samples=None, p=None,
    prediction_data=False)

In [75]:
labels = pd.Series(hdb.labels_)

In [76]:
labels.value_counts()

-1     2154
 57      34
 58      18
 25      16
 4       16
 60      16
 48      15
 13      15
 16      15
 65      14
 41      14
 34      13
 21      12
 64      11
 63      11
 17      11
 75      11
 27      11
 45      10
 56      10
 39      10
 49      10
 52      10
 70      10
 72       9
 19       9
 40       9
 79       9
 54       9
 31       9
       ... 
 32       6
 28       6
 42       6
 80       6
 66       6
 11       6
 51       6
 15       6
 47       6
 77       6
 3        5
 18       5
 2        5
 69       5
 6        5
 59       5
 10       5
 55       5
 14       5
 53       5
 20       5
 5        5
 76       5
 68       5
 26       5
 33       5
 23       5
 78       5
 7        5
 0        5
Length: 82, dtype: int64

In [86]:
for i in range(81):
    print(f'--- word cluster {i} ----')
    print([w for iw, w in enumerate(rare_words) if labels[iw] == i])

--- word cluster 0 ----
['link', 'particulate']
--- word cluster 1 ----
['count', 'ideological']
--- word cluster 2 ----
['ally', 'maryland']
--- word cluster 3 ----
['daily', 'apprehend', 'therapeutic', 'safest']
--- word cluster 4 ----
['hear', 'portrait', 'physically', 'history', 'antiterrorism', 'carroll', 'singapore', 'soundness']
--- word cluster 5 ----
['quota', 'repay', 'intentionally', 'centralized']
--- word cluster 6 ----
['iowa', 'siltation', 'piloting']
--- word cluster 7 ----
['probation', 'fulbright']
--- word cluster 8 ----
['undersubscribed', 'del']
--- word cluster 9 ----
['kind', 'responsive', 'coral', 'notified']
--- word cluster 10 ----
['vegetable', 'fruit', 'detoxification', 'clinic', 'breach', 'suicide', 'aftercare']
--- word cluster 11 ----
[]
--- word cluster 12 ----
['cumulative', 'man', 'residential']
--- word cluster 13 ----
['aids', 'quarters', 'light', 'blogger']
--- word cluster 14 ----
['mint', 'pretrial', 'instant', 'uniformed', 'rename', 'comma', 'tar

Some of these are pretty interesting:  Cluster 1 points to environmental issues, cluster 5 human rights, cluster 7 to education, cluster 10 to public health. Cluster 0 is maybe words ending in 'e'?? Looking at cluster 15, I get the impression that glove is better at clustering nouns (at least in English).

# and what words are closest to butterfly? 

Butterfly was unclustered, but we can still look at the distance
array and see what is closest.

In [43]:
rare_words.index('butterfly')

838

In [44]:
distance_arr[838]

array([0.96288221, 0.87520699, 0.891026  , ..., 1.0781585 , 0.81129131,
       0.98257685])

In [45]:
series = pd.Series(distance_arr[838])

In [65]:
series.sort_values().head(10)

838     0.000000
206     0.512884
398     0.529767
708     0.534224
1966    0.576681
362     0.592149
710     0.594837
357     0.599054
213     0.599221
204     0.606453
dtype: float64

In [66]:
words = series.sort_values().head(10).index

In [67]:
printable = [ rare_words[i] for i in words ] 
printable 

['butterfly',
 'bird',
 'wild',
 'olympic',
 'cat',
 'turtle',
 'paralympic',
 'elephant',
 'tree',
 'insect']

# Expanding to the full dictionary 


In [62]:
all_words = list(dictionary.values())

prune = []
for word in all_words:
    try:
        glove.word_vec(word)
    except KeyError:
        prune.append(word)

for word in prune:
    all_words.remove(word)
    
len(all_words)

4291

In [50]:
all_distance_arr = np.zeros((len(all_words), len(all_words)))

for i, wordi in enumerate(all_words):
    for j, wordj in enumerate(all_words):
        if i < j:
            all_distance_arr[i,j] = glove.distance(wordi, wordj)
            all_distance_arr[j,i] = all_distance_arr[i,j]
        elif i == j:
            all_distance_arr[i,j] = 0

In [81]:
hdb = hdbscan.HDBSCAN(metric='precomputed', 
                      approx_min_span_tree=False, 
                      cluster_selection_method='leaf', 
                     alpha=.0001)

In [82]:
hdb.fit(all_distance_arr)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=0.0001,
    approx_min_span_tree=False, cluster_selection_method='leaf',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(location=None),
    metric='precomputed', min_cluster_size=5, min_samples=None, p=None,
    prediction_data=False)

In [83]:
labels = pd.Series(hdb.labels_)

In [84]:
labels.value_counts()

-1      3359
 95       28
 40       23
 70       20
 98       20
 4        16
 109      16
 77       15
 94       15
 76       14
 88       13
 68       13
 108      12
 111      12
 21       12
 56       12
 14       12
 92       12
 22       12
 47       12
 35       11
 64       11
 60       11
 10       11
 102      11
 104      11
 112      10
 59       10
 45       10
 84       10
        ... 
 107       5
 103       5
 99        5
 91        5
 87        5
 67        5
 63        5
 51        5
 39        5
 27        5
 15        5
 11        5
 3         5
 110       5
 90        5
 78        5
 62        5
 50        5
 42        5
 38        5
 34        5
 30        5
 6         5
 113       5
 105       5
 81        5
 73        5
 69        5
 61        5
 0         5
Length: 117, dtype: int64

In [85]:
for i in range(117):
    print(f'--- word cluster {i} ----')
    print([w for iw, w in enumerate(all_words) if labels[iw] == i])

--- word cluster 0 ----
['abyei', 'kordofan', 'kachin', 'rakhine', 'rohingya']
--- word cluster 1 ----
['grulla', 'abus', 'état', 'totale', 'holdback', 'minimis', 'alexandrium']
--- word cluster 2 ----
['cooperator', 'mortgagee', 'annuitant', 'submunition', 'mortgagor']
--- word cluster 3 ----
['grande', 'rio', 'del', 'mar', 'vale']
--- word cluster 4 ----
['redesignate', 'reinspection', 'commingle', 'disaggregate', 'adulterate', 'effectuate', 'securitize', 'impute', 'prorate', 'derogate', 'reflow', 'sublease', 'subleasing', 'urbanize', 'repurpose', 'repurposed']
--- word cluster 5 ----
['relocation', 'demobilization', 'reintegration', 'repatriation', 'resettlement']
--- word cluster 6 ----
['allocation', 'allocate', 'disburse', 'earmarked', 'allocated']
--- word cluster 7 ----
['temporary', 'permanent', 'disruption', 'shutdown', 'outage', 'closure']
--- word cluster 8 ----
['reimbursement', 'allowance', 'deduction', 'deduct', 'tuition']
--- word cluster 9 ----
['uniform', 'furniture',

In [68]:
all_words.index('butterfly')

2095

In [71]:
butterfly_links = [all_words[i] for i in  pd.Series(all_distance_arr[2095]).sort_values().head(10).index]
butterfly_links

['butterfly',
 'species',
 'bird',
 'wild',
 'olympic',
 'cat',
 'holder',
 'turtle',
 'paralympic',
 'event']

The butterfly is an animal, but also an event in Olympic swimming.