In [1]:
import pandas as pd
import numpy as np 
import re

from pathlib import Path

In [46]:
from gensim import corpora
from gensim.utils import lemmatize
from gensim.parsing.preprocessing import STOPWORDS as gs_stopwords
from gensim.models import KeyedVectors

In [28]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [29]:
docdir = Path('../data/docs')

In [30]:
raw_corpus = []

In [31]:
files = list(docdir.glob('*.body'))
for f in files:
    with open(f, 'r') as fp:
        raw_corpus.append(fp.read())
        fp.close()

In [36]:
# regex to remove three letter (or more) capitalized acronyms
# also removes a plural acronym like HMOs by grabbing
# until the end of the word
tla = re.compile(r'[A-Z]{3,}\S*?\b')
corpus = []
for doc in raw_corpus:
    doc = re.sub(tla,'',doc)
    # from this lemmatization tutorial
    # https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    lemmatized = [wd.decode('utf-8').split('/')[0] for wd in lemmatize(doc, 
                                                                      min_length=4, stopwords=gs_stopwords)]
    corpus.append(lemmatized)


In [37]:
dictionary = corpora.Dictionary(tokens for tokens in corpus)

2019-02-24 17:26:51,080 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-02-24 17:26:51,161 : INFO : built Dictionary(4346 unique tokens: ['acquisition', 'aircraft', 'authorize', 'available', 'capital']...) from 1248 documents (total 92558 corpus positions)


In [38]:
type(dictionary.dfs)

dict

In [42]:
rare_words = []
for tokenid, count in dictionary.dfs.items():
    if count < 2:
        rare_words.append(dictionary[tokenid])
len(rare_words)

1647

In [None]:
wordlist = ''
for word in rare_words:
    wordlist = ' '.join([wordlist, word])
    if len(wordlist) > 100:
        print(wordlist)
        wordlist = ''

In [48]:
google_vec_path = '/Users/jlc/Downloads/GoogleNews-vectors-negative300.bin'
google_vec = KeyedVectors.load_word2vec_format(google_vec_path, binary=True)

2019-02-24 17:51:12,251 : INFO : loading projection weights from /Users/jlc/Downloads/GoogleNews-vectors-negative300.bin
2019-02-24 17:51:57,374 : INFO : loaded (3000000, 300) matrix from /Users/jlc/Downloads/GoogleNews-vectors-negative300.bin


In [56]:
for word in rare_words:
    try:
        google_vec.word_vec(word)
    except:
        rare_words.remove(word)
len(rare_words)

1533

In [57]:
distance_arr = np.zeros((len(rare_words), len(rare_words)))

for i, wordi in enumerate(rare_words):
    for j, wordj in enumerate(rare_words):
        if i < j:
            distance_arr[i,j] = google_vec.distance(wordi, wordj)
            distance_arr[j,i] = distance_arr[i,j]
        elif i == j:
            distance_arr[i,j] = 0

In [58]:
import hdbscan

In [68]:
hdb = hdbscan.HDBSCAN(metric='precomputed', approx_min_span_tree=False, cluster_selection_method='leaf')

In [69]:
hdb.fit(distance_arr)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=False, cluster_selection_method='leaf',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(location=None),
    metric='precomputed', min_cluster_size=5, min_samples=None, p=None,
    prediction_data=False)

In [70]:
labels = pd.Series(hdb.labels_)

In [71]:
labels.value_counts()

-1     1356
 9       50
 12      17
 4       16
 10      14
 6       13
 13      11
 5       10
 7        8
 11       7
 3        7
 2        7
 8        6
 1        6
 0        5
dtype: int64

In [72]:
for i in range(14):
    print(i, [w for iw, w in enumerate(rare_words) if labels[iw] == i])

0 ['seventh', 'marked', 'eighth', 'consecutive', 'tenth']
1 ['blue', 'yellow', 'black', 'pink', 'brown', 'orange']
2 ['formulate', 'outline', 'articulate', 'examine', 'explain', 'clarify', 'acknowledge']
3 ['accrual', 'prepayment', 'receivable', 'undisbursed', 'repayable', 'amortizing', 'debenture']
4 ['landlord', 'nonfrivolous', 'mortgagee', 'allottee', 'purchaser', 'foreclosed', 'possessory', 'claimant', 'subaccount', 'annuitant', 'tenancy', 'mortgagor', 'lien', 'grantor', 'taxable', 'lessor']
5 ['environmentally', 'inherently', 'artistically', 'culturally', 'socially', 'economically', 'generally', 'tend', 'necessarily', 'ecological']
6 ['automate', 'automated', 'sufficiently', 'automation', 'scalable', 'software', 'connectivity', 'efficiently', 'centralized', 'reliability', 'standardization', 'properly', 'database']
7 ['redecorate', 'redecoration', 'refurnishing', 'demolish', 'remodeling', 'renovate', 'evict', 'remodel']
8 ['biosimilar', 'biologic', 'physician', 'clinical', 'healthc

291