In [1]:
import pandas as pd
import numpy as np 
import re

from pathlib import Path

In [2]:
from gensim import corpora
from gensim.utils import lemmatize
from gensim.parsing.preprocessing import STOPWORDS as gs_stopwords
from gensim.models import KeyedVectors

In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
docdir = Path('../data/docs')

In [5]:
raw_corpus = []

In [6]:
files = list(docdir.glob('*.body'))
for f in files:
    with open(f, 'r') as fp:
        raw_corpus.append(fp.read())
        fp.close()

In [7]:
# regex to remove three letter (or more) capitalized acronyms
# also removes a plural acronym like HMOs by grabbing
# until the end of the word
tla = re.compile(r'[A-Z]{3,}\S*?\b')
corpus = []
for doc in raw_corpus:
    doc = re.sub(tla,'',doc)
    # from this lemmatization tutorial
    # https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    lemmatized = [wd.decode('utf-8').split('/')[0] for wd in lemmatize(doc, 
                                                                      min_length=4, stopwords=gs_stopwords)]
    corpus.append(lemmatized)


In [8]:
dictionary = corpora.Dictionary(tokens for tokens in corpus)

2019-02-24 18:41:40,883 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-02-24 18:41:41,020 : INFO : built Dictionary(4346 unique tokens: ['acquisition', 'aircraft', 'authorize', 'available', 'capital']...) from 1248 documents (total 92558 corpus positions)


In [9]:
type(dictionary.dfs)

dict

In [10]:
rare_words = []
for tokenid, count in dictionary.dfs.items():
    if count < 2:
        rare_words.append(dictionary[tokenid])
len(rare_words)

1647

In [11]:
wordlist = ''
for word in rare_words:
    wordlist = ' '.join([wordlist, word])
    if len(wordlist) > 100:
        print(wordlist)
        wordlist = ''

 moderately focused random randomized landlord promising forbid apprentice trainee unemployed delay postage
 suffrage deobligation rehire wellness dump heroin willing nonagricultural nonimmigrant committed practical
 liaison accessory exporter shipper barrel cylinder frame breech postmaster canadian maintained escalation
 helsinki reinsurance turkey turkish indictment stand coalition procedural elizabeth beach downgrade resiliency
 telemarketing correspondingly manifest packing grape varietal wine microorganism kindred dune retitle
 lakeshore nonapplication douglas redesignation miller assurance supreme cotton avian specialty zoonotic
 stockpile scrapie screwworm formulate brucellosis escort philosophical aids improper analytic confer
 alignment earlier afford roma escobare grulla salineno multiply redistribution burned furnished capitalized
 retardant equality raise setting marriage genital cutting mutilation constrain destabilizing neighboring
 blue nile abyei referendum viable macro

In [12]:
glove_file = '/Users/jlc/Downloads/glove.6B/glove.6B.100d.txt'
w2v_file = '/Users/jlc/Downloads/glove.6B/glove.6B.100d.txt.w2v'

glove = KeyedVectors.load_word2vec_format(w2v_file, binary=False)

2019-02-24 18:44:48,298 : INFO : loading projection weights from /Users/jlc/Downloads/glove.6B/glove.6B.100d.txt.w2v
2019-02-24 18:45:22,184 : INFO : loaded (400000, 100) matrix from /Users/jlc/Downloads/glove.6B/glove.6B.100d.txt.w2v


In [15]:
for word in rare_words:
    try:
        glove.word_vec(word)
    except:
        rare_words.remove(word)
len(rare_words)

1580

In [17]:
distance_arr = np.zeros((len(rare_words), len(rare_words)))

for i, wordi in enumerate(rare_words):
    for j, wordj in enumerate(rare_words):
        if i < j:
            distance_arr[i,j] = glove.distance(wordi, wordj)
            distance_arr[j,i] = distance_arr[i,j]
        elif i == j:
            distance_arr[i,j] = 0

In [18]:
import hdbscan

In [19]:
hdb = hdbscan.HDBSCAN(metric='precomputed', approx_min_span_tree=False, cluster_selection_method='leaf')

In [20]:
hdb.fit(distance_arr)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=False, cluster_selection_method='leaf',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(location=None),
    metric='precomputed', min_cluster_size=5, min_samples=None, p=None,
    prediction_data=False)

In [21]:
labels = pd.Series(hdb.labels_)

In [22]:
labels.value_counts()

-1    1422
 6      53
 5      29
 4      26
 7      24
 3       7
 2       7
 1       6
 0       6
dtype: int64

In [23]:
for i in range(7):
    print(i, [w for iw, w in enumerate(rare_words) if labels[iw] == i])

0 ['avian', 'aids', 'polio', 'epidemic', 'vaccination', 'infection']
1 ['dioxide', 'oxide', 'vapor', 'methane', 'liquid', 'hydrogen']
2 ['server', 'messaging', 'software', 'networking', 'cable', 'mobile', 'phone']
3 ['indictment', 'alleged', 'tribunal', 'jury', 'extradition', 'lawsuit', 'infringement']
4 ['alabama', 'arizona', 'colorado', 'connecticut', 'delaware', 'illinois', 'iowa', 'louisiana', 'maine', 'michigan', 'minnesota', 'montana', 'hampshire', 'dakota', 'ohio', 'pennsylvania', 'rhode', 'utah', 'vermont', 'wisconsin', 'wyoming', 'cleveland', 'arkansas', 'anaheim', 'idaho', 'maryland']
5 ['turkish', 'uganda', 'cameroon', 'niger', 'nigeria', 'malawi', 'egyptian', 'iraqi', 'lebanese', 'syrian', 'yemen', 'regime', 'corrupt', 'britain', 'ministry', 'bahrain', 'ethiopia', 'saudi', 'arabia', 'taliban', 'pakistani', 'singapore', 'australia', 'chinese', 'repressive', 'thailand', 'cambodia', 'india', 'vietnam']
6 ['focused', 'promising', 'delay', 'willing', 'practical', 'earlier', 'rai