In [1]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from adjustText import adjust_text
import operator

import gensim.downloader as api
from gensim.models import word2vec
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
# from sklearn.manifold import TSNE
# from sklearn.decomposition import PCA

In [2]:
# notebook hparams
corpus_path = r"..\barrons_333_corpus.txt"
W2V_SIZE = 100    # Word vector dimensionality
W2V_WINDOW = 30   # Context window size
W2V_MIN_COUNT = 1    # Minimum word count
W2V_EPOCHS = 50    # w2v model training iters

np.set_printoptions(suppress=True)

In [3]:
# extracting gensim pretrained model & corpora info
def get_gensim_pretrained_info(entity, desc_len=None):
    """
    :param entity: either 'corpora' or 'models'
    :param desc_len: description length of each entity, entire description is printed if this is None
    :return: None
    """
    info = api.info()
    for entity_name, entity_data in sorted(info[entity].items()):
        print(f"{entity_name:<40} {entity_data.get('num_records', -1)} records: "
              f"{entity_data['description'][:desc_len] + '...'}")

In [4]:
# tokenize a corpus using nltk
def nltk_corpus_tokenizer(corpus):
    # tokenize sentences in corpus
    wpt = nltk.WordPunctTokenizer()
    tokenized_corpus = wpt.tokenize(corpus)
    return tokenized_corpus

In [5]:
# train a w2v model on a given corpus
def train_w2v_model(corpus, size=W2V_SIZE, window=W2V_WINDOW, min_count=W2V_MIN_COUNT, iters=W2V_EPOCHS, workers=4):
    logging.info(f'word2vec model training started with params {size, window, min_count, iters, workers}')
    w2v_model = word2vec.Word2Vec(corpus,
                                  size=size,
                                  window=window,
                                  min_count=min_count,
                                  iter=iters,
                                  workers=workers)
    logging.info(f'word2vec model training completed..')
    return w2v_model

## extracting w2v feature vectors

In [6]:
%%time
with open(corpus_path, 'r') as f:
    barrons_corpus = f.read()
    
tokenized_corpus = nltk_corpus_tokenizer(barrons_corpus)
print(f'length of raw corpus: {len(tokenized_corpus)}')

length of raw corpus: 334
Wall time: 998 µs


In [7]:
def filter_corpus_vocab(w2v_model, tokenized_corpus):
    trained_words = list()
    untrained_words = list()
    model_vocab = w2v_model.vocab
    for _word in tokenized_corpus:
        if _word in model_vocab:
            trained_words.append(_word)
        else:
            untrained_words.append(_word)
    
    print(f"w2v model doesn't have {len(untrained_words)} words: {untrained_words}")
    return trained_words

In [8]:
get_gensim_pretrained_info('models', desc_len=80)
# w2v_model = train_w2v_model(tokenized_corpus)

__testing_word2vec-matrix-synopsis       -1 records: [THIS IS ONLY FOR TESTING] Word vecrors of the movie matrix....
conceptnet-numberbatch-17-06-300         1917247 records: ConceptNet Numberbatch consists of state-of-the-art semantic vectors (also known...
fasttext-wiki-news-subwords-300          999999 records: 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt...
glove-twitter-100                        1193514 records: Pre-trained vectors based on  2B tweets, 27B tokens, 1.2M vocab, uncased (https:...
glove-twitter-200                        1193514 records: Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...
glove-twitter-25                         1193514 records: Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...
glove-twitter-50                         1193514 records: Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...
glove-wiki-gigaword-100         

In [9]:
%%time
# load pre trained model
w2v_model_type = "glove-wiki-gigaword-50"
# w2v_model_type = "glove-twitter-100"  # doesn't work well
# w2v_model_type = "glove-wiki-gigaword-300"

w2v_model = api.load(w2v_model_type)
tokenized_corpus = filter_corpus_vocab(w2v_model, tokenized_corpus)
print(f'length of filtered corpus: {len(tokenized_corpus)}')

w2v model doesn't have 3 words: ['desiccate', 'enervate', 'veracious']
length of filtered corpus: 331
Wall time: 32.2 s


In [10]:
# sample test runs
for _word in ["stop", "woman", "man", "abate", "india"]:
    neighbors = w2v_model.most_similar(_word, topn=5)
    print(neighbors)

[('stopping', 0.8896316885948181), ('stopped', 0.8306300640106201), ('trying', 0.8275439739227295), ('tried', 0.8172997832298279), ('stops', 0.8129743933677673)]
[('girl', 0.9065280556678772), ('man', 0.8860336542129517), ('mother', 0.8763703107833862), ('her', 0.86131352186203), ('boy', 0.8596119284629822)]
[('woman', 0.8860337734222412), ('boy', 0.8564431071281433), ('another', 0.8452839851379395), ('old', 0.8372182846069336), ('one', 0.8276063203811646)]
[('caputo', 0.6675522923469543), ('washpost.com', 0.6115913391113281), ('levesque', 0.6013662815093994), ('subside', 0.5964425206184387), ('greef', 0.5947305560112)]
[('indian', 0.8648794889450073), ('pakistan', 0.8529723286628723), ('malaysia', 0.816650927066803), ('bangladesh', 0.8154239058494568), ('delhi', 0.8142766952514648)]


In [11]:
# get word vectors for all corpus words
wvs = w2v_model.wv[tokenized_corpus]
wvs.shape

  


(331, 50)

In [12]:
# build feature array for all tokens in corpus to cluster
w2v_feature_array = wvs.copy()  # if using barrons word meanings, average feature vectors

#### DBScan clustering

In [13]:
%%time
# build dbscan clustering model
# dbscan_model = DBSCAN(metric='cosine', eps=0.5, min_samples=3)
dbscan_model = DBSCAN(metric='cosine', eps=0.5, min_samples=2, algorithm="auto")

w2v_feature_array = StandardScaler().fit_transform(w2v_feature_array)
clustering = dbscan_model.fit(w2v_feature_array)
labels = clustering.labels_
core_samples = clustering.core_sample_indices_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = len([lab for lab in labels if lab == -1])

print(f"clustering metrics {set(labels)}: {n_clusters}, \n{n_noise} noise clusters out of {len(labels)}")
print(labels)

clustering metrics {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, -1}: 17, 
84 noise clusters out of 331
[ 0 -1  1  0  0  0  0  0 -1  2  0  3  4  0  0 -1  0  0  0  0  0  0  0  5
  0  0  0  0  0  0  0  0  0 -1 -1  0 -1  0  0  0  0 -1  0  0 -1 -1  0  6
  0 -1  0  7  0  0 -1 -1  0  0  0  0 -1  3 -1  0  0  0  0 -1  0  0 -1  5
  0  0  0  0  0  0  0  0  0  0  8  7  0  0 -1  0  0  0  0  0  0  0  0  3
  0 -1 -1 -1  0 -1  0 -1  0 -1  1  0  0  0  9 -1  7 -1 -1  9  3 10  0  0
 11  2  0  0  0  7 -1  0  0 -1  9  0  0  0 -1  0  0  0 -1  0  0  0  0  0
  0  0  0  0 -1  0  0  0  0  0  0 -1  0 -1  0  0 -1  0 12  0 12 -1 13 -1
 -1 -1  0  0  3 -1  0 -1  0  0  0  0 -1  0  0  0  0  0 14 -1  0  0 -1 -1
  0 -1  0 -1  0 -1 -1 -1  0 -1 -1  2  0 -1  0  0  7  0 -1  0  0  0  0  0
  0  0  0  0  0  0  0 -1  0 -1 -1  0 -1  0 -1  0  0 -1 10 -1  0 12 -1  0
  0  0 -1  8  3 -1 12 -1  0 15  0  6  0  0 11  0  4  0  0 -1 -1 -1  0  0
  0  0  3  0 -1 -1  0  0  0 -1 12  0  3 -1  0  0  0  0  0  0  0 -1 16  0
  0 14 

In [14]:
corpus_clusters_df = pd.DataFrame({'words': tokenized_corpus, 'cluter_labels': labels})
print(f"the corpus is segregated into {n_clusters} clusters")

the corpus is segregated into 17 clusters


In [15]:
corpus_clusters_df

Unnamed: 0,words,cluter_labels
0,abate,0
1,aberrant,-1
2,abeyance,1
3,abscond,0
4,abstemious,0
...,...,...
326,volatile,-1
327,wary,0
328,welter,-1
329,whimsical,0


## Save clusters to CSV

In [16]:
%%time
conv_clusters_df = pd.DataFrame()

count = 0
for cluster_id, subset_df in corpus_clusters_df.sort_values(by="cluter_labels").groupby("cluter_labels"):
    conv_clusters_df = pd.concat([conv_clusters_df, subset_df.words.reset_index(drop=True)], ignore_index=True, axis=1)

conv_clusters_df.fillna(value='', inplace=True)
conv_clusters_df.to_csv(f"../res/{w2v_model_type}-dbs_clusters.csv", index=False)

Wall time: 33.9 ms


In [17]:
conv_clusters_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,implacable,loquacious,abeyance,alacrity,plethora,ambiguous,approbation,precursor,metamorphosis,digression,extrapolation,perennial,pristine,impermeable,implicit,saturate,precarious,salubrious
1,ostentatious,tirade,dormant,equanimity,disparate,viable,deference,catalyst,eulogy,platitude,empirical,endemic,ephemeral,porous,tacit,inundate,tenuous,soporific
2,paragon,mollify,,magnanimity,emulate,problematic,,,elegy,,efficacy,,,impervious,,,,
3,pate,tangential,,,relegate,,,,coda,,,,,permeable,,,,
4,dogmatic,substantiate,,,contention,,,,dirge,,,,,refractory,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,,enhance,,,,,,,,,,,,,,,,
196,,dupe,,,,,,,,,,,,,,,,
197,,ebullient,,,,,,,,,,,,,,,,
198,,document,,,,,,,,,,,,,,,,
