In [1]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from adjustText import adjust_text
import operator

import gensim.downloader as api
from gensim.models import word2vec
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
# from sklearn.manifold import TSNE
# from sklearn.decomposition import PCA

In [2]:
# notebook hparams
corpus_path = r"..\barrons_333_corpus.txt"
W2V_SIZE = 100    # Word vector dimensionality
W2V_WINDOW = 30   # Context window size
W2V_MIN_COUNT = 1    # Minimum word count
W2V_EPOCHS = 50    # w2v model training iters

np.set_printoptions(suppress=True)

In [3]:
# extracting gensim pretrained model & corpora info
def get_gensim_pretrained_info(entity, desc_len=None):
    """
    :param entity: either 'corpora' or 'models'
    :param desc_len: description length of each entity, entire description is printed if this is None
    :return: None
    """
    info = api.info()
    for entity_name, entity_data in sorted(info[entity].items()):
        print(f"{entity_name:<40} {entity_data.get('num_records', -1)} records: "
              f"{entity_data['description'][:desc_len] + '...'}")

In [4]:
# tokenize a corpus using nltk
def nltk_corpus_tokenizer(corpus):
    # tokenize sentences in corpus
    wpt = nltk.WordPunctTokenizer()
    tokenized_corpus = wpt.tokenize(corpus)
    return tokenized_corpus

In [5]:
# train a w2v model on a given corpus
def train_w2v_model(corpus, size=W2V_SIZE, window=W2V_WINDOW, min_count=W2V_MIN_COUNT, iters=W2V_EPOCHS, workers=4):
    logging.info(f'word2vec model training started with params {size, window, min_count, iters, workers}')
    w2v_model = word2vec.Word2Vec(corpus,
                                  size=size,
                                  window=window,
                                  min_count=min_count,
                                  iter=iters,
                                  workers=workers)
    logging.info(f'word2vec model training completed..')
    return w2v_model

## extracting w2v feature vectors

In [6]:
%%time
with open(corpus_path, 'r') as f:
    barrons_corpus = f.read()
    
tokenized_corpus = nltk_corpus_tokenizer(barrons_corpus)
print(f'length of raw corpus: {len(tokenized_corpus)}')

length of raw corpus: 334
Wall time: 1.99 ms


In [7]:
def filter_corpus_vocab(w2v_model, tokenized_corpus):
    trained_words = list()
    untrained_words = list()
    model_vocab = w2v_model.vocab
    for _word in tokenized_corpus:
        if _word in model_vocab:
            trained_words.append(_word)
        else:
            untrained_words.append(_word)
    
    print(f"w2v model doesn't have {len(untrained_words)} words: {untrained_words}")
    return trained_words

In [8]:
get_gensim_pretrained_info('models', desc_len=80)
# w2v_model = train_w2v_model(tokenized_corpus)

__testing_word2vec-matrix-synopsis       -1 records: [THIS IS ONLY FOR TESTING] Word vecrors of the movie matrix....
conceptnet-numberbatch-17-06-300         1917247 records: ConceptNet Numberbatch consists of state-of-the-art semantic vectors (also known...
fasttext-wiki-news-subwords-300          999999 records: 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt...
glove-twitter-100                        1193514 records: Pre-trained vectors based on  2B tweets, 27B tokens, 1.2M vocab, uncased (https:...
glove-twitter-200                        1193514 records: Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...
glove-twitter-25                         1193514 records: Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...
glove-twitter-50                         1193514 records: Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...
glove-wiki-gigaword-100         

In [9]:
%%time
# load pre trained model
# w2v_model_type = "glove-wiki-gigaword-50"
# w2v_model_type = "glove-twitter-100"  # doesn't work well
w2v_model_type = "glove-wiki-gigaword-300"

w2v_model = api.load(w2v_model_type)
tokenized_corpus = filter_corpus_vocab(w2v_model, tokenized_corpus)
print(f'length of filtered corpus: {len(tokenized_corpus)}')

w2v model doesn't have 3 words: ['desiccate', 'enervate', 'veracious']
length of filtered corpus: 331
Wall time: 2min 23s


In [10]:
# sample test runs
for _word in ["stop", "woman", "man", "abate", "india"]:
    neighbors = w2v_model.most_similar(_word, topn=5)
    print(neighbors)

[('stopping', 0.758859395980835), ('stops', 0.7051315903663635), ('stopped', 0.6975031495094299), ('halt', 0.6571016311645508), ('prevent', 0.6046357750892639)]
[('girl', 0.7296419143676758), ('man', 0.6998662948608398), ('mother', 0.689943790435791), ('she', 0.6433226466178894), ('her', 0.6327143311500549)]
[('woman', 0.6998662948608398), ('person', 0.6443442106246948), ('boy', 0.620827853679657), ('he', 0.5926738977432251), ('men', 0.5819568634033203)]
[('subside', 0.4828464388847351), ('abated', 0.42604950070381165), ('ignazio', 0.39123255014419556), ('slacken', 0.38261228799819946), ('falter', 0.37811487913131714)]
[('indian', 0.7355823516845703), ('pakistan', 0.7285579442977905), ('delhi', 0.6846905946731567), ('bangladesh', 0.620319128036499), ('lanka', 0.609517514705658)]


In [11]:
# get word vectors for all corpus words
wvs = w2v_model.wv[tokenized_corpus]
wvs.shape

  


(331, 300)

In [12]:
# build feature array for all tokens in corpus to cluster
w2v_feature_array = wvs.copy()  # if using barrons word meanings, average feature vectors

#### DBScan clustering

In [50]:
%%time
# build dbscan clustering model
# dbscan_model = DBSCAN(metric='cosine', eps=0.5, min_samples=3)
dbscan_model = DBSCAN(metric='cosine', eps=0.5, min_samples=2, algorithm="auto")

w2v_feature_array = StandardScaler().fit_transform(w2v_feature_array)
clustering = dbscan_model.fit(w2v_feature_array)
labels = clustering.labels_
core_samples = clustering.core_sample_indices_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = len([lab for lab in labels if lab == -1])

print(f"clustering metrics {set(labels)}: {n_clusters}, \n{n_noise} noise clusters out of {len(labels)}")
print(labels)

clustering metrics {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1}: 15, 
294 noise clusters out of 331
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1  0 -1 -1 -1 -1 -1 -1  1 -1 -1
 -1  2 -1 -1 -1  1 -1 -1 -1 -1 -1  3 -1 -1  4 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  5 -1 -1 -1 -1 -1 -1
 -1  6 -1 -1 -1 -1  7 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  6 -1
 -1 -1 -1 -1 -1 -1  5 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  4
 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1 -1  4  8 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1  9 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 -1 -1 10 -1 10 11 -1 -1 -1 12 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1  9 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0  1 -1  3 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 11 -1 -1
 -1  1 -1 -1 -1 -1 -1 -1 -1 13 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 14 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -

In [51]:
corpus_clusters_df = pd.DataFrame({'words': tokenized_corpus, 'cluter_labels': labels})
print(f"the corpus is segregated into {n_clusters} clusters")

the corpus is segregated into 15 clusters


In [52]:
corpus_clusters_df

Unnamed: 0,words,cluter_labels
0,abate,-1
1,aberrant,-1
2,abeyance,-1
3,abscond,-1
4,abstemious,-1
...,...,...
326,volatile,-1
327,wary,-1
328,welter,-1
329,whimsical,-1


## Save clusters to CSV

In [53]:
%%time
conv_clusters_df = pd.DataFrame()

count = 0
for cluster_id, subset_df in corpus_clusters_df.sort_values(by="cluter_labels").groupby("cluter_labels"):
    conv_clusters_df = pd.concat([conv_clusters_df, subset_df.words.reset_index(drop=True)], ignore_index=True, axis=1)

conv_clusters_df.fillna(value='', inplace=True)
conv_clusters_df.to_csv(f"../res/{w2v_model_type}-dbs_clusters.csv", index=False)

Wall time: 48.9 ms


In [54]:
conv_clusters_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,abate,alleviate,placate,tortuous,banal,facilitate,converge,denigrate,harangue,specious,garrulous,impede,permeable,tacit,tenuous,substantiate
1,occlude,ameliorate,appease,arduous,mundane,enhance,diverge,disparage,diatribe,fallacious,loquacious,impair,impermeable,implicit,precarious,refute
2,obviate,exacerbate,assuage,,,bolster,,,invective,,,,,,,
3,obsequious,mitigate,mollify,,,,,,tirade,,,,,,,
4,obdurate,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,efficacy,,,,,,,,,,,,,,,
290,effrontery,,,,,,,,,,,,,,,
291,disjointed,,,,,,,,,,,,,,,
292,disparate,,,,,,,,,,,,,,,
