In [4]:
# Imports
from time import time
from os.path import join as join_path
import numpy as np
import pandas as pd

import multiprocessing
cores = multiprocessing.cpu_count()

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import logging # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import nltk
nltk.download('punkt')

from utils import clean_text, EpochSaver
from tqdm.notebook import tqdm

from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load and prepare data

In [2]:
# Constants
cord_data_dir = 'data'
cord_data_path = join_path(cord_data_dir, 'cord-19-data.csv')
w2v_saved_models_dir = 'models-word2vec'
saved_models_prefix = 'model'

In [30]:
cord_data = pd.read_csv(cord_data_path)
cord_data_eng = cord_data[cord_data['language'] == 'en']
eng_texts = cord_data_eng['body_text'].values

In [4]:
cord_num_sentences = 0
for text in tqdm(eng_texts):
    sentences = nltk.tokenize.sent_tokenize(text)
    cord_num_sentences += len(sentences)
print(f'Total number of CORD-19 sentences: {cord_num_sentences}')

HBox(children=(IntProgress(value=0, max=35708), HTML(value='')))


Total number of CORD-19 sentences: 7097680


In [4]:
class CORDDataIteratorWord2Vec():
    def __init__(self, texts: np.ndarray):
        self.texts = texts
    
    def __iter__(self):
        for text in self.texts:
            sentences = nltk.tokenize.sent_tokenize(text)
            cleaned_sentences = [clean_text(sent) for sent in sentences]
            for sentence in cleaned_sentences:
                yield sentence

In [5]:
cord_sentences = CORDDataIteratorWord2Vec(eng_texts)

## Learn word embeddings using Word2vec

In [6]:
class EpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self, output_dir: str, prefix: str, start_epoch: int = 1):
        self.output_dir = output_dir
        self.prefix = prefix
        self.epoch = start_epoch

    def on_epoch_end(self, model):
        output_path = join_path(self.output_dir, f'{self.prefix}_epoch_{self.epoch}.model')
        model.save(output_path)
        self.epoch += 1

In [7]:
# Setup initial model
w2v_model = Word2Vec(
    min_count=20,
    window=2,
    size=300,
    negative=5,
    workers=cores-1,
    callbacks=[EpochSaver(w2v_saved_models_dir, saved_models_prefix)]
)

In [12]:
# Build vocabulary
t = time()
w2v_model.build_vocab(tqdm(cord_sentences, total=cord_num_sentences), progress_per=int(cord_num_sentences / 100))
print(f'Time to build vocab: {round((time() - t) / 60, 2)} mins')

HBox(children=(IntProgress(value=0, max=7097680), HTML(value='')))

INFO - 07:48:01: collecting all words and their counts
INFO - 07:48:01: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 07:48:22: PROGRESS: at sentence #70976, processed 1013256 words, keeping 51891 word types
INFO - 07:48:40: PROGRESS: at sentence #141952, processed 2029040 words, keeping 84651 word types
INFO - 07:48:59: PROGRESS: at sentence #212928, processed 3054106 words, keeping 115992 word types
INFO - 07:49:17: PROGRESS: at sentence #283904, processed 4068911 words, keeping 142369 word types
INFO - 07:49:36: PROGRESS: at sentence #354880, processed 5105447 words, keeping 164500 word types
INFO - 07:49:55: PROGRESS: at sentence #425856, processed 6124854 words, keeping 185561 word types
INFO - 07:50:13: PROGRESS: at sentence #496832, processed 7156844 words, keeping 206875 word types
INFO - 07:50:31: PROGRESS: at sentence #567808, processed 8149173 words, keeping 233078 word types
INFO - 07:50:50: PROGRESS: at sentence #638784, processed 9174245 words, 

INFO - 08:12:20: PROGRESS: at sentence #5749056, processed 78309142 words, keeping 1293451 word types
INFO - 08:12:39: PROGRESS: at sentence #5820032, processed 79280301 words, keeping 1302643 word types
INFO - 08:12:58: PROGRESS: at sentence #5891008, processed 80236804 words, keeping 1311421 word types
INFO - 08:13:17: PROGRESS: at sentence #5961984, processed 81191278 words, keeping 1320204 word types
INFO - 08:13:36: PROGRESS: at sentence #6032960, processed 82148730 words, keeping 1328579 word types
INFO - 08:13:54: PROGRESS: at sentence #6103936, processed 83102084 words, keeping 1339290 word types
INFO - 08:14:13: PROGRESS: at sentence #6174912, processed 84058376 words, keeping 1348821 word types
INFO - 08:14:31: PROGRESS: at sentence #6245888, processed 85011177 words, keeping 1357663 word types
INFO - 08:14:49: PROGRESS: at sentence #6316864, processed 85969150 words, keeping 1367554 word types
INFO - 08:15:07: PROGRESS: at sentence #6387840, processed 86933176 words, keeping




INFO - 08:18:13: effective_min_count=20 retains 93856 unique words (6% of original 1463257, drops 1369401)
INFO - 08:18:13: effective_min_count=20 leaves 93320129 word corpus (96% of original 96497880, drops 3177751)
INFO - 08:18:14: deleting the raw counts dictionary of 1463257 items
INFO - 08:18:14: sample=0.001 downsamples 17 most-common words
INFO - 08:18:14: downsampling leaves estimated 90805423 word corpus (97.3% of prior 93320129)


RuntimeError: cannot sort vocabulary after model weights already initialized.

In [19]:
# Train model
t = time()
w2v_model.train(
    cord_sentences,
    total_examples=w2v_model.corpus_count,
    epochs=20,
    report_delay=30
)
print(f'Time to train the model: {round((time() - t) / 60, 2)} mins')

INFO - 08:35:20: training model with 15 workers on 93856 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2
INFO - 08:35:22: EPOCH 1 - PROGRESS: at 0.06% examples, 50206 words/s, in_qsize 0, out_qsize 0
INFO - 08:35:52: EPOCH 1 - PROGRESS: at 1.59% examples, 48588 words/s, in_qsize 0, out_qsize 0
INFO - 08:36:22: EPOCH 1 - PROGRESS: at 3.18% examples, 49548 words/s, in_qsize 0, out_qsize 0
INFO - 08:36:52: EPOCH 1 - PROGRESS: at 4.75% examples, 49824 words/s, in_qsize 0, out_qsize 0
INFO - 08:37:22: EPOCH 1 - PROGRESS: at 6.37% examples, 50213 words/s, in_qsize 0, out_qsize 0
INFO - 08:37:52: EPOCH 1 - PROGRESS: at 7.97% examples, 50201 words/s, in_qsize 0, out_qsize 0
INFO - 08:38:22: EPOCH 1 - PROGRESS: at 9.50% examples, 49999 words/s, in_qsize 0, out_qsize 0
INFO - 08:38:52: EPOCH 1 - PROGRESS: at 11.04% examples, 49920 words/s, in_qsize 0, out_qsize 0
INFO - 08:39:22: EPOCH 1 - PROGRESS: at 12.57% examples, 49631 words/s, in_qsize 0, out_qsize 0
INFO - 0

INFO - 09:08:21: EPOCH 2 - PROGRESS: at 6.31% examples, 49678 words/s, in_qsize 0, out_qsize 0
INFO - 09:08:51: EPOCH 2 - PROGRESS: at 7.90% examples, 49782 words/s, in_qsize 0, out_qsize 0
INFO - 09:09:21: EPOCH 2 - PROGRESS: at 9.44% examples, 49731 words/s, in_qsize 0, out_qsize 0
INFO - 09:09:51: EPOCH 2 - PROGRESS: at 11.01% examples, 49764 words/s, in_qsize 0, out_qsize 0
INFO - 09:10:21: EPOCH 2 - PROGRESS: at 12.60% examples, 49733 words/s, in_qsize 0, out_qsize 0
INFO - 09:10:51: EPOCH 2 - PROGRESS: at 14.13% examples, 49606 words/s, in_qsize 0, out_qsize 0
INFO - 09:11:21: EPOCH 2 - PROGRESS: at 15.69% examples, 49601 words/s, in_qsize 0, out_qsize 0
INFO - 09:11:52: EPOCH 2 - PROGRESS: at 17.27% examples, 49679 words/s, in_qsize 0, out_qsize 0
INFO - 09:12:22: EPOCH 2 - PROGRESS: at 18.87% examples, 49743 words/s, in_qsize 0, out_qsize 0
INFO - 09:12:52: EPOCH 2 - PROGRESS: at 20.45% examples, 49807 words/s, in_qsize 0, out_qsize 0
INFO - 09:13:22: EPOCH 2 - PROGRESS: at 22.

INFO - 09:42:07: EPOCH 3 - PROGRESS: at 14.48% examples, 50815 words/s, in_qsize 0, out_qsize 0
INFO - 09:42:37: EPOCH 3 - PROGRESS: at 16.11% examples, 50960 words/s, in_qsize 0, out_qsize 0
INFO - 09:43:07: EPOCH 3 - PROGRESS: at 17.74% examples, 51047 words/s, in_qsize 0, out_qsize 0
INFO - 09:43:37: EPOCH 3 - PROGRESS: at 19.38% examples, 51069 words/s, in_qsize 0, out_qsize 0
INFO - 09:44:07: EPOCH 3 - PROGRESS: at 20.99% examples, 51156 words/s, in_qsize 0, out_qsize 0
INFO - 09:44:37: EPOCH 3 - PROGRESS: at 22.62% examples, 51187 words/s, in_qsize 0, out_qsize 0
INFO - 09:45:07: EPOCH 3 - PROGRESS: at 24.23% examples, 51167 words/s, in_qsize 0, out_qsize 0
INFO - 09:45:37: EPOCH 3 - PROGRESS: at 25.84% examples, 51157 words/s, in_qsize 0, out_qsize 0
INFO - 09:46:07: EPOCH 3 - PROGRESS: at 27.42% examples, 51185 words/s, in_qsize 0, out_qsize 0
INFO - 09:46:38: EPOCH 3 - PROGRESS: at 29.05% examples, 51196 words/s, in_qsize 0, out_qsize 0
INFO - 09:47:08: EPOCH 3 - PROGRESS: at 

INFO - 10:15:56: EPOCH 4 - PROGRESS: at 23.25% examples, 48983 words/s, in_qsize 0, out_qsize 0
INFO - 10:16:26: EPOCH 4 - PROGRESS: at 24.83% examples, 49065 words/s, in_qsize 0, out_qsize 0
INFO - 10:16:57: EPOCH 4 - PROGRESS: at 26.40% examples, 49105 words/s, in_qsize 0, out_qsize 0
INFO - 10:17:27: EPOCH 4 - PROGRESS: at 28.00% examples, 49231 words/s, in_qsize 0, out_qsize 0
INFO - 10:17:57: EPOCH 4 - PROGRESS: at 29.60% examples, 49235 words/s, in_qsize 0, out_qsize 0
INFO - 10:18:27: EPOCH 4 - PROGRESS: at 31.26% examples, 49186 words/s, in_qsize 0, out_qsize 0
INFO - 10:18:57: EPOCH 4 - PROGRESS: at 32.85% examples, 49185 words/s, in_qsize 0, out_qsize 0
INFO - 10:19:27: EPOCH 4 - PROGRESS: at 34.45% examples, 49258 words/s, in_qsize 0, out_qsize 0
INFO - 10:19:57: EPOCH 4 - PROGRESS: at 36.04% examples, 49269 words/s, in_qsize 0, out_qsize 0
INFO - 10:20:27: EPOCH 4 - PROGRESS: at 37.64% examples, 49320 words/s, in_qsize 0, out_qsize 0
INFO - 10:20:57: EPOCH 4 - PROGRESS: at 

INFO - 10:49:40: EPOCH 5 - PROGRESS: at 31.13% examples, 49095 words/s, in_qsize 0, out_qsize 0
INFO - 10:50:10: EPOCH 5 - PROGRESS: at 32.73% examples, 49082 words/s, in_qsize 0, out_qsize 0
INFO - 10:50:41: EPOCH 5 - PROGRESS: at 34.27% examples, 49094 words/s, in_qsize 0, out_qsize 0
INFO - 10:51:11: EPOCH 5 - PROGRESS: at 35.87% examples, 49129 words/s, in_qsize 0, out_qsize 0
INFO - 10:51:41: EPOCH 5 - PROGRESS: at 37.49% examples, 49200 words/s, in_qsize 0, out_qsize 0
INFO - 10:52:11: EPOCH 5 - PROGRESS: at 39.14% examples, 49322 words/s, in_qsize 0, out_qsize 0
INFO - 10:52:41: EPOCH 5 - PROGRESS: at 40.76% examples, 49298 words/s, in_qsize 0, out_qsize 0
INFO - 10:53:11: EPOCH 5 - PROGRESS: at 42.46% examples, 49302 words/s, in_qsize 0, out_qsize 0
INFO - 10:53:42: EPOCH 5 - PROGRESS: at 44.16% examples, 49227 words/s, in_qsize 0, out_qsize 0
INFO - 10:54:12: EPOCH 5 - PROGRESS: at 45.84% examples, 49159 words/s, in_qsize 0, out_qsize 0
INFO - 10:54:42: EPOCH 5 - PROGRESS: at 

Time to train the model: 155.33 mins
