# Models for learning word embeddings


In [21]:
import re
from collections import defaultdict  # For word frequency
from time import time  # To time our operations

import pandas as pd  # For data handling

load_model = '' # model_name
#corpus = 'wiki'
corpus = 'speeches'
model_name = 'word2vec'
#model_name = 'fasttext'
preprocessing_type = 'spacy'
#preprocessing_type = 'spacy'

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [22]:
import os
cwd = os.getcwd()
print(cwd)

/home/jonatan/school/nlp-project/notebooks


In [23]:
if corpus == 'speeches':
    df = pd.read_csv("../output/speeches-1.csv", delimiter="|", lineterminator="\n")
    df = df.dropna().reset_index(drop=True)
    df.head()
elif corpus == 'wiki':
    data = open("../output/wikipedia2008_fi_lemmatized.txt").read()

## Cleaning:

In [24]:
if corpus == 'speeches':
    if preprocessing_type == 'spacy':
        from preprocessing import spacy_preprocess
        df_clean = spacy_preprocess(df)
    elif preprocessing_type == 'nltk':
        from preprocessing import nltk_preprocess
        df_clean = nltk_preprocess(df)
    else:
        raise Exception("Non allowed parameter for preprocessing")
    print(f"Cleaned shape: {df_clean.shape}")

Time to clean up everything: 1.96 mins
Cleaned shape: (45653, 1)


## Bigrams:

In [25]:
from gensim.models.phrases import Phrases, Phraser

As `Phrases()` takes a list of list of words as input:

In [26]:
if corpus == 'speeches':
    speeches = [re.split(r"[.!?]", row) for row in df_clean['clean']]
    sentences = [sent.strip().split() for speech in speeches for sent in speech if sent != ""]
elif corpus == 'wiki':
    sentences = re.split(r"[.!?]", data) 
    sentences = [sent.strip().split() for sent in sentences if sent != ""]

In [27]:
phrases = Phrases(sentences, min_count=30, progress_per=10000)

INFO - 22:52:33: collecting all words and their counts
INFO - 22:52:33: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 22:52:36: PROGRESS: at sentence #10000, processed 1560614 words and 935940 word types
INFO - 22:52:39: PROGRESS: at sentence #20000, processed 3306562 words and 1788752 word types
INFO - 22:52:43: PROGRESS: at sentence #30000, processed 5300193 words and 2673196 word types
INFO - 22:52:46: PROGRESS: at sentence #40000, processed 7030864 words and 3348730 word types
INFO - 22:52:48: collected 3760188 word types from a corpus of 8109030 words (unigram + bigrams) and 45653 sentences
INFO - 22:52:48: using 3760188 counts as vocab in Phrases<0 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>


In [28]:
bigram = Phraser(phrases)

INFO - 22:52:48: source_vocab length 3760188
INFO - 22:53:28: Phraser built with 4901 phrasegrams


Transform the corpus based on the bigrams detected:

In [29]:
sentences = bigram[sentences]

## Most Frequent Words:

In [30]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

347714

In [31]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['on', 'ja', 'että', 'se', 'tämä', 'myös', 'nyt', 'niin', 'mutta', 'ei']

# Training the model

In [32]:
import multiprocessing

if model_name == 'fasttext':
    from gensim.models import FastText as Model
elif model_name == 'word2vec':
    from gensim.models import Word2Vec as Model
else:
    raise Exception("Non allowed parameter for model.")

In [33]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [34]:
min_count=20
window=2
size=300
sample=6e-5
alpha=0.03
min_alpha=0.0007
negative=20
workers=cores-1

if len(load_model) > 0:
    model = Model.load(load_model)
else:
    model = Model(min_count=min_count,
                  window=window,
                  size=size,
                  sample=sample,
                  alpha=alpha,
                  min_alpha=min_alpha,
                  negative=negative,
                  workers=workers)

## Build vocabulary table:

In [35]:
t = time()

model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 22:53:51: collecting all words and their counts
INFO - 22:53:51: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 22:53:55: PROGRESS: at sentence #10000, processed 1418643 words, keeping 120883 word types
INFO - 22:54:00: PROGRESS: at sentence #20000, processed 3015313 words, keeping 198970 word types
INFO - 22:54:06: PROGRESS: at sentence #30000, processed 4844650 words, keeping 272131 word types
INFO - 22:54:11: PROGRESS: at sentence #40000, processed 6426255 words, keeping 319423 word types
INFO - 22:54:14: collected 347714 word types from a corpus of 7413971 raw words and 45653 sentences
INFO - 22:54:14: Loading a fresh vocabulary
INFO - 22:54:14: effective_min_count=20 retains 31435 unique words (9% of original 347714, drops 316279)
INFO - 22:54:14: effective_min_count=20 leaves 6565302 word corpus (88% of original 7413971, drops 848669)
INFO - 22:54:14: deleting the raw counts dictionary of 347714 items
INFO - 22:54:14: sample=6e-05 downsamples 704 

Time to build vocab: 0.39 mins


## Training:

In [36]:
t = time()
model.train(sentences, total_examples=model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 22:54:15: training model with 11 workers on 31435 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 22:54:16: EPOCH 1 - PROGRESS: at 4.27% examples, 127608 words/s, in_qsize 0, out_qsize 0
INFO - 22:54:17: EPOCH 1 - PROGRESS: at 8.02% examples, 131306 words/s, in_qsize 1, out_qsize 0
INFO - 22:54:18: EPOCH 1 - PROGRESS: at 11.95% examples, 134663 words/s, in_qsize 0, out_qsize 0
INFO - 22:54:19: EPOCH 1 - PROGRESS: at 15.53% examples, 134334 words/s, in_qsize 0, out_qsize 0
INFO - 22:54:20: EPOCH 1 - PROGRESS: at 19.31% examples, 135711 words/s, in_qsize 0, out_qsize 0
INFO - 22:54:21: EPOCH 1 - PROGRESS: at 22.96% examples, 135282 words/s, in_qsize 0, out_qsize 0
INFO - 22:54:22: EPOCH 1 - PROGRESS: at 25.88% examples, 133837 words/s, in_qsize 1, out_qsize 0
INFO - 22:54:23: EPOCH 1 - PROGRESS: at 29.56% examples, 134001 words/s, in_qsize 0, out_qsize 0
INFO - 22:54:24: EPOCH 1 - PROGRESS: at 33.15% examples, 134030 words/s, in_qsize 0, out_q

INFO - 22:55:22: EPOCH 3 - PROGRESS: at 26.40% examples, 137776 words/s, in_qsize 0, out_qsize 0
INFO - 22:55:23: EPOCH 3 - PROGRESS: at 29.64% examples, 136253 words/s, in_qsize 0, out_qsize 0
INFO - 22:55:24: EPOCH 3 - PROGRESS: at 33.31% examples, 135647 words/s, in_qsize 1, out_qsize 0
INFO - 22:55:25: EPOCH 3 - PROGRESS: at 36.36% examples, 134660 words/s, in_qsize 0, out_qsize 0
INFO - 22:55:26: EPOCH 3 - PROGRESS: at 39.78% examples, 134226 words/s, in_qsize 0, out_qsize 0
INFO - 22:55:27: EPOCH 3 - PROGRESS: at 43.23% examples, 133572 words/s, in_qsize 0, out_qsize 0
INFO - 22:55:28: EPOCH 3 - PROGRESS: at 45.93% examples, 133176 words/s, in_qsize 1, out_qsize 0
INFO - 22:55:29: EPOCH 3 - PROGRESS: at 48.97% examples, 132725 words/s, in_qsize 0, out_qsize 1
INFO - 22:55:30: EPOCH 3 - PROGRESS: at 52.25% examples, 133460 words/s, in_qsize 0, out_qsize 0
INFO - 22:55:31: EPOCH 3 - PROGRESS: at 55.65% examples, 133835 words/s, in_qsize 0, out_qsize 0
INFO - 22:55:32: EPOCH 3 - PRO

INFO - 22:56:30: EPOCH 5 - PROGRESS: at 51.58% examples, 131462 words/s, in_qsize 0, out_qsize 0
INFO - 22:56:31: EPOCH 5 - PROGRESS: at 54.45% examples, 131338 words/s, in_qsize 0, out_qsize 0
INFO - 22:56:32: EPOCH 5 - PROGRESS: at 57.60% examples, 130959 words/s, in_qsize 1, out_qsize 0
INFO - 22:56:33: EPOCH 5 - PROGRESS: at 60.04% examples, 130708 words/s, in_qsize 0, out_qsize 0
INFO - 22:56:34: EPOCH 5 - PROGRESS: at 62.64% examples, 130636 words/s, in_qsize 0, out_qsize 0
INFO - 22:56:35: EPOCH 5 - PROGRESS: at 65.68% examples, 130387 words/s, in_qsize 0, out_qsize 0
INFO - 22:56:36: EPOCH 5 - PROGRESS: at 69.25% examples, 130653 words/s, in_qsize 1, out_qsize 0
INFO - 22:56:37: EPOCH 5 - PROGRESS: at 73.22% examples, 131000 words/s, in_qsize 0, out_qsize 0
INFO - 22:56:38: EPOCH 5 - PROGRESS: at 77.12% examples, 131514 words/s, in_qsize 0, out_qsize 0
INFO - 22:56:39: EPOCH 5 - PROGRESS: at 80.62% examples, 131616 words/s, in_qsize 0, out_qsize 0
INFO - 22:56:40: EPOCH 5 - PRO

INFO - 22:57:36: EPOCH 7 - PROGRESS: at 75.50% examples, 141603 words/s, in_qsize 0, out_qsize 0
INFO - 22:57:37: EPOCH 7 - PROGRESS: at 79.21% examples, 141465 words/s, in_qsize 0, out_qsize 0
INFO - 22:57:38: EPOCH 7 - PROGRESS: at 82.57% examples, 141281 words/s, in_qsize 0, out_qsize 0
INFO - 22:57:39: EPOCH 7 - PROGRESS: at 86.00% examples, 141426 words/s, in_qsize 0, out_qsize 0
INFO - 22:57:40: EPOCH 7 - PROGRESS: at 89.32% examples, 141576 words/s, in_qsize 0, out_qsize 0
INFO - 22:57:41: EPOCH 7 - PROGRESS: at 92.33% examples, 141677 words/s, in_qsize 0, out_qsize 0
INFO - 22:57:42: EPOCH 7 - PROGRESS: at 95.83% examples, 141791 words/s, in_qsize 0, out_qsize 0
INFO - 22:57:43: EPOCH 7 - PROGRESS: at 99.18% examples, 141856 words/s, in_qsize 0, out_qsize 0
INFO - 22:57:43: worker thread finished; awaiting finish of 10 more threads
INFO - 22:57:43: worker thread finished; awaiting finish of 9 more threads
INFO - 22:57:43: worker thread finished; awaiting finish of 8 more thread

INFO - 22:58:42: worker thread finished; awaiting finish of 8 more threads
INFO - 22:58:42: worker thread finished; awaiting finish of 7 more threads
INFO - 22:58:42: worker thread finished; awaiting finish of 6 more threads
INFO - 22:58:42: worker thread finished; awaiting finish of 5 more threads
INFO - 22:58:42: worker thread finished; awaiting finish of 4 more threads
INFO - 22:58:42: worker thread finished; awaiting finish of 3 more threads
INFO - 22:58:42: worker thread finished; awaiting finish of 2 more threads
INFO - 22:58:42: worker thread finished; awaiting finish of 1 more threads
INFO - 22:58:42: worker thread finished; awaiting finish of 0 more threads
INFO - 22:58:42: EPOCH - 9 : training on 7413971 raw words (4074048 effective words) took 29.5s, 138044 effective words/s
INFO - 22:58:43: EPOCH 10 - PROGRESS: at 4.46% examples, 133682 words/s, in_qsize 0, out_qsize 0
INFO - 22:58:44: EPOCH 10 - PROGRESS: at 8.08% examples, 134494 words/s, in_qsize 0, out_qsize 0
INFO - 22

INFO - 22:59:42: EPOCH 12 - PROGRESS: at 4.46% examples, 136076 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:43: EPOCH 12 - PROGRESS: at 8.23% examples, 138388 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:44: EPOCH 12 - PROGRESS: at 12.05% examples, 136849 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:45: EPOCH 12 - PROGRESS: at 15.84% examples, 138300 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:46: EPOCH 12 - PROGRESS: at 19.82% examples, 139350 words/s, in_qsize 1, out_qsize 0
INFO - 22:59:47: EPOCH 12 - PROGRESS: at 23.51% examples, 139703 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:48: EPOCH 12 - PROGRESS: at 27.29% examples, 140653 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:49: EPOCH 12 - PROGRESS: at 30.87% examples, 140725 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:50: EPOCH 12 - PROGRESS: at 34.60% examples, 141028 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:51: EPOCH 12 - PROGRESS: at 38.27% examples, 141207 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:52: EPOCH

INFO - 23:00:50: EPOCH 14 - PROGRESS: at 33.73% examples, 137763 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:51: EPOCH 14 - PROGRESS: at 37.19% examples, 137467 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:52: EPOCH 14 - PROGRESS: at 40.60% examples, 136968 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:53: EPOCH 14 - PROGRESS: at 44.03% examples, 137209 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:54: EPOCH 14 - PROGRESS: at 47.21% examples, 137847 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:55: EPOCH 14 - PROGRESS: at 50.36% examples, 137759 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:56: EPOCH 14 - PROGRESS: at 53.46% examples, 137791 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:57: EPOCH 14 - PROGRESS: at 56.53% examples, 137790 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:58: EPOCH 14 - PROGRESS: at 59.74% examples, 137779 words/s, in_qsize 1, out_qsize 0
INFO - 23:00:59: EPOCH 14 - PROGRESS: at 62.47% examples, 137944 words/s, in_qsize 0, out_qsize 0
INFO - 23:01:00: EPO

INFO - 23:01:57: EPOCH 16 - PROGRESS: at 60.11% examples, 138690 words/s, in_qsize 0, out_qsize 0
INFO - 23:01:58: EPOCH 16 - PROGRESS: at 63.11% examples, 138980 words/s, in_qsize 0, out_qsize 0
INFO - 23:01:59: EPOCH 16 - PROGRESS: at 66.30% examples, 138359 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:00: EPOCH 16 - PROGRESS: at 69.70% examples, 138108 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:01: EPOCH 16 - PROGRESS: at 73.55% examples, 137852 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:02: EPOCH 16 - PROGRESS: at 77.45% examples, 137953 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:03: EPOCH 16 - PROGRESS: at 80.91% examples, 137878 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:04: EPOCH 16 - PROGRESS: at 84.12% examples, 137995 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:05: EPOCH 16 - PROGRESS: at 87.44% examples, 138002 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:06: EPOCH 16 - PROGRESS: at 90.46% examples, 137810 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:07: EPO

INFO - 23:03:05: EPOCH 18 - PROGRESS: at 85.23% examples, 135074 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:06: EPOCH 18 - PROGRESS: at 88.53% examples, 134892 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:07: EPOCH 18 - PROGRESS: at 91.22% examples, 134822 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:08: EPOCH 18 - PROGRESS: at 94.04% examples, 134813 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:09: EPOCH 18 - PROGRESS: at 97.35% examples, 134798 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:10: worker thread finished; awaiting finish of 10 more threads
INFO - 23:03:10: worker thread finished; awaiting finish of 9 more threads
INFO - 23:03:10: worker thread finished; awaiting finish of 8 more threads
INFO - 23:03:10: worker thread finished; awaiting finish of 7 more threads
INFO - 23:03:10: worker thread finished; awaiting finish of 6 more threads
INFO - 23:03:10: worker thread finished; awaiting finish of 5 more threads
INFO - 23:03:10: worker thread finished; awaiting finish of

INFO - 23:04:10: worker thread finished; awaiting finish of 7 more threads
INFO - 23:04:10: worker thread finished; awaiting finish of 6 more threads
INFO - 23:04:10: worker thread finished; awaiting finish of 5 more threads
INFO - 23:04:10: worker thread finished; awaiting finish of 4 more threads
INFO - 23:04:10: worker thread finished; awaiting finish of 3 more threads
INFO - 23:04:10: worker thread finished; awaiting finish of 2 more threads
INFO - 23:04:10: worker thread finished; awaiting finish of 1 more threads
INFO - 23:04:10: worker thread finished; awaiting finish of 0 more threads
INFO - 23:04:10: EPOCH - 20 : training on 7413971 raw words (4075022 effective words) took 30.5s, 133436 effective words/s
INFO - 23:04:11: EPOCH 21 - PROGRESS: at 4.46% examples, 134391 words/s, in_qsize 0, out_qsize 0
INFO - 23:04:12: EPOCH 21 - PROGRESS: at 8.08% examples, 135247 words/s, in_qsize 0, out_qsize 0
INFO - 23:04:13: EPOCH 21 - PROGRESS: at 11.95% examples, 134396 words/s, in_qsize 

INFO - 23:05:11: EPOCH 23 - PROGRESS: at 4.68% examples, 136440 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:12: EPOCH 23 - PROGRESS: at 8.52% examples, 136868 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:13: EPOCH 23 - PROGRESS: at 12.38% examples, 137083 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:14: EPOCH 23 - PROGRESS: at 16.15% examples, 138142 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:15: EPOCH 23 - PROGRESS: at 19.98% examples, 138516 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:16: EPOCH 23 - PROGRESS: at 23.45% examples, 137823 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:17: EPOCH 23 - PROGRESS: at 26.81% examples, 137045 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:18: EPOCH 23 - PROGRESS: at 30.39% examples, 136781 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:19: EPOCH 23 - PROGRESS: at 33.82% examples, 136557 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:20: EPOCH 23 - PROGRESS: at 37.32% examples, 136664 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:21: EPOCH

INFO - 23:06:18: EPOCH 25 - PROGRESS: at 34.60% examples, 141183 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:19: EPOCH 25 - PROGRESS: at 38.09% examples, 140543 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:20: EPOCH 25 - PROGRESS: at 41.57% examples, 139734 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:21: EPOCH 25 - PROGRESS: at 44.66% examples, 139128 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:22: EPOCH 25 - PROGRESS: at 47.58% examples, 138496 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:23: EPOCH 25 - PROGRESS: at 50.80% examples, 138104 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:24: EPOCH 25 - PROGRESS: at 53.80% examples, 138107 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:25: EPOCH 25 - PROGRESS: at 56.98% examples, 137600 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:26: EPOCH 25 - PROGRESS: at 59.80% examples, 137322 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:27: EPOCH 25 - PROGRESS: at 62.47% examples, 137401 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:29: EPO

INFO - 23:07:26: EPOCH 27 - PROGRESS: at 56.25% examples, 135110 words/s, in_qsize 0, out_qsize 0
INFO - 23:07:27: EPOCH 27 - PROGRESS: at 59.24% examples, 134885 words/s, in_qsize 0, out_qsize 0
INFO - 23:07:28: EPOCH 27 - PROGRESS: at 61.79% examples, 134969 words/s, in_qsize 1, out_qsize 0
INFO - 23:07:29: EPOCH 27 - PROGRESS: at 65.04% examples, 134998 words/s, in_qsize 0, out_qsize 0
INFO - 23:07:30: EPOCH 27 - PROGRESS: at 68.71% examples, 135043 words/s, in_qsize 0, out_qsize 0
INFO - 23:07:31: EPOCH 27 - PROGRESS: at 72.56% examples, 135035 words/s, in_qsize 0, out_qsize 0
INFO - 23:07:32: EPOCH 27 - PROGRESS: at 76.20% examples, 135157 words/s, in_qsize 1, out_qsize 0
INFO - 23:07:33: EPOCH 27 - PROGRESS: at 79.77% examples, 135086 words/s, in_qsize 0, out_qsize 0
INFO - 23:07:34: EPOCH 27 - PROGRESS: at 83.07% examples, 135254 words/s, in_qsize 0, out_qsize 0
INFO - 23:07:35: EPOCH 27 - PROGRESS: at 86.25% examples, 135273 words/s, in_qsize 0, out_qsize 0
INFO - 23:07:36: EPO

INFO - 23:08:33: EPOCH 29 - PROGRESS: at 81.17% examples, 138256 words/s, in_qsize 0, out_qsize 0
INFO - 23:08:34: EPOCH 29 - PROGRESS: at 84.53% examples, 138564 words/s, in_qsize 0, out_qsize 0
INFO - 23:08:35: EPOCH 29 - PROGRESS: at 87.81% examples, 138642 words/s, in_qsize 0, out_qsize 0
INFO - 23:08:36: EPOCH 29 - PROGRESS: at 90.78% examples, 138295 words/s, in_qsize 0, out_qsize 0
INFO - 23:08:37: EPOCH 29 - PROGRESS: at 93.60% examples, 138352 words/s, in_qsize 0, out_qsize 0
INFO - 23:08:38: EPOCH 29 - PROGRESS: at 96.96% examples, 138281 words/s, in_qsize 0, out_qsize 0
INFO - 23:08:39: worker thread finished; awaiting finish of 10 more threads
INFO - 23:08:39: worker thread finished; awaiting finish of 9 more threads
INFO - 23:08:39: worker thread finished; awaiting finish of 8 more threads
INFO - 23:08:39: worker thread finished; awaiting finish of 7 more threads
INFO - 23:08:39: worker thread finished; awaiting finish of 6 more threads
INFO - 23:08:39: worker thread finis

Time to train the model: 14.9 mins


In [37]:
model.save(f"{model_name}_{corpus}_{preprocessing_type}_mincount{min_count}_window{window}_size{size}_alpha{alpha}_minalpha{min_alpha}_negative{negative}_workers{workers}.model")

INFO - 23:09:08: saving Word2Vec object under word2vec_speeches_spacy_mincount20_window2_size300_alpha0.03_minalpha0.0007_negative20_workers11.model, separately None
INFO - 23:09:08: not storing attribute vectors_norm
INFO - 23:09:08: not storing attribute cum_table
INFO - 23:09:09: saved word2vec_speeches_spacy_mincount20_window2_size300_alpha0.03_minalpha0.0007_negative20_workers11.model
