# Test Word2vec training

In [19]:
from gensim.models import Word2Vec
import os
import shutil

In [17]:
NORMALIZED_TEXT_PATH = "../normalized_data/french_novel/normalized_balzac.txt"

W2V_MODEL_DESTINATION = "../models/french_novels/balzac/balzac_w2v_cbow_w10_ep20_mincount5.model"
W2V_MODEL_DESTINATION_DIR = "../models/french_novels/balzac/"

In [3]:
!wc $NORMALIZED_TEXT_PATH

  123218  2546113 14074554 ../normalized_data/french_novel/normalized_balzac.txt


In [4]:
SG = 0  # CBOW
NUM_EPOCHS = 20
WINDOW = 10
MIN_COUNT = 5
WORKERS = 4
WORD_COUNT = 2546113

model = Word2Vec(corpus_file=NORMALIZED_TEXT_PATH, sg=SG, window=WINDOW, min_count=MIN_COUNT, workers=WORKERS)
model.train(corpus_file=NORMALIZED_TEXT_PATH, 
           epochs=NUM_EPOCHS, total_examples=model.corpus_count, total_words=WORD_COUNT)

(37087419, 51022780)

In [5]:
model.wv

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fb70c5a5c50>

**Save the model**

In [22]:
model_dir_name = "balzac_w2v_{}_w{}_mincount{}_ep".format("cbow" if SG == 1 else "sg", WINDOW, MIN_COUNT, NUM_EPOCHS)
model_destination_dir = os.path.join(W2V_MODEL_DESTINATION_DIR, model_dir_name)
model_destination = os.path.join(model_destination_dir, model_dir_name + ".model")
shutil.os.makedirs(model_destination_dir, exist_ok=True)

model.wv.save(model_destination)

**Export vectors for visualization**

In [29]:
# Write to TSV
with open(os.path.join(model_destination_dir, model_dir_name + ".tsv"), "w") as f: 
    for w,v in model.wv.vocab.items():
        f.write("\t".join([str(v) for v in model.wv[w]]))
        f.write("\n")
with open(os.path.join(model_destination_dir, model_dir_name + "_metadata" + ".tsv"), "w") as f:
    f.write("label\tcount\n")
    for w,v in model.wv.vocab.items():
        f.write("\t".join([w, str(v.count)]))
        f.write("\n")

In [7]:
model.wv.vocab

{'\ufeffproject': <gensim.models.keyedvectors.Vocab at 0x7fb70c5a55f8>,
 'gutenberg': <gensim.models.keyedvectors.Vocab at 0x7fb6fb67dcc0>,
 's': <gensim.models.keyedvectors.Vocab at 0x7fb6fb663c88>,
 'la': <gensim.models.keyedvectors.Vocab at 0x7fb6fb6028d0>,
 'maison': <gensim.models.keyedvectors.Vocab at 0x7fb6fb6026d8>,
 'du': <gensim.models.keyedvectors.Vocab at 0x7fb6fb602f98>,
 'chat': <gensim.models.keyedvectors.Vocab at 0x7fb6fb602828>,
 'qui': <gensim.models.keyedvectors.Vocab at 0x7fb6fb6022e8>,
 'pelote': <gensim.models.keyedvectors.Vocab at 0x7fb6fb603828>,
 'by': <gensim.models.keyedvectors.Vocab at 0x7fb6fb603e10>,
 'honore': <gensim.models.keyedvectors.Vocab at 0x7fb6fb6032e8>,
 'de': <gensim.models.keyedvectors.Vocab at 0x7fb6fb603630>,
 'balzac': <gensim.models.keyedvectors.Vocab at 0x7fb6fb603c50>,
 'this': <gensim.models.keyedvectors.Vocab at 0x7fb6fb6030f0>,
 'ebook': <gensim.models.keyedvectors.Vocab at 0x7fb6fb603978>,
 'is': <gensim.models.keyedvectors.Vocab at 

In [26]:
v.count

11