# Test Word2vec training

In [1]:
from gensim.models import Word2Vec
import os
import shutil

## Balzac

In [2]:
NORMALIZED_TEXT_PATH = "../normalized_data/french_novel/normalized_balzac.txt"

W2V_MODEL_DESTINATION = "../models/french_novels/balzac/balzac_w2v_cbow_w10_ep20_mincount5.model"
W2V_MODEL_DESTINATION_DIR = "../models/french_novels/balzac/"

In [3]:
!wc $NORMALIZED_TEXT_PATH

  218073  4406900 25110024 ../normalized_data/french_novel/normalized_balzac.txt


In [4]:
SG = 0  # CBOW
NUM_EPOCHS = 20
WINDOW = 10
MIN_COUNT = 5
WORKERS = 4
WORD_COUNT = 4406900

model = Word2Vec(corpus_file=NORMALIZED_TEXT_PATH, sg=SG, window=WINDOW, min_count=MIN_COUNT, workers=WORKERS)
model.train(corpus_file=NORMALIZED_TEXT_PATH, 
           epochs=NUM_EPOCHS, total_examples=model.corpus_count, total_words=WORD_COUNT)

(65218331, 88060080)

In [5]:
model.wv

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fab8eea16d8>

**Save the model**

In [6]:
model_dir_name = "balzac_w2v_{}_w{}_mincount{}_ep".format("cbow" if SG == 1 else "sg", WINDOW, MIN_COUNT, NUM_EPOCHS)
model_destination_dir = os.path.join(W2V_MODEL_DESTINATION_DIR, model_dir_name)
model_destination = os.path.join(model_destination_dir, model_dir_name + ".model")
shutil.os.makedirs(model_destination_dir, exist_ok=True)

model.wv.save(model_destination)

**Export vectors for visualization**

In [7]:
# Write to TSV
with open(os.path.join(model_destination_dir, model_dir_name + ".tsv"), "w") as f: 
    for w,v in model.wv.vocab.items():
        f.write("\t".join([str(v) for v in model.wv[w]]))
        f.write("\n")
with open(os.path.join(model_destination_dir, model_dir_name + "_metadata" + ".tsv"), "w") as f:
    f.write("label\tcount\n")
    for w,v in model.wv.vocab.items():
        f.write("\t".join([w, str(v.count)]))
        f.write("\n")

## Zola

In [2]:
NORMALIZED_TEXT_PATH = "../normalized_data/french_novel/normalized_zola.txt"

W2V_MODEL_DESTINATION = "../models/french_novels/zola/zola_w2v_cbow_w10_ep20_mincount5.model"
W2V_MODEL_DESTINATION_DIR = "../models/french_novels/zola/"

In [3]:
!wc $NORMALIZED_TEXT_PATH

  257939  4634495 27084870 ../normalized_data/french_novel/normalized_zola.txt


In [4]:
SG = 0  # CBOW
NUM_EPOCHS = 20
WINDOW = 10
MIN_COUNT = 5
WORKERS = 4
WORD_COUNT = 4634495

model = Word2Vec(corpus_file=NORMALIZED_TEXT_PATH, sg=SG, window=WINDOW, min_count=MIN_COUNT, workers=WORKERS)
model.train(corpus_file=NORMALIZED_TEXT_PATH, 
           epochs=NUM_EPOCHS, total_examples=model.corpus_count, total_words=WORD_COUNT)

(70298408, 92942080)

In [5]:
model.wv

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fd1c6460748>

**Save the model**

In [8]:
model_dir_name = "zola_w2v_{}_w{}_mincount{}_ep".format("cbow" if SG == 1 else "sg", WINDOW, MIN_COUNT, NUM_EPOCHS)
model_destination_dir = os.path.join(W2V_MODEL_DESTINATION_DIR, model_dir_name)
model_destination = os.path.join(model_destination_dir, model_dir_name + ".model")
shutil.os.makedirs(model_destination_dir, exist_ok=True)

model.wv.save(model_destination)

**Export vectors for visualization**

In [9]:
# Write to TSV
with open(os.path.join(model_destination_dir, model_dir_name + ".tsv"), "w") as f: 
    for w,v in model.wv.vocab.items():
        f.write("\t".join([str(v) for v in model.wv[w]]))
        f.write("\n")
with open(os.path.join(model_destination_dir, model_dir_name + "_metadata" + ".tsv"), "w") as f:
    f.write("label\tcount\n")
    for w,v in model.wv.vocab.items():
        f.write("\t".join([w, str(v.count)]))
        f.write("\n")