In [1]:
import gensim
from pprint import pprint
import random
import argparse
import time
import os

epochs = 5
lang1 = "data/Books.en-es.en"
lang2 = "data/Books.en-es.es"
method = 2
dims = 300
window = 3

#Load in corpora
print("\n\n\nLoading Corpora")
t1 = time.time()
with open(lang1, encoding="utf-8") as f:
    lang1 = f.readlines()

with open(lang2, encoding="utf-8") as f:
    lang2 = f.readlines()

#Tokenize each corpus
lang1 = [gensim.utils.simple_preprocess(line) for line in lang1]
lang2 = [gensim.utils.simple_preprocess(line) for line in lang2]

#Get all word types in each corpus
lang1_words = list(set([word for sent in lang1 for word in sent]))
lang2_words = list(set([word for sent in lang2 for word in sent]))
print("-- {} -- seconds\n".format(time.time()-t1))


#Method 2
def mix2(l1,l2):
    output = []
    if len(l1) > len(l2):
        new_line = []
        for i in range(len(l1)):
            if random.randint(0,1):
                new_line.append(l1[i])
            elif i < len(l2):
                new_line.append(l2[i])
            else:
                new_line.append(l1[i])
    else:
        new_line = []
        for i in range(len(l2)):
            if random.randint(0,1):
                new_line.append(l2[i])
            elif i < len(l1):
                new_line.append(l1[i])
            else:
                new_line.append(l2[i])
    return new_line


#Use one of the methods
print("Creating bilingual data set")
t1 = time.time()
if method == 1:
    data = []
    for i,j in zip(lang1,lang2):
        data += mix1(i,j)

elif method == 2:
    data = []
    for i,j in zip(lang1[2:],lang2[2:]):
        i_n = len(i)
        j_n = len(j)
        if i_n > 4 and j_n > 4:
            for k in range(int(max(i_n,j_n)**(0.5))*2):
                data.append(mix2(i,j))
print("-- {} -- seconds\n".format(time.time()-t1))

#Load word2vec model
print("Loading gensim.Word2Vec")
t1 = time.time()
model = gensim.models.Word2Vec(
        data,
        size=dims,
        window=window,
        min_count=2,
        workers=4)
print("-- {} -- seconds\n".format(time.time()-t1))


#Train the model
print("Training Model")
t1 = time.time()
model.train(data, total_examples=len(data), epochs=epochs)
print("-- {} -- seconds\n".format(time.time()-t1))




Loading Corpora
-- 4.198770999908447 -- seconds

Creating bilingual data set
-- 20.91409420967102 -- seconds

Loading gensim.Word2Vec
-- 55.90849423408508 -- seconds

Training Model
-- 52.00492072105408 -- seconds



In [13]:
model.most_similar("butterfly")

  """Entry point for launching an IPython kernel.


[('ondulaban', 0.5498026013374329),
 ('cazamariposas', 0.47866594791412354),
 ('boulders', 0.46035659313201904),
 ('rojizo', 0.4365118145942688),
 ('ovalada', 0.424294650554657),
 ('mariposa', 0.4178538918495178),
 ('marrón', 0.41318005323410034),
 ('festoons', 0.40188270807266235),
 ('azulados', 0.40170085430145264),
 ('zarzas', 0.4010452628135681)]