In [1]:
import numpy as np
import pandas as pd
import utils

__author__ = "Olivares Castillo José Luis"

In [2]:
def read(file, threshold=0, vocabulary=None, dtype='float'):
    # Copyright (C) 2016-2018  Mikel Artetxe <artetxem@gmail.com>
    # https://github.com/artetxem/vecmap/blob/master/embeddings.py
    header = file.readline().split(' ')
    count = int(header[0]) if threshold <= 0 else min(threshold, int(header[0]))
    dim = int(header[1])
    words = []
    matrix = np.empty((count, dim), dtype=dtype) if vocabulary is None else []
    for i in range(count):
        word, vec = file.readline().split(' ', 1)
        if vocabulary is None:
            words.append(word)
            matrix[i] = np.fromstring(vec, sep=' ', dtype=dtype)
        elif word in vocabulary:
            words.append(word)
            matrix.append(np.fromstring(vec, sep=' ', dtype=dtype))
    return (words, matrix) if vocabulary is None else (words, np.array(matrix, dtype=dtype))

In [3]:
def read_lexicon(source):
    src,trg=list(),list()
    if source.__eq__("train"):
        with open("../dataset/OPUS_en_it_europarl_train_5K.txt","r",encoding='utf-8') as file:
            for line in file:
                src.append(line.split()[0])
                trg.append(line.split()[1])
        return (src,trg)
    elif source.__eq__("test"):
        with open("../dataset/OPUS_en_it_europarl_test.txt","r",encoding='utf-8') as file:
            for line in file:
                src.append(line.split()[0])
                trg.append(line.split()[1])
        return (src,trg)

In [8]:
def get_vectors(lexicon,words,embeddings,dtype='float'):
    matrix = np.empty((len(lexicon),embeddings.shape[1]),dtype=dtype)
    for i in range(len(lexicon)):
        if lexicon[i] in words:
            #tmp.append(list(embeddings[words.index(lexicon[i])]))
            matrix[i] = embeddings[words.index(lexicon[i])]
    return np.asarray(matrix,dtype=dtype)

In [4]:
en,it=read_lexicon("test")
len(en),len(it)

(1869, 1869)

In [5]:
source_vec = open("../dataset/EN.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt",
             encoding="utf-8",errors="surrogateescape")

words_en,en_vec=read(source_vec)

In [6]:
en_vec[words_en.index("for")]

array([ 0.053462, -0.017315,  0.039388,  0.022096,  0.097245, -0.007484,
        0.038782, -0.113592, -0.029476, -0.149046,  0.095559, -0.015483,
        0.019254, -0.019111,  0.160142, -0.043328,  0.207161, -0.044734,
        0.066466, -0.088493,  0.045842,  0.010875, -0.132724, -0.063616,
       -0.091216, -0.057338, -0.169607, -0.030058,  0.071344,  0.022969,
       -0.028265, -0.092045, -0.135797,  0.110643, -0.007282, -0.065283,
        0.014716,  0.027053, -0.130566, -0.083649, -0.045809, -0.056796,
       -0.076717, -0.006347, -0.016789, -0.058759,  0.174185,  0.051115,
       -0.041456, -0.095424,  0.062544, -0.152105, -0.208745, -0.100034,
       -0.074475, -0.032816,  0.129226, -0.161741, -0.038977,  0.164587,
       -0.091351, -0.051929, -0.013642,  0.004292, -0.12287 , -0.069695,
       -0.042102, -0.11601 , -0.024308,  0.027785,  0.0153  , -0.106739,
       -0.025478, -0.048127,  0.189194,  0.033518,  0.074332,  0.009075,
       -0.005929,  0.138593, -0.003235, -0.006764, 

In [9]:
src_vec=get_vectors(en,words_en,en_vec)
src_vec.shape

(1869, 300)

In [10]:
en[10]

'following'

In [11]:
target_vec = open("../dataset/IT.200K.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt",
             encoding="utf-8",errors="surrogateescape")

words_it,it_vec=read(target_vec)

In [12]:
trg_vec=get_vectors(it,words_it,it_vec)
trg_vec.shape

(1869, 300)

In [13]:
with open("../dataset/NEW-EN-TEST","w") as file:
    for i in range(src_vec.shape[0]):
        #file.write(en[i]+" "+" ".join(map(str,src_vec[i]))+"\n")
        if i.__ne__(src_vec.shape[0] - 1):
            file.write(en[i]+" "+" ".join(map(str,src_vec[i]))+"\n")
        else:
            file.write(en[i]+" "+" ".join(map(str,src_vec[i])))

In [14]:
with open("../dataset/NEW-IT-TEST","w") as file:
    for i in range(trg_vec.shape[0]):
        #file.write(it[i]+" "+" ".join(map(str,trg_vec[i]))+"\n")
        if i.__ne__(trg_vec.shape[0] - 1):
            file.write(it[i]+" "+" ".join(map(str,trg_vec[i]))+"\n")
        else:
            file.write(it[i]+" "+" ".join(map(str,trg_vec[i])))

In [13]:
words_it.index("per")

12

In [14]:
it_vec[12]

array([-0.004587, -0.09339 ,  0.024194, -0.160234, -0.081758,  0.02754 ,
        0.001967, -0.007735, -0.07485 ,  0.004046, -0.015186,  0.043899,
        0.060886, -0.084293,  0.02517 ,  0.019767, -0.088462, -0.100752,
       -0.076238, -0.029609, -0.037766,  0.074355, -0.011544,  0.124296,
       -0.131977,  0.016333, -0.024172,  0.039691, -0.051303, -0.046599,
        0.020477,  0.177797,  0.001731, -0.031561,  0.080046, -0.02657 ,
        0.041981,  0.024949,  0.100902, -0.018006,  0.171553, -0.153255,
        0.031861,  0.049595,  0.05172 ,  0.056013, -0.089203, -0.070196,
       -0.035462,  0.012322, -0.031717, -0.013635,  0.014406, -0.121697,
        0.07401 , -0.003795,  0.136772,  0.027047, -0.145845,  0.069214,
       -0.133257, -0.105563, -0.121019, -0.049852, -0.095362, -0.029586,
       -0.120687, -0.104069,  0.018176,  0.052571,  0.085504,  0.008815,
       -0.042495, -0.012995,  0.138161, -0.023372, -0.054404,  0.032455,
       -0.067908, -0.01588 ,  0.015618,  0.044195, 