In [22]:
import sys
import os
import pandas as pd
import string
import numpy as np
import re
import spacy
import es_core_news_sm
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import BucketIterator
from sklearn.model_selection import train_test_split
import torch

In [23]:
data_path = os.path.join(sys.path[0], 'data/spa.txt')
lines= pd.read_table(data_path,  names =['eng', 'spa', 'comments'])
train,valid = train_test_split(lines, test_size=0.1)

In [24]:
train.to_csv('train.csv')
valid.to_csv('valid.csv')

In [25]:
# use spacy for tokenization - provides good support for tokenization in languages other than english
eng_field = Field(tokenize="spacy",
                 tokenizer_language="en",
                 init_token="<sos>",
                 eos_token = "<eos>",
                 lower = True)

spa_field = Field(tokenize="spacy",
                 tokenizer_language="es",
                 init_token="<sos>",
                 eos_token = "<eos>",
                 lower=True)

In [26]:
tabular_data_fields = [("id", None), ('eng', eng_field), ('spa', spa_field),('comments',None)]

In [27]:
train, valid = TabularDataset.splits(

path = './',
train='train.csv',
validation='valid.csv',
format = 'csv',
skip_header = True,
fields = tabular_data_fields)

In [28]:
valid[122].__dict__

{'eng': ['does', 'everybody', 'understand', '?'],
 'spa': ['¿', 'todos', 'entienden', '?']}

In [29]:
eng_field.build_vocab(train, valid)

In [30]:
spa_field.build_vocab(train, valid)

In [11]:
spa_field.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x15b14a070>>,
            {'<unk>': 0,
             '<pad>': 1,
             '<sos>': 2,
             '<eos>': 3,
             '.': 4,
             'de': 5,
             'que': 6,
             'no': 7,
             'a': 8,
             'tom': 9,
             'la': 10,
             '?': 11,
             '¿': 12,
             'el': 13,
             'en': 14,
             'es': 15,
             'un': 16,
             'me': 17,
             'se': 18,
             ',': 19,
             'por': 20,
             'lo': 21,
             'él': 22,
             'una': 23,
             'los': 24,
             'su': 25,
             'está': 26,
             'con': 27,
             'mi': 28,
             'qué': 29,
             'le': 30,
             'ella': 31,
             'te': 32,
             'mary': 33,
             'para': 34,
             'y': 35,
             'las': 36,
             'más': 37,
           

In [31]:
train[0].__dict__

{'eng': ['he', 'often', 'goes', 'off', 'on', 'wild', 'goose', 'chases', '.'],
 'spa': ['él', 'a', 'menudo', 'persigue', 'objetivos', 'inalcanzables', '.']}

In [13]:
eng_field.vocab.itos[3]

'<eos>'

In [15]:
train_iter, val_iter = BucketIterator.splits((train, valid),
batch_size = 32, sort_key = lambda x: len(x.eng))



In [17]:
for i, batch in enumerate(train_iter):
    src1 = batch.eng[:,0].numpy()
    trg1 = batch.spa[:,0].numpy()
    
    break

In [18]:
def confirm_mapping(src, trg):
    src = np.copy(src)
    vfunc = np.vectorize(lambda x: eng_field.vocab.itos[x])
    saved_src = vfunc(src)
    
    trg = np.copy(trg)
    xfunc = np.vectorize(lambda x: spa_field.vocab.itos[x])
    saved_trg = xfunc(trg)
    
    return saved_src, saved_trg

In [19]:
for i, batch in enumerate(val_iter):
    src2 = batch.eng[:,0].numpy()
    trg2 = batch.spa[:,0].numpy()

In [20]:
print(src1)
print(trg1)
confirm_mapping(src1, trg1)

[   2   50  156    7 1970    6  110    4    3    1    1    1    1    1
    1    1]


(array(['<sos>', 'they', 'wanted', 'to', 'steal', 'the', 'car', '.',
        '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>',
        '<pad>'], dtype='<U6'),
 array(['<sos>', 'quieren', 'robar', 'el', 'auto', '.', '<eos>', '<pad>',
        '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'],
       dtype='<U7'))

In [32]:
print(src2)
print(trg2)
confirm_mapping(src2, trg2)

[   2   16   17  241    7   37   10  406    4   73  134   65   43   10
  153   22 1510   43   10 1392   22  165   43   10  297   22   42  117
   43   10  902    4    3]
[   2   81  299   15  214    4   61    6  510   56  155   19 2292   56
   23 3118   19 5220   56   23  591   35  306   56 1000    4    3    1
    1    1    1    1    1]


(array(['<sos>', 'it', "'s", 'hard', 'to', 'be', 'a', 'woman', '.', 'one',
        'must', 'think', 'like', 'a', 'man', ',', 'act', 'like', 'a',
        'lady', ',', 'look', 'like', 'a', 'girl', ',', 'and', 'work',
        'like', 'a', 'horse', '.', '<eos>'], dtype='<U5'),
 array(['<sos>', 'ser', 'mujer', 'es', 'difícil', '.', 'hay', 'que',
        'pensar', 'como', 'hombre', ',', 'actuar', 'como', 'una', 'dama',
        ',', 'verse', 'como', 'una', 'niña', 'y', 'trabajar', 'como',
        'caballo', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>',
        '<pad>', '<pad>'], dtype='<U8'))