In [4]:
import sys
import os
import pandas as pd
import string
import numpy as np
import re
import spacy
import es_core_news_sm
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import BucketIterator
from sklearn.model_selection import train_test_split
import torch

In [5]:
data_path = os.path.join(sys.path[0], 'data/spa.txt')
lines= pd.read_table(data_path,  names =['eng', 'spa', 'comments'])
train,valid = train_test_split(lines, test_size=0.1)

In [6]:
train.to_csv('train.csv')
valid.to_csv('valid.csv')

In [7]:
# use spacy for tokenization - provides good support for tokenization in languages other than english
eng_field = Field(tokenize="spacy",
                 tokenizer_language="en",
                 init_token="<sos>",
                 eos_token = "<eos>",
                 lower = True)

spa_field = Field(tokenize="spacy",
                 tokenizer_language="es",
                 init_token="<sos>",
                 eos_token = "<eos>",
                 lower=True)

In [8]:
tabular_data_fields = [("id", None), ('eng', eng_field), ('spa', spa_field),('comments',None)]

In [9]:
train, valid = TabularDataset.splits(

path = './',
train='train.csv',
validation='valid.csv',
format = 'csv',
skip_header = True,
fields = tabular_data_fields)

In [10]:
valid[122].__dict__

{'eng': ['i', 'could', 'hardly', 'endure', 'the', 'pain', '.'],
 'spa': ['apenas', 'podía', 'soportar', 'el', 'dolor', '.']}

In [11]:
eng_field.build_vocab(train, valid)

In [12]:
spa_field.build_vocab(train, valid)

In [15]:
len(spa_field.vocab)
len(eng_field.vocab)

13365

In [16]:
train[0].__dict__

{'eng': ['have', 'you', 'known', 'tom', 'long', '?'],
 'spa': ['¿', 'conoces', 'a', 'tom', 'desde', 'hace', 'mucho', 'tiempo', '?']}

In [17]:
train_iter, val_iter = BucketIterator.splits((train, valid),
batch_size = 32, sort_key = lambda x: len(x.eng))

In [36]:
for i, (batch) in enumerate(train_iter):
    print(help(batch))
    print(batch.eng)
    break

Help on Batch in module torchtext.data.batch object:

class Batch(builtins.object)
 |  Batch(data=None, dataset=None, device=None)
 |  
 |  Defines a batch of examples along with its Fields.
 |  
 |  Attributes:
 |      batch_size: Number of examples in the batch.
 |      dataset: A reference to the dataset object the examples come from
 |          (which itself contains the dataset's Field objects).
 |      train: Deprecated: this attribute is left for backwards compatibility,
 |          however it is UNUSED as of the merger with pytorch 0.4.
 |      input_fields: The names of the fields that are used as input for the model
 |      target_fields: The names of the fields that are used as targets during
 |                     model training
 |  
 |  Also stores the Variable for each column in the batch as an attribute.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, data=None, dataset=None, device=None)
 |      Create a Batch from a list of examples.
 |  
 |  __iter__(self)
 |  

In [41]:
for i, batch in enumerate(train_iter):
    src1 = batch.eng[:,0].numpy()
    trg1 = batch.spa[:,0].numpy()
    
    break

In [42]:
def confirm_mapping(src, trg):
    src = np.copy(src)
    vfunc = np.vectorize(lambda x: eng_field.vocab.itos[x])
    saved_src = vfunc(src)
    
    trg = np.copy(trg)
    xfunc = np.vectorize(lambda x: spa_field.vocab.itos[x])
    saved_trg = xfunc(trg)
    
    return saved_src, saved_trg

In [65]:
for i, batch in enumerate(val_iter):
    print(type(batch.eng))
    if (type(batch.eng) == torch.Tensor):
        src2 = batch.eng[:,0].numpy()
        trg2 = batch.spa[:,0].numpy()
    break

<class 'torch.Tensor'>


In [44]:
print(src1)
print(trg1)
confirm_mapping(src1, trg1)

[  2   5 113 579 187 116  25 811   4   3   1   1   1   1   1]
[  2 756 904  14  55  44 435   5 179   4   3   1   1   1   1   1   1]


(array(['<sos>', 'i', "'d", 'rather', 'stay', 'home', 'this', 'weekend',
        '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'],
       dtype='<U7'),
 array(['<sos>', 'prefiero', 'quedarme', 'en', 'casa', 'este', 'fin', 'de',
        'semana', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>',
        '<pad>', '<pad>'], dtype='<U8'))

In [66]:
print(src2)
print(trg2)
confirm_mapping(src2, trg2)

[   2  136 1113    4    3]
[    2  8419 14570     4     3     1     1     1]


(array(['<sos>', 'say', 'cheese', '.', '<eos>'], dtype='<U6'),
 array(['<sos>', 'decid', 'patata', '.', '<eos>', '<pad>', '<pad>',
        '<pad>'], dtype='<U6'))

In [67]:
print(type(src2) == np.ndarray)

True
