In [2]:
import sys
import os
import pandas as pd
import string
import numpy as np
import re
import spacy
import es_core_news_sm
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import BucketIterator
from sklearn.model_selection import train_test_split
import torch

In [3]:
data_path = os.path.join(sys.path[0], 'data/spa.txt')
lines= pd.read_table(data_path,  names =['eng', 'spa', 'comments'])
train,valid = train_test_split(lines, test_size=0.1)

In [4]:
train.to_csv('train.csv')
valid.to_csv('valid.csv')

In [5]:
# use spacy for tokenization - provides good support for tokenization in languages other than english
eng_field = Field(tokenize="spacy",
                 tokenizer_language="en",
                 init_token="<sos>",
                 eos_token = "<eos>",
                 lower = True)

spa_field = Field(tokenize="spacy",
                 tokenizer_language="es",
                 init_token="<sos>",
                 eos_token = "<eos>",
                 lower=True)

In [6]:
tabular_data_fields = [("id", None), ('eng', eng_field), ('spa', spa_field),('comments',None)]

In [7]:
train, valid = TabularDataset.splits(

path = './',
train='train.csv',
validation='valid.csv',
format = 'csv',
skip_header = True,
fields = tabular_data_fields)

In [8]:
valid[122].__dict__

{'eng': ['what', 'are', 'they', 'complaining', 'about', '?'],
 'spa': ['¿', 'de', 'qué', 'se', 'quejan', '?']}

In [74]:
eng_field.build_vocab(train, valid)

In [75]:
spa_field.build_vocab(train, valid)

In [78]:
len(spa_field.vocab)
len(eng_field.vocab)

13365

In [114]:
spa_field.vocab.stoi[' ']

0

In [12]:
train[0].__dict__

{'eng': ['please',
  'put',
  'a',
  'lot',
  'of',
  'cream',
  'in',
  'my',
  'coffee',
  '.'],
 'spa': ['ponele',
  'mucha',
  'crema',
  'a',
  'mi',
  'café',
  ',',
  'por',
  'favor',
  '.']}

In [13]:
train_iter, val_iter = BucketIterator.splits((train, valid),
batch_size = 32, sort_key = lambda x: len(x.eng))

In [14]:
for i, (batch) in enumerate(train_iter):
    print(help(batch))
    print(batch.eng)
    break

Help on Batch in module torchtext.data.batch object:

class Batch(builtins.object)
 |  Batch(data=None, dataset=None, device=None)
 |  
 |  Defines a batch of examples along with its Fields.
 |  
 |  Attributes:
 |      batch_size: Number of examples in the batch.
 |      dataset: A reference to the dataset object the examples come from
 |          (which itself contains the dataset's Field objects).
 |      train: Deprecated: this attribute is left for backwards compatibility,
 |          however it is UNUSED as of the merger with pytorch 0.4.
 |      input_fields: The names of the fields that are used as input for the model
 |      target_fields: The names of the fields that are used as targets during
 |                     model training
 |  
 |  Also stores the Variable for each column in the batch as an attribute.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, data=None, dataset=None, device=None)
 |      Create a Batch from a list of examples.
 |  
 |  __iter__(self)
 |  

In [15]:
for i, batch in enumerate(train_iter):
    src1 = batch.eng[:,0].numpy()
    trg1 = batch.spa[:,0].numpy()
    
    break

In [16]:
def confirm_mapping(src, trg):
    src = np.copy(src)
    vfunc = np.vectorize(lambda x: eng_field.vocab.itos[x])
    saved_src = vfunc(src)
    
    trg = np.copy(trg)
    xfunc = np.vectorize(lambda x: spa_field.vocab.itos[x])
    saved_trg = xfunc(trg)
    
    return saved_src, saved_trg

In [59]:
for i, batch in enumerate(val_iter):
    print(type(batch.eng))
    saved = batch.spa.numpy()
    if (type(batch.eng) == torch.Tensor):
        src2 = batch.eng[:,0].numpy()
        trg2 = batch.spa[:,0].numpy()
    print(batch.eng.shape)
    print(batch.spa.shape)
    break

<class 'torch.Tensor'>
torch.Size([5, 32])
torch.Size([8, 32])


In [60]:
saved = saved.T

In [64]:
x_t = saved[:, :-1]
y_t = saved[:,1:]

In [65]:
x_t

array([[    2,  3456,     4,     3,     1,     1,     1],
       [    2,    17, 13706,     4,     3,     1,     1],
       [    2,   101,    26,     5,   307,     4,     3],
       [    2, 12116,    16,  1112,     4,     3,     1],
       [    2,    80, 11935,     4,     3,     1,     1],
       [    2,    65,   102,     5,   307,     4,     3],
       [    2,   940,     4,     3,     1,     1,     1],
       [    2,  4987,     8,   503,     4,     3,     1],
       [    2,    12,    61,   158,    14,    55,    11],
       [    2,   540,    74,   605,     4,     3,     1],
       [    2,  3413,   206,     5,   173,   167,     4],
       [    2,  2267,  2548,     4,     3,     1,     1],
       [    2,  1667,     4,     3,     1,     1,     1],
       [    2,   258,  1899,     4,     3,     1,     1],
       [    2,  1812,    27,   513,     4,     3,     1],
       [    2,    80,  6624,     4,     3,     1,     1],
       [    2,    12,    95,  2430,    11,     3,     1],
       [    2,

In [61]:
saved

array([[    2,  3456,     4,     3,     1,     1,     1,     1],
       [    2,    17, 13706,     4,     3,     1,     1,     1],
       [    2,   101,    26,     5,   307,     4,     3,     1],
       [    2, 12116,    16,  1112,     4,     3,     1,     1],
       [    2,    80, 11935,     4,     3,     1,     1,     1],
       [    2,    65,   102,     5,   307,     4,     3,     1],
       [    2,   940,     4,     3,     1,     1,     1,     1],
       [    2,  4987,     8,   503,     4,     3,     1,     1],
       [    2,    12,    61,   158,    14,    55,    11,     3],
       [    2,   540,    74,   605,     4,     3,     1,     1],
       [    2,  3413,   206,     5,   173,   167,     4,     3],
       [    2,  2267,  2548,     4,     3,     1,     1,     1],
       [    2,  1667,     4,     3,     1,     1,     1,     1],
       [    2,   258,  1899,     4,     3,     1,     1,     1],
       [    2,  1812,    27,   513,     4,     3,     1,     1],
       [    2,    80,  66

In [66]:
y_t

array([[ 3456,     4,     3,     1,     1,     1,     1],
       [   17, 13706,     4,     3,     1,     1,     1],
       [  101,    26,     5,   307,     4,     3,     1],
       [12116,    16,  1112,     4,     3,     1,     1],
       [   80, 11935,     4,     3,     1,     1,     1],
       [   65,   102,     5,   307,     4,     3,     1],
       [  940,     4,     3,     1,     1,     1,     1],
       [ 4987,     8,   503,     4,     3,     1,     1],
       [   12,    61,   158,    14,    55,    11,     3],
       [  540,    74,   605,     4,     3,     1,     1],
       [ 3413,   206,     5,   173,   167,     4,     3],
       [ 2267,  2548,     4,     3,     1,     1,     1],
       [ 1667,     4,     3,     1,     1,     1,     1],
       [  258,  1899,     4,     3,     1,     1,     1],
       [ 1812,    27,   513,     4,     3,     1,     1],
       [   80,  6624,     4,     3,     1,     1,     1],
       [   12,    95,  2430,    11,     3,     1,     1],
       [  114,

In [18]:
src1

array([ 2,  5, 39,  8, 94,  5, 46, 86,  8,  4,  3,  1,  1,  1])

In [19]:
print(src1)
print(trg1)
confirm_mapping(src1, trg1)

[ 2  5 39  8 94  5 46 86  8  4  3  1  1  1]
[  2  32  49 327  32 484 124   4   3   1   1   1   1   1]


(array(['<sos>', 'i', 'want', 'you', 'where', 'i', 'can', 'see', 'you',
        '.', '<eos>', '<pad>', '<pad>', '<pad>'], dtype='<U5'),
 array(['<sos>', 'te', 'quiero', 'donde', 'te', 'pueda', 'ver', '.',
        '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], dtype='<U6'))

In [20]:
print(src2)
print(trg2)
confirm_mapping(src2, trg2)

[  2   8 218   4   3]
[   2 3456    4    3    1    1    1    1]


(array(['<sos>', 'you', 'lost', '.', '<eos>'], dtype='<U5'),
 array(['<sos>', 'perdiste', '.', '<eos>', '<pad>', '<pad>', '<pad>',
        '<pad>'], dtype='<U8'))

In [21]:
print(type(src2) == np.ndarray)

True


In [30]:
x = trg2[:-1]
x

array([   2, 3456,    4,    3,    1,    1,    1])

In [25]:
y = trg2[1:]