In [39]:
from vocab import *
from utils import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## `words2charindices()`

In addition to standard `word2id` dictionary we have `char2id` that is short dictionary that is build from `char_list`.

In [27]:
vocab = VocabEntry()

In [28]:
print(vocab.char_list)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ',', ';', '.', '!', '?', ':', "'", '"', '/', '\\', '|', '_', '@', '#', '$', '%', '^', '&', '*', '~', '`', '+', '-', '=', '<', '>', '(', ')', '[', ']']


In [29]:
len(vocab.char2id)

96

In [30]:
print(list(vocab.char2id.items())[:15])

[('<pad>', 0), ('{', 1), ('}', 2), ('<unk>', 3), ('A', 4), ('B', 5), ('C', 6), ('D', 7), ('E', 8), ('F', 9), ('G', 10), ('H', 11), ('I', 12), ('J', 13), ('K', 14)]


Let's check that our function works correctly on a simple example from `sanity_check.py`.

In [31]:
sentences = [["a", "b", "c?"], ["~d~", "c", "b", "a"]]

In [32]:
vocab.words2charindices(sentences)

[[[1, 30, 2], [1, 31, 2], [1, 32, 70, 2]],
 [[1, 85, 33, 85, 2], [1, 32, 2], [1, 31, 2], [1, 30, 2]]]

In [33]:
[vocab.char2id['{'], vocab.char2id['c'], vocab.char2id['?'], vocab.char2id['}']]

[1, 32, 70, 2]

## `pad_sents_char() `

This function is not entirely clear. What we are doing and what seems to pass all the tests:

- pad all sentences to `max_sent_len` in a batch (in our case - `6`); but pad it with `[0]`, not just `0`;
- padd all words up to `21` (provided) or truncate them again with `0`;

In [35]:
sentences = [['Human:', 'What', 'do', 'we', 'want?'], 
             ['Computer:', 'Natural', 'language', 'processing!'],
             ['Human:', 'When', 'do', 'we', 'want', 'it?'], 
             ['Computer:', 'When', 'do', 'we', 'want', 'what?']]

In [42]:
[len(s) for s in sentences]

[5, 4, 6, 6]

In [36]:
word_ids = vocab.words2charindices(sentences)

In [43]:
[len(s) for s in word_ids]

[5, 4, 6, 6]

In [38]:
word_ids[0]

[[1, 11, 50, 42, 30, 43, 71, 2],
 [1, 26, 37, 30, 49, 2],
 [1, 33, 44, 2],
 [1, 52, 34, 2],
 [1, 52, 30, 43, 49, 70, 2]]

In [48]:
padded_sentences = pad_sents_char(word_ids, 0)

In [49]:
padded_sentences[0]

[[1, 11, 50, 42, 30, 43, 71, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 26, 37, 30, 49, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 33, 44, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 52, 34, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 52, 30, 43, 49, 70, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [50]:
padded_sentences[1]

[[1, 6, 44, 42, 45, 50, 49, 34, 47, 71, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 17, 30, 49, 50, 47, 30, 41, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 41, 30, 43, 36, 50, 30, 36, 34, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 45, 47, 44, 32, 34, 48, 48, 38, 43, 36, 69, 2, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

Let's load correct padded sentences.

In [45]:
gold_padded_sentences = torch.load('./sanity_check_en_es_data/gold_padded_sentences.pkl')

In [46]:
gold_padded_sentences[0]

[[1, 11, 50, 42, 30, 43, 71, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 26, 37, 30, 49, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 33, 44, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 52, 34, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 52, 30, 43, 49, 70, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [51]:
gold_padded_sentences[1]

[[1, 6, 44, 42, 45, 50, 49, 34, 47, 71, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 17, 30, 49, 50, 47, 30, 41, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 41, 30, 43, 36, 50, 30, 36, 34, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 45, 47, 44, 32, 34, 48, 48, 38, 43, 36, 69, 2, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

## `to_input_tensor_char() `

In [53]:
sents_var = vocab.to_input_tensor_char(sentences, torch.device('cpu'))

In [54]:
type(sents_var)

torch.Tensor

In [56]:
sents_var.shape

torch.Size([6, 4, 21])