In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import argparse
import collections
import torch
import numpy as np
import data_loader.data_loaders as module_data
import data_loader.preprocess as module_preprocess
import model.loss as module_loss
import model.metric as module_metric
import model.model as module_arch
from parse_config import ConfigParser
from trainer import Trainer


# fix random seeds for reproducibility
SEED = 123
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)

In [4]:
import json
config = ConfigParser(json.load(open('config.json', 'r')))

In [5]:
logger = config.get_logger('train')
# preprocess
# config.init_obj('preprocess', module_preprocess)

# setup data_loader instances
data_loader = config.init_obj('data_loader', module_data, save_dir=config.save_dir)



In [4]:
data_loader.TEXT.vocab.itos[:5]

['<unk>', '<pad>', '<init>', '<eos>', '.']

In [6]:
print(vars(data_loader.dataset.examples[0]))

{'talk': ['can', 'we', 'make', 'this', 'quick', '?', ' ', 'roxanne', 'korrine', 'and', 'andrew', 'barrett', 'are', 'having', 'an', 'incredibly', 'horrendous', 'public', 'break-', 'up', 'on', 'the', 'quad', '.', ' ', 'again', '.'], 'response': ['well', ',', 'i', 'thought', 'we', "'d", 'start', 'with', 'pronunciation', ',', 'if', 'that', "'s", 'okay', 'with', 'you', '.']}


In [7]:
print(vars(data_loader.TEXT))

{'sequential': True, 'use_vocab': True, 'init_token': '<init>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'fix_length': 32, 'dtype': torch.int64, 'preprocessing': None, 'postprocessing': None, 'lower': True, 'tokenizer_args': (<bound method ChatbotDataLoader._tokenizer of <data_loader.data_loaders.ChatbotDataLoader object at 0x13058f370>>, 'en'), 'tokenize': <bound method ChatbotDataLoader._tokenizer of <data_loader.data_loaders.ChatbotDataLoader object at 0x13058f370>>, 'include_lengths': True, 'batch_first': False, 'pad_token': '<pad>', 'pad_first': False, 'truncate_first': False, 'stop_words': None, 'is_target': False, 'vocab': <torchtext.vocab.Vocab object at 0x1325475e0>}


In [8]:
print(config.__dict__)

{'_config': {'name': 'Chatbot', 'n_gpu': 1, 'embed_size': 50, 'hidden_size': 256, 'encoder_arch': {'type': 'ChatbotEncoder', 'args': {'n_layers': 1, 'dropout': 0.1}}, 'attn_arch': {'type': 'Attention', 'args': {'method': 'concat'}}, 'decoder_arch': {'type': 'LuongAttnDecoderRNN', 'args': {'attn_model': 'concat', 'n_layers': 1, 'dropout': 0.1}}, 'preprocess': {'type': 'ChatbotDataPreprocess', 'args': {'data_dir': 'data/cornell movie-dialogs corpus'}}, 'data_loader': {'type': 'ChatbotDataLoader', 'args': {'data_dir': 'data/cornell movie-dialogs corpus', 'filename': 'formatted_movie_lines.csv', 'text_field_path': None, 'vocab_path': None, 'batch_size': 128, 'sent_len': 32, 'init_token': '<init>', 'eos_token': '<eos>', 'min_freq': 5, 'shuffle': True, 'validation_split': 0.1, 'debug': True}}, 'optimizer': {'type': 'Adam', 'args': {'lr': 0.0001, 'weight_decay': 0, 'amsgrad': True}}, 'loss': 'mask_nll_loss', 'metrics': ['accuracy', 'top_k_acc'], 'lr_scheduler': {'type': 'StepLR', 'args': {'st

In [8]:
print(data_loader.TEXT)

<torchtext.data.field.Field object at 0x12b6f9220>


In [9]:
data_loader.TEXT.vocab.stoi['<pad>']

1

In [27]:
data_loader.TEXT.vocab.itos[0]

'<unk>'

In [28]:
vocab = data_loader.TEXT.vocab.itos

In [34]:
t = torch.randint(10, [10,32])

In [39]:
t

tensor([[9, 2, 1, 6, 0, 6, 6, 5, 3, 9, 4, 6, 1, 8, 0, 5, 4, 1, 6, 7, 3, 1, 2, 2,
         8, 8, 3, 3, 4, 6, 1, 2],
        [3, 6, 0, 5, 0, 5, 2, 9, 4, 0, 4, 8, 1, 2, 6, 9, 7, 5, 5, 6, 4, 4, 2, 9,
         3, 8, 2, 6, 6, 1, 2, 7],
        [3, 8, 7, 3, 3, 8, 5, 1, 9, 8, 0, 0, 7, 8, 6, 5, 1, 9, 0, 3, 5, 4, 6, 1,
         3, 6, 3, 3, 7, 1, 7, 1],
        [9, 6, 7, 9, 9, 5, 6, 6, 2, 9, 7, 4, 6, 5, 9, 8, 0, 6, 1, 4, 9, 7, 4, 0,
         5, 0, 1, 9, 5, 1, 8, 5],
        [4, 5, 6, 4, 7, 7, 4, 5, 3, 3, 0, 5, 9, 5, 3, 8, 0, 3, 7, 8, 6, 3, 4, 8,
         8, 5, 6, 7, 1, 0, 2, 0],
        [1, 2, 2, 1, 6, 0, 9, 6, 6, 1, 6, 8, 2, 4, 1, 4, 3, 5, 2, 2, 8, 9, 4, 2,
         8, 3, 4, 6, 6, 5, 6, 9],
        [9, 5, 5, 8, 9, 9, 1, 1, 2, 8, 1, 1, 3, 3, 9, 2, 0, 3, 9, 5, 8, 5, 4, 2,
         2, 3, 1, 9, 6, 9, 6, 6],
        [3, 7, 1, 4, 3, 7, 2, 2, 0, 9, 4, 3, 5, 8, 0, 7, 5, 7, 7, 9, 4, 7, 5, 1,
         0, 3, 1, 2, 6, 2, 7, 9],
        [2, 2, 1, 6, 2, 4, 5, 0, 3, 3, 7, 4, 3, 1, 1, 0, 0, 2, 4, 9, 6, 1, 9, 7,

In [42]:
print(len(data_loader.valid_iter.dataset))

99


In [26]:
for idx, batch in enumerate(data_loader.train_iter):
#     print(vars(batch))
    print(batch.talk[0][:, 0])
    print(batch.talk[1][0])
#     print(vars(batch.dataset))
#     print(vars(batch.dataset.examples[0]))
#     print(len(batch.dataset.examples))
#     for ex in batch.dataset.examples:
#         print(vars(ex))
    if idx == 100:
        break
    print("---")

tensor([  2,  71,   5,  20, 212,   5,  56,   6, 190,  13,  18, 117,   4,   3,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1])
tensor(14)
---
tensor([    2,     9,     0, 16764,    25,     0,    16,    26,    35,    78,
           11,    30,    11,  6518,     9,  1798,    16,     3,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1])
tensor(18)
---
tensor([   2,   39,   22,  211,  682,   22,   36, 1012,   19,    7,  100,  397,
         527,   42,  143,  187,   92,  338,  522,    4,    6,   93,  397,    5,
          21,    5,  143,  187,    5,   19,  337,    3])
tensor(32)
---
tensor([  2,  25,  13, 179,   8,   3,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1])
tensor(6)
---
tensor([   2,  637,  133,   29,   24,    6, 7712,   57,   20,   13,    5,  118,
           4,    

In [17]:
len(splits[0].examples)

NameError: name 'splits' is not defined

In [33]:
len(splits[1].examples)

22128

In [34]:
len(dataset.examples)

221282

In [42]:
data_loader.TEXT.vocab.stoi['<PAS>']

0

In [50]:
from torchtext.data import BucketIterator

In [51]:
train_iter = BucketIterator(splits[0], 2)



In [57]:
valid_iter = BucketIterator(splits[1], 8)

In [59]:
for i in valid_iter:
    print(i)



AttributeError: 'Field' object has no attribute 'vocab'

# Model

In [8]:
import torch.nn as nn
class ChatbotModel(nn.Module):
    def __init__(self, vocab_size, padding_idx, hidden_size, embed_size, n_layers=1, dropout=0.):
        super().__init__()
        self.vocab_size = vocab_size
        self.padding_idx = padding_idx
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.dropout = dropout
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=padding_idx)
        self.gru = nn.GRU(input_size=embed_size, hidden_size=hidden_size, num_layers=1,
                          bidirectional=True, dropout=dropout)

    def forward(self, input_seq, input_lengths, hidden=None):
        emb = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(emb, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        return outputs, hidden

In [9]:
model = ChatbotModel(100, 0, 256, 50)

In [10]:
vars(model)

{'training': True,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_non_persistent_buffers_set': set(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('embedding', Embedding(100, 50, padding_idx=0)),
              ('gru', GRU(50, 256, bidirectional=True))]),
 'vocab_size': 100,
 'padding_idx': 0,
 'n_layers': 1,
 'hidden_size': 256,
 'embed_size': 50,
 'dropout': 0.0}

In [11]:
import numpy as np

In [13]:
t = torch.tensor(np.random.rand(32, 50, 256))

In [14]:
t.size()

torch.Size([32, 50, 256])

In [16]:
t.squeeze(0).size()

torch.Size([32, 50, 256])

In [18]:
gru = nn.GRU(256, 256)

In [20]:
last_hidden = None
output, hidden = gru(torch.tensor(np.random.rand(32, 50, 256)), torch.tensor(np.random.rand(1, 50, 256)))

RuntimeError: Expected object of scalar type Double but got scalar type Float for argument #3 'mat2' in call to _th_addmm_out

In [23]:
seq = torch.tensor(np.random.randint(5, size=(5, 3)))
lens = torch.tensor([3, 2,5,1,1])

In [24]:
seq

tensor([[0, 0, 3],
        [3, 1, 1],
        [3, 4, 3],
        [1, 4, 0],
        [0, 2, 2]])

In [25]:
lens

tensor([3, 2, 5, 1, 1])

In [26]:
import torch.nn as nn
packed = nn.utils.rnn.pack_padded_sequence(seq, lens, batch_first=True, enforce_sorted=False)

In [27]:
packed

PackedSequence(data=tensor([3, 0, 3, 1, 0, 4, 0, 1, 3, 3]), batch_sizes=tensor([5, 3, 2, 1, 1]), sorted_indices=tensor([2, 0, 1, 3, 4]), unsorted_indices=tensor([1, 2, 0, 3, 4]))

In [28]:
packed.data

tensor([3, 0, 3, 1, 0, 4, 0, 1, 3, 3])

In [29]:
packed.data.size()

torch.Size([10])

In [30]:
emb = nn.Embedding(100, 50)

In [33]:
emb(packed)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not PackedSequence

In [34]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

# NLL Loss

In [38]:
import numpy as np
import torch.nn as nn

In [41]:
np.log(0.5)

-0.6931471805599453

In [26]:
encoder = config.init_obj(
        'encoder_arch', module_arch,
        vocab_size=data_loader.vocab_size,
        padding_idx=data_loader.padding_idx,
        hidden_size=config['hidden_size'],
        embed_size=config['embed_size']
    )

KeyError: 'encoder_arch'

In [30]:
encoder = config.init_obj(
        'encoder_arch', module_arch,
        vocab_size=data_loader.vocab_size,
        padding_idx=data_loader.padding_idx,
        hidden_size=config['hidden_size'],
        embed_size=config['embed_size']
    )



In [31]:
vars(encoder)

{'training': True,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_non_persistent_buffers_set': set(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('embedding', Embedding(21928, 50, padding_idx=1)),
              ('gru', GRU(50, 256, dropout=0.1, bidirectional=True))]),
 'vocab_size': 21928,
 'padding_idx': 1,
 'n_layers': 1,
 'hidden_size': 256,
 'embed_size': 50,
 'dropout': 0.1}

In [35]:
print(type(encoder).__name__)

ChatbotEncoder


In [15]:
import torch.nn as nn
rnn = nn.GRU(input_size=50, hidden_size=64, num_layers=2, bidirectional=True)

In [17]:
a = torch.randn(20, 32, 50)

In [18]:
h = torch.randn(2 * 2, 32, 64)

In [19]:
output, hidden = rnn(a, h)

In [20]:
output.shape

torch.Size([20, 32, 128])

In [21]:
hidden.shape

torch.Size([4, 32, 64])

In [22]:
hidden[:2].shape

torch.Size([2, 32, 64])

In [49]:
hidden[:1].shape

torch.Size([1, 32, 64])

In [51]:
hidden[:1]

tensor([[[-0.1238, -0.0801, -0.0751,  ..., -0.1777,  0.0315,  0.2147],
         [ 0.2452, -0.5233, -0.1172,  ...,  0.0260, -0.2557,  0.0231],
         [-0.3241, -0.2272, -0.1784,  ..., -0.1512, -0.1940,  0.5007],
         ...,
         [ 0.2113,  0.1761, -0.4286,  ..., -0.1313, -0.4878,  0.1818],
         [ 0.1389,  0.0497,  0.0418,  ..., -0.0419,  0.0566,  0.3900],
         [ 0.4436,  0.0784, -0.3300,  ..., -0.0707, -0.1370, -0.1538]]],
       grad_fn=<SliceBackward>)

In [23]:
hidden[-1] == output

RuntimeError: The size of tensor a (64) must match the size of tensor b (128) at non-singleton dimension 2

In [53]:
hidden[0]

tensor([[-0.1238, -0.0801, -0.0751,  ..., -0.1777,  0.0315,  0.2147],
        [ 0.2452, -0.5233, -0.1172,  ...,  0.0260, -0.2557,  0.0231],
        [-0.3241, -0.2272, -0.1784,  ..., -0.1512, -0.1940,  0.5007],
        ...,
        [ 0.2113,  0.1761, -0.4286,  ..., -0.1313, -0.4878,  0.1818],
        [ 0.1389,  0.0497,  0.0418,  ..., -0.0419,  0.0566,  0.3900],
        [ 0.4436,  0.0784, -0.3300,  ..., -0.0707, -0.1370, -0.1538]],
       grad_fn=<SelectBackward>)

In [12]:
encoder = config.init_obj(
    'encoder_arch', module_arch,
    vocab_size=data_loader.vocab_size,
    padding_idx=data_loader.padding_idx,
    hidden_size=config['hidden_size'],
    embed_size=config['embed_size']
)
logger.info(encoder)
decoder = config.init_obj(
    'decoder_arch', module_arch,
    embedding=encoder.embedding,
    hidden_size=config['hidden_size'],
    vocab_size=data_loader.vocab_size
)
logger.info(decoder)

ChatbotEncoder(
  (embedding): Embedding(100, 50, padding_idx=1)
  (gru): GRU(50, 256, dropout=0.1, bidirectional=True)
)
Trainable parameters: 478088
LuongAttnDecoderRNN(
  (embedding): Embedding(100, 50, padding_idx=1)
  (embedding_dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(256, 256, dropout=0.1)
  (concat): Linear(in_features=512, out_features=256, bias=True)
  (out): Linear(in_features=256, out_features=100, bias=True)
  (attn): Attention(
    (attn): Linear(in_features=512, out_features=256, bias=True)
  )
)
Trainable parameters: 688364




In [24]:
decoder.n_layers

1

In [28]:
output.shape

torch.Size([20, 32, 128])

In [29]:
output

tensor([[[-4.0961e-01, -3.1895e-01, -6.4349e-02,  ...,  1.8628e-01,
           4.3406e-01, -1.7006e-03],
         [-8.6427e-01, -4.6732e-01, -1.0612e-01,  ..., -3.1088e-01,
          -1.8132e-01,  2.2435e-01],
         [-4.6097e-01,  4.7562e-01, -1.4677e-01,  ..., -2.4477e-02,
           1.8857e-01,  3.1170e-01],
         ...,
         [ 9.7691e-01, -6.8664e-01,  3.9081e-01,  ...,  7.2344e-02,
          -2.9403e-01, -1.6083e-01],
         [ 1.4220e-01, -7.6799e-01,  1.2404e-01,  ...,  3.6330e-01,
          -3.6989e-01,  4.0256e-01],
         [-1.1044e+00, -2.8971e-01,  1.0354e+00,  ...,  6.3056e-02,
          -3.6093e-01,  4.2395e-02]],

        [[-4.2769e-02, -1.7248e-01,  6.9871e-02,  ..., -1.1662e-01,
           3.2323e-01,  1.0613e-01],
         [-6.9177e-01, -2.7251e-01, -6.4256e-03,  ..., -1.9466e-01,
          -1.2310e-01,  2.2617e-01],
         [-9.1384e-02,  3.9609e-01, -2.1645e-01,  ..., -6.9060e-02,
           1.6677e-01,  1.4670e-01],
         ...,
         [ 6.9007e-01, -3

In [30]:
output.tanh()

tensor([[[-3.8814e-01, -3.0856e-01, -6.4260e-02,  ...,  1.8416e-01,
           4.0871e-01, -1.7006e-03],
         [-6.9845e-01, -4.3603e-01, -1.0573e-01,  ..., -3.0124e-01,
          -1.7935e-01,  2.2066e-01],
         [-4.3087e-01,  4.4273e-01, -1.4572e-01,  ..., -2.4472e-02,
           1.8637e-01,  3.0199e-01],
         ...,
         [ 7.5173e-01, -5.9582e-01,  3.7206e-01,  ...,  7.2218e-02,
          -2.8584e-01, -1.5946e-01],
         [ 1.4124e-01, -6.4576e-01,  1.2341e-01,  ...,  3.4811e-01,
          -3.5390e-01,  3.8214e-01],
         [-8.0207e-01, -2.8187e-01,  7.7608e-01,  ...,  6.2973e-02,
          -3.4604e-01,  4.2370e-02]],

        [[-4.2743e-02, -1.7079e-01,  6.9757e-02,  ..., -1.1610e-01,
           3.1242e-01,  1.0573e-01],
         [-5.9912e-01, -2.6596e-01, -6.4255e-03,  ..., -1.9224e-01,
          -1.2248e-01,  2.2239e-01],
         [-9.1130e-02,  3.7660e-01, -2.1313e-01,  ..., -6.8950e-02,
           1.6524e-01,  1.4566e-01],
         ...,
         [ 5.9803e-01, -3

In [32]:
h_reshape = hidden.view(2, 2, -1, 64)

In [33]:
h_reshape.shape

torch.Size([2, 2, 32, 64])

In [36]:
h_reshape[-1:, :1].shape

torch.Size([1, 1, 32, 64])

In [39]:
[1,2,3][-1:]

[3]

In [41]:
l = [[[1,2],[3,4]],[4,5,6]]

In [44]:
import numpy as np

In [49]:
torch.randn(4,5,6)[0:1].shape

torch.Size([1, 5, 6])

In [47]:
(torch.ones(1, 1, dtype=torch.long) * 3)

tensor([[3]])

In [48]:
torch.zeros([0])

tensor([])

In [49]:
data_loader.valid_iter.dataset

<torchtext.data.dataset.TabularDataset at 0x1324b2520>

In [6]:
t = torch.zeros(10)

In [8]:
t.device

device(type='cpu')

In [14]:
input_t = torch.randn(3, 10)
target = torch.tensor([0, 5, 2])

In [16]:
torch.nn.functional.nll_loss(input_t, target, reduce=False)



tensor([-0.3374,  1.6033, -1.8319])

In [38]:
x = torch.randn(5, 10)
y = torch.randint(10, (5,))

In [39]:
x.shape

torch.Size([5, 10])

In [40]:
y.shape

torch.Size([5])

In [41]:
mask = torch.randint(32, (5,))

In [42]:
g = torch.gather(x, dim=1, index=y.view(-1, 1))

In [43]:
g

tensor([[-1.1415],
        [ 0.3589],
        [-1.5390],
        [-0.4349],
        [ 2.0616]])

In [44]:
x

tensor([[ 0.6532,  0.3980,  2.1203, -1.9028, -0.9555, -1.1577,  1.3683, -1.1415,
          1.1474, -1.8784],
        [-0.9788, -0.4524,  0.6420,  0.4371,  1.6494, -0.3803, -1.8457, -0.1763,
          0.3589, -0.5085],
        [ 0.4091, -0.8702, -0.2526,  0.2752, -0.3832, -0.4173, -0.0355, -0.2302,
         -1.5390, -0.4670],
        [-0.2825, -0.0398,  0.7663,  0.0687,  0.3328, -0.4308, -0.2603,  1.2306,
         -1.2975, -0.4349],
        [-1.0758, -0.8776,  0.9318,  0.6979, -1.3016, -0.6882, -0.2885,  0.9103,
          2.0616,  1.9427]])

In [45]:
y

tensor([7, 8, 8, 9, 8])

In [50]:
torch.log(torch.tensor([1.]))

tensor([0.])

In [54]:
torch.zeros(10).masked_select(torch.tensor([False for _ in range(10)]))

tensor([])

In [57]:
for i in range(torch.randint(100, (32,)).max()):
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86


In [75]:
p = data_loader.TEXT.preprocess("test, im a good bot")

In [76]:
p

['test', ',', 'i', 'm', 'a', 'good', 'bot']

In [77]:
data_loader.TEXT.numericalize(data_loader.TEXT.pad([p]))

(tensor([[   2],
         [1104],
         [   5],
         [   7],
         [1441],
         [  12],
         [  90],
         [   0],
         [   3],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1],
         [   1]]),
 tensor([9]))

In [73]:
data_loader.TEXT.pad(p)

([['<START>',
   't',
   'e',
   's',
   't',
   ',',
   '<END>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>'],
  ['<START>',
   'i',
   'm',
   '<END>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>'],
  ['<START>',
   'a',
   '<END>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '<pad>',
   '

In [78]:
vars(data_loader.TEXT)

{'sequential': True,
 'use_vocab': True,
 'init_token': '<START>',
 'eos_token': '<END>',
 'unk_token': '<unk>',
 'fix_length': 32,
 'preprocessing': None,
 'postprocessing': None,
 'lower': True,
 'tokenizer_args': (<bound method ChatbotDataLoader._tokenizer of <data_loader.data_loaders.ChatbotDataLoader object at 0x127591a60>>,
  'en'),
 'include_lengths': True,
 'batch_first': False,
 'pad_token': '<pad>',
 'pad_first': False,
 'truncate_first': False,
 'stop_words': None,
 'is_target': False,
 'dtype': torch.int64,
 'tokenize': <bound method ChatbotDataLoader._tokenizer of <data_loader.data_loaders.ChatbotDataLoader object at 0x127591a60>>,
 'vocab': <torchtext.vocab.Vocab at 0x151d14cd0>}

In [81]:
vars(data_loader.TEXT.vocab).keys()

dict_keys(['freqs', 'itos', 'unk_index', 'stoi', 'vectors'])

In [82]:
vars(data_loader.TEXT)

{'sequential': True,
 'use_vocab': True,
 'init_token': '<START>',
 'eos_token': '<END>',
 'unk_token': '<unk>',
 'fix_length': 32,
 'preprocessing': None,
 'postprocessing': None,
 'lower': True,
 'tokenizer_args': (<bound method ChatbotDataLoader._tokenizer of <data_loader.data_loaders.ChatbotDataLoader object at 0x127591a60>>,
  'en'),
 'include_lengths': True,
 'batch_first': False,
 'pad_token': '<pad>',
 'pad_first': False,
 'truncate_first': False,
 'stop_words': None,
 'is_target': False,
 'dtype': torch.int64,
 'tokenize': <bound method ChatbotDataLoader._tokenizer of <data_loader.data_loaders.ChatbotDataLoader object at 0x127591a60>>,
 'vocab': <torchtext.vocab.Vocab at 0x151d14cd0>}

In [84]:
vars(data_loader.TEXT.vocab.itos['<START>'])

TypeError: list indices must be integers or slices, not str

In [88]:
list(filter(lambda x: x!=1, [1,2,3,4]))[1:-1]

[3]