In [1]:
import argparse
import collections
import torch
import numpy as np
import data_loader.data_loaders as module_data
import data_loader.preprocess as module_preprocess
import model.loss as module_loss
import model.metric as module_metric
import model.model as module_arch
from parse_config import ConfigParser
from trainer import Trainer


# fix random seeds for reproducibility
SEED = 123
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)

In [27]:
import json
config = ConfigParser(json.load(open('config.json', 'r')))

In [3]:
logger = config.get_logger('train')
# preprocess
# config.init_obj('preprocess', module_preprocess)

# setup data_loader instances
data_loader = config.init_obj('data_loader', module_data, save_dir=config.save_dir)



In [28]:
data_loader.TEXT.vocab.itos[:5]

['<unk>', '<pad>', '<init>', '<eos>', '.']

In [29]:
print(vars(data_loader.dataset.examples[0]))

{'talk': ['can', 'we', 'make', 'this', 'quick', '?', ' ', 'roxanne', 'korrine', 'and', 'andrew', 'barrett', 'are', 'having', 'an', 'incredibly', 'horrendous', 'public', 'break-', 'up', 'on', 'the', 'quad', '.', ' ', 'again', '.'], 'response': ['well', ',', 'i', 'thought', 'we', "'d", 'start', 'with', 'pronunciation', ',', 'if', 'that', "'s", 'okay', 'with', 'you', '.']}


In [6]:
print(vars(data_loader.TEXT))

{'sequential': True, 'use_vocab': True, 'init_token': '<init>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'fix_length': 32, 'dtype': torch.int64, 'preprocessing': None, 'postprocessing': None, 'lower': True, 'tokenizer_args': (<bound method ChatbotDataLoader._tokenizer of <data_loader.data_loaders.ChatbotDataLoader object at 0x129aec190>>, 'en'), 'tokenize': <bound method ChatbotDataLoader._tokenizer of <data_loader.data_loaders.ChatbotDataLoader object at 0x129aec190>>, 'include_lengths': True, 'batch_first': False, 'pad_token': '<pad>', 'pad_first': False, 'truncate_first': False, 'stop_words': None, 'is_target': False, 'vocab': <torchtext.vocab.Vocab object at 0x12b6f91f0>}


In [7]:
print(config.__dict__)

{'_config': {'name': 'Chatbot', 'n_gpu': 1, 'arch': {'type': 'ChatbotModel', 'args': {}}, 'preprocess': {'type': 'ChatbotDataPreprocess', 'args': {'data_dir': 'data/cornell movie-dialogs corpus'}}, 'data_loader': {'type': 'ChatbotDataLoader', 'args': {'data_dir': 'data/cornell movie-dialogs corpus', 'filename': 'formatted_movie_lines.csv', 'text_field_path': None, 'vocab_path': None, 'batch_size': 128, 'sent_len': 32, 'init_token': '<init>', 'eos_token': '<eos>', 'min_freq': 5, 'shuffle': True, 'validation_split': 0.1}}, 'optimizer': {'type': 'Adam', 'args': {'lr': 0.001, 'weight_decay': 0, 'amsgrad': True}}, 'loss': 'mask_nll_loss', 'metrics': ['accuracy', 'top_k_acc'], 'lr_scheduler': {'type': 'StepLR', 'args': {'step_size': 50, 'gamma': 0.1}}, 'trainer': {'epochs': 10, 'save_dir': 'saved/', 'save_period': 1, 'verbosity': 2, 'monitor': 'min val_loss', 'early_stop': 2, 'tensorboard': True}}, 'resume': None, '_save_dir': PosixPath('saved/models/Chatbot/1020_211127'), '_log_dir': PosixP

In [8]:
print(data_loader.TEXT)

<torchtext.data.field.Field object at 0x12b6f9220>


In [9]:
data_loader.TEXT.vocab.stoi['<pad>']

1

In [10]:
from torchtext.data import BucketIterator

In [24]:
for idx, batch in enumerate(data_loader.train_iter):
    print(batch)
    if idx == 3:
        break
    print(batch.talk[0])
    print((batch.talk[0] != 1))
    print(batch.talk[1])
    print("---")


[torchtext.data.batch.Batch of size 128]
	[.talk]:('[torch.LongTensor of size 32x128]', '[torch.LongTensor of size 128]')
	[.response]:('[torch.LongTensor of size 32x128]', '[torch.LongTensor of size 128]')
tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [137,   6,  29,  ...,  39,  86,  19],
        [ 26,  93,  67,  ...,   5,   8,  18],
        ...,
        [  1,   1,   1,  ...,   1,   1, 140],
        [  1,   1,   1,  ...,   1,   1,  13],
        [  1,   1,   1,  ...,   1,   1,   3]])
tensor([[ True,  True,  True,  ...,  True,  True,  True],
        [ True,  True,  True,  ...,  True,  True,  True],
        [ True,  True,  True,  ...,  True,  True,  True],
        ...,
        [False, False, False,  ..., False, False,  True],
        [False, False, False,  ..., False, False,  True],
        [False, False, False,  ..., False, False,  True]])
tensor([13, 15, 11,  6,  6, 14,  8, 13,  9,  8, 22, 31,  6,  8, 20,  5, 32,  4,
        12, 26,  7, 19,  7,  7, 11,  6, 11, 21,  7,  6, 11, 

In [17]:
len(splits[0].examples)

NameError: name 'splits' is not defined

In [33]:
len(splits[1].examples)

22128

In [34]:
len(dataset.examples)

221282

In [42]:
data_loader.TEXT.vocab.stoi['<PAS>']

0

In [50]:
from torchtext.data import BucketIterator

In [51]:
train_iter = BucketIterator(splits[0], 2)



In [57]:
valid_iter = BucketIterator(splits[1], 8)

In [59]:
for i in valid_iter:
    print(i)



AttributeError: 'Field' object has no attribute 'vocab'

# Model

In [8]:
import torch.nn as nn
class ChatbotModel(nn.Module):
    def __init__(self, vocab_size, padding_idx, hidden_size, embed_size, n_layers=1, dropout=0.):
        super().__init__()
        self.vocab_size = vocab_size
        self.padding_idx = padding_idx
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.dropout = dropout
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=padding_idx)
        self.gru = nn.GRU(input_size=embed_size, hidden_size=hidden_size, num_layers=1,
                          bidirectional=True, dropout=dropout)

    def forward(self, input_seq, input_lengths, hidden=None):
        emb = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(emb, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        return outputs, hidden

In [9]:
model = ChatbotModel(100, 0, 256, 50)

In [10]:
vars(model)

{'training': True,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_non_persistent_buffers_set': set(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('embedding', Embedding(100, 50, padding_idx=0)),
              ('gru', GRU(50, 256, bidirectional=True))]),
 'vocab_size': 100,
 'padding_idx': 0,
 'n_layers': 1,
 'hidden_size': 256,
 'embed_size': 50,
 'dropout': 0.0}

In [11]:
import numpy as np

In [13]:
t = torch.tensor(np.random.rand(32, 50, 256))

In [14]:
t.size()

torch.Size([32, 50, 256])

In [16]:
t.squeeze(0).size()

torch.Size([32, 50, 256])

In [18]:
gru = nn.GRU(256, 256)

In [20]:
last_hidden = None
output, hidden = gru(torch.tensor(np.random.rand(32, 50, 256)), torch.tensor(np.random.rand(1, 50, 256)))

RuntimeError: Expected object of scalar type Double but got scalar type Float for argument #3 'mat2' in call to _th_addmm_out

In [23]:
seq = torch.tensor(np.random.randint(5, size=(5, 3)))
lens = torch.tensor([3, 2,5,1,1])

In [24]:
seq

tensor([[0, 0, 3],
        [3, 1, 1],
        [3, 4, 3],
        [1, 4, 0],
        [0, 2, 2]])

In [25]:
lens

tensor([3, 2, 5, 1, 1])

In [26]:
import torch.nn as nn
packed = nn.utils.rnn.pack_padded_sequence(seq, lens, batch_first=True, enforce_sorted=False)

In [27]:
packed

PackedSequence(data=tensor([3, 0, 3, 1, 0, 4, 0, 1, 3, 3]), batch_sizes=tensor([5, 3, 2, 1, 1]), sorted_indices=tensor([2, 0, 1, 3, 4]), unsorted_indices=tensor([1, 2, 0, 3, 4]))

In [28]:
packed.data

tensor([3, 0, 3, 1, 0, 4, 0, 1, 3, 3])

In [29]:
packed.data.size()

torch.Size([10])

In [30]:
emb = nn.Embedding(100, 50)

In [33]:
emb(packed)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not PackedSequence

In [34]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

# NLL Loss

In [38]:
import numpy as np
import torch.nn as nn

In [41]:
np.log(0.5)

-0.6931471805599453

In [26]:
encoder = config.init_obj(
        'encoder_arch', module_arch,
        vocab_size=data_loader.vocab_size,
        padding_idx=data_loader.padding_idx,
        hidden_size=config['hidden_size'],
        embed_size=config['embed_size']
    )

KeyError: 'encoder_arch'

In [30]:
encoder = config.init_obj(
        'encoder_arch', module_arch,
        vocab_size=data_loader.vocab_size,
        padding_idx=data_loader.padding_idx,
        hidden_size=config['hidden_size'],
        embed_size=config['embed_size']
    )



In [31]:
vars(encoder)

{'training': True,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_non_persistent_buffers_set': set(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('embedding', Embedding(21928, 50, padding_idx=1)),
              ('gru', GRU(50, 256, dropout=0.1, bidirectional=True))]),
 'vocab_size': 21928,
 'padding_idx': 1,
 'n_layers': 1,
 'hidden_size': 256,
 'embed_size': 50,
 'dropout': 0.1}

In [35]:
print(type(encoder).__name__)

ChatbotEncoder
