In [1]:
import argparse
import collections
import torch
import numpy as np
import data_loader.data_loaders as module_data
import data_loader.preprocess as module_preprocess
import model.loss as module_loss
import model.metric as module_metric
import model.model as module_arch
from parse_config import ConfigParser
from trainer import Trainer


# fix random seeds for reproducibility
SEED = 123
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)

In [2]:
import json
config = ConfigParser(json.load(open('config.json', 'r')))

In [3]:
logger = config.get_logger('train')
# preprocess
# config.init_obj('preprocess', module_preprocess)

# setup data_loader instances
data_loader = config.init_obj('data_loader', module_data, save_dir=config.save_dir)



In [4]:
data_loader.TEXT.vocab.itos[:5]

['<unk>', '<pad>', '<init>', '<eos>', '.']

In [5]:
print(vars(data_loader.dataset.examples[0]))

{'talk': ['can', 'we', 'make', 'this', 'quick', '?', ' ', 'roxanne', 'korrine', 'and', 'andrew', 'barrett', 'are', 'having', 'an', 'incredibly', 'horrendous', 'public', 'break-', 'up', 'on', 'the', 'quad', '.', ' ', 'again', '.'], 'response': ['well', ',', 'i', 'thought', 'we', "'d", 'start', 'with', 'pronunciation', ',', 'if', 'that', "'s", 'okay', 'with', 'you', '.']}


In [6]:
print(vars(data_loader.TEXT))

{'sequential': True, 'use_vocab': True, 'init_token': '<init>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'fix_length': 32, 'dtype': torch.int64, 'preprocessing': None, 'postprocessing': None, 'lower': True, 'tokenizer_args': (<bound method ChatbotDataLoader._tokenizer of <data_loader.data_loaders.ChatbotDataLoader object at 0x1029d1eb0>>, 'en'), 'tokenize': <bound method ChatbotDataLoader._tokenizer of <data_loader.data_loaders.ChatbotDataLoader object at 0x1029d1eb0>>, 'include_lengths': True, 'batch_first': False, 'pad_token': '<pad>', 'pad_first': False, 'truncate_first': False, 'stop_words': None, 'is_target': False, 'vocab': <torchtext.vocab.Vocab object at 0x12243f310>}


In [7]:
print(config.__dict__)

{'_config': {'name': 'Chatbot', 'n_gpu': 1, 'arch': {'type': 'ChatbotModel', 'args': {}}, 'preprocess': {'type': 'ChatbotDataPreprocess', 'args': {'data_dir': 'data/cornell movie-dialogs corpus'}}, 'data_loader': {'type': 'ChatbotDataLoader', 'args': {'data_dir': 'data/cornell movie-dialogs corpus', 'filename': 'formatted_movie_lines.csv', 'text_field_path': None, 'vocab_path': None, 'batch_size': 128, 'sent_len': 32, 'init_token': '<init>', 'eos_token': '<eos>', 'min_freq': 5, 'shuffle': True, 'validation_split': 0.1}}, 'optimizer': {'type': 'Adam', 'args': {'lr': 0.001, 'weight_decay': 0, 'amsgrad': True}}, 'loss': 'nll_loss', 'metrics': ['accuracy', 'top_k_acc'], 'lr_scheduler': {'type': 'StepLR', 'args': {'step_size': 50, 'gamma': 0.1}}, 'trainer': {'epochs': 10, 'save_dir': 'saved/', 'save_period': 1, 'verbosity': 2, 'monitor': 'min val_loss', 'early_stop': 2, 'tensorboard': True}}, 'resume': None, '_save_dir': PosixPath('saved/models/Chatbot/1015_175112'), '_log_dir': PosixPath('

In [8]:
print(data_loader.TEXT)

<torchtext.data.field.Field object at 0x12243f340>


In [9]:
data_loader.TEXT.vocab.stoi['<pad>']

1

In [8]:
datasets = data_loader.dataset.split(split_ratio=0.9)

In [15]:
from torchtext.data import BucketIterator

In [18]:
TEXT.build_vocab(dataset, max_size=20000)

In [23]:
data_iter = data_loader.train_iter

In [12]:
for idx, batch in enumerate(data_loader.train_iter):
    if idx == 5:
        break
    print(batch.talk[0])
    print(batch.talk[1])
    print("---")

tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [1949,   16,   52,  ...,    6,  184,   21],
        [   5,  111,  176,  ...,   17,   15,    8],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
tensor([20,  8, 12, 22, 18,  4, 32, 10,  7, 32, 17,  5,  4, 14,  8, 32, 11,  7,
        11, 10,  8,  6, 16, 14,  5, 11, 32, 12,  6, 14,  7, 32,  7,  8,  9, 32,
        14, 32, 22,  7,  6,  7, 24, 13, 31, 32, 16, 20, 32, 23,  8, 15, 17, 32,
        17,  9, 18, 32, 18,  7,  4, 14, 10, 27,  5, 23,  8,  8, 16, 14, 32, 22,
        29, 12,  8,  8,  5, 11, 11, 17, 11, 19,  6, 20, 16, 13, 23, 23, 12, 28,
        21, 12, 17, 13, 19,  6,  5, 11,  7, 10,  6, 10, 13, 17,  4, 13,  6,  4,
         6, 32, 10, 14,  8, 20,  6, 13, 25,  4, 18, 19, 10,  6, 10,  5, 16, 15,
        13,  4])
---
tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [ 63,   6,  86,  ..., 178,   7,   6],
 



In [32]:
len(splits[0].examples)

199154

In [33]:
len(splits[1].examples)

22128

In [34]:
len(dataset.examples)

221282

In [42]:
data_loader.TEXT.vocab.stoi['<PAS>']

0

In [50]:
from torchtext.data import BucketIterator

In [51]:
train_iter = BucketIterator(splits[0], 2)



In [57]:
valid_iter = BucketIterator(splits[1], 8)

In [59]:
for i in valid_iter:
    print(i)



AttributeError: 'Field' object has no attribute 'vocab'

# Model

In [8]:
import torch.nn as nn
class ChatbotModel(nn.Module):
    def __init__(self, vocab_size, padding_idx, hidden_size, embed_size, n_layers=1, dropout=0.):
        super().__init__()
        self.vocab_size = vocab_size
        self.padding_idx = padding_idx
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.dropout = dropout
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=padding_idx)
        self.gru = nn.GRU(input_size=embed_size, hidden_size=hidden_size, num_layers=1,
                          bidirectional=True, dropout=dropout)

    def forward(self, input_seq, input_lengths, hidden=None):
        emb = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(emb, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        return outputs, hidden

In [9]:
model = ChatbotModel(100, 0, 256, 50)

In [10]:
vars(model)

{'training': True,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_non_persistent_buffers_set': set(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('embedding', Embedding(100, 50, padding_idx=0)),
              ('gru', GRU(50, 256, bidirectional=True))]),
 'vocab_size': 100,
 'padding_idx': 0,
 'n_layers': 1,
 'hidden_size': 256,
 'embed_size': 50,
 'dropout': 0.0}

In [11]:
import numpy as np

In [13]:
t = torch.tensor(np.random.rand(32, 50, 256))

In [14]:
t.size()

torch.Size([32, 50, 256])

In [16]:
t.squeeze(0).size()

torch.Size([32, 50, 256])

In [18]:
gru = nn.GRU(256, 256)

In [20]:
last_hidden = None
output, hidden = gru(torch.tensor(np.random.rand(32, 50, 256)), torch.tensor(np.random.rand(1, 50, 256)))

RuntimeError: Expected object of scalar type Double but got scalar type Float for argument #3 'mat2' in call to _th_addmm_out