In [1]:
import argparse
import collections
import torch
import numpy as np
import data_loader.data_loaders as module_data
import data_loader.preprocess as module_preprocess
import model.loss as module_loss
import model.metric as module_metric
import model.model as module_arch
from parse_config import ConfigParser
from trainer import Trainer


# fix random seeds for reproducibility
SEED = 123
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)

In [2]:
import json
config = ConfigParser(json.load(open('config.json', 'r')))

In [3]:
logger = config.get_logger('train')
# preprocess
# config.init_obj('preprocess', module_preprocess)

# setup data_loader instances
data_loader = config.init_obj('data_loader', module_data, save_dir=config.save_dir)



In [4]:
data_loader.TEXT.vocab.itos[:5]

['<unk>', '<pad>', '<init>', '<eos>', '.']

In [5]:
print(vars(data_loader.dataset.examples[0]))

{'talk': ['can', 'we', 'make', 'this', 'quick', '?', ' ', 'roxanne', 'korrine', 'and', 'andrew', 'barrett', 'are', 'having', 'an', 'incredibly', 'horrendous', 'public', 'break-', 'up', 'on', 'the', 'quad', '.', ' ', 'again', '.'], 'response': ['well', ',', 'i', 'thought', 'we', "'d", 'start', 'with', 'pronunciation', ',', 'if', 'that', "'s", 'okay', 'with', 'you', '.']}


In [6]:
print(vars(data_loader.TEXT))

{'sequential': True, 'use_vocab': True, 'init_token': '<init>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'fix_length': 32, 'dtype': torch.int64, 'preprocessing': None, 'postprocessing': None, 'lower': True, 'tokenizer_args': (<bound method ChatbotDataLoader._tokenizer of <data_loader.data_loaders.ChatbotDataLoader object at 0x12d8e2220>>, 'en'), 'tokenize': <bound method ChatbotDataLoader._tokenizer of <data_loader.data_loaders.ChatbotDataLoader object at 0x12d8e2220>>, 'include_lengths': False, 'batch_first': False, 'pad_token': '<pad>', 'pad_first': False, 'truncate_first': False, 'stop_words': None, 'is_target': False, 'vocab': <torchtext.vocab.Vocab object at 0x12f8353d0>}


In [7]:
print(config.__dict__)

{'_config': {'name': 'Chatbot', 'n_gpu': 1, 'arch': {'type': 'ChatbotModel', 'args': {}}, 'preprocess': {'type': 'ChatbotDataPreprocess', 'args': {'data_dir': 'data/cornell movie-dialogs corpus'}}, 'data_loader': {'type': 'ChatbotDataLoader', 'args': {'data_dir': 'data/cornell movie-dialogs corpus', 'filename': 'formatted_movie_lines.csv', 'text_field_path': None, 'vocab_path': None, 'batch_size': 128, 'sent_len': 32, 'init_token': '<init>', 'eos_token': '<eos>', 'min_freq': 5, 'shuffle': True, 'validation_split': 0.1}}, 'optimizer': {'type': 'Adam', 'args': {'lr': 0.001, 'weight_decay': 0, 'amsgrad': True}}, 'loss': 'nll_loss', 'metrics': ['accuracy', 'top_k_acc'], 'lr_scheduler': {'type': 'StepLR', 'args': {'step_size': 50, 'gamma': 0.1}}, 'trainer': {'epochs': 10, 'save_dir': 'saved/', 'save_period': 1, 'verbosity': 2, 'monitor': 'min val_loss', 'early_stop': 2, 'tensorboard': True}}, 'resume': None, '_save_dir': PosixPath('saved/models/Chatbot/1015_173207'), '_log_dir': PosixPath('

In [49]:
print(data_loader.TEXT)

<torchtext.data.field.Field object at 0x10c025f40>


In [11]:
data_loader.TEXT.vocab.pad

AttributeError: 'Vocab' object has no attribute 'pad'

In [8]:
datasets = data_loader.dataset.split(split_ratio=0.9)

In [15]:
from torchtext.data import BucketIterator

In [18]:
TEXT.build_vocab(dataset, max_size=20000)

In [23]:
data_iter = data_loader.train_iter

In [10]:
for idx, i in enumerate(data_loader.train_iter):
    if idx == 5:
        break
    print(vars(i))

{'batch_size': 128, 'dataset': <torchtext.data.dataset.Dataset object at 0x1539d82b0>, 'fields': dict_keys(['talk', 'response']), 'input_fields': ['talk', 'response'], 'target_fields': [], 'talk': tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [ 83,  83, 281,  ...,  20,  18,  86],
        [ 33,  37, 366,  ..., 212,  14,   5],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]]), 'response': tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [  52,   77,    6,  ..., 6838,    7,   89],
        [  25,   17,  116,  ...,    4, 2493,   45],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])}
{'batch_size': 128, 'dataset': <torchtext.data.dataset.Dataset object at 0x1539d82b0>, 'fields': dict_keys(['talk', 'response']), 'input_fields': ['talk', 'response'], 'targ



In [32]:
len(splits[0].examples)

199154

In [33]:
len(splits[1].examples)

22128

In [34]:
len(dataset.examples)

221282

In [42]:
data_loader.TEXT.vocab.stoi['<PAS>']

0

In [50]:
from torchtext.data import BucketIterator

In [51]:
train_iter = BucketIterator(splits[0], 2)



In [57]:
valid_iter = BucketIterator(splits[1], 8)

In [59]:
for i in valid_iter:
    print(i)



AttributeError: 'Field' object has no attribute 'vocab'