In [13]:
import torch

import nltk
nltk.download('punkt')

from src.data_parser import DataParser
from src.tokenizer import Tokenizer
from src.dataset import NMTDataset
from src.models import NMTModel, Encoder, Decoder
from src.utils import seed_all

[nltk_data] Downloading package punkt to /home/egor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
seed_all(42)

In [5]:
data_parser = DataParser('./data/rus.txt')
eng, ru = data_parser.split_by_languages()

In [7]:
Tokenizer.build_vocab(eng, './data/vocab_eng.txt')
Tokenizer.build_vocab(ru, './data/vocab_ru.txt', threshold=0.7)

eng_tokenizer = Tokenizer('eng', './data/vocab_eng.txt')
ru_tokenizer = Tokenizer('ru', './data/vocab_ru.txt')

In [8]:
test_strings = ['I love dogs and cats', 'Я люблю собак и кошек']
inform = ['---------- test eng ----------', '---------- test ru ----------']
tokenizers = [eng_tokenizer, ru_tokenizer]
for test_str, inf, tokenizer in list(zip(test_strings, inform, tokenizers)):
    print(inf)
    print(test_str)
    tokenized = tokenizer.tokenize(test_str)
    print(tokenized)
    encoded = tokenizer.encode(tokenized)
    print(encoded)
    decoded = tokenizer.decode(encoded)
    print(decoded)

---------- test eng ----------
I love dogs and cats
['<BOS>', 'i', 'love', 'dogs', 'and', 'cats', '<EOS>']
[0, 5, 169, 740, 46, 804, 1]
['<BOS>', 'i', 'love', 'dogs', 'and', 'cats', '<EOS>']
---------- test ru ----------
Я люблю собак и кошек
['<BOS>', 'я', 'люблю', 'собак', 'и', 'кошек', '<EOS>']
[0, 6, 172, 1417, 26, 1575, 1]
['<BOS>', 'я', 'люблю', 'собак', 'и', 'кошек', '<EOS>']


In [15]:
config = {
    'dataset': {
        'source_pad_len': 100,
        'target_pad_len': 100
    },
    'dataloader': {
        'train_bs': 20,
        'test_bs': 20
    },
    'encoder_cfg': {
        'vocab_size': eng_tokenizer.get_vocab_size(),
        'embedding_size': 256,
        'hidden_size': 128
    },
    'decoder_cfg': {
        'vocab_size': ru_tokenizer.get_vocab_size(),
        'embedding_size': 256,
        'hidden_size': 128
    },
    'optim': {
        'lr': 1e-4
    }
}

In [12]:
train, test = data_parser.train_test_split(0.9)

train_dataset = NMTDataset(train, eng_tokenizer, ru_tokenizer, **config['dataset'])
test_dataset = NMTDataset(train, eng_tokenizer, ru_tokenizer, **config['dataset'])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config['dataloader']['train_bs'], shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=config['dataloader']['test_bs'], shuffle=False)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

encoder = Encoder(**config['encoder_cfg'])
decoder = Decoder(**config['decoder_cfg'])

model = NMTModel(encoder, decoder).to(device)
optimizer = torch.optim.Adam(model.parameters(), config['optim']['lr'])
criterion = torch.nn.NLLLoss(ignore_index = ru_tokenizer.encode(['<PAD>'])[0])

In [19]:
def train_epoch(model, optimizer, loader, criterion, epoch, log_step=200):
    model.train()
    loss_val = []
    avg_loss = []
    iter_step = 1
    for batch in loader:
        optimizer.zero_grad()
        for key in batch.keys():
            batch[key] = batch[key].to(device)
        preds = model(batch)
        preds = preds.permute(0, 2, 1)
        loss = criterion(preds, batch['target_for_loss'])
        avg_loss.append(loss.detach().item())
        if iter_step % log_step == 0:
            avg_loss_val = sum(avg_loss) / len(avg_loss)
            print('epoch\t{}\t[{}/{}]\tloss: {:4f}'.format(epoch, iter_step, len(loader), avg_loss_val))
            avg_loss = []
            loss_val.append(avg_loss_val)
        iter_step += 1
        loss.backward()
        optimizer.step()
    return loss_val

In [None]:
losses = []
EPOCHS = 1
for epoch in range(1, EPOCHS + 1):
    epoch_loss = train_epoch(model, optimizer, loader, criterion, epoch)
    losses.extend(epoch_loss)

In [None]:
def translate(model, sent):
    model.eval()
    sent = eng_tokenizer.encode(eng_tokenizer.tokenize(sent))
    sent = torch.LongTensor([sent])
    dec_sent = [0]
    while dec_sent[-1] != 1:
        dec = torch.LongTensor([dec_sent])
        batch = {'source':sent, 'source_mask':None, 'target':dec, 'target_mask':None}
        with torch.no_grad():
            max_val = torch.exp(model(batch)).max(dim=2)[1][:,-1].item()
        dec_sent.append(max_val)

    print(' '.join(ru_tokenizer.decode(dec_sent)))

In [None]:
translate(model, 'i love cats')