In [1]:
import re
import random
from itertools import chain
from collections import Counter
from tqdm import tqdm

import torch
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Selected device: {device}')

Selected device: cuda


### load data

In [2]:
def word_preprocess(w):
    w = w.lower().strip()
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)
    w = w.strip()
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [3]:
def read_words(path, n=None, seed=None):
    random.seed(seed)
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()[:n] if seed == False else random.sample(f.readlines(), n)
        lines = [[word_preprocess(w).split() for w in ln.split('\t')[:2]] for ln in lines]    
    return zip(*lines)

### build vocabulary

In [4]:
class Vocab:
    def __init__(self, sentences):
        self.counts = Counter(list(chain(*sentences)))
        self.id2token = dict(enumerate(self.counts.keys(), 1))
        self.token2id = {v: k for k, v in self.id2token.items()}
    
    def as_tensor(self, sentences):
        t = [torch.as_tensor([idx for w in s if (idx := self.token2id.get(w)) is not None]) for s in sentences]
        return torch.nn.utils.rnn.pad_sequence(t).T
    
    @property
    def size(self):
        return len(self.counts)

In [5]:
# load data and build vocabs
en, ru = read_words('rus-eng/rus.txt', n=50000, seed=1)
en_vocab = Vocab(en)
ru_vocab = Vocab(ru)

# size overview
en_vocab.size, ru_vocab.size

(8241, 20931)

In [6]:
# build tensors
tensor_en = en_vocab.as_tensor(en)
tensor_ru = ru_vocab.as_tensor(ru)

tensor_en.shape, tensor_ru.shape

(torch.Size([50000, 35]), torch.Size([50000, 32]))

### split & build dataset

In [7]:
# train/valid split
train_en, valid_en, train_ru, valid_ru = train_test_split(tensor_en, tensor_ru, test_size=0.2)

In [8]:
class Seq2seqDataset(torch.utils.data.Dataset):
    def __init__(self, *seq):
        self.seq = seq

    def __getitem__(self, index):
        return [seq[index] for seq in self.seq]

    def __len__(self):
        return len(self.seq[0])

In [9]:
BATCH_SIZE = 128

train_dataset = Seq2seqDataset(train_ru, train_en)
valid_dataset = Seq2seqDataset(valid_ru, valid_en)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)

### Encoder / Decoder

In [10]:
class Encoder(torch.nn.Module):
    def __init__(self, dict_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embs = torch.nn.Embedding(dict_size, embedding_dim)
        self.gru = torch.nn.GRU(embedding_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
    
    def forward(self, x):
        x = self.embs(x)
        x, h = self.gru(x)
        return h

In [11]:
class Decoder(torch.nn.Module):
    def __init__(self, dict_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embs = torch.nn.Embedding(dict_size, embedding_dim)
        self.gru = torch.nn.GRU(embedding_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.fc = torch.nn.Linear(2 * hidden_dim, output_dim)
    
    def forward(self, x, h0):
        x = self.embs(x)
        x, h = self.gru(x, h0)
        x = self.fc(x.squeeze(0))
        return x, h

In [12]:
class Net(torch.nn.Module):
    def __init__(self, dict_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.enc = Encoder(dict_size, embedding_dim, hidden_dim)
        self.dec = Decoder(dict_size, embedding_dim, hidden_dim, output_dim)
    
    def forward(self, x, t):
        h = self.enc(x)
        out, h = self.dec(t, h)
        return out.permute(0, 2, 1)

In [13]:
DICT_SIZE = 50000

model = Net(dict_size=DICT_SIZE, embedding_dim=128, hidden_dim=256, output_dim=en_vocab.size).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

In [14]:
epochs = 5
dev = device
optim = optimizer

model.train()
for ep in range(epochs):
    sum_loss, items = 0.0, 0
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Epoch {ep + 1}/{epochs}')
    for i, batch in pbar:
        inputs, target = batch[0].to(dev), batch[1].to(dev)
        optim.zero_grad()

        outputs = model(inputs, target)

        loss = criterion(outputs, target)
        loss.backward()
        optim.step()
        
        sum_loss += loss.item()
        items += len(target)
        pbar.set_postfix({'cumulative loss per item': sum_loss / items})
model.eval()

Epoch 1/5: 100%|██████████| 313/313 [00:32<00:00,  9.54it/s, cumulative loss per item=0.00228]
Epoch 2/5: 100%|██████████| 313/313 [00:32<00:00,  9.51it/s, cumulative loss per item=0.000329]
Epoch 3/5: 100%|██████████| 313/313 [00:32<00:00,  9.50it/s, cumulative loss per item=0.000117]
Epoch 4/5: 100%|██████████| 313/313 [00:32<00:00,  9.49it/s, cumulative loss per item=4.14e-5]
Epoch 5/5: 100%|██████████| 313/313 [00:32<00:00,  9.50it/s, cumulative loss per item=3.51e-5]


Net(
  (enc): Encoder(
    (embs): Embedding(50000, 128)
    (gru): GRU(128, 256, num_layers=2, batch_first=True, bidirectional=True)
  )
  (dec): Decoder(
    (embs): Embedding(50000, 128)
    (gru): GRU(128, 256, num_layers=2, batch_first=True, bidirectional=True)
    (fc): Linear(in_features=512, out_features=8241, bias=True)
  )
)

In [15]:
pbar = tqdm(enumerate(valid_loader), total=len(valid_loader), desc=f'Epoch {ep + 1}/{epochs}')
predicts = None
for i, batch in pbar:    
    inputs, target = batch[0].to(dev), batch[1].to(dev)
    outputs = model(inputs, target).argmax(axis=1)
    predicts = outputs.detach().cpu() if predicts is None else torch.cat([predicts, outputs.detach().cpu()])

Epoch 5/5: 100%|██████████| 313/313 [00:07<00:00, 41.13it/s]


In [16]:
# OVERVIEW
IDX = 4

source_text = [ru_vocab.id2token[w] for w in valid_dataset[IDX][0].numpy() if w]
true_text = [en_vocab.id2token[w] for w in valid_dataset[IDX][1].numpy() if w]
pred_text = [[en_vocab.id2token[w] for w in sentence if w] for sentence in predicts.numpy()][IDX]

print(' '.join(source_text), ' '.join(true_text), ' '.join(pred_text), sep='\n')

<start> я видел тома , сидящего в одиночестве в баре и пьющего вино . <end>
<start> i saw tom sitting at the bar alone , drinking wine . <end>
<start> i'm sorry i've kept you waiting so long . <end>


In [17]:
#