In [1]:
import re
import random
from itertools import chain
from collections import Counter
from tqdm import tqdm

import torch
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Selected device: {device}')

Selected device: cuda


### load data

In [2]:
def word_preprocess(w):
    w = w.lower().strip()
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)
    w = w.strip()
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [3]:
def read_words(path, n=None, seed=None):
    random.seed(seed)
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()[:n] if seed == False else random.sample(f.readlines(), n)
        lines = [[word_preprocess(w).split() for w in ln.split('\t')[:2]] for ln in lines]    
    return zip(*lines)

### build vocabulary

In [4]:
class Vocab:
    def __init__(self, sentences):
        self.counts = Counter(list(chain(*sentences)))
        self.id2token = dict(enumerate(self.counts.keys(), 1))
        self.token2id = {v: k for k, v in self.id2token.items()}
    
    def as_tensor(self, sentences):
        t = [torch.as_tensor([idx for w in s if (idx := self.token2id.get(w)) is not None]) for s in sentences]
        return torch.nn.utils.rnn.pad_sequence(t).T
    
    @property
    def size(self):
        return len(self.counts)

In [5]:
# load data and build vocabs
en, ru = read_words('rus-eng/rus.txt', n=100000, seed=False)
en_vocab = Vocab(en)
ru_vocab = Vocab(ru)

# size overview
en_vocab.size, ru_vocab.size

(7334, 20502)

In [6]:
# build tensors
tensor_en = en_vocab.as_tensor(en)
tensor_ru = ru_vocab.as_tensor(ru)

tensor_en.shape, tensor_ru.shape

(torch.Size([100000, 11]), torch.Size([100000, 15]))

### split & build dataset

In [7]:
# train/valid split
train_en, valid_en, train_ru, valid_ru = train_test_split(tensor_en, tensor_ru, test_size=0.2)

In [8]:
class Seq2seqDataset(torch.utils.data.Dataset):
    def __init__(self, *seq):
        max_shape = max([sq.shape[1] for sq in seq])
        ashapes = [max_shape - sq.shape[1] for sq in seq]
        self.seq = [torch.nn.functional.pad(sq, (0, applen)) for sq, applen in zip(seq, ashapes)]
        # self.seq = seq

    def __getitem__(self, index):
        return [seq[index] for seq in self.seq]

    def __len__(self):
        return len(self.seq[0])

In [9]:
BATCH_SIZE = 512

train_dataset = Seq2seqDataset(train_ru, train_en)
valid_dataset = Seq2seqDataset(valid_ru, valid_en)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)

### Encoder / Decoder

In [10]:
class Encoder(torch.nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        # self.rnn = torch.nn.GRU(embedding_dim, hidden_dim, num_layers=2, dropout=dropout)
        # self.rnn = torch.nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=True)
        self.rnn = torch.nn.GRU(embedding_dim, hidden_dim, num_layers=2, dropout=dropout, batch_first=True)
        # self.rnn = torch.nn.GRU(embedding_dim, hidden_dim, num_layers=2, dropout=dropout, batch_first=True, bidirectional=True)
        self.dropout = torch.nn.Dropout(dropout)
        
    def forward(self, x):
        emb = self.dropout(self.embedding(x))
        out, hid = self.rnn(emb)
        return hid

In [11]:
class Decoder(torch.nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = torch.nn.Embedding(output_dim, embedding_dim)
        # self.rnn = torch.nn.GRU(embedding_dim, hidden_dim, num_layers=2, dropout=dropout)
        # self.rnn = torch.nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=True)
        self.rnn = torch.nn.GRU(embedding_dim, hidden_dim, num_layers=2, dropout=dropout, batch_first=True)
        # self.rnn = torch.nn.GRU(embedding_dim, hidden_dim, num_layers=2, dropout=dropout, batch_first=True, bidirectional=True)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        self.dropout = torch.nn.Dropout(dropout)
        
    def forward(self, x, h):
        # x = x.unsqueeze(0)
        emb = self.dropout(self.embedding(x))
        out, hid = self.rnn(emb, h)
        # print(f'DEC RNN output: {out.shape}')

        pred = self.fc(out)
        # pred = self.fc(out.squeeze(0))
        # print(f'PRED shape: {pred.shape}')
        return pred, hid

In [12]:
class Seq2Seq(torch.nn.Module):
    def __init__(self, input_dim, output_dim, embedding_dim, hidden_dim, n_layers=2, dropout=0.1, device=device):
        super().__init__()
        self.encoder = Encoder(input_dim, embedding_dim, hidden_dim, n_layers, dropout)
        self.decoder = Decoder(output_dim, embedding_dim, hidden_dim, n_layers, dropout)
        self.device = device
                
    def forward(self, x, t, forcing_ratio=0.5):
        # print(f'X shape: {x.shape}')
        # print(f'T shape: {t.shape}')
        
        # tensor to store decoder outputs
        outs = torch.zeros(t.shape[1], t.shape[0], self.decoder.output_dim).to(self.device)
        # print(f'outs shape: {outs.shape}')
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        h = self.encoder(x)
        # print(f'encoder hidden shape: {h.shape}')
        # first input to the decoder is the  tokens
        # ins = t[:, 0]
        
        for i in range(0, t.shape[1]):
            ins = t[:, i].unsqueeze(1)
            # print(f'ins shape: {ins.shape}')

            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            out, h = self.decoder(ins, h)
            outs[i] = out.squeeze()
            # ins = t[:, i]
            
            # if forcing, use actual next token as next input; if not, use predicted token
            force = random.random() < forcing_ratio
            ins = t[:, i] if force else outs.argmax(axis=1)
        return outs.permute(1, 2, 0)
        # return outs

In [13]:
INPUT_DIM = ru_vocab.size + 1
OUTPUT_DIM = en_vocab.size + 1

model = Seq2Seq(INPUT_DIM, OUTPUT_DIM, embedding_dim=128, hidden_dim=256, n_layers=4).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
criterion = torch.nn.CrossEntropyLoss()

In [14]:
epochs = 10
dev = device
optim = optimizer

model.train()
for ep in range(epochs):
    sum_loss, items = 0.0, 0
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Epoch {ep + 1}/{epochs}')
    for i, batch in pbar:
        inputs, target = batch[0].to(dev), batch[1].to(dev)
        optim.zero_grad()

        outputs = model(inputs, target)
        
        loss = criterion(outputs, target)
        loss.backward()
        optim.step()
        
        sum_loss += loss.item()
        items += len(target)
        pbar.set_postfix({'cumulative loss per item': sum_loss / items})
model.eval()

Epoch 1/10: 100%|██████████| 157/157 [00:36<00:00,  4.27it/s, cumulative loss per item=0.00701]
Epoch 2/10: 100%|██████████| 157/157 [00:36<00:00,  4.30it/s, cumulative loss per item=0.00371]
Epoch 3/10: 100%|██████████| 157/157 [00:36<00:00,  4.31it/s, cumulative loss per item=0.00277]
Epoch 4/10: 100%|██████████| 157/157 [00:36<00:00,  4.29it/s, cumulative loss per item=0.00253]
Epoch 5/10: 100%|██████████| 157/157 [00:36<00:00,  4.31it/s, cumulative loss per item=0.00234]
Epoch 6/10: 100%|██████████| 157/157 [00:36<00:00,  4.29it/s, cumulative loss per item=0.0024] 
Epoch 7/10: 100%|██████████| 157/157 [00:36<00:00,  4.28it/s, cumulative loss per item=0.00249]
Epoch 8/10: 100%|██████████| 157/157 [00:36<00:00,  4.29it/s, cumulative loss per item=0.00226]
Epoch 9/10: 100%|██████████| 157/157 [00:38<00:00,  4.06it/s, cumulative loss per item=0.00209]
Epoch 10/10: 100%|██████████| 157/157 [00:36<00:00,  4.26it/s, cumulative loss per item=0.00227]


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(20503, 128)
    (rnn): GRU(128, 256, num_layers=2, batch_first=True, dropout=0.1)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(7335, 128)
    (rnn): GRU(128, 256, num_layers=2, batch_first=True, dropout=0.1)
    (fc): Linear(in_features=256, out_features=7335, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [15]:
pbar = tqdm(enumerate(valid_loader), total=len(valid_loader), desc=f'Epoch {ep + 1}/{epochs}')
predicts = None
for i, batch in pbar:    
    inputs, target = batch[0].to(dev), batch[1].to(dev)
    outputs = model(inputs, target).argmax(axis=1)
    predicts = outputs.detach().cpu() if predicts is None else torch.cat([predicts, outputs.detach().cpu()])

Epoch 10/10: 100%|██████████| 157/157 [00:18<00:00,  8.51it/s]


In [18]:
# OVERVIEW
IDX = 2

source_text = [ru_vocab.id2token[w] for w in valid_dataset[IDX][0].numpy() if w]
true_text = [en_vocab.id2token[w] for w in valid_dataset[IDX][1].numpy() if w]
pred_text = [[en_vocab.id2token[w] for w in sentence if w] for sentence in predicts.numpy()][IDX]

print(' '.join(source_text), ' '.join(true_text), ' '.join(pred_text), sep='\n')

<start> оно у тебя есть ? <end>
<start> do you have it ? <end>
<start> does your wife work ? <end>


In [17]:
#