In [1]:
# the Russian lemmatizer requires the pymorphy2 library
!pip install pymorphy2==0.8

Collecting pymorphy2==0.8
[?25l  Downloading https://files.pythonhosted.org/packages/a3/33/fff9675c68b5f6c63ec8c6e6ff57827dda28a1fa5b2c2d727dffff92dd47/pymorphy2-0.8-py2.py3-none-any.whl (46kB)
[K     |███████                         | 10kB 13.9MB/s eta 0:00:01[K     |██████████████▏                 | 20kB 16.5MB/s eta 0:00:01[K     |█████████████████████▎          | 30kB 10.1MB/s eta 0:00:01[K     |████████████████████████████▍   | 40kB 8.4MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 2.7MB/s 
[?25hCollecting dawg-python>=0.7
  Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec85745766c0047ccc3b5036f1d03559fd46bb38b5eeb/DAWG_Python-0.7.2-py2.py3-none-any.whl
Collecting pymorphy2-dicts<3.0,>=2.4
[?25l  Downloading https://files.pythonhosted.org/packages/02/51/2465fd4f72328ab50877b54777764d928da8cb15b74e2680fc1bd8cb3173/pymorphy2_dicts-2.4.393442.3710985-py2.py3-none-any.whl (7.1MB)
[K     |████████████████████████████████| 7.1MB 7

In [2]:
import io
import math
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
from torchtext.legacy.data import Field, Dataset, Example, BucketIterator

import spacy
from spacy.lang.ru import Russian

from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
nlp_ru = Russian()
nlp_en = spacy.load("en_core_web_sm", disable = ["parser", "tagger", "ner"])

def tokenize_ru(text):
  return [tok.text for tok in nlp_ru.tokenizer(text)]

def tokenize_en(text):
  return [tok.text for tok in nlp_en.tokenizer(text)]


SRC = Field(tokenize = tokenize_ru, include_lengths = True, lower = True)
TRG = Field(tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos>',
                     include_lengths = True, lower = True)

fields = [('rus', SRC), ('eng', TRG)]


In [4]:
# Get the dataset
torchtext.utils.download_from_url('https://github.com/bsbor/data/releases/download/test3/1mcorpus.zip', '1mcorpus.zip')
torchtext.utils.extract_archive('1mcorpus.zip')

ru_lines = io.open("corpus.en_ru.1m.ru", encoding='UTF-8').read().splitlines()
en_lines = io.open("corpus.en_ru.1m.en", encoding='UTF-8').read().splitlines()


1mcorpus.zip: 100%|██████████| 129M/129M [00:06<00:00, 20.6MB/s]


In [5]:
dataset_size = 200000
temp_ru_lines = ru_lines[:dataset_size]
temp_en_lines = en_lines[:dataset_size]
sentences = list(zip(temp_ru_lines, temp_en_lines))

data = [ Example.fromlist(item, fields) for item in sentences ]

data = Dataset(data, fields=fields)
SRC.build_vocab(data)
TRG.build_vocab(data)

In [6]:
sentences[0]

('Такое развитие характера Гарри может разочаровать читателей, полюбивших его былую мстительность, но с другой стороны это преображение укрепляет позицию тех, кто не видит глубже сюжета и изображения героев.',
 "This new development in Harry's character may be a disappointment to those readers who enjoyed his old vindictive ways, but it also reinforces the position of pro-Potter people who do not see beneath the surface appearance of the characters and plots.")

In [8]:
class Attention(nn.Module):
  def __init__(self, hidden_size):
    super(Attention, self).__init__()        
    self.hidden_size = hidden_size
      
  def forward(self, hidden, encoder_outputs, mask):
    # dot score
    attn_scores = torch.sum(hidden * encoder_outputs, dim=2)
    
    # Transpose max_length and batch_size dimensions
    attn_scores = attn_scores.t()
    
    # Apply mask so network does not attend <pad> tokens        
    attn_scores = attn_scores.masked_fill(mask == 0, -1e5)
    
    # Return softmax over attention scores      
    return F.softmax(attn_scores, dim=1).unsqueeze(1)

In [9]:
class Encoder(nn.Module):
  def __init__(self, hidden_size, embedding_size, num_layers=2, dropout=0.3):
    
    super(Encoder, self).__init__()
    
    # Basic network params
    self.hidden_size = hidden_size
    self.embedding_size = embedding_size
    self.num_layers = num_layers
    self.dropout = dropout
    
    # Embedding layer that will be shared with Decoder
    self.embedding = nn.Embedding(len(SRC.vocab), embedding_size)
    # GRU layer
    self.gru = nn.GRU(embedding_size, hidden_size,
                      num_layers=num_layers,
                      dropout=dropout)
      
  def forward(self, input_sequence):
    # Convert input_sequence to word embeddings
    embedded = self.embedding(input_sequence)
            
    outputs, hidden = self.gru(embedded)
    
    # The ouput of a GRU has shape -> (seq_len, batch, hidden_size)
    return outputs, hidden

class Decoder(nn.Module):
  def __init__(self, embedding_size, hidden_size, output_size, n_layers=2, dropout=0.3): 
    super(Decoder, self).__init__()
    
    # Basic network params
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.n_layers = n_layers
    self.dropout = dropout
    self.embedding = nn.Embedding(output_size, embedding_size)
            
    self.gru = nn.GRU(embedding_size, hidden_size, n_layers, 
                      dropout=dropout)
    
    self.concat = nn.Linear(hidden_size * 2, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)
    self.attn = Attention(hidden_size)
      
  def forward(self, current_token, hidden_state, encoder_outputs, mask):
    # convert current_token to word_embedding
    embedded = self.embedding(current_token)
    
    # Pass through GRU
    gru_output, hidden_state = self.gru(embedded, hidden_state)
    
    # Calculate attention weights
    attention_weights = self.attn(gru_output, encoder_outputs, mask)
    
    # Calculate context vector (weigthed average)
    context = attention_weights.bmm(encoder_outputs.transpose(0, 1))
    
    # Concatenate  context vector and GRU output
    gru_output = gru_output.squeeze(0)
    context = context.squeeze(1)
    concat_input = torch.cat((gru_output, context), 1)
    concat_output = torch.tanh(self.concat(concat_input))
    
    # Pass concat_output to final output layer
    output = self.out(concat_output)
    
    # Return output and final hidden state
    return output, hidden_state

In [10]:
class seq2seq(nn.Module):
  def __init__(self, embedding_size, hidden_size, vocab_size, device, pad_idx, eos_idx, sos_idx):
    super(seq2seq, self).__init__()
    
    # Embedding layer shared by encoder and decoder
    self.embedding = nn.Embedding(vocab_size, embedding_size)
    
    # Encoder network
    self.encoder = Encoder(hidden_size, embedding_size, num_layers=2, dropout=0.3)
    
    # Decoder network        
    self.decoder = Decoder(embedding_size, hidden_size, vocab_size, n_layers=2, dropout=0.3)
    
    # Indices of special tokens and hardware device 
    self.pad_idx = pad_idx
    self.eos_idx = eos_idx
    self.sos_idx = sos_idx
    self.device = device
      
  def create_mask(self, input_sequence):
    return (input_sequence != self.pad_idx).permute(1, 0)
      
      
  def forward(self, input_sequence, output_sequence):
    
    # Unpack input_sequence tuple
    input_tokens = input_sequence[0]
  
    # Unpack output_tokens, or create an empty tensor for text generation
    if output_sequence is None:
      inference = True
      output_tokens = torch.zeros((100, input_tokens.shape[1])).long().fill_(self.sos_idx).to(self.device)
    else:
      inference = False
      output_tokens = output_sequence[0]
    
    vocab_size = self.decoder.output_size
    batch_size = len(input_sequence[1])
    max_seq_len = len(output_tokens)
    
    # tensor to store decoder outputs
    outputs = torch.zeros(max_seq_len, batch_size, vocab_size).to(self.device)        
    
    # pass input sequence to the encoder
    encoder_outputs, hidden = self.encoder(input_tokens)
    
    # first input to the decoder is the <sos> tokens
    output = output_tokens[0,:]
    
    # create mask
    mask = self.create_mask(input_tokens)
    
    
    # Step through the length of the output sequence one token at a time
    for t in range(1, max_seq_len):
      output = output.unsqueeze(0)
      
      output, hidden = self.decoder(output, hidden, encoder_outputs, mask)
      outputs[t] = output
      
      if inference:
        output = output.max(1)[1]
      else:
        output = output_tokens[t]
      
      # If we're in inference mode, keep generating until we produce an
      # <eos> token
      if inference and output.item() == self.eos_idx:
        return outputs[:t]
        
    return outputs

In [11]:

train_data, val_data = data.split(split_ratio=0.8)
train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, val_data), 
    batch_size = 64, 
    sort_within_batch = True, 
    sort_key = lambda x:len(x.rus),
    device = device
)

# extract special tokens
pad_idx = TRG.vocab.stoi['<pad>']
eos_idx = TRG.vocab.stoi['<eos>']
sos_idx = TRG.vocab.stoi['<sos>']

# Size of embedding_dim should match the dim of pre-trained word embeddings!
embedding_dim = 100
hidden_dim = 256
vocab_size = len(TRG.vocab)

model = seq2seq(embedding_dim, hidden_dim, vocab_size, 
                device, pad_idx, eos_idx, sos_idx).to(device)

optimizer = optim.Adam(model.parameters())

# cross entropy loss with softmax
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

def train(model, iterator, criterion, optimizer):
  # Put the model in training mode!
  model.train()
  
  epoch_loss = 0
  i = 0
  it_size = len(iterator)
  with tqdm(total=it_size) as progress_bar:
    for (idx, batch) in enumerate(iterator):
        #if (idx % (round(it_size/500)) == 0):
        #    print("\tCompleted: {} / {} batches".format(idx, it_size))

        input_sequence = batch.rus
        output_sequence = batch.eng

        target_tokens = output_sequence[0]

        # zero out the gradient for the current batch
        optimizer.zero_grad()

        # Run the batch through our model
        output = model(input_sequence, output_sequence)

        # Throw it through our loss function
        output = output[1:].view(-1, output.shape[-1])
        target_tokens = target_tokens[1:].view(-1)

        loss = criterion(output, target_tokens)

        # Perform back-prop and calculate the gradient of our loss function
        loss.backward()

        # Update model parameters
        optimizer.step()

        epoch_loss += loss.item()
        i+= 1
        progress_bar.update(1) # update progress
        
  return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
  # Put the model in training mode!
  model.eval()
  
  epoch_loss = 0
  
  for (idx, batch) in enumerate(iterator):
    input_sequence = batch.rus
    output_sequence = batch.eng

    target_tokens = output_sequence[0]

    # Run the batch through our model
    output = model(input_sequence, output_sequence)

    # Throw it through our loss function
    output = output[1:].view(-1, output.shape[-1])
    target_tokens = target_tokens[1:].view(-1)

    loss = criterion(output, target_tokens)

    epoch_loss += loss.item()
      
  return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs



In [13]:
# %%script false
# Train
N_EPOCHS = 5

best_valid_loss = float('inf')

# start model training
print('Epoch 1 Training started....')
for epoch in range(N_EPOCHS):
  start_time = time.time()
  
  train_loss = train(model, train_iterator, criterion, optimizer)
  valid_loss = evaluate(model, valid_iterator, criterion)
  
  end_time = time.time()
  
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
  
  # compare validation loss
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'best_model.pt')
  
  print(f'\nEpoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'  > Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
  print(f'  > Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
  print('')


  0%|          | 0/625 [00:00<?, ?it/s]

Epoch 1 Training started....


100%|██████████| 625/625 [09:57<00:00,  1.05it/s]
  0%|          | 0/625 [00:00<?, ?it/s]


Epoch: 01 | Time: 10m 45s
  > Train Loss: 6.857 | Train PPL: 950.624
  > Val. Loss: 6.377 |  Val. PPL: 588.427



100%|██████████| 625/625 [09:48<00:00,  1.06it/s]
  0%|          | 0/625 [00:00<?, ?it/s]


Epoch: 02 | Time: 10m 37s
  > Train Loss: 5.962 | Train PPL: 388.454
  > Val. Loss: 5.803 |  Val. PPL: 331.285



100%|██████████| 625/625 [09:51<00:00,  1.06it/s]
  0%|          | 0/625 [00:00<?, ?it/s]


Epoch: 03 | Time: 10m 39s
  > Train Loss: 5.485 | Train PPL: 241.073
  > Val. Loss: 5.590 |  Val. PPL: 267.815



100%|██████████| 625/625 [09:51<00:00,  1.06it/s]
  0%|          | 0/625 [00:00<?, ?it/s]


Epoch: 04 | Time: 10m 39s
  > Train Loss: 5.136 | Train PPL: 170.019
  > Val. Loss: 5.465 |  Val. PPL: 236.373



100%|██████████| 625/625 [09:54<00:00,  1.05it/s]



Epoch: 05 | Time: 10m 42s
  > Train Loss: 4.842 | Train PPL: 126.746
  > Val. Loss: 5.409 |  Val. PPL: 223.446



In [14]:
%%script false

# saving & loading the model
saved_model_path = "best_model.pt"
model.load_state_dict(torch.load(saved_model_path))
    

In [24]:
def translate_sentence(model, sentence):
    model.eval()
    
    # tokenization
    tokenized = nlp_ru(sentence) 
    # convert tokens to lowercase
    tokenized = [t.lower_ for t in tokenized]
    # convert tokens to integers
    int_tokenized = [SRC.vocab.stoi[t] for t in tokenized] 
    
    # convert list to tensor
    sentence_length = torch.LongTensor([len(int_tokenized)]).to(model.device) 
    tensor = torch.LongTensor(int_tokenized).unsqueeze(1).to(model.device) 
    
    # get predictions
    translation_tensor_logits = model((tensor, sentence_length), None) 
    
    # get token index with highest score
    translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
    # convert indices (integers) to tokens
    translation = [TRG.vocab.itos[t] for t in translation_tensor]
 
    # Start at the first index.  We don't need to return the <sos> token...
    translation = translation[1:]
    return " ".join(translation)


In [23]:
translate_sentence(model, "это холодно")

it s cold


In [37]:
eval_lines = io.open('eval-ru-100.txt').read().splitlines()

eval_en_out = [ translate_sentence(model, s) for s in eval_lines ]

eval_output = list(zip(eval_lines, eval_en_out))

In [34]:
eval_output[:4]

[('26. Вопрос о лесах необходимо вывести на более высокий уровень в рамках целей устойчивого развития, в том числе посредством включения в такие цели убедительных и четких целевых и рабочих показателей по лесам.',
  'Forest need to increase with the develop goal the including with force and clear development mark of forests.'),
 ('В рамках экологической экспертизы определены пять вариантов строительства и эксплуатации замещающей электростанции, которая восстановит мощность энергораспределительной сети Управления по состоянию до стихийного бедствия.',
  'in the end of the company , the company will be provided by the international and the international sector , which is also to be able to provide the new system of the new system .'),
 ('В ходе рассмотрения данного пункта повестки дня Рабочая группа будет кратко проинформирована Секретариатом о работе УНП ООН по содействию ратификации и осуществлению Протокола об огнестрельном оружии в рамках Глобальной программы по огнестрельному оружию