### download data

In [6]:
!wget http://www.manythings.org/anki/rus-eng.zip -O 'data/rus-eng.zip'
!wget https://object.pouta.csc.fi/OPUS-OpenSubtitles/v1/moses/en-ru.txt.zip -O 'data/en-ru.txt.zip'

--2024-09-08 16:10:28--  http://www.manythings.org/anki/rus-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16305013 (16M) [application/zip]
Saving to: ‘data/rus-eng.zip’


2024-09-08 16:10:52 (672 KB/s) - ‘data/rus-eng.zip’ saved [16305013/16305013]

--2024-09-08 16:10:52--  https://object.pouta.csc.fi/OPUS-OpenSubtitles/v1/moses/en-ru.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 610036 (596K) [application/zip]
Saving to: ‘en-ru.txt.zip’


2024-09-08 16:10:52 (2.55 MB/s) - ‘en-ru.txt.zip’ saved [610036/610036]



In [36]:
import zipfile
from collections import Counter
import torch
from torch.nn.utils.rnn import pad_sequence
import re
from typing import List, Dict
import torch.nn as nn
import torch.optim as optim

### read data

In [None]:
dataset_anki_path = 'data/rus-eng.zip'
with zipfile.ZipFile(dataset_anki_path, 'r') as zip_ref:
    zip_ref.extractall('anki_data')
dataset_opensub_path = 'data/en-ru.txt.zip'
with zipfile.ZipFile(dataset_opensub_path, 'r') as zip_ref:
    zip_ref.extractall('opensubtitles_data')

### prepare data

In [2]:
def load_anki_data(file_path, num_samples=100000):
    input_texts = []
    target_texts = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines[:num_samples]:
            en_text, ru_text = line.strip().split('\t')[:-1]
            input_texts.append(en_text)
            target_texts.append('\t' + ru_text + '\n') 
    return input_texts, target_texts

anki_input_texts, anki_target_texts = load_anki_data('anki_data/rus.txt')


In [3]:
def load_opensubtitles_data(file_path_ru, file_path_en, num_samples=100000):
    input_texts = []
    with open(file_path_en, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines[:num_samples]:
            en_text = line.strip()
            input_texts.append(en_text)
    target_texts = []
    with open(file_path_ru, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines[:num_samples]:
            ru_text = line.strip()
            target_texts.append('\t' + ru_text + '\n')
    return input_texts, target_texts

opensub_input_texts, opensub_target_texts = load_opensubtitles_data('opensubtitles_data/OpenSubtitles.en-ru.ru',
                                                                   'opensubtitles_data/OpenSubtitles.en-ru.en')


In [4]:
opensub_input_texts[0], opensub_target_texts[0]

('LOS ANGELES 2029 A. D.', '\t2029 год нашей эры.\n')

### tokenize data

In [28]:
def bsaic_english_tokenizer(text: str)->List[str]:
    return re.findall(r'w+', text.lower())

def build_vocab(sentences: str, tokenizer)->Dict[str, str]:
    counter = Counter()
    for sentence in sentences:
        counter.update(tokenizer(sentence))
    vocab = {word: i for i, (word, _) in enumerate(counter.items(), 4)}
    vocab['<unk>'] = 0
    vocab['<pad>'] = 1
    vocab['<bos>'] = 2
    vocab['<eos>'] = 3
    return vocab

def tokenizer_sentences(sentences, vocab, tokenizer):
    return [[vocab['<bos>'] + [vocab.get(token, vocab['<unk>']) for token in tokenizer(sentence)] +\
             vocab['<eos>']] for sentence in sentences]

def pad_sequences(sequence, padding_value):
    return pad_sequence([torch.tensor(seq) for seq in sequence], \
                        padding_value=padding_value,
                        batch_first=True)
    
tokenizer = basic_english_tokenizer

opensub_vocab_input = build_vocab(opensub_input_texts, tokenizer)
opensub_vocab_target = build_vocab(opensub_target_texts, tokenizer)

anki_vocab_input = build_vocab(anki_input_texts, tokenizer)
anki_vocab_target = build_vocab(anki_target_texts, tokenizer)

opensub_input_sequences = tokenize_sentences(opensub_input_texts, opensub_vocab_input, tokenizer)
opensub_target_sequences = tokenize_sentences(opensub_target_texts, opensub_vocab_target, tokenizer)

anki_input_sequences = tokenize_sentences(anki_input_texts, anki_vocab_input, tokenizer)
anki_target_sequences = tokenize_sentences(anki_target_texts, anki_vocab_target, tokenizer)

opensub_input_padded = pad_sequences(opensub_input_sequences, opensub_vocab_input['<pad>'])
opensub_target_padded = pad_sequences(opensub_target_sequences, opensub_vocab_target['<pad>'])

anki_input_padded = pad_sequences(anki_input_sequences, anki_vocab_input['<pad>'])
anki_target_padded = pad_sequences(anki_target_sequences, anki_vocab_target['<pad>'])


In [6]:
# from collections import Counter
# import torch
# from torch.nn.utils.rnn import pad_sequence
# import re

# # Простой токенизатор на основе регулярных выражений
# def basic_english_tokenizer(text):
#     return re.findall(r'\w+', text.lower())

# # Создание словаря
# def build_vocab(sentences, tokenizer):
#     counter = Counter()
#     for sentence in sentences:
#         counter.update(tokenizer(sentence))
#     vocab = {word: i for i, (word, _) in enumerate(counter.items(), 4)}
#     vocab['<unk>'] = 0
#     vocab['<pad>'] = 1
#     vocab['<bos>'] = 2
#     vocab['<eos>'] = 3
#     return vocab

# # Токенизация предложений с использованием созданного словаря
# def tokenize_sentences(sentences, vocab, tokenizer):
#     return [[vocab['<bos>']] + [vocab.get(token, vocab['<unk>']) for token in tokenizer(sentence)] + [vocab['<eos>']] for sentence in sentences]

# # Паддинг последовательностей
# def pad_sequences(sequences, padding_value):
#     return pad_sequence([torch.tensor(seq) for seq in sequences], padding_value=padding_value, batch_first=True)


# # Создание словарей для обоих наборов данных
# tokenizer = basic_english_tokenizer

# opensub_vocab_input = build_vocab(opensub_input_texts, tokenizer)
# opensub_vocab_target = build_vocab(opensub_target_texts, tokenizer)

# anki_vocab_input = build_vocab(anki_input_texts, tokenizer)
# anki_vocab_target = build_vocab(anki_target_texts, tokenizer)

# # Токенизация данных
# opensub_input_sequences = tokenize_sentences(opensub_input_texts, opensub_vocab_input, tokenizer)
# opensub_target_sequences = tokenize_sentences(opensub_target_texts, opensub_vocab_target, tokenizer)

# anki_input_sequences = tokenize_sentences(anki_input_texts, anki_vocab_input, tokenizer)
# anki_target_sequences = tokenize_sentences(anki_target_texts, anki_vocab_target, tokenizer)

# # Паддинг последовательностей
# opensub_input_padded = pad_sequences(opensub_input_sequences, opensub_vocab_input['<pad>'])
# opensub_target_padded = pad_sequences(opensub_target_sequences, opensub_vocab_target['<pad>'])

# anki_input_padded = pad_sequences(anki_input_sequences, anki_vocab_input['<pad>'])
# anki_target_padded = pad_sequences(anki_target_sequences, anki_vocab_target['<pad>'])


In [None]:
# class Decoder(nn.Module):
#     def __init__(self, output_size, hidden_size):
#         super(Decoder, self).__init__()
#         self.hidden_size = hidden_size
#         self.embedding = nn.Embedding(output_size, hidden_size)
#         self.lstm = nn.LSTM(hidden_size, hidden_size)
#         self.out = nn.Linear(hidden_size, output_size)
#         self.softmax = nn.LogSoftmax(dim=1)

#     def forward(self, input, hidden, cell):
#         embedded = self.embedding(input).view(1, 1, -1)  # (1, 1, hidden_size)
#         output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
#         output = self.softmax(self.out(output[0]))
#         return output, hidden, cell

In [70]:
# class Encoder(nn.Module):
#     def __init__(self, input_size, hidden_size):
#         super(Encoder, self).__init__()
#         self.hidden_size = hidden_size
#         self.embedding = nn.Embedding(input_size, hidden_size)
#         self.lstm = nn.LSTM(hidden_size, hidden_size)

#     def forward(self, input_, hidden, cell):
#         embedded = self.embedding(input_).view(1, 1, -1) # (1, 1, hidden_size)
#         output, (hidden_cell) = self.lstm(embedded, (hidden, cell))
#         return hidden, cell

#     def init_hidden(self):
#         return (torch.zeros(1,1, self.hidden_size), # hidden state
#                 torch.zeros(1,1,self.hidden_size)) # cell state

# class Decoder(nn.Module):
#     def __init__(self, output_size, hidden_state):
#         super(Decoder, self).__init__()
#         self.hidden_size = hidden_size
#         self.embedding = nn.Embedding(output_size, hidden_size)
#         self.lstm = nn.LSTM(hidden_size, hidden_size)
#         self.out = nn.Linear(hidden_size, output_size)
#         self.softmax = nn.LogSoftmax(dim=1)

#     def forward(self, input_, hidden, cell):
#         embedded = self.embedding(input_).view(1,1,-1) # (1, 1, hidden_size)
#         output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
#         output = self.softmax(self.out(output[0]))
#         return output, hidden, cell
        
  

# def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length):
#     encoder_hidden, encoder_cell = encoder.init_hidden()
#     encoder_optimizer.zero_grad()
#     decoder_optimizer.zero_grad()
#     input_length = input_tensor.size(0)
#     target_length = target_tensor.size(0)
#     loss = 0
#     for ei in range(input_length):
#         encoder_hidden, encoder_cell = encoder(input_tensor[ei], encoder_hidden, encoder_cell)
#     decoder_input = torch.tensor([[opensub_vocab_target['<bos>']]])
#     decoder_hidden, decoder_cell = encoder_hidden, encoder_cell # init decoder
#     for di in range(target_length):
#         decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell)
#         _, topi = decoder_output.topk(1)
#         decoder_input = topi.squeeze().detach()
#         loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
#         if decoder_input.item() == opensub_vocab_target['<eos>']:
#             break
#     loss.backward()
#     encoder_optimizer.step()
#     decoder_optimizer.step()
#     return loss.item() / target_length

# hidden_size = 256
# learning_rate = 0.01
# n_iters = 10_000
# print_every = 1000

# encoder = Encoder(len(opensub_vocab_input), hidden_size)
# decoder = Decoder(len(opensub_vocab_target), hidden_size) 

# encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
# decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
# criterion = nn.NLLLoss()

# for iter_ in range(1, n_iters+1):
#     training_pair = [opensub_input_padded[iter_ % len(opensub_input_padded)],
#                      opensub_target_padded[iter_ % len(opensub_target_padded)]]
#     input_tensor = training_pair[0]
#     target_tensor = training_pair[1]
#     loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=10)
#     if iter_ % print_every == 0:
#         print(f'Iteration {iter}, Loss: {loss:.4f}')

In [None]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length):
    encoder_hidden, encoder_cell = encoder.init_hidden()  # Initialize hidden and cell states
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    loss = 0

    # Encode input tensor
    for ei in range(input_length):
        encoder_hidden, encoder_cell = encoder(input_tensor[ei], encoder_hidden, encoder_cell)

    decoder_input = torch.tensor([[opensub_vocab_target['<bos>']]])  # Start token
    decoder_hidden, decoder_cell = encoder_hidden, encoder_cell  # Initialize decoder states with encoder states

    # Decode using the encoded hidden state
    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell)
        _, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # Use predicted token as next input

        loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))  # Calculate loss
        if decoder_input.item() == opensub_vocab_target['<eos>']:  # Stop if EOS token is predicted
            break

    loss.backward()  # Backpropagation

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input_, hidden, cell):
        embedded = self.embedding(input_).view(1, 1, -1)  # (1, 1, hidden_size)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        return hidden, cell

    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_size),  # hidden state
                torch.zeros(1, 1, self.hidden_size))  # cell state


class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_, hidden, cell):
        embedded = self.embedding(input_).view(1, 1, -1)  # (1, 1, hidden_size)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        output = self.softmax(self.out(output[0]))
        return output, hidden, cell

# Parameters
hidden_size = 256
learning_rate = 0.01
n_iters = 10000
print_every = 1000

# Create models
encoder = Encoder(len(opensub_vocab_input), hidden_size)
decoder = Decoder(len(opensub_vocab_target), hidden_size)

# Optimizers and loss function
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

# Training loop
for iter_ in range(1, n_iters + 1):
    training_pair = [opensub_input_padded[iter_ % len(opensub_input_padded)], opensub_target_padded[iter_ % len(opensub_target_padded)]]
    input_tensor = training_pair[0]
    target_tensor = training_pair[1]

    loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=10)

    if iter % print_every == 0:
        print(f'Iteration {iter_}, Loss: {loss:.4f}')


In [None]:
opensub_vocab_target_reverse = {index: word for word, index in opensub_vocab_target.items()}
opensub_vocab_input_reverse = {index: word for word, index in opensub_vocab_input.items()}

def translate_sentence(input_tensor, encoder, decoder, max_length=10):
    with torch.no_grad():
        encoder_hidden, encoder_cell = encoder.init_hidden()
        input_length = input_tensor.size(0)
        for ei in range(input_length):
            encoder_hidden, decoder_cell = encoder(input_tensor[ei], encoder_hidden, encoder_cell)
        decoder_input = torch.tensor([[opensub_vocab_target['<bos>']]])
        decoder_hidden, decoder_cell = encoder_hidden, encoder_cell

        decoder_words = []
        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_cell = decoder(decoder(decoder_input, decoder_hidden, decoder_cell)
            _, topi = decoder_output.topk(1)
            if topi.item() == opensub_vocab_target['<eos>']:
                break
            else:
                decoded_words.append(opensub_vocab_target_reverse[topi.item()])
        return decoded_words
                                                                   
        

In [21]:
# Создаем обратный словарь для целевого языка
opensub_vocab_target_reverse = {index: word for word, index in opensub_vocab_target.items()}
opensub_vocab_input_reverse = {index: word for word, index in opensub_vocab_input.items()}


# Функция для выполнения перевода одного предложения
def translate_sentence(input_tensor, encoder, decoder, max_length=10):
    with torch.no_grad():
        encoder_hidden, encoder_cell = encoder.init_hidden()

        input_length = input_tensor.size(0)
        
        for ei in range(input_length):
            encoder_hidden, encoder_cell = encoder(input_tensor[ei], encoder_hidden, encoder_cell)

        decoder_input = torch.tensor([[opensub_vocab_target['<bos>']]])
        decoder_hidden, decoder_cell = encoder_hidden, encoder_cell

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell)
            topv, topi = decoder_output.topk(1)
            if topi.item() == opensub_vocab_target['<eos>']:
                break
            else:
                decoded_words.append(opensub_vocab_target_reverse[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

# Пример для предложений разной длины
sentence_lengths = [3, 6, 10]
for length in sentence_lengths:
    input_sentence = opensub_input_padded[length]  # Предложение из определенного количества слов
    translated_sentence = translate_sentence(input_sentence, encoder, decoder, max_length=length)
    print(f'Original sentence of {length} words: {" ".join([opensub_vocab_input_reverse[word.item()] for word in input_sentence])}')
    print(f'Translated sentence: {" ".join(translated_sentence)}')
    print()


Original sentence of 3 words: <bos> lt would be fought here ln our present <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Translated sentence: <bos> он

Original sentence of 6 words: <bos> what the hell <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad

In [23]:
def get_word_vector_lstm(word, encoder, vocab, device='cpu'):
    # Преобразование слова в тензор
    input_tensor = torch.tensor([vocab[word]]).to(device)
    
    # Получение эмбеддинга слова
    embedded = encoder.embedding(input_tensor)
    
    # Инициализация скрытых состояний
    hidden, cell = encoder.init_hidden()
    
    # Прямой проход через LSTM
    embedded = embedded.view(1, 1, -1)  # (1, 1, hidden_size) чтобы соответствовать формату LSTM
    
    # Проверка и исправление размерностей для LSTM
    if hidden.dim() != 3:
        hidden = hidden.unsqueeze(0)  # Приводим к размерности (num_layers, batch_size, hidden_size)
    if cell.dim() != 3:
        cell = cell.unsqueeze(0)  # Приводим к размерности (num_layers, batch_size, hidden_size)
    
    output, (hidden, cell) = encoder.lstm(embedded, (hidden, cell))
    
    # Возвращаем последнее скрытое состояние
    return hidden[-1]  # Возвращаем последнюю скрытую вектору


vocab = opensub_vocab_input  # Например, словарь для кодировщика
word = 'happy'

# Получение векторов слов
vector_lstm = get_word_vector_lstm(word, encoder, vocab)

print(f"LSTM vector for '{word}': {vector_lstm}")

LSTM vector for 'happy': tensor([[ 2.3426e-03, -4.0188e-02,  1.4073e-02, -1.3712e-01, -1.8990e-01,
         -1.5245e-01,  2.8612e-02,  5.2273e-02, -2.2459e-02, -2.6216e-02,
         -2.6487e-01,  2.6727e-02, -3.0794e-01,  7.8054e-02,  3.4654e-02,
         -1.2907e-01, -9.2686e-02,  1.2740e-01, -1.0494e-01,  2.7748e-02,
          1.1791e-01, -1.5569e-01,  1.3155e-01,  9.1160e-02,  4.3471e-02,
         -4.5930e-02, -2.2109e-01,  5.1826e-02,  7.6673e-02,  1.0606e-01,
         -1.5880e-01, -1.3279e-01,  6.8736e-02,  1.7262e-02, -1.3951e-01,
         -7.9645e-02, -7.7770e-02, -6.6938e-02, -6.5604e-02, -7.3534e-02,
          1.1132e-01,  1.4896e-04, -1.4683e-01, -1.0186e-01, -8.8630e-02,
          1.5370e-01, -1.6466e-01,  2.8312e-02,  1.8305e-01, -6.5646e-02,
          4.0294e-02,  1.0834e-01,  1.3756e-02,  1.6939e-01, -3.9089e-02,
         -5.6647e-02, -5.4708e-02,  2.0035e-01, -1.5366e-01,  9.9880e-02,
          1.8550e-01,  5.2398e-02,  1.0620e-01,  1.6688e-01,  1.1317e-01,
          1.2