In [1]:
!mkdir data
!wget http://www.manythings.org/anki/rus-eng.zip -O 'data/rus-eng.zip'
!wget https://object.pouta.csc.fi/OPUS-OpenSubtitles/v1/moses/en-ru.txt.zip -O 'data/en-ru.txt.zip'

mkdir: data: File exists
--2024-09-12 08:20:52--  http://www.manythings.org/anki/rus-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16305013 (16M) [application/zip]
Saving to: ‘data/rus-eng.zip’


2024-09-12 08:21:02 (1.61 MB/s) - ‘data/rus-eng.zip’ saved [16305013/16305013]

--2024-09-12 08:21:02--  https://object.pouta.csc.fi/OPUS-OpenSubtitles/v1/moses/en-ru.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 610036 (596K) [application/zip]
Saving to: ‘data/en-ru.txt.zip’


2024-09-12 08:21:06 (183 KB/s) - ‘data/en-ru.txt.zip’ saved [610036/610036]



In [2]:
import unicodedata
import string
import re
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import pandas as pd
import zipfile
from collections import Counter
import torch
from torch.nn.utils.rnn import pad_sequence
import re

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# dataset_anki_path = 'data/rus-eng.zip'
# with zipfile.ZipFile(dataset_anki_path, 'r') as zip_ref:
#     zip_ref.extractall('anki_data')
# dataset_opensub_path = 'data/en-ru.txt.zip'
# with zipfile.ZipFile(dataset_opensub_path, 'r') as zip_ref:
#     zip_ref.extractall('opensubtitles_data')

In [4]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) \
        if unicodedata.category(c) != 'Mn')

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Zа-яёъА-ЯЁЪ.!?]+", r" ", s)
    return s


In [6]:
def load_opensubtitles_data(file_path_ru, file_path_en, num_samples=100_000):
    input_texts = []
    with open(file_path_en, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines[:num_samples]:
            en_text = line.strip()
            input_texts.append(normalizeString(en_text))
    target_texts = []
    with open(file_path_ru, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines[:num_samples]:
            ru_text = line.strip()
            target_texts.append(normalizeString(ru_text))
    return input_texts, target_texts

def load_anki_data(file_path, num_samples=100_000):
    input_texts = []
    target_texts = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines[:num_samples]:
            en_text, ru_text = line.strip().split('\t')[:-1]
            input_texts.append(normalizeString(en_text))
            target_texts.append(normalizeString(ru_text)) 
    return input_texts, target_texts

In [7]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # lines = open('anki_data/rus.txt', encoding='utf-8').\
    #     read().strip().split('\n')

    # pairs = [[normalizeString(s) for s in l.split('\t')][0:2] for l in lines]
    anki_input_texts, anki_target_texts = load_anki_data('anki_data/rus.txt')
    opensub_input_texts, opensub_target_texts = load_opensubtitles_data('opensubtitles_data/OpenSubtitles.en-ru.ru',
                                                                 'opensubtitles_data/OpenSubtitles.en-ru.en')
    # pairs = list(zip(anki_input_texts, anki_target_texts)) 
    pairs = list(zip(opensub_input_texts, opensub_target_texts))
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [8]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH  and \
        p[0].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [9]:
MAX_LENGTH = 10


def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    pairs = filterPairs(pairs)
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'rus', False)
print(random.choice(pairs))

Reading lines...
Counted words:
eng 999
rus 1379
('i m not gonna hurt anyone .', 'я никого не трону !')


In [10]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size,num_layers=1):
        super(EncoderRNN, self).__init__()
        self.num_layers=num_layers
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size,num_layers)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.rnn(output, hidden)
        return output, hidden

    def initHidden(self):
        if str(self.rnn)[:4] == 'LSTM':
            return (torch.zeros(self.num_layers, 1, self.hidden_size, device=device)
                ,torch.zeros(self.num_layers, 1, self.hidden_size, device=device))
        else:
            return torch.zeros(self.num_layers, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size,num_layers=1):
        super(DecoderRNN, self).__init__()
        self.num_layers=num_layers
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size,num_layers)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.rnn(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        if str(self.rnn)[:4] == 'LSTM':
            return (torch.zeros(self.num_layers, 1, self.hidden_size, device=device)
                ,torch.zeros(self.num_layers, 1, self.hidden_size, device=device))
        else:
            return torch.zeros(self.num_layers, 1, self.hidden_size, device=device)

In [11]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)


In [12]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    # Encode input tensor

    loss = 0
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    # Decode using the encoded hidden state
    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    if use_teacher_forcing:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]
    else:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            _, topi = decoder_output.topk(1)

            decoder_input = topi.squeeze().detach()
            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.item() / target_length
    



In [13]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [14]:
def trainIters(encoder, decoder, n_iters, print_every=1000,
               learning_rate=0.01):
    start = time.time()
    print_time = time.time()
    print_iter = 0
    print_loss_total = 0
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()
    for iter_ in range(1, n_iters + 1):
        training_pair = training_pairs[iter_ - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        print_iter += 1

        if iter_ % 100 == 0 : 
            if (time.time() - print_time > 30) or iter_ == n_iters:
                print_time = time.time()
                print_loss_avg = print_loss_total / print_iter
                print_iter = 0
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, iter_ / n_iters),
                                             iter_, iter_ / n_iters * 100, print_loss_avg))
        
            
                
        


In [15]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size,1).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words,1).to(device)
trainIters(encoder1, decoder1, 100_000, print_every=10000)

0m 30s (- 63m 15s) (800 0%) 4.0931
1m 2s (- 60m 1s) (1700 1%) 4.0058
1m 33s (- 58m 12s) (2600 2%) 3.8576
2m 3s (- 56m 55s) (3500 3%) 3.6404
2m 34s (- 56m 7s) (4400 4%) 3.6211
3m 6s (- 55m 30s) (5300 5%) 3.3818
3m 37s (- 54m 47s) (6200 6%) 3.3225
4m 8s (- 54m 17s) (7100 7%) 3.0963
4m 40s (- 54m 27s) (7900 7%) 3.0186
5m 10s (- 54m 23s) (8700 8%) 2.7887
5m 41s (- 54m 12s) (9500 9%) 2.5852
6m 14s (- 53m 48s) (10400 10%) 2.3835
6m 47s (- 53m 21s) (11300 11%) 2.1350
7m 20s (- 52m 51s) (12200 12%) 1.9401
7m 51s (- 52m 34s) (13000 13%) 1.7572
8m 24s (- 52m 3s) (13900 13%) 1.5513
8m 58s (- 51m 37s) (14800 14%) 1.3746
9m 31s (- 51m 8s) (15700 15%) 1.1503
10m 4s (- 50m 56s) (16500 16%) 0.9757
10m 35s (- 50m 39s) (17300 17%) 0.8460
11m 8s (- 50m 22s) (18100 18%) 0.7226
11m 39s (- 50m 2s) (18900 18%) 0.6368
12m 13s (- 49m 48s) (19700 19%) 0.5571
12m 45s (- 49m 26s) (20500 20%) 0.4512
13m 16s (- 49m 3s) (21300 21%) 0.3981
13m 50s (- 48m 47s) (22100 22%) 0.3546
14m 23s (- 48m 26s) (22900 22%) 0.2846


In [25]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]
        decoder_input = torch.tensor([[SOS_token]], device=device)  
        decoder_hidden = encoder_hidden
        decoded_words = []
        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            _, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])
            decoder_input = topi.squeeze().detach()
        return decoded_words
            
def evaluateRandomly(encoder, decoder, l, n=3):
    pairs_ = [pair for pair in pairs if len(pair[0].split()) == l]
    for i in range(n):
        
        
        pair = random.choice(pairs_)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words[1:-1])
        print('<', output_sentence)
        print('')            
            
        
        


In [26]:
# 3 words
evaluateRandomly(encoder1, decoder1, 5)

> i m diane selwyn .
= меня зовут даиян селвин .
< зовут даиян селвин .

> i m not moving .
= я никуда не поиду .
< никуда не поиду .

> she s gonna fry .
= она просто сгорит .
< просто сгорит .



In [27]:
# 6 words
evaluateRandomly(encoder1, decoder1,7)

> you re a prince of troy .
= ты принц трои .
< принц трои .

> you re using her as bait .
= ты используешь ее в качестве приманки ?
< используешь ее в качестве приманки ?

> i m gonna write it down .
= нужно все это записать .
< все это записать .



In [28]:
# 7 words
evaluateRandomly(encoder1, decoder1,8)

> you re in prison aren t you ?
= ты же в темнице !
< же в темнице !

> you re playinq a danqerous qame here .
= ты затеял опасную игру .
< затеял опасную игру .

> she s been drinking too much coppertone .
=  ты мне это говоришь ?
< ты мне это говоришь ?



In [34]:
def get_word_vector(word, encoder, decoder, vocab, device='cpu'):
    input_tensor = torch.tensor([[vocab[word]]], device=device)
    
    encoder_hidden = encoder.initHidden()
    encoder_output, encoder_hidden = encoder(input_tensor[0], encoder_hidden)
    
    return encoder_hidden[0].squeeze(0)

def evaluate_word_pairs(encoder, decoder, vocab, synonym_pairs, antonym_pairs, device='cpu'):
    results = []

    def cosine_similarity_torch(vec1, vec2):
        cos = F.cosine_similarity(vec1, vec2)
        return cos.item()

    for word1, word2 in synonym_pairs:
        vec1 = get_word_vector(word1, encoder, decoder, vocab, device)
        vec2 = get_word_vector(word2, encoder, decoder, vocab, device)
        cosine_sim = cosine_similarity_torch(vec1, vec2)
        results.append([word1, word2, cosine_sim, "Синонимы"])

    for word1, word2 in antonym_pairs:
        vec1 = get_word_vector(word1, encoder, decoder, vocab, device)
        vec2 = get_word_vector(word2, encoder, decoder, vocab, device)
        cosine_sim = cosine_similarity_torch(vec1, vec2)
        results.append([word1, word2, cosine_sim, "Антонимы"])

    df = pd.DataFrame(results, columns=["Слово 1", "Слово 2", "Косинусное расстояние", "Тип"])
    return df


synonym_pairs = [("страшно", "ужасно")]
antonym_pairs = [("страшно", "уверен")]

df_results = evaluate_word_pairs(encoder1, decoder1, output_lang.word2index, synonym_pairs, antonym_pairs)

print(df_results)


   Слово 1 Слово 2  Косинусное расстояние       Тип
0  страшно  ужасно              -0.165247  Синонимы
1  страшно  уверен               0.040088  Антонимы
