In [1]:
from io import open
import unicodedata
import string
import re
import random
from tqdm import tqdm

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### 1. Загрузка и преобразование данных

In [4]:
# Класс для создания словаря
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
# Изменение кодировка текста на ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

# Приведение к нижнему регистру, удаление лишних символов
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^а-яА-Вa-zA-Z.!?]+", r" ", s)
    return s

In [6]:
# Чтение файла. Создание пар фраз
def readLangs(lang1, lang2, reverse = False):
    
    # Read the file and split into lines
    lines = open('D:/НЕТОЛОГИЯ/Deep ML/файлы/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')[0:2]] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [7]:
# Отбор предложений длиной 30
MAX_LENGTH = 30

def filterPair(p):
    return len(p[0]) < MAX_LENGTH and len(p[1]) < MAX_LENGTH 

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [8]:
# Формирование итогового датасета
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2)
    print("Read %s sentence pairs" % len(pairs))
    #pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, filterPairs(pairs)

input_lang, output_lang, pairs = prepareData('eng', 'rus', True)
print(random.choice(pairs))

Read 519900 sentence pairs
Trimmed to 519900 sentence pairs
Counting words...
Counted words:
eng 17542
rus 60040
['is this your purse ?', 'это ваша сумочка ?']


### 2. Функции для обучения и теста

In [9]:
# Encoder
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers = num_layers)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output) # правка для 2-х слойной GRU
        output, hidden = self.gru(output, hidden) 
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [10]:
# Decoder
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers = num_layers)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output) # правка для 2-х слойной GRU
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [11]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [12]:
#Обучение
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length = 30):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device = device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [13]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [14]:
def trainIters(encoder, decoder, n_iters, print_every = 1000, plot_every = 100, learning_rate = 0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    
    criterion = nn.NLLLoss()
        
    for iter in tqdm(range(1, n_iters + 1)):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [15]:
# Функция для визуализации лосса
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [21]:
# Тест
def evaluate(encoder, decoder, sentence, max_length = MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [22]:
# Функция для грубой оценки точности
def evaluateRandomly(encoder, decoder, n=10):
    acc = 0
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        if output_words == pair[1]:
            acc += 1
            print('полное совпадение')
        print('')

### 3. GRU с 1 слоем

In [167]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size, num_layers = 1).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words, num_layers = 1).to(device)

In [184]:
# Обучение
trainIters(encoder1, decoder1, n_iters = 50000, print_every = 5000)

 10%|█         | 5006/50000 [01:39<14:58, 50.07it/s]

1m 46s (- 15m 55s) (5000 10%) 2.4141


 20%|██        | 10006/50000 [03:19<14:00, 47.60it/s]

3m 25s (- 13m 42s) (10000 20%) 2.3924


 30%|███       | 15009/50000 [04:58<11:40, 49.99it/s]

5m 5s (- 11m 52s) (15000 30%) 2.3592


 40%|████      | 20005/50000 [06:38<10:23, 48.08it/s]

6m 45s (- 10m 7s) (20000 40%) 2.3574


 50%|█████     | 25007/50000 [08:19<08:20, 49.96it/s]

8m 25s (- 8m 25s) (25000 50%) 2.3350


 60%|██████    | 30008/50000 [09:59<06:25, 51.83it/s]

10m 6s (- 6m 44s) (30000 60%) 2.2958


 70%|███████   | 35009/50000 [11:39<04:51, 51.49it/s]

11m 45s (- 5m 2s) (35000 70%) 2.2839


 80%|████████  | 40007/50000 [13:19<03:06, 53.61it/s]

13m 26s (- 3m 21s) (40000 80%) 2.2666


 90%|█████████ | 45006/50000 [15:01<01:41, 49.37it/s]

15m 7s (- 1m 40s) (45000 90%) 2.2054


100%|██████████| 50000/50000 [16:41<00:00, 49.93it/s]


16m 47s (- 0m 0s) (50000 100%) 2.2149


In [191]:
# Оценка качества
evaluateRandomly(encoder1, decoder1, n = 10)

> you need to get started .
= вам надо начинать .
< тебе надо к . <EOS>

> this isn t paper .
= это не бумага .
< это не . . <EOS>

> i wish that wasn t the case .
= лучше б оно было не так .
< на было бы и не . <EOS>

> tom is crying in his room .
= том плачет в своеи комнате .
< том в в в в комнате . <EOS>

> you ve lost your mind .
= вы с ума сошли .
< ты потерял свое . <EOS>

> was tom ever married ?
= том когда нибудь был женат ?
< том когда нибудь женат ? <EOS>

> you re being hunted .
= на вас охотятся .
< ты не . <EOS>

> you re not very good .
= ты не слишком хорош .
< ты не очень хорошо . <EOS>

> take this brochure .
= возьми эту брошюру .
< возьмите эту . <EOS>

> i ll come home at ten .
= я приду домои в десять .
< я вернусь в в . . <EOS>



### 4. GRU с 2 слоями

In [18]:
hidden_size = 256
encoder2 = EncoderRNN(input_lang.n_words, hidden_size, num_layers = 2).to(device)
decoder2 = DecoderRNN(hidden_size, output_lang.n_words, num_layers = 2).to(device)

In [19]:
# Обучение
trainIters(encoder2, decoder2, n_iters = 50000, print_every = 5000)

 10%|█         | 5008/50000 [01:37<15:57, 46.98it/s]

1m 44s (- 15m 42s) (5000 10%) 4.9985


 20%|██        | 10009/50000 [03:18<13:12, 50.47it/s]

3m 25s (- 13m 40s) (10000 20%) 4.8544


 30%|███       | 15006/50000 [04:58<11:38, 50.07it/s]

5m 5s (- 11m 53s) (15000 30%) 4.8120


 40%|████      | 20005/50000 [06:39<10:02, 49.79it/s]

6m 46s (- 10m 10s) (20000 40%) 4.7132


 50%|█████     | 25009/50000 [08:20<08:16, 50.29it/s]

8m 27s (- 8m 27s) (25000 50%) 4.7213


 60%|██████    | 30009/50000 [10:01<06:36, 50.46it/s]

10m 8s (- 6m 45s) (30000 60%) 4.6603


 70%|███████   | 35007/50000 [11:40<04:52, 51.26it/s]

11m 48s (- 5m 3s) (35000 70%) 4.6481


 80%|████████  | 40005/50000 [13:20<03:19, 50.17it/s]

13m 27s (- 3m 21s) (40000 80%) 4.6866


 90%|█████████ | 45009/50000 [14:59<01:35, 52.17it/s]

15m 7s (- 1m 40s) (45000 90%) 4.6272


100%|██████████| 50000/50000 [16:39<00:00, 50.05it/s]


16m 46s (- 0m 0s) (50000 100%) 4.6218


In [24]:
# Оценка качества
evaluateRandomly(encoder2, decoder2, n = 10)

> i think tom will like you .
= думаю ты понравишься тому .
< я не <EOS>

> tom was killed on monday .
= тома убили в понедельник .
< я не <EOS>

> that isn t the same .
= это не то же самое .
< я не <EOS>

> i won t forget .
= не забуду .
< я не <EOS>

> i wanted to die .
= я хотел умереть .
< я не <EOS>

> the movie ran for two hours .
= фильм шел два часа .
< я не <EOS>

> is it hard to milk a goat ?
= трудно ли доить козу ?
< я не <EOS>

> guess whose house it is .
= угадаи чеи это дом .
< я не <EOS>

> it s poison .
= это яд .
< я не <EOS>

> what is this thing for ?
= зачем нужна эта вещь ?
< я не <EOS>



### 5. LSTM

In [35]:
# Encoder для LSTM
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.lstm(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device = device)

In [36]:
# Decoder для LSTM
class DecoderLSTM(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.lstm(output)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [38]:
hidden_size = 256
encoder3 = EncoderLSTM(input_lang.n_words, hidden_size).to(device)
decoder3 = DecoderLSTM(hidden_size, output_lang.n_words).to(device)

In [39]:
# Обучение
trainIters(encoder3, decoder3, n_iters = 50000, print_every = 5000)

 10%|█         | 5011/50000 [01:20<13:00, 57.65it/s]

1m 27s (- 13m 5s) (5000 10%) 5.2503


 20%|██        | 10010/50000 [02:44<10:57, 60.84it/s]

2m 50s (- 11m 23s) (10000 20%) 4.8690


 30%|███       | 15008/50000 [04:11<09:22, 62.25it/s]

4m 18s (- 10m 3s) (15000 30%) 4.8126


 40%|████      | 20009/50000 [05:37<08:47, 56.88it/s]

5m 44s (- 8m 36s) (20000 40%) 4.7352


 50%|█████     | 25006/50000 [07:03<07:05, 58.72it/s]

7m 10s (- 7m 10s) (25000 50%) 4.6817


 60%|██████    | 30007/50000 [08:26<05:14, 63.62it/s]

8m 33s (- 5m 42s) (30000 60%) 4.6826


 70%|███████   | 35008/50000 [09:49<04:09, 60.17it/s]

9m 56s (- 4m 15s) (35000 70%) 4.6613


 80%|████████  | 40012/50000 [11:14<02:43, 60.97it/s]

11m 20s (- 2m 50s) (40000 80%) 4.6579


 90%|█████████ | 45009/50000 [12:38<01:35, 52.08it/s]

12m 45s (- 1m 25s) (45000 90%) 4.6413


100%|██████████| 50000/50000 [14:05<00:00, 59.12it/s]


14m 12s (- 0m 0s) (50000 100%) 4.6300


In [40]:
# Оценка качества
evaluateRandomly(encoder3, decoder3, n = 10)

> you re a big liar .
= ты большои лгун .
< я не . <EOS>

> are you still in college ?
= ты еще в колледже ?
< я не . <EOS>

> i m really old .
= я очень старая .
< я не . <EOS>

> you don t need to do that .
= тебе незачем это делать .
< я не . <EOS>

> tom likes ponies .
= том любит пони .
< я не . <EOS>

> nobody told us either .
= нам тоже никто не сказал .
< я не . <EOS>

> there weren t any survivors .
= никто не выжил .
< я не . <EOS>

> judge for yourself .
= сам посуди .
< я не . <EOS>

> that s mine . give it back .
= это мое . отдаи .
< я не . <EOS>

> we re very happy .
= мы очень счастливы .
< я не . <EOS>

