In [None]:
%matplotlib inline

In [None]:
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
!wget https://download.pytorch.org/tutorial/data.zip
!unzip data.zip

--2023-02-18 16:44:20--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 18.65.3.63, 18.65.3.71, 18.65.3.38, ...
Connecting to download.pytorch.org (download.pytorch.org)|18.65.3.63|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip.1’


2023-02-18 16:44:21 (19.6 MB/s) - ‘data.zip.1’ saved [2882130/2882130]

Archive:  data.zip
replace data/eng-fra.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
!tail data/eng-fra.txt

No matter how much you try to convince people that chocolate is vanilla, it'll still be chocolate, even though you may manage to convince yourself and a few others that it's vanilla.	Peu importe le temps que tu passeras à essayer de convaincre les gens que le chocolat est de la vanille, ça restera toujours du chocolat, même si tu réussis à convaincre toi et quelques autres que c'est de la vanille.
A child who is a native speaker usually knows many things about his or her language that a non-native speaker who has been studying for years still does not know and perhaps will never know.	Un enfant qui est un locuteur natif connaît habituellement de nombreuses choses sur son langage qu'un locuteur non-natif qui a étudié pendant des années ignore encore et peut-être ne saura jamais.
There are four main causes of alcohol-related death. Injury from car accidents or violence is one. Diseases like cirrhosis of the liver, cancer, heart and blood system diseases are the others.	Il y a quatre caus

# **1 словарь языка**

In [None]:
SOS_token = 0
EOS_token = 1

# объект, который позволит работать с языком
class Lang:
    def __init__(self, name):
        self.name = name # сохраняется имя
        self.word2index = {} # перевод слов в индексы
        self.word2count = {} # счетчик одинаковых слов
        self.index2word = {0: "SOS", 1: "EOS"} # добавляет 2 токена, делает обратное преобразование из индексов в цифры
        self.n_words = 2  # Count SOS and EOS (количество слов, которое мы используем в нашем словаре)

    def addSentence(self, sentence): # функция: добавить предложение
        for word in sentence.split(' '): # каждое предложение разбиваем по пробелам, получаем слова
            self.addWord(word) # каждое слово добавляем в словарь

    def addWord(self, word): # функция: добавления слова
        if word not in self.word2index: # проверяем находится ли слово внутри словаря
            self.word2index[word] = self.n_words # добавляем новое слово, а именно, в форме: ключ-значение (пример: {239:'зонтик'})
            self.word2count[word] = 1
            self.index2word[self.n_words] = word # 'зонтик': 239
            self.n_words += 1
        else:
            self.word2count[word] += 1 # если слово в словаре, то добавляем количество данных слов в словаре

# **2 функция с кодировкой unicodeToAscii(s):**
# **3 функция, которая приводит в порядок текст normalizeString(s):**

In [None]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

# **4 функция, которая читает наш словарь:**

In [None]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # читаем файл и разбиваем на линии
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # разбивает все линии на пары и очищает, т.е. делает: def normalizeString(s)(см. выше)
    # т.к. у нас идет: предложение, затем предложение-перевод, то здесь ведется обработка предложений - парами.
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Реверс, который меняет языки местами (т.е. меняетпредложения попарно местами)
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

# **5 Сокращаем текст (способ: указываем слова с которых должны начинаться предложения eng_prefixes)**

In [None]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# **6 Функция считает наши словари**

In [None]:
# читает словари, фильтрует пары, печатает сколько осталось,
# добавляет пары в соответствующие языки, выводим статистику
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 10853 sentence pairs
Counting words...
Counted words:
fra 4489
eng 2925
['il a encore des doutes .', 'he is still having doubts .']


In [None]:
input_lang.n_words

4489

In [None]:
# print(input_lang.name, input_lang.n_words) == fra 4489
# hidden_size = 256
# EncoderRNN(input_lang.n_words, hidden_size) == EncoderRNN(4489, 256)

In [None]:
# nn.Embedding(input_size, hidden_size)
nn.Embedding(256, 256)

Embedding(256, 256)

In [None]:
# nn.GRU(hidden_size, hidden_size)
nn.GRU(256, 256)

GRU(256, 256)

In [None]:
nn.Embedding(256,256)(256).view(1, 1, -1)

# **7 The Encoder** (прогоняет наши данные через модель)
-----------





In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size): # EncoderRNN(input_lang.n_words, hidden_size) == EncoderRNN(4489, 256)
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size) # Embedding(256, 256)
        self.gru = nn.GRU(hidden_size, hidden_size) # nn.GRU(256, 256)

    def forward(self, input, hidden): # input torch.Size([1]) hidden torch.Size([1, 1, 256])
#        print (f'input {input.shape} hidden {hidden.shape}')
        embedded = self.embedding(input).view(1, 1, -1) # embedded torch.Size([1, 1, 256])
#        print(f'embedded {embedded.shape}')
        output = embedded
        output, hidden = self.gru(output, hidden) # output, hidden torch.Size([1, 1, 256]), torch.Size([1, 1, 256])
#        print(f'output, hidden {output.shape}, {hidden.shape}')
        return output, hidden

    def initHidden(self): # обнуляет скрытое состояние
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
embedded torch.Size([1, 1, 256])
output, hidden torch.Size([1, 1, 256]), torch.Size([1, 1, 256])

#  **8 The Decoder**
-----------




In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# **9 Функции которые переводят наши пары в тензоры:**

# !!!!!!!!!!в этом уроке каждый индекс - это слово!!!!!!!!!

In [None]:
input_lang

<__main__.Lang at 0x7f8c9a9fdfd0>

In [None]:
[input_lang.word2index[i] for i in pairs[0].split(' ')]

AttributeError: ignored

In [None]:
# получаем индексы из предложений
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

# функция использует функцию выше (предыдущую)
def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

# функция использует функцию выше (предыдущую)
def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

# **10 функция train обучает модель на одном предложении:**

In [None]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

# **11 функции работы со временем**

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

# **12 функция, которая обучает нашу модель на всех предложениях**

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    # список тренировочных пар:
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

# **13 отрисовывается график lossов**

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

# **14 функция, которая будет генерировать предложение**

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    # max_length=MAX_LENGTH - это задается максимальная длина сгенерированного предложения
    with torch.no_grad(): # отключаем градиенты, т.к. они уже обучены
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words # на выходе словарь из слов последовательности

# **15 Функция выводит заданное (n=10) число предложений и их переводов**

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

trainIters(encoder1, decoder1, 75000, print_every=5000)

input torch.Size([1]) hidden torch.Size([1, 1, 256])
embedded torch.Size([1, 256])


RuntimeError: ignored

In [None]:
evaluateRandomly(encoder1, decoder1)

> il est fier de sa collection .
= he is proud of his collection .
< he is proud of his of . <EOS>

> tu n es pas la bienvenue ici .
= you re not welcome here .
< you re not welcome here . <EOS>

> je suis enchantee d etre ici .
= i m delighted to be here .
< i m delighted to be here . <EOS>

> elle parle dix langues .
= she speaks ten languages .
< she speaks ten languages . <EOS>

> elles sont la pour moi .
= they re here for me .
< they re here for me . <EOS>

> tu es completement ignorant .
= you re totally ignorant .
< you re totally ignorant . <EOS>

> je fredonne .
= i m humming .
< i m interfering . <EOS>

> il est frais emoulu de l universite .
= he is fresh from college .
< he s fresh out of college . <EOS>

> je suis impartial .
= i m unbiased .
< i m rich . <EOS>

> je suis tres occupe aujourd hui .
= i m very busy today .
< i m very busy today . <EOS>

