In [None]:
import torch
from torch import nn
import pandas as pd
import numpy as np
import re
import time
import torchtext
import torchdata

In [None]:
from torchtext import transforms
from torchdata import datapipes

In [None]:
print(torch.cuda.is_available())
torchtext.__version__, torchdata.__version__

False


('0.16.0+cpu', '0.7.0')

**Функция вывода информации о датапайпе**

In [None]:
def print_simple_data_pipe(data_pipe: datapipes.iter.IterDataPipe) -> None:
    print(type(data_pipe))
    for sample in data_pipe:
        print(sample)
        break

**Создание датапайпов построчного чтения из файлов**

In [None]:
data_pipe_ru = datapipes.iter.FileLister('/content/corpus.en_ru.1m.ru')
data_pipe_en = datapipes.iter.FileLister('../data/1mcorpus/corpus.en_ru.1m.en')
print_simple_data_pipe(data_pipe_ru)

<class 'torch.utils.data.datapipes.iter.fileopener.FileOpenerIterDataPipe'>
<class 'torch.utils.data.datapipes.iter.filelister.FileListerIterDataPipe'>
/content/corpus.en_ru.1m.ru


In [None]:
data_pipe_ru = datapipes.iter.FileOpener(data_pipe_ru, mode='r', encoding='UTF-8')
data_pipe_en = datapipes.iter.FileOpener(data_pipe_en, mode='r', encoding='UTF-8')
print_simple_data_pipe(data_pipe_ru)

In [None]:
skip = 1000000 - 10000
data_pipe_ru = datapipes.iter.LineReader(data_pipe_ru, return_path=False, skip_lines=skip)
data_pipe_en = datapipes.iter.LineReader(data_pipe_en, return_path=False, skip_lines=skip)
print_simple_data_pipe(data_pipe_ru)

<class 'torchdata.datapipes.iter.util.plain_text_reader.LineReaderIterDataPipe'>


**Объединение датапайпов**

In [None]:
l = list(data_pipe_ru)
print(l)

[]


In [None]:
data_pipe = datapipes.iter.Zipper(data_pipe_ru, data_pipe_en)
print_simple_data_pipe(data_pipe)

<class 'torch.utils.data.datapipes.iter.combining.ZipperIterDataPipe'>
('Все задачи по диагностике и пуско-наладке выполняются копплером.', 'The Bus Coupler carries out all diagnosis and commissioning tasks.')


**Функции токенизации для текстов**

In [None]:
from razdel import tokenize
import re
def tokenizer_ru(ru_text: str) -> list[str]:
    text = re.sub('[^\s^\w^-]+', '', ru_text).lower()
    words = [j.text for j in tokenize(text)]
    return words

In [None]:
from nltk import word_tokenize
def tokenizer_en(en_text: str) -> list[str]:
    text = re.sub('[^\s^\w^-]+', '', en_text).lower()
    words = word_tokenize(text)
    return words

**Генераторы с функциями токенизации**

In [None]:
def yield_tokens_ru(data_iter: datapipes.iter.IterDataPipe):
    # итерируемся по набору данных
    for ru_text, en_text in data_iter:
        yield tokenizer_ru(ru_text)
def yield_tokens_en(data_iter: datapipes.iter.IterDataPipe):
    # итерируемся по набору данных
    for ru_text, en_text in data_iter:
        yield tokenizer_en(en_text)

**Создание словарей для языков**

In [None]:
from torchtext.vocab import build_vocab_from_iterator

In [None]:
vocab_ru = build_vocab_from_iterator(
    yield_tokens_ru(data_pipe),
    min_freq=1,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
vocab_en = build_vocab_from_iterator(
    yield_tokens_en(data_pipe),
    min_freq=1,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
# если мы попытаемся получить индекс слова, которого не знаем - вернется индекс для токена <unk>
vocab_ru.set_default_index(vocab_ru['<unk>'])
vocab_en.set_default_index(vocab_en['<unk>'])

**Функции добавления тегов sos и eos**

In [None]:
def vocab_transform_ru(vocab: torchtext.vocab.Vocab) -> transforms.Sequential:
    text_tranform = transforms.Sequential(
        transforms.VocabTransform(vocab=vocab),
        # добавляем в начало и конец документа индексы токенов <sos> и <eos> соответственно
        # это актуально для решения задач машинного перевода и генерации текстов
        # для решения условной задачи классификации это избыточно
        transforms.AddToken(vocab['<sos>'], begin=True),
        transforms.AddToken(vocab['<eos>'], begin=False)
    )
    return text_tranform
def vocab_transform_en_with_sos(vocab: torchtext.vocab.Vocab) -> transforms.Sequential:
    text_tranform = transforms.Sequential(
        transforms.VocabTransform(vocab=vocab),
        transforms.AddToken(vocab['<sos>'], begin=True)
    )
    return text_tranform
def vocab_transform_en_with_eos(vocab: torchtext.vocab.Vocab) -> transforms.Sequential:
    text_tranform = transforms.Sequential(
        transforms.VocabTransform(vocab=vocab),
        transforms.AddToken(vocab['<eos>'], begin=False)
    )
    return text_tranform

In [None]:
def apply_vocab_transform(pair: tuple[list[str], list[str]]) -> tuple[list[int], list[int], list[int]]:
    return (
        vocab_transform_ru(vocab_ru)(tokenizer_ru(pair[0])),
        vocab_transform_en_with_sos(vocab_en)(tokenizer_en(pair[1])),
        vocab_transform_en_with_eos(vocab_en)(tokenizer_en(pair[1]))
)

In [None]:
data_pipe1 = data_pipe.map(apply_vocab_transform)
# print_simple_data_pipe(data_pipe1)

**Отсев длинных предложений**

In [None]:
mean_lens = np.median([(len(i[0]), len(i[1])) for i in list(data_pipe1)], axis=0)
print(mean_lens)
max_len_ru = round(mean_lens[0])
max_len_en = round(mean_lens[1])

[16. 18.]


In [None]:
def max_size_text(tup):
    return len(tup[0]) <= max_len_ru and len(tup[1]) <= max_len_en

In [None]:
data_pipe1 = data_pipe1.filter(filter_fn=max_size_text)
len_data_pipe = len(list(data_pipe1))
print(len_data_pipe) #11188 18000 #35000

4490


**Добавление паддингов**

In [None]:
def apply_padding(pair):
    len_ru, len_en = len(pair[0]), len(pair[1])
    ru = pair[0] + [vocab_ru['<pad>']]*(max_len_ru - len_ru)
    en_with_sos = pair[1] + [vocab_en['<pad>']]*(max_len_en - len_en)
    en_with_eos = pair[2] + [vocab_en['<pad>']]*(max_len_en - len_en)
    return (ru, en_with_sos, en_with_eos)
    # return (torch.tensor(ru, device=device), torch.tensor(en_with_sos, device=device), torch.tensor(en_with_eos, device=device))

In [None]:
data_pipe2 = data_pipe1.map(apply_padding)
# print_simple_data_pipe(data_pipe2)

**Разделение на обучающую и тестовую выборку**

In [None]:
test_size = 5
data_pipe_train, data_pipe_test = datapipes.iter.RandomSplitter(data_pipe2, total_length=len_data_pipe, weights={"train": (len_data_pipe - test_size) / len_data_pipe , "test": test_size / len_data_pipe}, seed=46)
# print_simple_data_pipe(data_pipe_test)

**Создание батчей для обучения**

In [None]:
BATCH_SIZE = 256
data_pipe_train1 = data_pipe_train.bucketbatch(
    batch_size = BATCH_SIZE,
    use_in_batch_shuffle=False,
)
data_pipe_test1 = data_pipe_test.bucketbatch(
    batch_size = test_size
)

In [None]:
# [  [([а,б,в], [a,b,c]), ([г,д,е], [d,f,g])]  ,  [([а,б,в], [a,b,c]), ([г,д,е], [d,f,g])]  ]
# [  [([а,б,в],[г,д,е]), ([a,b,c], [d,f,g])]   ,  [([а,б,в],[г,д,е]), ([a,b,c], [d,f,g])]   ]
def separate(pair: list[tuple[list[int], list[int], list[int]]]) -> list[tuple[list[int], list[int], list[int]]]:
    examples, targets_with_sos, targets_with_eos = zip(*pair)
    return examples, targets_with_sos, targets_with_eos
    # return torch.tensor(examples, device=device), torch.tensor(targets_with_sos, device=device), torch.tensor(targets_with_eos, device=device)

In [None]:
data_pipe_train2 = data_pipe_train1.map(separate)
data_pipe_test2 = data_pipe_test1.map(separate)
# print_simple_data_pipe(data_pipe_test2)

**Перенос данных на cuda**

In [None]:
torch.cuda.get_device_name(), torch.cuda.get_device_properties(torch.cuda.current_device)
#не гугловская тесла, но чет пукает

('NVIDIA GeForce MX550',
 _CudaDeviceProperties(name='NVIDIA GeForce MX550', major=7, minor=5, total_memory=2047MB, multi_processor_count=16))

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
device

device(type='cuda')

In [None]:
def transfer_to_cuda(pair):
    return (torch.tensor(pair[0], device=device), torch.tensor(pair[1], device=device), torch.tensor(pair[2], device=device))

In [None]:
data_pipe_train3 = data_pipe_train2.map(transfer_to_cuda)
data_pipe_test3 = data_pipe_test2.map(transfer_to_cuda)

**Применение последовательной предобработки**

In [None]:
data_pipe_train_final =  datapipes.iter.IterableWrapper(list(data_pipe_train3)).shuffle()
data_pipe_test_final =  datapipes.iter.IterableWrapper(list(data_pipe_test3))

**Seq2Seq class**

In [None]:
H_SIZE_RU = 64
H_SIZE_EN = H_SIZE_RU + int(max_len_ru)
EMB_SIZE = 100
VOCAB_SIZE_RU = len(vocab_ru)
VOCAB_SIZE_EN = len(vocab_en)

class Encoder_with_embedding(nn.Module):
    def __init__(self, VOCAB_SIZE_RU, EMB_SIZE_RU, H_SIZE_RU):
        super(Encoder_with_embedding, self).__init__()
        self.emb_ru = nn.Embedding(num_embeddings=VOCAB_SIZE_RU, embedding_dim=EMB_SIZE_RU)
        self.gru_ru = nn.GRU(input_size=EMB_SIZE_RU, hidden_size=H_SIZE_RU, num_layers=1, batch_first=True)
    def forward(self, x_ru):
        vec_ru = self.emb_ru(x_ru)
        # print(x_ru.shape)
        encoder_out, encoder_hidden = self.gru_ru(vec_ru)
        encoder_hidden = encoder_hidden.squeeze(0)
        # print(encoder_out.shape, encoder_out[:, 0].shape, encoder_hidden.shape) # 32 83 64 | 32 64 | 32 64
        # torch.esum('ij,ij->i', encoder_out[:, 0], encoder_hidden)
        vnim = torch.einsum('ij,ij->i', encoder_out[:, 0], encoder_hidden)
        for i in range(1, x_ru.shape[1]):
            vnim = torch.column_stack((vnim, torch.einsum('ij,ij->i', encoder_out[:, i], encoder_hidden)))

        return torch.cat((encoder_hidden, nn.Softmax(dim=1)(vnim)), -1)
# или
# class Encoder_without_embedding(nn.Module):
#     def __init__(self EMB_SIZE_RU, H_SIZE):
#         super(Encoder_without_embedding, self).__init__()
#         self.gru_ru = nn.GRU(input_size=EMB_SIZE_RU, hidden_size=H_SIZE, num_layers=1, batch_first=True)
#     def forward(self, x_vec_ru):
#         encoder_hidden = self.gru_ru(x_vec_ru)[1].squeeze(0)
#         return encoder_hidden

class Decoder(nn.Module):
    def __init__(self, VOCAB_SIZE_EN, EMB_SIZE_EN, H_SIZE_EN):
        super(Decoder, self).__init__()
        self.emb_en = nn.Embedding(VOCAB_SIZE_EN, EMB_SIZE_EN)
        self.gru_en = nn.GRU(input_size=EMB_SIZE_EN, hidden_size=H_SIZE_EN, num_layers=1, batch_first=True)
        self.linear = nn.Linear(H_SIZE_EN, VOCAB_SIZE_EN)
    def forward(self, x_en, encoder_hidden):
        vec_en = self.emb_en(x_en)
        gru_out = self.gru_en(vec_en, encoder_hidden.unsqueeze(0))[0]
        decoder_out = self.linear(gru_out)
        return decoder_out

class Seq2Seq(nn.Module):
    def __init__(self, VOCAB_SIZE_RU, EMB_SIZE_RU, VOCAB_SIZE_EN, EMB_SIZE_EN, H_SIZE_RU, H_SIZE_EN):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder_with_embedding(VOCAB_SIZE_RU, EMB_SIZE_RU, H_SIZE_RU)
        # self.encoder = Encoder_without_embedding(EMB_SIZE_RU, H_SIZE)
        self.decoder = Decoder(VOCAB_SIZE_EN, EMB_SIZE_EN, H_SIZE_EN)
    def forward(self, x_ru, x_en):
        encoder_hidden = self.encoder(x_ru)
        decoder_out = self.decoder(x_en, encoder_hidden)
        return decoder_out

In [None]:
model = Seq2Seq(VOCAB_SIZE_RU, EMB_SIZE, VOCAB_SIZE_EN, EMB_SIZE, H_SIZE_RU, H_SIZE_EN)
model = model.to(device)
model #79000 41000 | 106000 55000 | 161000 84000

Seq2Seq(
  (encoder): Encoder_with_embedding(
    (emb_ru): Embedding(106294, 100)
    (gru_ru): GRU(100, 64, batch_first=True)
  )
  (decoder): Decoder(
    (emb_en): Embedding(55280, 100)
    (gru_en): GRU(100, 80, batch_first=True)
    (linear): Linear(in_features=80, out_features=55280, bias=True)
  )
)

**Алгоритм обучения**

In [None]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0025)

In [None]:
import random

In [None]:
epochs = 100
Teacher_Enforcing = 0.9
a = time.time()
for epoch in range(epochs):
    run = time.time()
    for x_ru, x_en_with_sos, x_en_with_eos in list(data_pipe_train_final):
        x_en_shape = x_en_with_sos.shape
        pred = torch.zeros(x_en_shape[0], x_en_shape[1], device=device).type(torch.int32)
        pred[:, 0] = 1
        outputs = model(x_ru, pred)
        for i in range(x_en_shape[1]):
            if random.uniform(0, 1) < Teacher_Enforcing:
                save = outputs[:, i-1].argmax(1)
                pred[:, i] = save
            else:
                save = x_en_with_sos[:, i]
                pred[:, i] = save
            # pam *=Teacher_Enforcing
            outputs = model(x_ru, pred)
        # print(pred)
        # print(outputs.argmax(2))
        # print(nn.ConstantPad1d((1, 0), 1)(outputs).shape, nn.ConstantPad1d((0, 1), 0)(x_en).shape)
        # print(outputs[:].reshape(-1, outputs.shape[2]).shape, x_en[:].reshape(-1).shape)
        loss_value = loss(outputs[:].reshape(-1, outputs.shape[2]), x_en_with_eos[:].reshape(-1))
        optimizer.zero_grad()
        loss_value.backward()
        optimizer.step()



    print(f'Прошло {(time.time() - a)/60} минут')
    print(f'Время эпохи {(time.time() - run)/60} минут')
    print(f'Эпоха {epoch + 1}, Значение функции потерь: {loss_value.item()}')
print(f'Прошло {(time.time() - a)/60} минут. Время на эпоху: {(time.time() - a)/60/epochs}')
#500 obj - 32 batch - 1.5673409223556518 минут
#500 obj - 64 batch - 0.9876247843106588 минут
#500 obj - 128 batch - 0.9665880322456359 минут
#500 obj - 256 batch - 0.5786411007245381 минут
#2000 obj - 256 batch -

Прошло 0.5267734328905741 минут
Время эпохи 0.5267564177513122 минут
Эпоха 1, Значение функции потерь: 2.559390068054199
Прошло 1.0656983693440756 минут
Время эпохи 0.5237415989240011 минут
Эпоха 2, Значение функции потерь: 2.4577181339263916
Прошло 1.6056825955708822 минут
Время эпохи 0.5247600356737773 минут
Эпоха 3, Значение функции потерь: 2.5772221088409424
Прошло 2.145470917224884 минут
Время эпохи 0.5245388229688008 минут
Эпоха 4, Значение функции потерь: 2.5999889373779297
Прошло 2.6860827883084615 минут
Время эпохи 0.5253540277481079 минут
Эпоха 5, Значение функции потерь: 2.4643325805664062
Прошло 3.2254183292388916 минут
Время эпохи 0.524252446492513 минут
Эпоха 6, Значение функции потерь: 2.5058395862579346
Прошло 3.7661298910776773 минут
Время эпохи 0.5254616061846415 минут
Эпоха 7, Значение функции потерь: 2.5006868839263916
Прошло 4.30612268447876 минут
Время эпохи 0.5247259140014648 минут
Эпоха 8, Значение функции потерь: 2.477367401123047
Прошло 4.846101307868958 минут

**Проверка перевода**

In [None]:
text_ru, text_en, text_en2 = list(data_pipe_train_final)[0]
text_ru1, text_en1, text_en21 = list(data_pipe_test_final)[0]

In [None]:
vocab1 = vocab_ru.get_itos()
vocab2 = vocab_en.get_itos()
print(*[ vocab1[i.to('cpu').numpy()] for i in text_ru[0].clone().detach() ])
print(*[ vocab2[i.to('cpu').numpy()] for i in text_en[0].clone().detach() ])
print(*[ vocab2[i.to('cpu').numpy()] for i in text_en2[0].clone().detach() ])
print()
print(*[ vocab1[i.to('cpu').numpy()] for i in text_ru1[0].clone().detach() ])
print(*[ vocab2[i.to('cpu').numpy()] for i in text_en1[0].clone().detach() ])

<sos> кольцевой туристический маршрут <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
<sos> rundumadum hiking trail <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
rundumadum hiking trail <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

<sos> он научится быть спокойным при завершении атаки <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
<sos> he will learn to calm down when finishing <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [None]:
pred = torch.zeros(text_en.shape[0], text_en.shape[1], device=device).type(torch.int32)
pred[:, 0] = 1
for i in range(text_en.shape[1]):
    outputs = model(text_ru, pred)
    save = outputs[:, i-1].argmax(1)
    pred[:, i] = save
print(*[ vocab2[i.to('cpu').numpy()] for i in pred[0].clone().detach() ])

pred = torch.zeros(text_en1.shape[0], text_en1.shape[1], device=device).type(torch.int32)
pred[:, 0] = 1
for i in range(text_en1.shape[1]):
    outputs = model(text_ru1, pred)
    save = outputs[:, i-1].argmax(1)
    pred[:, i] = save
print(*[ vocab2[i.to('cpu').numpy()] for i in pred[0].clone().detach() ])

<pad> rundumadum hiking trail <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
<pad> the is the the the phenomena once and and the of and tenants <eos> <pad> <pad> <pad>
