In [1]:
import os
import re
import typing as t
import numpy as np
import pandas as pd
import zipfile
from pathlib import Path

In [2]:
ru = None
en = None

with zipfile.ZipFile('../storage/1mcorpus.zip') as zf:
    for filename in zf.namelist():
        if '.ru' in filename:
            with zf.open(filename) as f:
                ru = f.read().decode().split('\n')
        elif '.en' in filename:
            with zf.open(filename) as f:
                en = f.read().decode().split('\n')

In [3]:
ru[:5], en[:5], len(ru), len(en)

(['Такое развитие характера Гарри может разочаровать читателей, полюбивших его былую мстительность, но с другой стороны это преображение укрепляет позицию тех, кто не видит глубже сюжета и изображения героев.',
  'Решение суда (группа вернулась под крыло к Elektra Entertainment) предотвратило дальнейшие нападки со стороны неугомонного Ульриха и не позволило ему обнародовать детали нового контракта.',
  'Когда тебе 18 или 19 лет, легко перенимать бандитские повадки и переносить их в группу.',
  'А сейчас куча триьютов тем же самым BLACK SABBATH и KISS.',
  'Я был единственным, кто занялся копированием демо на кассете.'],
 ["This new development in Harry's character may be a disappointment to those readers who enjoyed his old vindictive ways, but it also reinforces the position of pro-Potter people who do not see beneath the surface appearance of the characters and plots.",
  'A nondisclosure clause in the final settlement (the band is back on Elektra) prevents Ulrich, an irrepressible m

In [2]:
import torchtext
import torchdata
from torchdata import datapipes
from torchdata.datapipes.iter import FileLister, FileOpener, IterableWrapper
from torchtext import transforms
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# DEVICE = 'cpu'

torch.cuda_version, torch.cuda.is_available(), torch.cuda.get_device_name(torch.cuda.current_device()), torchtext.__version__, torchdata.__version__

('11.8', True, 'NVIDIA GeForce GTX 1650', '0.15.2', '0.6.1')

In [3]:
SEED = 212

# random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
from itertools import islice


def print_datapipe(datapipe: datapipes.iter.IterDataPipe | t.Iterable, n: int = None) -> None:
    print(type(datapipe))

    for sample in islice(datapipe, n):
        print(sample)

In [5]:
datapipe = FileLister('../storage', '1mcorpus.zip')
datapipe = FileOpener(datapipe, mode='b')
datapipe = datapipe.load_from_zip()
print_datapipe(datapipe)

<class 'torchdata.datapipes.iter.util.ziparchiveloader.ZipArchiveLoaderIterDataPipe'>
('..\\storage\\1mcorpus.zip\\1mcorpus\\corpus.en_ru.1m.en', StreamWrapper<..\storage\1mcorpus.zip\1mcorpus\corpus.en_ru.1m.en,<zipfile.ZipExtFile name='1mcorpus/corpus.en_ru.1m.en' mode='r' compress_type=deflate>>)
('..\\storage\\1mcorpus.zip\\1mcorpus\\corpus.en_ru.1m.ru', StreamWrapper<..\storage\1mcorpus.zip\1mcorpus\corpus.en_ru.1m.ru,<zipfile.ZipExtFile name='1mcorpus/corpus.en_ru.1m.ru' mode='r' compress_type=deflate>>)


In [7]:
ru = None
en = None

for pth, stream in datapipe:
    if pth.endswith('.ru'):
        ru = [line.decode() for line in stream.readlines()]
    if pth.endswith('.en'):
        en = [line.decode() for line in stream.readlines()]

In [9]:
# ru_dp = None
# en_dp = None

# for pth, stream in datapipe:
#     if pth.endswith('.ru'):
#         ru_dp = list(IterableWrapper(stream.readlines()).map(bytes.decode))
#     if pth.endswith('.en'):
#         en_dp = list(IterableWrapper(stream.readlines()).map(bytes.decode))

In [10]:
print_datapipe(ru, 1)
print_datapipe(en, 1)

<class 'list'>
Такое развитие характера Гарри может разочаровать читателей, полюбивших его былую мстительность, но с другой стороны это преображение укрепляет позицию тех, кто не видит глубже сюжета и изображения героев.

<class 'list'>
This new development in Harry's character may be a disappointment to those readers who enjoyed his old vindictive ways, but it also reinforces the position of pro-Potter people who do not see beneath the surface appearance of the characters and plots.



In [8]:
en_train, en_test, ru_train, ru_test = train_test_split(en, ru, test_size=0.3, random_state=69, shuffle=True)

print(*map(len, (en_train, en_test, ru_train, ru_test)))

700000 300000 700000 300000


In [26]:
def save_data(path: str, rows) -> None:
    with open(path, 'w', encoding='utf-8') as f:
        f.write(''.join(rows))

save_data('../storage/en_train.en', en_train)
save_data('../storage/en_test.en', en_test)
save_data('../storage/ru_train.ru', ru_train)
save_data('../storage/ru_test.ru', ru_test)

In [83]:
from operator import itemgetter

def get_datapipe_with_lang_docs(path):
    return IterableWrapper([path]).open_files('r', 'utf-8').parse_csv().map(itemgetter(0))

In [84]:
ru_dp = get_datapipe_with_lang_docs('../storage/ru_train.ru')
en_dp = get_datapipe_with_lang_docs('../storage/en_train.en')

en_ru_dp = en_dp.zip(ru_dp)
print_datapipe(en_ru_dp, 2)

<class 'torch.utils.data.datapipes.iter.combining.ZipperIterDataPipe'>
('The tasks of MCs at land level are regulated by an MC-ordinance.', 'Деятельность национальных мониторинговых комитетов регулируется решениями соответствующего национального комитета.')
('This was done to speed up the work on the documents.', 'Это было сделано для того')


In [87]:
len(tuple(ru_dp))

660384

In [13]:
# from torch.utils.data import DataLoader, random_split
# train_dataset, test_dataset = random_split(en_ru_dp, [0.75, 0.25])

In [14]:
# a, b = en_ru_dp.random_split(
#     # total_length=len(en_ru_dp),
#     weights={'train': 0.8, 'test': 0.2},
#     # target='train',
#     seed=69,
# )

In [52]:
from razdel import tokenize as razdel_tokenize


def tokenize(text: str) -> list[str]:
    return [_.text for _ in razdel_tokenize(text)]


def tokenize_iter(data_iter: datapipes.iter.IterDataPipe):
    for example in data_iter:
        yield tokenize(example)

In [88]:
ru_voc = build_vocab_from_iterator(
    tokenize_iter(ru_dp),
    min_freq=2,
    specials=['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True,
)
ru_voc.set_default_index(ru_voc['<unk>'])

In [89]:
len(ru_voc)

183077

In [18]:
ru_voc.get_stoi()

{'невыполнимым': 258364,
 'просматривайте': 87999,
 'крупнейшем': 46514,
 'негативная': 49894,
 'моногорода': 256778,
 'Водянова': 217495,
 'соответсвует': 275399,
 'Норильский': 22156,
 'стволе': 96204,
 'поощ': 266771,
 'установлен': 2668,
 'мини-кухней': 24263,
 'MED-V': 207928,
 'партии': 886,
 'Эммануэля': 176824,
 'Swedish': 120733,
 'аранжировщиком': 239755,
 'группировки': 9878,
 'широкий': 2073,
 'обязательны': 41451,
 'DHTML': 107188,
 'геометрической': 38920,
 'уложено': 88918,
 'сохранение': 3371,
 'техникум': 54455,
 'Германские': 90421,
 'Выберите': 1363,
 'сходятся': 27216,
 'Side': 57025,
 'продержали': 95386,
 'НТРК': 226586,
 'пенджабцы': 263835,
 'питьевой': 7654,
 'ни': 148,
 'CLIENT': 162193,
 'закрытых': 13644,
 'кофе-брейки': 102501,
 'глинозема': 44904,
 'взвилась': 242522,
 'взаимосвязаны': 24879,
 'землевладельцам': 149372,
 'рекомендованного': 76729,
 '6/8': 106929,
 'номер': 536,
 '<eos>': 2,
 'расходы': 996,
 'Проекте': 84433,
 'галактики': 25681,
 'обычног

In [19]:
ru_voc.get_itos()[:10]

['<pad>', '<sos>', '<eos>', '<unk>', ',', '.', 'и', 'в', 'на', '"']

In [20]:
def make_doc_transform(vocab: torchtext.vocab.Vocab) -> transforms.Sequential:
    # TODO:
    doc_transform = transforms.Sequential(
        transforms.VocabTransform(vocab=vocab),
        transforms.AddToken(vocab['<sos>'], begin=True),
        transforms.AddToken(vocab['<eos>'], begin=False),
    )
    return doc_transform

In [21]:
ru_doc_transform = make_doc_transform(ru_voc)

In [22]:
ru_doc_transform(tokenize('кафедра хайпа это ебать как круто'))

[1, 26583, 3, 27, 181932, 21, 22899, 2]

In [23]:
ru_dp = ru_dp.map(tokenize)
print_datapipe(ru_dp, 1)
ru_dp = ru_dp.map(ru_doc_transform)
print_datapipe(ru_dp, 1)

<class 'torch.utils.data.datapipes.iter.callable.MapperIterDataPipe'>
['Деятельность', 'национальных', 'мониторинговых', 'комитетов', 'регулируется', 'решениями', 'соответствующего', 'национального', 'комитета', '.']
<class 'torch.utils.data.datapipes.iter.callable.MapperIterDataPipe'>
[1, 7523, 820, 67330, 8843, 13084, 11145, 2672, 1389, 1772, 5, 2]


In [24]:
en_voc = build_vocab_from_iterator(
    tokenize_iter(en_dp),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True,
)
en_voc.set_default_index(en_voc['<unk>'])

print(len(ru_voc))

en_doc_transform = make_doc_transform(en_voc)

283317


In [25]:
en_dp = en_dp.map(tokenize).map(en_doc_transform)
print_datapipe(en_dp, 2)

<class 'torch.utils.data.datapipes.iter.callable.MapperIterDataPipe'>
[1, 17, 1460, 7, 61980, 36, 623, 214, 21, 5721, 25, 39, 3, 6, 2]
[1, 71, 30, 569, 9, 1287, 83, 4, 99, 19, 4, 613, 6, 2]


In [26]:
en_ru_dp = en_dp.zip(ru_dp)
print_datapipe(en_ru_dp, 1)

<class 'torch.utils.data.datapipes.iter.combining.ZipperIterDataPipe'>
([1, 17, 1460, 7, 61980, 36, 623, 214, 21, 5721, 25, 39, 3, 6, 2], [1, 7523, 820, 67330, 8843, 13084, 11145, 2672, 1389, 1772, 5, 2])


In [27]:
dp = en_ru_dp.bucketbatch(
    batch_size = 32,
    use_in_batch_shuffle=False,
)
print_datapipe(dp, 1)

<class 'torch.utils.data.datapipes.iter.grouping.BatcherIterDataPipe'>
[([1, 1580, 1171, 21, 2836, 449, 33, 1212, 6, 2], [1, 2056, 644, 23874, 282, 1526, 11, 339, 5, 2]), ([1, 451, 4773, 431, 4, 134767, 5810, 66, 5517, 9, 152, 7592, 4405, 5, 8, 14547, 362, 4, 3819, 7, 4, 1462, 6, 2], [1, 429, 34902, 10, 3, 10376, 11997, 3, 23911, 4, 15974, 4, 66, 27750, 111963, 6, 196087, 10, 575, 1129, 71, 21418, 5, 2]), ([1, 47, 2691, 8877, 4, 63404, 7966, 1714, 30, 1756, 13, 4225, 8, 2862, 8, 10, 811, 4, 141, 139, 80, 26, 30, 1378, 391, 26, 66, 4, 1185, 7, 11, 3, 2795, 6, 2], [1, 16671, 10602, 77, 5, 98525, 125305, 64, 13140, 8, 3, 124, 4, 26, 7, 2843, 48, 56, 51, 4229, 10, 581, 52543, 21, 3, 3449, 5, 2]), ([1, 12552, 4, 5602, 9, 2319, 680, 1958, 31, 68, 11, 611, 3603, 3901, 354, 6, 2], [1, 26003, 1106, 39454, 4, 13496, 14965, 1061, 7, 1254, 301, 877, 4112, 4210, 5, 2]), ([1, 37, 10906, 9, 22, 7663, 13, 1233, 345, 85, 37, 593, 1245, 189, 3977, 31, 4, 109, 769, 6, 2], [1, 2111, 11, 8392, 4, 66, 67368

In [28]:
def bXY_to_bXbY(batch):
    """
        batch: list[tuple[[int, ...], [int, ...]], ...]
        return batch: list[tuple[[int, ...], ...], tuple[[int, ...], ...]]
    """
    return list(zip(*batch))

In [29]:
dp = dp.map(bXY_to_bXbY)
print_datapipe(dp, 1)

<class 'torch.utils.data.datapipes.iter.callable.MapperIterDataPipe'>
[([1, 134, 10, 111, 3126, 654, 5, 69, 12, 290, 11, 2840, 119, 5027, 8, 2768, 5, 43, 4, 67, 6, 2], [1, 126, 21, 11, 23748, 241, 134, 81, 29, 277, 50, 21, 3840, 5, 29, 21, 33, 449, 31, 4, 1321, 190, 2], [1, 239, 14171, 5, 3192, 8554, 14, 74, 86, 558, 8, 22, 2719, 31, 44, 5595, 8, 44, 10686, 6, 2], [1, 61, 1598, 10, 744, 8236, 6, 2], [1, 71, 447, 42, 62, 3425, 25, 5952, 1250, 273, 845, 8, 30, 93, 142, 25, 1494, 1936, 27003, 10, 644, 6, 2], [1, 13533, 9614, 1352, 1558, 15, 1838, 2079, 15, 31, 466, 5, 439, 2418, 9, 11, 185, 6363, 1199, 25, 39, 27605, 7, 1003, 31, 4, 1998, 1022, 8, 945, 6, 2], [1, 71, 15648, 26133, 585, 11, 20509, 13, 827, 379, 2], [1, 4750, 49, 1015, 16, 27, 4771, 8, 11, 3489, 802, 4248, 45, 8, 152, 2145, 2809, 3453, 10, 498, 13, 49, 1015, 6, 2], [1, 9385, 7, 4439, 25986, 8, 3787, 8161, 560, 10, 715, 5, 101, 36859, 29927, 3787, 3634, 10448, 15693, 5206, 95639, 15693, 3418, 9435, 27055, 4514, 16435, 27056,

1) Внутри батча у конкретного языка размерность одинаковая

2) В батче у разных языков могут быть разные размерности

3) У разных батчей могут быть разные размерности

In [30]:
def apply_padding(bXbY):
    X, Y = bXbY
    return (
        transforms.ToTensor(en_voc['<pad>'])(list(X)),
        transforms.ToTensor(ru_voc['<pad>'])(list(Y)),
    )

print_datapipe(dp.map(apply_padding), 1)

<class 'torch.utils.data.datapipes.iter.callable.MapperIterDataPipe'>
(tensor([[    1,  4570, 50901,  ...,     0,     0,     0],
        [    1,   184,    21,  ...,     0,     0,     0],
        [    1,    17, 67307,  ...,     0,     0,     0],
        ...,
        [    1,    17, 14270,  ...,     0,     0,     0],
        [    1,   239, 14171,  ...,     0,     0,     0],
        [    1,  8367,   678,  ...,     0,     0,     0]]), tensor([[     1,  29320,  24476,  ...,      0,      0,      0],
        [     1,    201,     30,  ...,      0,      0,      0],
        [     1, 119551,  18841,  ...,      0,      0,      0],
        ...,
        [     1,  33623,  35616,  ...,      0,      0,      0],
        [     1,  14298,  78213,  ...,      0,      0,      0],
        [     1,  17196,    124,  ...,      0,      0,      0]]))


In [31]:
dp = dp.map(apply_padding)

In [32]:
# TODO: для тестовой сделать пайплайн

# Models

In [33]:
class Encoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embedding_dim: int,
        dropout: float,
        hidden_size: int,
        rnn_num_layers: int,
        bidirectional: bool = False,
    ) -> None:
        super(Encoder, self).__init__()

        self.n_layers = rnn_num_layers
        self.hidden_size = hidden_size

        # bidirectional = 2 if bidirectional else 1

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
        )

        self.gru = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=rnn_num_layers,
            dropout=dropout,
            # bidirectional=bidirectional,
            batch_first=True,
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs):
        # inputs ~ Size([batch_size, padded_input_doc_size_in_batch])
        embedded = self.dropout(self.embedding(inputs))  # Size([batch_size, padded_input_doc_size_in_batch, embedding_dim])
        output, h_n = self.gru(embedded)
        # encoder h_n -> decoder h_0
        # TODO: output -> Attention
        # h_n ~ Size([bi * rnn_num_layers, batch_size, hidden_size])
        return h_n

In [34]:
class Decoder(nn.Module):
    def __init__(
        self,
        output_size: int,  # target vocab_size
        embedding_dim,
        dropout: float,
        hidden_size,
        rnn_num_layers,
        bidirectional: bool = False,
    ) -> None:
        super(Decoder, self).__init__()

        self.output_size = output_size
        self.hidden_size = hidden_size
        self.n_layers = rnn_num_layers

        # bidirectional = 2 if bidirectional else 1

        self.embedding = nn.Embedding(
            num_embeddings=output_size,  # макс кол-во векторов = vocab_size
            embedding_dim=embedding_dim,  # заданной размерности
        )

        self.gru = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=rnn_num_layers,
            dropout=dropout,
            # bidirectional=bidirectional,
            batch_first=True,
        )
        
        self.dropout = nn.Dropout(dropout)

        # ? bi * hidden_size
        self.fc = nn.Linear(hidden_size, output_size)  # Брать ли softmax чтобы узнать следующий токен?


    def forward(
        self,
        input: torch.Tensor,  # batch of <sos> ~ Size([batch_size])
        hidden,  # Size([bi * rnn_num_layers, batch_size, hidden_size])
    ):
        inputs = input.unsqueeze(1)  # Size([batch_size, 1])
        embedded = self.dropout(self.embedding(inputs)) # Size([batch_size, 1, embedding_dim])

        output, h_n = self.gru(embedded, hidden)
        # не надо или хз че с размерностями output ~ Size([batch_size, 1, bi * hidden_size])
        # h_n ~ Size([bi * rnn_num_layers, batch_size, hidden_size]) ~ [1, batch_size, hidden_size]

        prediction = self.fc(h_n.squeeze(0))  # Size([batch_size, output_size]) ~ [batch_size, target_vocab_size] = batch of logits
        return prediction, h_n

In [35]:
class Seq2Seq(nn.Module):
    def __init__(
        self,
        encoder: Encoder,
        decoder: Decoder,
        teacher_forcing_ratio: float | None = None,
    ) -> None:
        super(Seq2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.device = DEVICE

        assert encoder.hidden_size == decoder.hidden_size, 'hidden_size of Encoder and Decoder did not match'
        assert encoder.n_layers == decoder.n_layers, 'n_layers of Encoder and Decoder did not match'

    def forward(
        self,
        inputs,  # batch, Size([batch_size, padded_input_doc_size_for_batch])
        target_outputs=None,  # batch, Size([batch_size, padded_target_doc_size_for_batch])
    ):
        batch_size = inputs.shape[0]
        max_len = target_outputs.shape[1]
        target_vocab_size = self.decoder.output_size

        target_pad = 0  # TODO: откуда брать?
        target_sos = 0

        # TODO: потеря градиента? => concat
        outputs = torch.empty(batch_size, max_len, target_vocab_size).fill_(target_pad).to(self.device)  # batch of arrays of logits
        # outputs ~ Size([batch_size, max_len, target_vocab_size])

        hidden = self.encoder.forward(inputs)
        input = torch.empty(batch_size).fill_(target_sos).type(torch.int64).to(self.device)  # TODO: maybe [b,1]

        for t in range(1, max_len):
            pred, hidden = self.decoder.forward(input=input, hidden=hidden)
            # pred ~ Size([batch_size, target_vocab_size])
            outputs.permute(1, 0, 2)[t] = pred  # TODO: мб потеря градиента

            input = pred.softmax(1).argmax(1)  # Size([batch_size])  # TODO: 2x softmax?
            if self.teacher_forcing_ratio is not None:
                if np.random.sample() <= self.teacher_forcing_ratio:
                    input = target_outputs[:, t].squeeze(-1)

        return outputs

In [36]:
a = torch.tensor([[1, 2, 1]], dtype=torch.float32)

In [37]:
a.softmax(1).argmax(1)

tensor([1])

In [38]:
np.random.sample()

0.23290163402733643

In [39]:
z = torch.zeros(4, 3, 2)
z

tensor([[[0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.]]])

In [40]:
torch.ones(4, 1, 2)

tensor([[[1., 1.]],

        [[1., 1.]],

        [[1., 1.]],

        [[1., 1.]]])

In [41]:
z.permute(1, 0, 2)[0] = torch.ones(1, 2)
# z.permute(1, 0, 2)
z

tensor([[[1., 1.],
         [0., 0.],
         [0., 0.]],

        [[1., 1.],
         [0., 0.],
         [0., 0.]],

        [[1., 1.],
         [0., 0.],
         [0., 0.]],

        [[1., 1.],
         [0., 0.],
         [0., 0.]]])

In [42]:
z.permute(1, 0, 2)[0]

tensor([[1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.]])

In [43]:
torch.empty(4,3)

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])

In [44]:
torch.empty(4, 1, dtype=torch.long, device=DEVICE).fill_(0)

tensor([[0],
        [0],
        [0],
        [0]], device='cuda:0')

# Обучение

In [45]:
len(en_voc)

135778

In [46]:
lr = 1e-3
loss = nn.CrossEntropyLoss()

embedding_dim = 128
hidden_size = 128
rnn_num_layers = 1
encoder = Encoder(
    vocab_size=len(en_voc),
    embedding_dim=embedding_dim,
    dropout=0.1,
    hidden_size=hidden_size,
    rnn_num_layers=rnn_num_layers,
).to(DEVICE)
decoder = Decoder(
    output_size=len(ru_voc),
    embedding_dim=embedding_dim,
    dropout=0.1,
    hidden_size=hidden_size,
    rnn_num_layers=rnn_num_layers,
).to(DEVICE)
seq2seq = Seq2Seq(
    encoder=encoder,
    decoder=decoder,
    teacher_forcing_ratio=0.2,
).to(DEVICE)
optimizer = torch.optim.Adam(seq2seq.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2)

list(param.is_cuda for param in seq2seq.parameters())



[True, True, True, True, True, True, True, True, True, True, True, True]

In [47]:
epochs = 1

In [48]:
len(dp)

21875

In [49]:
for epoch in range(epochs):
    print(f'[Seq2Seq] Epoch {epoch + 1}...')
    for x_b, y_b in islice(dp, 100):
        x_b = x_b.to(DEVICE)
        y_b = y_b.to(DEVICE)
        optimizer.zero_grad()
        outputs = seq2seq(x_b, y_b)
        loss_value = loss(outputs.permute(0, 2, 1), y_b)
        print(f'\tLoss value is {loss_value:.3f}.')
        loss_value.backward()
        torch.nn.utils.clip_grad_norm_(seq2seq.parameters(), 1)
        optimizer.step()
        scheduler.step()

[Seq2Seq] Epoch 1...
	Loss value is 12.541.
	Loss value is 12.471.
	Loss value is 12.336.


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.50 GiB (GPU 0; 4.00 GiB total capacity; 8.55 GiB already allocated; 0 bytes free; 9.48 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
with torch.no_grad():
    for x_b, y_b in islice(dp, 1):
        x_b = x_b.to(DEVICE)
        y_b = y_b.to(DEVICE)
        outputs = seq2seq(x_b, y_b)
        res = outputs.softmax(1).argmax(1)
        for r in res:
            a = [ru_voc.get_itos()[r_i] for r_i in list(r)]
            print(a)

['всегда', '<pad>', 'а', '<eos>', 'и', 'к', '<unk>', 'к', '<eos>', '<eos>', '<eos>', '.', '.', '<unk>', '<eos>', 'а', '<pad>', '<eos>', '<pad>', '<eos>', '<pad>', 'и', '<pad>', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<sos>', '<pad>', '<sos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<sos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<sos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<eos>']
['более', '<pad>', '-', '<eos>', 'в', '-', '<unk>', 'так', '<eos>', '<unk>', '<eos>', '.', '.', '<unk>', '<eos>', '-

In [None]:
a = torch.zeros(4, 3, 2)
a.view(-1, 6)

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [110]:
from IPython.display import display, clear_output
import numpy as np
import matplotlib.pyplot as plt
from collections import deque


%matplotlib inline

m = 100
n = 100
matrix = np.random.normal(0, 1, size=(m, n))
d = deque(maxlen=2)
d.append([0, 0])

fig = plt.figure()
ax = fig.add_subplot(111)

for i in range(1, m):
    # ax.clear()
    d.append([i, i * i])
    ax.plot(*zip(d[0], d[1]), color='royalblue')
    display(fig)
    clear_output(wait=True)
    plt.pause(0.01)

ValueError: Single argument to subplot must be a three-digit integer, not 1

<Figure size 640x480 with 0 Axes>

# СБПИ

In [2]:
import typing as t
from collections import defaultdict
from copy import deepcopy
import orjson
from pydantic import TypeAdapter
from operator import attrgetter


class Attribute(t.NamedTuple):
    id: int  # [1; +inf)
    name: str = '<unk>'


class FunctionalDependence(t.NamedTuple):
    X: frozenset[Attribute]
    Y: Attribute


class Relation(t.NamedTuple):
    X: frozenset[Attribute]
    Y: frozenset[Attribute]


adapter = TypeAdapter(frozenset[FunctionalDependence])

In [3]:
def print_sbpi(table: list[list[str]]):
    print('\n'.join(map(''.join, table)))
    print()


def sbpi(fzs: t.Collection[FunctionalDependence]):
    def make_relations(fzs: t.Iterable[FunctionalDependence]) -> t.Collection[Relation]:
        relations = defaultdict(list)
        for fz in fzs:
            if fz.X:
                relations[tuple(fz.X)].append(fz.Y)
        return {Relation(frozenset(X), frozenset(Y)) for X, Y in relations.items()}

    def get_attributes(fzs: t.Iterable[FunctionalDependence]) -> t.Collection[Attribute]:
        return {fz.Y for fz in fzs}

    all_attributes = get_attributes(fzs)
    relations = sorted(make_relations(fzs))
    
    attributes_count = len(all_attributes)

    table = [['_'] * attributes_count for _ in range(len(relations))]
    
    for i, relation in enumerate(relations):
        for attr in {*relation.X, *relation.Y}:
            table[i][attr.id - 1] = 'a'

    i = 0
    print_sbpi(table)
    sorted_fzs = filter(attrgetter('X'), fzs)

    while True:
        i += 1
        if any(all(el == 'a' for el in row) for row in table):
            return table, True

        table_buf = deepcopy(table)

        for fz in sorted_fzs:
            rows = [
                i for i, _ in enumerate(relations)
                if all(table[i][x.id - 1] == 'a' for x in fz.X)
            ]

            check_column = fz.Y.id - 1

            if any(table[row][check_column] == 'a' for row in rows):
                for row in rows:
                    table[row][check_column] = 'a'

        print(f'Итерация {i}:')
        print_sbpi(table)

        if table == table_buf:
            return table, False

In [24]:
with open('../../shailushai/rgr_spbi.json', 'r', encoding='utf-8') as f:
    data = orjson.loads(f.read())

fzs = adapter.validate_python(data)
fzs

frozenset({FunctionalDependence(X=frozenset(), Y=Attribute(id=7, name='<unk>')),
           FunctionalDependence(X=frozenset({Attribute(id=4, name='<unk>')}), Y=Attribute(id=5, name='<unk>')),
           FunctionalDependence(X=frozenset({Attribute(id=7, name='<unk>')}), Y=Attribute(id=11, name='<unk>')),
           FunctionalDependence(X=frozenset({Attribute(id=1, name='<unk>')}), Y=Attribute(id=2, name='<unk>')),
           FunctionalDependence(X=frozenset({Attribute(id=1, name='<unk>')}), Y=Attribute(id=3, name='<unk>')),
           FunctionalDependence(X=frozenset({Attribute(id=7, name='<unk>')}), Y=Attribute(id=1, name='<unk>')),
           FunctionalDependence(X=frozenset({Attribute(id=12, name='<unk>')}), Y=Attribute(id=10, name='<unk>')),
           FunctionalDependence(X=frozenset({Attribute(id=7, name='<unk>')}), Y=Attribute(id=9, name='<unk>')),
           FunctionalDependence(X=frozenset({Attribute(id=4, name='<unk>')}), Y=Attribute(id=13, name='<unk>')),
           Function

In [25]:
table, is_success_sbpi = sbpi(fzs)
is_success_sbpi

_____a______a
aaa__________
___aa_______a
a__a__aaa_aa_
_________a_a_

Итерация 1:
_____a______a
aaa__________
___aaa______a
aaaaaaaaaaaaa
_________a_a_



True