Свободно переиспользуйте код с семинаров / прошлой домашки. Если копируете код из интернета, укажите ссылку откуда скопировали, это тоже можно. 
Дедлайн: 23 декабря 23:59. 

**ВНИМАНИЕ**: сдавать на почту `almarkv@yandex.ru` с темой письма в формате `[ML_in_SWE] Задание 2 - ФИО`. Вместо `ФИО` подставьте ваше ФИО. Если **тема** письма **не подходит** под этот формат, задание будет **считаться не сданным**.

Ваш код должен выполнять на GPU, если оно доступно.

In [217]:
import os
import math
import random
from random import choices

import torch
from torch import nn
from einops import rearrange
import youtokentome as yttm

device = 'cuda' if torch.cuda.is_available() else 'cpu'
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'

### Задание 1 (3 балла + возможность 1 дополнительного балла в задание 2)

Реализуйте TransformerDecoder

Для этого вам необходимо внести изменения в код с семинара. Основные изменения (а может и все) нужно внести в AttentionLayer. Мы обсуждали их на занятии.

<img src="homework02.jpeg" width="1600" height="800">

In [3]:
class PreNorm(nn.Module):
    """Apply layer normalization to the input and pass it through the layer."""

    def __init__(self, dim: int, layer: nn.Module) -> None:
        super().__init__()
        self.layer = layer
        self.norm = nn.LayerNorm(dim)

    def forward(self, x: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
        x = self.norm(x)
        return self.layer(x, **kwargs)


class FeedForward(nn.Module):
    """ Implement Linear(d, h) -> GeLU() -> Linear(h, d) """

    def __init__(self, dim: int, hidden_dim: int) -> None:
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, dim)
        )

    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
        return self.net(x)


class Attention(nn.Module):
    """ Multi-Head Self Attention """

    def __init__(self, dim: int, heads: int = 8, dim_head: int = 64, mask: bool = False) -> None:
        super().__init__()

        inner_dim = dim_head * heads
        need_project_out = not (heads == 1 and dim_head == dim)

        self.mask = mask
        self.heads = heads
        self.scale = dim_head ** -0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
        self.attend = nn.Softmax(dim=-1)

        self.to_out = nn.Linear(inner_dim, dim) if need_project_out else nn.Identity()

    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
        qkv = self.to_qkv(x).chunk(3, dim=-1)  # [batch_size, seq_len, (dim_head * heads)]
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=self.heads), qkv)

        attn = torch.matmul(q, k.transpose(-1, -2))  # [batch_size, seq_len, seq_len]
        attn = attn * self.scale

        if self.mask:
            attn += torch.triu(torch.ones_like(attn), diagonal=1) * float('-inf')

        attn_weights = self.attend(attn)

        out = torch.matmul(attn_weights, v)  # [batch_size, heads, seq_len, dim_head]
        out = rearrange(out, 'b h s d -> b s (h d)')  # [batch_size, seq_len, dim_head * heads == inner_dim]
        return self.to_out(out)

    # def forward_one_head(self, x: torch.FloatTensor) -> torch.FloatTensor:
    #     qkv = self.to_qkv(x) # [batch_size, seq_len, dim * 3]
    #     qkv = qkv.chunk(3, dim=-1) # [batch_size, seq_len, dim] * 3
    #     # qkv[0] # 0, ..., dim - 1 ||| dim, ..., 2 * dim - 1 ||| 2 * dim, ..., 3 * dim - 1
    #     q, k, v = qkv

    #     # q [batch_size, seq_len, dim]
    #     # k [batch_size, seq_len, dim] -> k.transpose(-1, -2) [batch_size, dim, seq_len]

    #     attn = torch.matmul(q, k.transpose(-1, -2)) # [batch_size, seq_len, seq_len]
    #     attn_weights = self.attend(attn) # [batch_size, seq_len, seq_len]

    #     out = torch.matmul(attn_weights, v) # [batch_size, seq_len, dim]
    #     return out

# class TransformerEncoder(nn.Module):
#     def __init__(self, dim: int, depth: int, heads: int, dim_head: int, mlp_dim: int) -> None:
#         super().__init__()
#         self.layers = nn.ModuleList([])
#         for _ in range(depth):
#             layer = nn.ModuleList([
#                 PreNorm(dim, Attention(dim, heads, dim_head)),
#                 PreNorm(dim, FeedForward(dim, mlp_dim)),
#             ])
#             self.layers.append(layer)
#
#         self.norm = nn.LayerNorm(dim)
#
#     def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
#         for attn, ff in self.layers:
#             x = attn(x) + x
#             x = ff(x) + x
#
#         return self.norm(x)


In [None]:
class EthiliumDecoder(nn.Module):
    """Ethilium decoder."""

    def __init__(self, dim: int = 768, nlayes: int = 3, nheads: int = 8, dim_head: int = 64,
                 dim_hid: int = 768 * 4) -> None:
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(nlayes):
            layer = nn.ModuleList([
                PreNorm(dim, Attention(dim, nheads, dim_head, mask=True)),
                # здесь мог быть ваш Encoder-Decoder attention, но его нет
                PreNorm(dim, FeedForward(dim, dim_hid)),
            ])
            self.layers.append(layer)

        self.norm = nn.LayerNorm(dim)

    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x

        return self.norm(x)

### Задание 2 (7 баллов)

Используя TransformerDecoder, реализуйте языковое моделирование на задаче сонетов.

#### Задание 2.1 (1 балл) 

Прочитайте датасет из файла sonnets.txt. Обратите внимание на то, что не все строки в файле относятся к соннетам. Предобработайте соннеты, чтобы сделать из них датасет. Датасет должен иметь следующий вид: это 4 подярд идущие строки в каком-либо сонете в нижнем регистре, такие что 4 строка оканчивается на точку, вопросительный знак или восклицательный знак. 

На примере первого сонета: строчки
```
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
```

не должны быть объектом в датасете, а строчки

```
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.
``` 
должны (вообще, только 4 последнеи строки в 1 сонете пойдут в датасет). Строки могут пересекаться между собой, например на основе 2го сонета будет получено 2 примера в датасет: строчки 9-12 и строчки 11-14.


После этого преобразуйте все полученные примеры в формат `@LINE1@ {line_1} @LINE2@ {line_2} @LINE3@ {line_3} @LINE4@ {line_4}`, например четверостишие выше должно иметь вид `"@LINE1@ within thine own bud buriest thy content, @LINE2@ and tender churl mak'st waste in niggarding: @LINE3@ pity the world, or else this glutton be, @LINE4@ to eat the world's due, by the grave and thee."`

In [35]:
if not os.path.exists('sonnets.txt'):
    !wget https: // raw.githubusercontent.com / girafe-ai / ml-course / master / homeworks_basic / Lab2_DL / sonnets.txt

with open('sonnets.txt', 'r') as iofile:
    text = iofile.readlines()

TEXT_START = 45
TEXT_END = -368
text = text[TEXT_START: TEXT_END]
headline_indices = [0]

for i in range(len(text)):
    if text[i] == '\n':
        headline_indices.append(i)
    text[i] = text[i].lower()[2::]

paragraphs = []

for i in range(0, len(headline_indices) - 1, 2):
    l, r = headline_indices[i], headline_indices[i + 1]
    paragraphs.append(''.join(text[l:r]))

paragraphs = [p.split('\n')[:-1] for p in paragraphs]

suitable_paragraphs = []
for p in paragraphs:
    for i in range(3, len(p)):
        if p[i].endswith(('.', '?', '!')):
            suitable_paragraphs.append(p[i - 3:i + 1])

suitable_paragraphs

[['within thine own bud buriest thy content,',
  "and tender churl mak'st waste in niggarding:",
  '  pity the world, or else this glutton be,',
  "  to eat the world's due, by the grave and thee."],
 ['then being asked, where all thy beauty lies,',
  'where all the treasure of thy lusty days;',
  'to say, within thine own deep sunken eyes,',
  'were an all-eating shame, and thriftless praise.'],
 ["how much more praise deserv'd thy beauty's use,",
  "if thou couldst answer 'this fair child of mine",
  "shall sum my count, and make my old excuse,'",
  'proving his beauty by succession thine!'],
 ["shall sum my count, and make my old excuse,'",
  'proving his beauty by succession thine!',
  '  this were to be new made when thou art old,',
  "  and see thy blood warm when thou feel'st it cold."],
 ['look in thy glass and tell the face thou viewest',
  'now is the time that face should form another;',
  'whose fresh repair if now thou not renewest,',
  'thou dost beguile the world, unbles

In [52]:
data = [f"@LINE1@ {p[0]} @LINE2@ {p[1]} @LINE3@ {p[2].strip()} @LINE4@ {p[3].strip()}" for p in suitable_paragraphs]
data

["@LINE1@ within thine own bud buriest thy content, @LINE2@ and tender churl mak'st waste in niggarding: @LINE3@ pity the world, or else this glutton be, @LINE4@ to eat the world's due, by the grave and thee.",
 '@LINE1@ then being asked, where all thy beauty lies, @LINE2@ where all the treasure of thy lusty days; @LINE3@ to say, within thine own deep sunken eyes, @LINE4@ were an all-eating shame, and thriftless praise.',
 "@LINE1@ how much more praise deserv'd thy beauty's use, @LINE2@ if thou couldst answer 'this fair child of mine @LINE3@ shall sum my count, and make my old excuse,' @LINE4@ proving his beauty by succession thine!",
 "@LINE1@ shall sum my count, and make my old excuse,' @LINE2@ proving his beauty by succession thine! @LINE3@ this were to be new made when thou art old, @LINE4@ and see thy blood warm when thou feel'st it cold.",
 '@LINE1@ look in thy glass and tell the face thou viewest @LINE2@ now is the time that face should form another; @LINE3@ whose fresh repair i

In [53]:
assert "@LINE1@ within thine own bud buriest thy content, @LINE2@ and tender churl mak'st waste in niggarding: @LINE3@ pity the world, or else this glutton be, @LINE4@ to eat the world's due, by the grave and thee." == data[0]

#### Задание 2.2 (1 балл)

Обучите токенизатор BPE на полученном датасете  (воспользуйтесь библиотекой [YTTM](https://github.com/VKCOM/YouTokenToMe)), проверье что `@LINE1@`, `@LINE2@`, `@LINE3@`, `@LINE4@` представлены в виде одного токена. Используйте `vocab_size = 6000`.  

Токенизируйте все примеры в датасете, составьте Vocabulary, проверьте что у вас получается преобразовывать токенизированные примеры в `input_ids` для deep learning (этот код можно скопировать из прошлой домашки + семинаров).

In [69]:
with open('data.txt', 'w') as file:
    file.writelines(' '.join(data))

In [209]:
# Training model
yttm.BPE.train(data='data.txt', vocab_size=6000, model='model.model')

# Loading model
bpe = yttm.BPE(model='model.model')

tokenized = bpe.encode(data, bos=True, eos=True)

Training parameters
  input: data.txt
  model: model.model
  vocab_size: 6000
  n_threads: 8
  character_coverage: 1
  pad: 0
  unk: 1
  bos: 2
  eos: 3

reading file...
learning bpe...
number of unique characters in the training data: 43
number of deleted characters: 0
number of unique characters left: 43
id: 1000=55+612               freq: 1           subword: ▁sake?=▁s+ake?
model saved to: model.model


In [210]:
bpe.decode(tokenized[0])

["<BOS> @LINE1@ within thine own bud buriest thy content, @LINE2@ and tender churl mak'st waste in niggarding: @LINE3@ pity the world, or else this glutton be, @LINE4@ to eat the world's due, by the grave and thee.<EOS>"]

In [211]:
bpe.vocab()[:5]

['<PAD>', '<UNK>', '<BOS>', '<EOS>', '▁']

In [212]:
#padding
max_len = max([len(t) for t in tokenized])
for i in range(len(tokenized)):
    while len(tokenized[i]) < max_len:
        tokenized[i].append(0)

In [214]:
padd_tokens_tensor = torch.LongTensor(tokenized)
padd_tokens_tensor.shape

torch.Size([53, 44])

In [289]:
from torch.utils.data import Dataset

class CustomTokenDataset(Dataset):
    def __init__(self, padd_tokens_tensor):
        self.padd_tokens_tensor = padd_tokens_tensor

    def __len__(self):
        return len(self.padd_tokens_tensor)

    def __getitem__(self, idx):
        return padd_tokens_tensor[idx], padd_tokens_tensor[idx, :-1]

In [290]:
train_dataset = CustomTokenDataset(padd_tokens_tensor[:37])
val_dataset = CustomTokenDataset(padd_tokens_tensor[37:])

In [302]:
from torch.utils.data import DataLoader

batch_size = 12

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [294]:
next(iter(val_dataloader))[1].shape

torch.Size([12, 43])

In [254]:
# def get_batch(data, idx, batch_size = 12):
#     choosen_idx = random.sample(range(padd_tokens_tensor.shape[0]), batch_size)
#     input = padd_tokens_tensor[choosen_idx].transpose(1,0)
#     target = padd_tokens_tensor[choosen_idx , :-1].transpose(1,0)
#     return input, target

In [255]:
# def get_batch(index: int):
#     if index >= padd_tokens_tensor.shape[1]:
#         raise IndexError("what's up?")
#     input = padd_tokens_tensor[:, :index]
#     target = padd_tokens_tensor[:, index].type(torch.LongTensor)
#     return input, target

In [258]:
# get_batch()[0].shape # [seq_len, batch_size]

torch.Size([44, 12])

In [222]:
bpe.vocab_size()

1495

In [131]:
# bpe.encode(choices(bpe.vocab(), weights=torch.rand(1495,1,53).tolist(), k = 1))

TypeError: can only concatenate list (not "float") to list

#### Задание 2.3 (3 балла + 1 бонус)

Реализуйте языковую модель на основе TransformerDecoder-а. Если Вы не выполнили задание выше, используйте `torch.nn.TransformerDecoder`. Если вы выполнили первое задание, вы получите 1 дополнительный балл за использование собственного TransformerDecoder-а.

In [295]:
# https://pytorch.org/tutorials/beginner/transformer_tutorial.html
# нам понадобится этот пацан, чтобы че-то заработало
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [296]:
# https://jalammar.github.io/illustrated-gpt2/
# https://pytorch.org/tutorials/beginner/transformer_tutorial.html
class EthanolDecepticon(nn.Module):
    def __init__(self, vocab_size: int, dim_model: int = 768, nlayers: int = 3, nheads: int = 8, dim_head: int = 64,
                 dim_hid: int = 768 * 4, dropout: float = 0.5):
        super().__init__()

        self.vocab_size = vocab_size
        self.dim_model = dim_model
        self.nlayers = nlayers
        self.nheads = nheads
        self.dim_head = dim_head
        self.dim_hid = dim_hid
        self.dropout = dropout

        self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=dim_model)
        self.pos_encoder = PositionalEncoding(dim_model, dropout)
        self.decoder_layers = EthiliumDecoder(dim_model, nlayers, nheads, dim_head, dim_hid)
        # todo: add fc + softmax
        self.fc = nn.LazyLinear(vocab_size)
        self.softmax = nn.Softmax(dim = 0)
        # encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        # self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        # self.encoder = nn.Embedding(ntoken, d_model)
        # self.d_model = d_model
        # self.decoder = nn.Linear(d_model, ntoken)

    #     self.init_weights()
    #
    # def init_weights(self) -> None:
    #     initrange = 0.1
    #     self.encoder.weight.data.uniform_(-initrange, initrange)
    #     self.decoder.bias.data.zero_()
    #     self.decoder.weight.data.uniform_(-initrange, initrange)

    # takes tokenized input
    # returns probability distribution vector
    def forward(self, src: torch.Tensor) -> torch.Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, vocab_size]
        """
        src = self.embed(src) * math.sqrt(self.dim_model)
        src = self.pos_encoder(src)
        src = self.decoder_layers(src)
        src = self.fc(src)
        return self.softmax(src)


model = EthanolDecepticon(vocab_size=bpe.vocab_size())



In [305]:
model(next(iter(val_dataloader))[0].transpose(1,0)) # transpose!

tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        ...,

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]

In [306]:
criterion = nn.CrossEntropyLoss()
input, target = next(iter(val_dataloader))
pred = model(input)
loss = criterion(pred.view(-1, model.vocab_size), target.view(-1))
loss

ValueError: Expected input batch_size (528) to match target batch_size (516).

##### Задание 2.4 (2 балла)

Обучите модель. Изобразите как меняется лосс, сгенерируйте несколько примеров стихов. Баллы за качество генерации снижаться не будут, но лосс должен понижаться + должна быть функция для получения случайного стиха.

In [186]:
from tqdm import tqdm

# https://pytorch.org/tutorials/beginner/transformer_tutorial.html
criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train(nepochs: int, model: nn.Module) -> None:
    # log_interval = 200

    for epoch in range(nepochs):
        # total_loss = 0.
        model.train()
        for input, target in train_dataloader:
            pred = model(input)

            loss = criterion(pred.view(-1, model.vocab_size), target.view(-1)) # вообще, так неправильно, но уже ладно

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()

            total_loss += loss.item()
            if i % log_interval == 0 and i > 0:
                lr = scheduler.get_last_lr()[0]
                loss_interval_mean = total_loss / log_interval
                print(f'| epoch {epoch:3d} | {i:5d}/{768:5d} batches | '
                      f'lr {lr:02.2f} | '
                      f'loss mean {loss_interval_mean:5.2f} | '
                      f'current batch loss {loss}')
                total_loss = 0

        # total_loss = 0.
        model.eval()
        for input, target in val_dataloader:
            pred = model(input)

            loss = criterion(pred, target) # вообще, так неправильно, но уже ладно

            total_loss += loss.item()
            if i % log_interval == 0 and i > 0:
                lr = scheduler.get_last_lr()[0]
                loss_interval_mean = total_loss / log_interval
                print(f'| epoch {epoch:3d} | {i:5d}/{768:5d} batches | '
                      f'lr {lr:02.2f} | '
                      f'loss mean {loss_interval_mean:5.2f} | '
                      f'current batch loss {loss}')
                total_loss = 0

    scheduler.step()

In [162]:
train(1, model)

  return self.softmax(output)
  0%|          | 0/768 [00:01<?, ?it/s]


RuntimeError: Expected target size [53, 1495], got [53]

In [10]:


def compute_loss(predictions: torch.Tensor, answers):


# YOUR CODE GOES HERE

def traininng_step(model, optimizer, inputs):
    """
    This function performs a training step.
    param: model -- SonetModel object
    param: optimizer -- torch.optim object
    param: inputs -- output of torch DataLoader, one batch 
    """
    pass


def train(model, optimizer, train_loader, val_loader, num_epochs=10):
    """
    This function performs a training loop.
    param: model -- SonetModel object
    param: optimizer -- torch.optim object
    param: train_loader -- torch DataLoader object
    param: val_loader -- torch DataLoader object
    param: num_epochs -- number of epochs
    """
    pass

Обратите внимание, что специальные токены были нужны нам чтобы после генерации мы могли разложить стихи на четверостишия обратно и красиво его нариовать (ну и чтобы моделе помочь выучить рифму/ритм на самом деле, но для этого нужны большие трансформеры)