In [65]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [126]:

import math
import time
from tqdm import tqdm
import torch.nn.functional as F

import torch
import numpy as np
import torch.nn as nn
from torch import Tensor

from torch.utils.data import DataLoader
from torch.nn import (TransformerEncoder, TransformerDecoder,
                      TransformerEncoderLayer, TransformerDecoderLayer)

from src.prepare_data import download_data, build_train_vocab, get_train_test_val, check_tokens,tokens_to_sentence , generate_batch , visualize_iter_data , get_embed
# from src.LSTM import 
from src.train import create_mask,generate_square_subsequent_mask, train_epoch , evaluate, bleu_calculate
from src.transformer import Seq2SeqTransformer,PositionalEncoding,TokenEmbedding


#### Requirements

In [67]:
# !pip freeze > requirements.txt
# !python3 -m spacy info # spacy работает на версиях питона от 3.10
# !python -m spacy download de_core_news_sm
# !python -m spacy download en_core_web_sm

#### Data preparing

In [68]:
train_filepaths , val_filepaths , test_filepaths = download_data()
de_vocab, en_vocab, de_tokenizer, en_tokenizer = build_train_vocab(train_filepaths)
print( 'De vocab En vocab: ',len(de_vocab), len(en_vocab))
train_data , val_data , test_data = get_train_test_val(train_filepaths, test_filepaths, val_filepaths , de_vocab , en_vocab ,de_tokenizer,en_tokenizer )
print('Train Test Val: ',len(train_data),len(test_data) , len(val_data))

De vocab En vocab:  19215 10838
Train Test Val:  29000 1014 1000


#### Special symbols
EXAMPLE: [BOS_IDX , token1 , token2 , token3 , EOS_IDX , PAD_IX , PAD_IX , PAD_IX]

In [69]:
BATCH_SIZE = 128
PAD_IDX = de_vocab['<pad>']
BOS_IDX = de_vocab['<bos>']
EOS_IDX = de_vocab['<eos>']
print(PAD_IDX , BOS_IDX , EOS_IDX)

1 2 3


In [70]:
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn= lambda x : generate_batch(x , BOS_IDX=BOS_IDX,PAD_IDX=PAD_IDX,EOS_IDX=EOS_IDX))
valid_iter = DataLoader(val_data, batch_size=1,
                        shuffle=True, collate_fn= lambda x : generate_batch(x , BOS_IDX=BOS_IDX,PAD_IDX=PAD_IDX,EOS_IDX=EOS_IDX))
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=True, collate_fn= lambda x : generate_batch(x , BOS_IDX=BOS_IDX,PAD_IDX=PAD_IDX,EOS_IDX=EOS_IDX))

# TRAIN

In [71]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cpu'

# SEQ2SEQ train

In [107]:
SRC_VOCAB_SIZE = len(de_vocab)
TGT_VOCAB_SIZE = len(en_vocab)
EMB_SIZE = 32
NHEAD = 2
FFN_HID_DIM = 32
BATCH_SIZE = 32
NUM_ENCODER_LAYERS = 2
NUM_DECODER_LAYERS = 2
NUM_EPOCHS = 8


transformer = Seq2SeqTransformer(num_encoder_layers=NUM_ENCODER_LAYERS,
                                num_decoder_layers= NUM_DECODER_LAYERS,
                                emb_size= EMB_SIZE, src_vocab_size= SRC_VOCAB_SIZE,
                                 tgt_vocab_size= TGT_VOCAB_SIZE,
                                 dim_feedforward= FFN_HID_DIM , NHEAD=NHEAD)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(
    transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9
)



In [None]:
for epoch in range(1, NUM_EPOCHS+1):
    start_time = time.time()
    print('train')
    train_loss = train_epoch(transformer, train_iter, optimizer , DEVICE =DEVICE , loss_fn =loss_fn , pad_idx = PAD_IDX)
    end_time = time.time()
    print('eval')
    val_loss = evaluate(transformer, valid_iter , DEVICE =DEVICE , loss_fn =loss_fn,  pad_idx = PAD_IDX)
    print('bleu')
    bleu = bleu_calculate(transformer, valid_iter, en_vocab = en_vocab ,de_vocab = de_vocab ,de_tokenizer = de_tokenizer ,DEVICE = DEVICE , EOS_IDX = EOS_IDX ,BOS_IDX = BOS_IDX)
    all_time = time.time()
    print(f"Epoch: {epoch}, "
          f"Train loss: {train_loss:.3f}, "
          f"Val loss: {val_loss:.3f}, "
          f"Blue: {bleu:.3f}, "
          f"Epoch time = {(end_time - start_time):.3f}s, "
          f"All time = {(all_time - start_time):.3f}s")

# RNN encoder-decoder train

In [98]:
SRC_VOCAB_SIZE = len(de_vocab)
TGT_VOCAB_SIZE = len(en_vocab)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
NUM_EPOCHS = 16


rnn = Seq2SeqRNN(NUM_ENCODER_LAYERS,
                                 NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE,
                                 TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)

for p in rnn.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [94]:
out_forward = rnn.forward(src,tgt)

In [95]:
out_forward.shape

torch.Size([26, 128, 10838])

In [99]:
out_encode = rnn.encode(src)
out_decode = rnn.decode(tgt , out_encode[0] , out_encode[1])

In [100]:
out_decode.shape

torch.Size([26, 128, 512])

In [None]:
for epoch in range(1, NUM_EPOCHS+1):
    start_time = time.time()
    train_loss = train_epoch(transformer, train_iter, optimizer)
    end_time = time.time()
    val_loss = evaluate(transformer, valid_iter)
    bleu = bleu_calculate(transformer, valid_iter)
    all_time = time.time()
    print(f"Epoch: {epoch}, "
          f"Train loss: {train_loss:.3f}, "
          f"Val loss: {val_loss:.3f}, "
          f"Blue: {bleu:.3f}, "
          f"Epoch time = {(end_time - start_time):.3f}s, "
          f"All time = {(all_time - start_time):.3f}s")

# Добавляем ATTENTION


In [None]:
class Attention(torch.nn.Module):

    def __init__(self, encoder_dim: int, decoder_dim: int):
        super().__init__()
        self.encoder_dim = encoder_dim
        self.decoder_dim = decoder_dim

    def forward(self,
        query: torch.Tensor,  # [decoder_dim]
        values: torch.Tensor, # [seq_length, encoder_dim]
        ):
        weights = self._get_weights(query, values) # [seq_length]
        weights = torch.nn.functional.softmax(weights, dim=0)
        return weights @ values  # [encoder_dim]

class AdditiveAttention(Attention):

    def __init__(self, encoder_dim, decoder_dim):
        super().__init__(encoder_dim, decoder_dim)
        self.v = torch.nn.Parameter(
            torch.FloatTensor(self.decoder_dim).uniform_(-0.1, 0.1))
        self.W_1 = torch.nn.Linear(self.decoder_dim, self.decoder_dim)
        self.W_2 = torch.nn.Linear(self.encoder_dim, self.decoder_dim)

    def _get_weights(self,
        query: torch.Tensor,  # [decoder_dim]
        values: torch.Tensor,  # [seq_length, encoder_dim]
    ):
        query = query.repeat(values.size(0), 1)  # [seq_length, decoder_dim]
        weights = self.W_1(query) + self.W_2(values)  # [seq_length, decoder_dim]
        return torch.tanh(weights) @ self.v  # [seq_length]

In [None]:
attention = Attention( query_size = 20, key_size = 10 )
hidden = torch.randn(1, 1, 10)
embs = torch.randn(1, 5, 20)

context = attention(hidden=hidden , embs=embs)
print(context.shape)

torch.Size([1, 1, 20])


In [None]:
MAX_GENERATE = 20
class RNN_Att(nn.Module) :
    def __init__(self , input_size , encoder_size ,decoder_size , output_size ) :
        super(RNN_Att, self).__init__()
        self.input_size = input_size
        self.hidden_encoder_size = encoder_size
        self.hidden_decoder_size = decoder_size
        self.output_size = output_size

        self.encoder = RNNEncoder(input_size = self.input_size, hidden_size = self.hidden_encoder_size)
        # print(self.query_size , self.hidden_decoder_size )

        self.attention = Attention(query_size = self.hidden_encoder_size , key_size = self.hidden_decoder_size)
        self.decoder = RNNDecoder(self.hidden_encoder_size, self.hidden_decoder_size, self.output_size)

    def forward(self , input_sequence ) :
        initial_hidden = torch.zeros(1,1,self.hidden_encoder_size)
        # print(input_sequence.shape, initial_hidden.shape )
        encoder_hidden, _ = self.encoder(input_sequence, initial_hidden) # hidden layers of encoder
        # print(encoder_hidden.shape)
        #INITALIZATION
        initial_output = torch.zeros(1,1,self.output_size)
        decoder_hidden = torch.zeros(1,1,self.hidden_decoder_size)
        current_output = initial_output

        #GENERATE current_output
        output_sequence = []
        for j in range(MAX_GENERATE) :
            context = self.attention(hidden=decoder_hidden , embs=encoder_hidden)
            decoder_hidden , current_output = self.decoder(context , decoder_hidden , current_output )
            output_sequence.append(current_output)
            # Условие, что сгенерировался конечный токен
            # if current_output
            #     break

        output_sequence = torch.cat(output_sequence, dim=1)
        # print(output_sequence.shape)
        # output_sequence = output_sequence.permute(1,0,3,2)
        # output_sequence = output_sequence.squeeze(3)
        return output_sequence

In [None]:
input_size = 30 ; encoder_size = 15 ;decoder_size = 20 ; output_size = 30
rnn_att = RNN_Att( input_size = input_size, encoder_size = encoder_size ,decoder_size=decoder_size , output_size=output_size)
input_sequence = torch.randn(1, 5, input_size)

ouput_sequence = rnn_att(input_sequence)




In [None]:
a = torch.randn(5,4,1,15)
b = a.permute(1,0,3,2)
print(b.shape)
c = b.squeeze(3)
print(c.shape)

torch.Size([4, 5, 15, 1])
torch.Size([4, 5, 15])
