In [None]:
%pip install sacrebleu

In [None]:
!pip install torchtext
!pip install torch==2.3.0

In [None]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
from torchtext.utils import download_from_url, extract_archive
import io

In [None]:
import numpy as np 
import pandas as pd 
import os
import math
from tqdm import tqdm
import sacrebleu 
import random
import wandb
import re
import time

In [None]:
%cd ../input/bhw2-data/data

Практически все взято из туториала pytorch-а:
https://pytorch.org/tutorials/beginner/torchtext_translation_tutorial.html

In [None]:
#en_tokenizer = get_tokenizer("basic_english")
def english_tokenizer(text):
    return text.lower().strip().split()
    

def german_tokenizer(text):
    return text.lower().strip().split()
    
de_tokenizer = german_tokenizer
en_tokenizer = english_tokenizer

In [None]:
train_filepaths = ('train.de-en.de', 'train.de-en.en')
val_filepaths = ('val.de-en.de', 'val.de-en.en')
test_filepaths = 'test1.de-en.de'

In [None]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(filepath, tokenizer):
    with io.open(filepath, encoding="utf8") as f:
        for line in f:
            yield tokenizer(line)

de_vocab = build_vocab_from_iterator(
    yield_tokens(train_filepaths[0], de_tokenizer),
    specials=['<unk>', '<pad>', '<bos>', '<eos>'],
    min_freq=8
)
de_vocab.set_default_index(de_vocab['<unk>'])

en_vocab = build_vocab_from_iterator(
    yield_tokens(train_filepaths[1], en_tokenizer),
    specials=['<unk>', '<pad>', '<bos>', '<eos>'],
    min_freq=8
)
en_vocab.set_default_index(en_vocab['<unk>'])

In [None]:
def data_process(filepaths):
    if (len(filepaths) == 2):
        raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
        raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
        data = []
        for raw_de, raw_en in zip(raw_de_iter, raw_en_iter):
            de_tensor = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
                                       dtype=torch.long)
            en_tensor = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
                                       dtype=torch.long)
            data.append((de_tensor, en_tensor))
        return data
    else:
        raw_de_iter = iter(io.open(filepaths, encoding="utf8"))
        data = []
        for raw_de in raw_de_iter:
            de_tensor = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
                                       dtype=torch.long)
            data.append(de_tensor)
        return data
        

train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 64
PAD_IDX = de_vocab['<pad>']
BOS_IDX = de_vocab['<bos>']
EOS_IDX = de_vocab['<eos>']

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
  de_batch, en_batch = [], []
  for (de_item, en_item) in data_batch:
    de_batch.append(torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
    en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
  de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
  en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
  return de_batch, en_batch

train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)


In [None]:
import random
from typing import Tuple

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor


class Encoder(nn.Module):
    def __init__(self,
                 input_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: float):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)

        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self,
                src: Tensor) -> Tuple[Tensor]:

        embedded = self.dropout(self.embedding(src))

        outputs, hidden = self.rnn(embedded)

        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))

        return outputs, hidden


class Attention(nn.Module):
    def __init__(self,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 attn_dim: int):
        super().__init__()

        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim

        self.attn = nn.Linear(self.attn_in, attn_dim)

    def forward(self,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor) -> Tensor:

        src_len = encoder_outputs.shape[0]

        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        energy = torch.tanh(self.attn(torch.cat((
            repeated_decoder_hidden,
            encoder_outputs),
            dim = 2)))

        attention = torch.sum(energy, dim=2)

        return F.softmax(attention, dim=1)


class Decoder(nn.Module):
    def __init__(self,
                 output_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: int,
                 attention: nn.Module):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)

        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)

        self.dropout = nn.Dropout(dropout)


    def _weighted_encoder_rep(self,
                              decoder_hidden: Tensor,
                              encoder_outputs: Tensor) -> Tensor:

        a = self.attention(decoder_hidden, encoder_outputs)

        a = a.unsqueeze(1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        weighted_encoder_rep = torch.bmm(a, encoder_outputs)

        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)

        return weighted_encoder_rep


    def forward(self,
                input: Tensor,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor) -> Tuple[Tensor]:

        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden,
                                                          encoder_outputs)

        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)

        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)

        output = self.out(torch.cat((output,
                                     weighted_encoder_rep,
                                     embedded), dim = 1))

        return output, decoder_hidden.squeeze(0)


class Seq2Seq(nn.Module):
    def __init__(self,
                 encoder: nn.Module,
                 decoder: nn.Module,
                 device: torch.device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self,
                src: Tensor,
                trg: Tensor,
                teacher_forcing_ratio: float = 0.5) -> Tensor:

        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)

        output = trg[0,:]

        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs


INPUT_DIM = len(de_vocab)
OUTPUT_DIM = len(en_vocab)


ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
ENC_HID_DIM = 256
DEC_HID_DIM = 256
ATTN_DIM = 8
ENC_DROPOUT = 0.2
DEC_DROPOUT = 0.2

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)

attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)

dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)


def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


model.apply(init_weights)

optimizer = optim.Adam(model.parameters())

In [None]:
PAD_IDX = en_vocab.get_stoi()['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
def translate_sentence(model, src_tensor, src_vocab, trg_vocab, max_len=100):
   model.eval()
   if src_tensor.dim() == 1:
       src_tensor = src_tensor.unsqueeze(1)  
   with torch.no_grad():
       encoder_outputs, hidden = model.encoder(src_tensor)

   trg_indexes = [trg_vocab['<bos>']]

   for _ in range(max_len):
       trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(src_tensor.device)
       with torch.no_grad():
           output, hidden = model.decoder(trg_tensor, hidden, encoder_outputs)

       pred_token = output.argmax(dim=1).item()
       trg_indexes.append(pred_token)

       # Stop at <eos>
       if pred_token == trg_vocab['<eos>']:
           break

   trg_tokens = [trg_vocab.get_itos()[i] for i in trg_indexes][1:]
   return trg_tokens

In [None]:
def evaluate_bleu(model, iterator, src_vocab, trg_vocab, device):
    model.eval()
    hypotheses = []
    references = []

    with torch.no_grad():
        for src, trg in iterator:
            src, trg = src.to(device), trg.to(device)

            src_sentence = src[:, 0]
            trg_sentence = trg[:, 0]

            pred_tokens = translate_sentence(model, src_sentence, src_vocab, trg_vocab)
            hypothesis = " ".join(pred_tokens)

            trg_tokens = [trg_vocab.get_itos()[i] for i in trg_sentence.tolist()]
            
            if trg_tokens[0] == "<bos>":
                trg_tokens = trg_tokens[1:]
            
            reference = " ".join(trg_tokens).strip()

            hypotheses.append(hypothesis)
            references.append([reference])  
    bleu = sacrebleu.corpus_bleu(hypotheses, references)
    return bleu.score


In [None]:
def train(model: nn.Module,
          iterator: torch.utils.data.DataLoader,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):
    model.train()
    epoch_loss = 0

    for _, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg)
        
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def epoch_time(start_time: float, end_time: float):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [None]:
epochs = 5
clip = 1
best_valid_bleu = 0  
for epoch in tqdm(range(epochs)):

    start_time = time.time()

    train_loss = train(model, train_iter, optimizer, criterion, clip)
    valid_bleu = evaluate_bleu(model, valid_iter, de_vocab, en_vocab, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. BLEU: {valid_bleu:.2f}')

    if valid_bleu > best_valid_bleu:
        best_valid_bleu = valid_bleu
        torch.save(model.state_dict(), '/kaggle/working/best-model.pt')


In [21]:
model = Seq2Seq(enc, dec, device)

model.apply(init_weights)
model.load_state_dict(torch.load("../../model-pts/best-model.pt", map_location=device))

model.eval()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(17706, 128)
    (rnn): GRU(128, 256, bidirectional=True)
    (fc): Linear(in_features=512, out_features=256, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=768, out_features=8, bias=True)
    )
    (embedding): Embedding(14266, 128)
    (rnn): GRU(640, 256)
    (out): Linear(in_features=896, out_features=14266, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
)

In [22]:
def generate_batch_test(data_batch):
    de_batch = []
    for de_item in data_batch:
        de_tensor = torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])])
        de_batch.append(de_tensor)

    de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
    return de_batch


test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=False, collate_fn=generate_batch_test)

predictions = []
model.eval()  
with torch.no_grad():
    for batch in test_iter:
        for i in range(batch.size(1)):
            src_sentence = batch[:, i].to(device)
            predicted_indices = translate_sentence(model, src_sentence, de_vocab, en_vocab)
            prediction = " ".join(predicted_indices)
            predictions.append(prediction)


with open("/kaggle/working/test.de-en.en", "w", encoding="utf-8") as f:
    for pred in predictions:
        f.write(pred + "\n")
