In [None]:
!pip install tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import re
import os
import json
import math
import random
import time

import numpy as np
from datetime import datetime
from typing import Optional
from collections import Counter
from configparser import ConfigParser
from typing import List, Dict, Tuple, Optional, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch import Tensor
import transformers

from nltk.tokenize import word_tokenize
import nltk
from gensim.models import Word2Vec

from torch.nn.utils import clip_grad_norm_
from tqdm.auto import tqdm, trange
from gensim.models import KeyedVectors
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
root = "/content/drive/MyDrive/nlp_train_contests/1"
models_dir = os.path.join(root, "models")
data_dir = os.path.join(root, "data")
submission_dir = os.path.join(root, "submissions")
config_dir = os.path.join(root, "configs")
config_path = os.path.join(config_dir, "conf.ini")


MIN_FREQ = 3 #Второй вариант для 5

In [None]:
def get_date_id():
    now = datetime.now()
    date_id = now.strftime("%Y%m%d_%H%M%S")
    return date_id


# def save_submission(submission:List, save_dir:str):
#   os.makedirs(save_dir, exist_ok=True)
#   filename = os.path.join(f"submission_{get_date_id()}.txt")
#   with open(os.path.join(save_dir, filename), "w") as f:
#       for line in submission:
#           f.write("{}\n".format(" ".join(list(map(str, map(int, line))))))

def save_submission(submission:List[dict], save_dir:str):
    os.makedirs(save_dir, exist_ok=True)
    filename = os.path.join(f"submission_{get_date_id()}.jsonl")
    with open(os.path.join(save_dir, filename), "w", encoding="utf-8") as f:
        for di in submission:
            line = json.dumps(di, sort_keys=True, ensure_ascii=False, separators=(',', ':'))
            f.write(line+"\n")

def save_models(model:nn.Module,optimizer: torch.optim.Adam, scheduler, model_name:str, save_dir:str):
    os.makedirs(save_dir, exist_ok=True)
    filename = os.path.join(f"{model_name}_{get_date_id()}.pt")
    try:
        # torch.save(model.state_dict(), os.path.join(save_dir, filename))
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_step_dict': scheduler.state_dict()
            }, os.path.join(save_dir, filename))
    except Exception as e:
        print(f"ошибка в сохранении модели {e}")

def load_weights(model:nn.Module, path:str, device:torch.device):
    if os.path.exists(path):
        model.load_state_dict(torch.load(path, map_location=device, weights_only=True))
        print(f"Модель успешно загружена из {path}")
    else:
        print("Путь модели не существует!")


def load_config(path: str) ->ConfigParser:
    try:
        config = ConfigParser()
        config.read(path)
        return config
    except Exception as e:
        print(f"ошибка чтения конфига {e}")

In [None]:
train_path = os.path.join(data_dir, "train")
val_path = os.path.join(data_dir, "val")
test_path = os.path.join(data_dir, "test_no_reference")
val_path

'/content/drive/MyDrive/nlp_train_contests/1/data/val'

In [None]:
val_data = []
with open(val_path, "r", encoding="utf-8") as f:
    for line in f.readlines():
        temp_dict = json.loads(line)
        val_data.append([temp_dict["src"], temp_dict["dst"]])

In [None]:
train_data = []
with open(train_path, "r", encoding="utf-8") as f:
    for line in f.readlines():
        temp_dict = json.loads(line)
        train_data.append([temp_dict["src"], temp_dict["dst"]])

In [None]:
##### Формат: {"dst":"....", "src":....}
print(f"Длина трейна = {len(train_data)}")
print(f"Длина вала = {len(val_data)}")

Длина трейна = 300000
Длина вала = 500


In [None]:
for line in val_data:
    line[0] = line[0].replace("▵", " ▵")

for line in train_data:
    line[0] = line[0].replace("▵", " ▵")

val_data = list(map(lambda el:(word_tokenize(el[0]), word_tokenize(el[1])), val_data))
train_data = list(map(lambda el:(word_tokenize(el[0]), word_tokenize(el[1])), train_data))

In [None]:
BATCH_SIZE = 256
EMB_SIZE = ENC_EMB_DIM = DEC_EMB_DIM = 192 #Второй вариант для 192, на компе щас для 96
NHEAD = 6
FFN_HID_DIM = 512 #Второй вариант для 192, на компе щас для 256
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
DEVICE = device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
NUM_EPOCHS = 70

SRC_LANGUAGE = 'ze'
TGT_LANGUAGE = 'en'

In [None]:
def train_wv_src():
    train_data_src = [["<sos>"] +el[0]+["<eos>"] for el in train_data]
    model_src = Word2Vec(sentences=train_data_src, vector_size=ENC_EMB_DIM, window=5, min_count=MIN_FREQ, workers=4)
    model_src.save(os.path.join(models_dir, f"model_src_{EMB_SIZE}"))

    # Store just the words + their trained embeddings.
    word_vectors = model_src.wv

    np.random.seed(1234)
    unk_arr = np.random.random(size = ENC_EMB_DIM).astype(np.float32)
    unk_arr /= np.linalg.norm(unk_arr).astype(np.float32)
    pad_arr = np.random.random(size = ENC_EMB_DIM).astype(np.float32)
    pad_arr /= np.linalg.norm(pad_arr).astype(np.float32)

    additional = {"<unk>": unk_arr, "<pad>":pad_arr}

    for k, v in additional.items():
        word_vectors.add_vector(k, v)

    word_vectors.save(os.path.join(models_dir, f"model_src_vocab_{EMB_SIZE}"))

In [None]:
def train_wv_trg():
    train_data_trg = [["<sos>"] +el[1]+["<eos>"]  for el in train_data]
    model_trg = Word2Vec(sentences=train_data_trg, vector_size=DEC_EMB_DIM, window=5, min_count=MIN_FREQ, workers=4)
    model_trg.save(os.path.join(models_dir, f"model_trg_{EMB_SIZE}"))

    # Store just the words + their trained embeddings.
    word_vectors = model_trg.wv

    unk_arr = np.random.random(size = DEC_EMB_DIM).astype(np.float32)
    unk_arr /= np.linalg.norm(unk_arr).astype(np.float32)
    pad_arr = np.random.random(size = DEC_EMB_DIM).astype(np.float32)
    pad_arr /= np.linalg.norm(pad_arr).astype(np.float32)

    additional = {"<unk>": unk_arr, "<pad>":pad_arr}

    for k, v in additional.items():
        word_vectors.add_vector(k, v)

    word_vectors.save(os.path.join(models_dir, f"model_trg_vocab_{EMB_SIZE}"))

In [None]:
# train_wv_src()
# train_wv_trg()



In [None]:
wv_src= KeyedVectors.load(os.path.join(models_dir, f"model_src_vocab_{EMB_SIZE}"), mmap='r')
SRC_VOCAB_SIZE  = len(wv_src)
wv_trg = KeyedVectors.load(os.path.join(models_dir, f"model_trg_vocab_{EMB_SIZE}"), mmap='r')
TGT_VOCAB_SIZE = len(wv_trg) #"Размеры словаря и word2vec должны совпадать!"
print(f"input = {SRC_VOCAB_SIZE }, out = {TGT_VOCAB_SIZE }")

input = 54069, out = 23161


In [None]:
trg_unk_idx = wv_trg.key_to_index["<unk>"]
trg_pad_idx = wv_trg.key_to_index["<pad>"]
src_unk_idx = wv_src.key_to_index["<unk>"]
src_pad_idx =wv_src.key_to_index["<pad>"]

trg_sos_idx = wv_trg.key_to_index["<sos>"]
trg_eos_idx = wv_trg.key_to_index["<eos>"]
src_sos_idx = wv_src.key_to_index["<sos>"]
src_eos_idx =wv_src.key_to_index["<eos>"]

sos_token, eos_token, pad_token = "<sos>", "<eos>", "<pad>"
unk_token = "<unk>"

In [None]:
def encode_W2V(sent, wv, default_idx:int):
    tokenized = ["<sos>"] + sent + ["<eos>"]
    return [wv.get_index(tok, default=default_idx) for tok in tokenized]

In [None]:
def collate_batch(batch):
    src_list, trg_list = [], []
    for src, trg in batch:
        src_encoded = encode_W2V(src, wv=wv_src, default_idx=src_unk_idx)[::-1]
        src_list.append(torch.tensor(src_encoded))

        trg_encoded = encode_W2V(trg, wv=wv_trg, default_idx=trg_unk_idx)
        trg_list.append(torch.tensor(trg_encoded))

    src_padded = pad_sequence(src_list, padding_value=wv_src.key_to_index[pad_token], batch_first=True)
    trg_padded = pad_sequence(trg_list, padding_value=wv_trg.key_to_index[pad_token], batch_first=True)

    return src_padded, trg_padded


train_dataloader = DataLoader(train_data, BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
src_batch, trg_batch = next(iter(train_dataloader))
print(src_batch.shape, trg_batch.shape)
val_dataloader = DataLoader(val_data, BATCH_SIZE, collate_fn=collate_batch)

torch.Size([256, 25]) torch.Size([256, 41])


In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask
def create_mask(src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)
    src_padding_mask = (src == src_pad_idx)
    tgt_padding_mask = (tgt == trg_pad_idx)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        """
        :param max_len: Input length sequence.
        :param d_model: Embedding dimension.
        :param dropout: Dropout value (default=0.1)
        """
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        """
        Inputs of forward function
        :param x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        """
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

class Seq2SeqTransformer(nn.Module):
    def __init__(
        self,
        num_encoder_layers: int,
        num_decoder_layers: int,
        emb_size: int,
        nhead: int,
        src_vocab_size: int,
        tgt_vocab_size: int,
        dim_feedforward: int = 512,
        dropout: float = 0.1
    ):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = nn.Transformer(
            d_model=emb_size,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)
    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)
    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)
    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [None]:
model = Seq2SeqTransformer(
    NUM_ENCODER_LAYERS,
    NUM_DECODER_LAYERS,
    EMB_SIZE,
    NHEAD,
    SRC_VOCAB_SIZE,
    TGT_VOCAB_SIZE,
    FFN_HID_DIM
).to(DEVICE)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")


print(model)


loss_fn = torch.nn.CrossEntropyLoss(ignore_index=wv_trg.key_to_index[pad_token])
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, betas=(0.9, 0.98), eps=1e-9)
# scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=40, num_training_steps=100*300_000//30_000)
# после можно поменять на
scheduler =torch.optim.lr_scheduler.PolynomialLR(optimizer, 50*300_000/30_000) #lr меняем на последний полученный

# save_models(model, optimizer, scheduler, model_name="combo_512", save_dir=models_dir)


#Загрузить модель и состояние оптимизатора
#Сразу сохранить и проверить что работает

21,822,649 total parameters.
21,822,649 training parameters.
Seq2SeqTransformer(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
          )
          (linear1): Linear(in_features=192, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=192, bias=True)
          (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-2): 3 x Tr

In [None]:
def train_epoch(model, optimizer, scheduler):
    print('Training')
    model.train()
    losses = 0
    scheduler_stepper=0
    scheduler_iterator = 0
    for src, tgt in tqdm(train_dataloader, total=len(list(train_dataloader))):
        # print(" ".join(vocab_transform[SRC_LANGUAGE].lookup_tokens(list(src[0].cpu().numpy()))).replace("<bos>", "").replace("<eos>", ""))
        # print(" ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt[0].cpu().numpy()))).replace("<bos>", "").replace("<eos>", ""))


        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        scheduler_stepper+=src.shape[0]


        tgt_input = tgt[:, :-1]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        logits = model(
            src,
            tgt_input,
            src_mask,
            tgt_mask,
            src_padding_mask,
            tgt_padding_mask,
            src_padding_mask
        )
        optimizer.zero_grad()
        tgt_out = tgt[:, 1:]
        loss = loss_fn(logits.view(-1, TGT_VOCAB_SIZE), tgt_out.contiguous().view(-1))
        loss.backward()
        optimizer.step()
        losses += loss.item()

        if scheduler_stepper>=30_000:
             scheduler.step()
             scheduler_stepper=0
             scheduler_iterator+=1
             print(f"lr = {scheduler.get_lr()}, loss = {losses/(30_000*scheduler_iterator)}")


    return losses / len(list(train_dataloader))


def evaluate(model):
    print('Validating')
    model.eval()
    losses = 0
    for src, tgt in tqdm(val_dataloader, total=len(list(val_dataloader))):
        # print(" ".join(vocab_transform[SRC_LANGUAGE].lookup_tokens(list(src[0].cpu().numpy()))).replace("<bos>", "").replace("<eos>", ""))
        # print(" ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt[0].cpu().numpy()))).replace("<bos>", "").replace("<eos>", ""))
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:, :-1]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(
            src,
            tgt_input,
            src_mask,
            tgt_mask,
            src_padding_mask,
            tgt_padding_mask,
            src_padding_mask
        )
        tgt_out = tgt[:, 1:]
        loss = loss_fn(logits.view(-1, TGT_VOCAB_SIZE), tgt_out.contiguous().view(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [None]:
checkpoint = torch.load(os.path.join(models_dir, "combo_512_20241119_161553.pt"), weights_only=True)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
scheduler.load_state_dict(checkpoint["scheduler_step_dict"])

train_loss_list, valid_loss_list = [], []

for epoch in range(1, NUM_EPOCHS+1):
    start_time = time.time()
    train_loss = train_epoch(model, optimizer, scheduler)
    valid_loss = evaluate(model)
    end_time = time.time()
    train_loss_list.append(train_loss)
    valid_loss_list.append(valid_loss)
    if epoch%3==0:
        save_models(model, optimizer, scheduler, model_name="combo_512", save_dir=models_dir)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {valid_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s \n"))

In [None]:
import matplotlib.pyplot as plt
def save_plots(train_loss, valid_loss):
    """
    Function to save the loss plots to disk.
    """
    # Loss plots.
    plt.figure(figsize=(10, 7))
    plt.plot(
        train_loss, color='blue', linestyle='-',
        label='train loss'
    )
    plt.plot(
        valid_loss, color='red', linestyle='-',
        label='validataion loss'
    )
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(os.path.join('outputs', 'loss.png'))
    plt.show()

In [None]:
save_plots(train_loss_list, valid_loss_list)

In [None]:
# Helper function to generate output sequence using greedy algorithm.
def greedy_decode(model, src, src_mask, max_len, start_symbol, temperature:float = 0.3):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        if i == 0:
            ys = ys.transpose(1, 0)
        tgt_mask = (generate_square_subsequent_mask(ys.size(1))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out

        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        #p_next = F.softmax(prob.squeeze(0) / temperature, dim=-1).detach().cpu().data.numpy()
        #top_index = np.random.choice(len(wv_trg), p=p_next)
        # # next_word = torch.tensor([top_index], dtype=torch.long)
        #next_word = top_index


        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
        if next_word == trg_eos_idx:
            break
    return ys

In [None]:
def translate(model: torch.nn.Module, src_sentence: list):
    model.eval()
    src = torch.tensor(encode_W2V(src_sentence, wv=wv_src, default_idx=src_unk_idx)).view(1, -1).to(device)
    num_tokens = src.shape[1]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=trg_sos_idx).flatten()
    return " ".join([wv_trg.index_to_key[tok] for tok in tgt_tokens.cpu().numpy()]).replace("<sos>", "").replace("<eos>", "")

In [None]:
# infer_sentences = [
#     ["Take a seat.", "Prends place !"],
#     ["I'm not scared to die", "Je ne crains pas de mourir."],
#     ["You'd better make sure that it is true.", "Tu ferais bien de t'assurer que c'est vrai."],
#     ["The clock has stopped.", "L'horloge s'est arrêtée."],
#     ["Take any two cards you like.", "Prends deux cartes de ton choix."]
# ]
val_path = os.path.join(data_dir, "val")
val_data = []
with open(val_path, "r", encoding="utf-8") as f:
    for line in f.readlines():
        temp_dict = json.loads(line)
        val_data.append([temp_dict["src"], temp_dict["dst"]])

for line in val_data:
    line[0] = line[0].replace("▵", " ▵")


val_data = list(map(lambda el:(word_tokenize(el[0]), word_tokenize(el[1])), val_data))

for sentence in random.sample(val_data, 10):
    print(f"SRC: {sentence[0]}")
    print(f"GT: {sentence[1]}")
    print(f"PRED: {translate(model, sentence[0])}\n")

SRC: ['◝▴▱▱◠▫◂◓', "206'◳◬", '◗▢▱▴◎◪▨', '◗◉◫▦', '◀◭▫▩▦', '◕◪▼◪', '◠▽◠▨▫◠', '▨◠▱◈▪◐▪▦▪▢◬', '◚▴', '◠◞▪▱', '▴▫▨◗▦▱◗◐◗', '◗▢▱◪◎▴', '◍◬◓◞◠▫◬▦▪▦', '◚▴◓◗▱◎◪◈◫◐◗▦◫', '◈◭◒▩▦◭▦', '▵']
GT: ['Imagine', 'you', 'stayed', 'up', 'all', 'night', 'to', 'watch', 'the', 'full', 'Bellator', '206', ',', 'but', 'you', 'did', "n't", 'get', 'to', 'see', 'the', 'main', 'event', '.']
PRED:  I 'm not going to be a <unk> , and I 'm not going to be a <unk> , and I 'm

SRC: ['◄◠▨◪◈◂▦▽◠', '◝◠◒◀◠▨◠▦▪', '◁◧◓◠▦', '◁◠▴◚■', '◪◒◗', '◁◧◓◫▼◠', '◚▴', '◧◐▱◨', '◙◨◞▷▨◂■', '◄◠▨◪◈◂▦◳◠', '◓◪◍▴◓◠▦◈▾◎◨▦◈◠', '◭▱▨◪▦◫▦', '◠◈◬▦▪', '◈▴◐◫◒▫◫◓◪◓◪▨', '▯◦▶◱', '◚◪', '○◚◓▾▻◠', "◝◗◓▱◗◐◫'▦▴", '▨◠▫▪▱▪◎▪▦', '◣▦◭▦◭', '◠◉◎◠', '◳◣▦▩▦◈◪▨◫', '◂◳▱◠◓◬▦◬', '30', '◰◳▱▩▱', "2018'◈▴", '▮▫◓◨◎◗▼◠■', "◄◠▨▴◈◧▦◳◠'◈◠", '▨▾▱▱◠▦◈◬▱◠◓', '▵']
GT: ['Macedonian', 'Prime', 'Minister', 'Zoran', 'Zaev', ',', 'his', 'wife', 'Zorica', ',', 'and', 'his', 'son', 'Dusko', 'voted', 'for', 'the', 'Macedonian', 'referendum', 'on', 'changing', 'the', 'country', "'s", 'name', ',', 'wh

In [None]:
test_path = os.path.join(data_dir, "test_no_reference")
test_data = []
with open(test_path, "r", encoding="utf-8") as f:
    for line in f.readlines():
        temp_dict = json.loads(line)
        test_data.append(temp_dict["src"])


In [None]:
def make_submission_transformer(test_data:list[str])->list[dict]:
    sub_list_dict = []
    model.eval()
    max_len = 50
    with torch.no_grad():
        pred_sents = []
        # val_sents = []
        for sentence in test_data:

            src = sentence.replace("▵", " ▵")
            src = word_tokenize(src)

            translation = translate(model, src)

            sub_list_dict.append({"dst":translation, "src":sentence})
        # references_list = [[ref] for ref in pred_sents]
        # bleu_score_corpus = corpus_bleu(references_list, val_sents)
        # print("Corpus BLEU Score: ", bleu_score_corpus)
    return sub_list_dict

In [None]:
sld = make_submission_transformer(test_data[:10])
print(*sld, sep = "\n")

{'dst': " I 'm not sure you need to tell that how you can get a , was the leave your husband in the", 'src': '◲▦◠▦◬▦■ ◉◗▢◕◗ ◍◗▱◎ ▽◠▽▪▦◠ ◕▴◉◗▦▼▴ ◀◗◓◉◧▨ ◎▴◞◠▸ ◠▱◈▪▨ ◚◪ ◀◨ ◎◪◞◠▸▱◠◓◬▦ ◀◠▢▪▱◠◓▪ ▻◪▨ ◈◂◞▫◉◠ ◈▴◐◫▱◈◗▵'}
{'dst': " Let 's see if I 'll be gon na start with you , we can get off the <unk> , wrong And make it if we might be <unk> , then the you 'll put another one", 'src': '▯▴▥ ◟◧◓▨▱◨ ◀◫◓ ◈◠◈◬■ ◉◂▼◨◐◨▦ ◠▦▦◪◞◗▦◗▦ ▽◠▢◈◬◐▪ ◚◪ ◳◠▦▱▪◒▱▪▨▱◠ ▨▴▦◈◗◞◗▦▴ ◕◣▦◈▴◓◈◗◐◫■ "◀◫◓ ◞◫◳◠▷◗ ◈◠▷◠" ◳◠▢◠▦ ◀◗◓ ◎▴◞◠▸◈◠▦ ◞◧▦◓◠ ▨◪▦◈◫◞◗▦◪ ◠◳◓▪◎▼◬▱◬▨ ◳◠▻▪▱◈▪◐▪ ◚◪ ◗◒◫▦◈◪▦ ▨◂◚◨▱◈▾◐▾ ◫◉◫▦ ◉◗◍▫▴ ◈◠◚◠ ◠◉◬▽◂◓▵'}
{'dst': ' The kill a your are going to do something , with the <unk> . ', 'src': '◡◠▻◧▦ ◂▫◧◎◂◀◗▱ ◍◗◓◎◠◞◬ ◠▦▱◠◒◎◠◞▪▢ ◝◓▴▹◗▫ ◈◨◓▾◎▾▦◈◠ ◞▪▦◬◓◈◠ ◀◪▨▱▴◎◪ ◞◭◓◪◞◫▦◫▦ ◨▢◠◎◠◞▪▦▪▦ ◫▦◞◠▦▱◠◓▪▦ ◗◒◗▦◫ ▨◠▽◀◪▫◎◪◞◫▦◪ ▦▴◈◪▦ ◂▱◠◀◫▱▴▼▴◐◫▦◫ ◞◇◳▱◪◈◗▵'}
{'dst': ' No , we were the know their a need to see the , then you were going on the in to stick in the hold of your ass ,', 'src': "◝▾◀◀◠ ▰◠▫◞◂▦ ◚▴ ▰▴◀◀ ▮◫◎▻◞◂▦■ ◞◠◀◠▷ ◂◳▦◠▦◠▦ ◍◂◨◓◀◠▱▱

In [None]:
sld = make_submission_transformer(test_data)
save_submission(sld, save_dir=submission_dir)

### Проверка

In [None]:
check_data = []
with open(os.path.join(submission_dir, "submission_20241116_123840.jsonl"), "r", encoding="utf-8") as f:
    for line in f.readlines():
        temp_dict = json.loads(line)
        check_data.append([temp_dict["src"], temp_dict["dst"]])
check_data[500:600]