In [None]:
# Seq2Seq RNN + Attention (Bahdanau)

!pip -q install datasets sacrebleu tqdm sentencepiece

import os, re, math, random, json, time
from dataclasses import dataclass
from typing import List, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm
from datasets import load_dataset
import sacrebleu
import sentencepiece as spm

def hr(title=None, ch="="):
    if title:
        print(f"\n{ch*10} {title} {ch*10}")
    else:
        print(ch*32)

def print_kv(k, v, pad=22):
    print(f"{k:<{pad}}: {v}")

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hr("Runtime")
print_kv("torch.cuda.is_available()", torch.cuda.is_available())
if torch.cuda.is_available():
    print_kv("GPU", torch.cuda.get_device_name(0))
print_kv("Device", DEVICE)

from google.colab import drive
hr("Mount Drive")
drive.mount("/content/drive")
DRIVE_DIR = "/content/drive/MyDrive/rnn_attn_ru_en"
os.makedirs(DRIVE_DIR, exist_ok=True)
print_kv("Drive save dir", DRIVE_DIR)

DATASET_NAME   = "Helsinki-NLP/opus-100"
DATASET_CONFIG = "en-ru"
SRC_LANG = "ru"
TGT_LANG = "en"

N_TRAIN = 50_000
N_DEV   = 2_000
N_TEST  = 2_000

MAX_LEN_SRC = 80
MAX_LEN_TGT = 80

SP_VOCAB_SIZE = 8000
SP_CHAR_COV_RU = 0.9995
SP_CHAR_COV_EN = 1.0

BATCH_SIZE = 32
EMB_DIM = 256
HID_DIM = 512
ENC_LAYERS = 1
DEC_LAYERS = 1
DROPOUT = 0.2

EPOCHS = 5
LR = 3e-4
CLIP = 1.0
TEACHER_FORCING = 0.9

EVAL_SENTS = 500
MAX_DECODE_LEN = 80

WORK_DIR = "/content/rnn_attn_ru_en"
TOK_DIR  = f"{WORK_DIR}/tokenizers"
CKPT_DIR = f"{WORK_DIR}/checkpoints"
os.makedirs(TOK_DIR, exist_ok=True)
os.makedirs(CKPT_DIR, exist_ok=True)

hr("Load dataset")
ds = load_dataset(DATASET_NAME, DATASET_CONFIG)

train_data = ds["train"]
dev_data   = ds["validation"]
test_data  = ds["test"]

print_kv("Train size", len(train_data))
print_kv("Dev size",   len(dev_data))
print_kv("Test size",  len(test_data))
print_kv("Train used", N_TRAIN)

_tok_re = re.compile(r"[^\w]+", flags=re.UNICODE)

def basic_tokenize(text: str) -> str:
    text = text.strip().lower()
    toks = [t for t in _tok_re.split(text) if t]
    return " ".join(toks)

def ok_len(src: str, tgt: str) -> bool:
    s = basic_tokenize(src).split()
    t = basic_tokenize(tgt).split()
    return (1 <= len(s) <= MAX_LEN_SRC) and (1 <= len(t) <= MAX_LEN_TGT)

hr("Build SP training text")
ru_txt = f"{TOK_DIR}/sp_ru_train.txt"
en_txt = f"{TOK_DIR}/sp_en_train.txt"

def write_sp_training_text(split, out_ru, out_en, n_limit):
    kept = 0
    with open(out_ru, "w", encoding="utf-8") as fru, open(out_en, "w", encoding="utf-8") as fen:
        for i in tqdm(range(min(n_limit, len(split))), desc=f"Writing {n_limit} lines"):
            ex = split[i]["translation"]
            ru = ex[SRC_LANG]
            en = ex[TGT_LANG]
            if not ok_len(ru, en):
                continue
            fru.write(basic_tokenize(ru) + "\n")
            fen.write(basic_tokenize(en) + "\n")
            kept += 1
    return kept

kept = write_sp_training_text(train_data, ru_txt, en_txt, N_TRAIN)
print_kv("Pairs written to SP text", kept)

hr("Train SentencePiece (RU + EN)")

def train_sp(input_txt, model_prefix, vocab_size, char_cov):
    model_file = model_prefix + ".model"
    if os.path.exists(model_file):
        return model_file

    args = (
        f"--input={input_txt} "
        f"--model_prefix={model_prefix} "
        f"--vocab_size={vocab_size} "
        f"--character_coverage={char_cov} "
        f"--model_type=unigram "
        f"--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 "
        f"--hard_vocab_limit=false"
    )
    spm.SentencePieceTrainer.Train(args)
    return model_file

ru_model_prefix = f"{TOK_DIR}/sp_ru"
en_model_prefix = f"{TOK_DIR}/sp_en"

ru_sp_model = train_sp(ru_txt, ru_model_prefix, SP_VOCAB_SIZE, SP_CHAR_COV_RU)
en_sp_model = train_sp(en_txt, en_model_prefix, SP_VOCAB_SIZE, SP_CHAR_COV_EN)

ru_sp = spm.SentencePieceProcessor(model_file=ru_sp_model)
en_sp = spm.SentencePieceProcessor(model_file=en_sp_model)

PAD_ID = 0
UNK_ID = 1
BOS_ID = 2
EOS_ID = 3

print_kv("RU SP model", ru_sp_model)
print_kv("EN SP model", en_sp_model)
print_kv("PAD/UNK/BOS/EOS", f"{PAD_ID}/{UNK_ID}/{BOS_ID}/{EOS_ID}")
print_kv("RU vocab size", ru_sp.get_piece_size())
print_kv("EN vocab size", en_sp.get_piece_size())

for f in [ru_sp_model, en_sp_model, ru_model_prefix + ".vocab", en_model_prefix + ".vocab"]:
    if os.path.exists(f):
        os.system(f'cp "{f}" "{DRIVE_DIR}/"')
print_kv("Drive tokenizers", "saved ✅")

hr("Encode train/dev/test")

def encode_ru(text: str) -> List[int]:
    ids = ru_sp.encode(text.strip().lower(), out_type=int)
    ids = ids[:MAX_LEN_SRC-1] + [EOS_ID]
    return ids

def encode_en(text: str) -> List[int]:
    ids = en_sp.encode(text.strip().lower(), out_type=int)
    ids = ids[:MAX_LEN_TGT-2]
    return [BOS_ID] + ids + [EOS_ID]

def build_encoded_pairs(split, n_limit):
    pairs = []
    for i in tqdm(range(min(n_limit, len(split))), desc=f"Encoding {n_limit} pairs"):
        ex = split[i]["translation"]
        ru = ex[SRC_LANG]
        en = ex[TGT_LANG]
        if not ok_len(ru, en):
            continue
        pairs.append((encode_ru(ru), encode_en(en)))
    return pairs

train_pairs = build_encoded_pairs(train_data, N_TRAIN)
dev_pairs   = build_encoded_pairs(dev_data, N_DEV)
test_pairs  = build_encoded_pairs(test_data, N_TEST)

print_kv("Train pairs", len(train_pairs))
print_kv("Dev pairs",   len(dev_pairs))
print_kv("Test pairs",  len(test_pairs))

class PairDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        return self.pairs[idx]

def collate_batch(batch):
    srcs, tgts = zip(*batch)
    src_lens = torch.tensor([len(s) for s in srcs], dtype=torch.long)
    tgt_lens = torch.tensor([len(t) for t in tgts], dtype=torch.long)

    max_s = max(len(s) for s in srcs)
    max_t = max(len(t) for t in tgts)

    src_pad = torch.full((len(batch), max_s), PAD_ID, dtype=torch.long)
    tgt_pad = torch.full((len(batch), max_t), PAD_ID, dtype=torch.long)

    for i, (s, t) in enumerate(zip(srcs, tgts)):
        src_pad[i, :len(s)] = torch.tensor(s, dtype=torch.long)
        tgt_pad[i, :len(t)] = torch.tensor(t, dtype=torch.long)

    return src_pad, src_lens, tgt_pad, tgt_lens

train_loader = DataLoader(PairDataset(train_pairs), batch_size=BATCH_SIZE, shuffle=True,
                          collate_fn=collate_batch, num_workers=2, pin_memory=True)
dev_loader   = DataLoader(PairDataset(dev_pairs), batch_size=BATCH_SIZE, shuffle=False,
                          collate_fn=collate_batch, num_workers=2, pin_memory=True)
test_loader  = DataLoader(PairDataset(test_pairs), batch_size=BATCH_SIZE, shuffle=False,
                          collate_fn=collate_batch, num_workers=2, pin_memory=True)

class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers=1, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD_ID)
        self.rnn = nn.GRU(emb_dim, hid_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_lens):
        emb = self.dropout(self.embedding(src))
        packed = nn.utils.rnn.pack_padded_sequence(emb, src_lens.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, h = self.rnn(packed)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)  # [B,S,2H]
        return out, h
class BahdanauAttention(nn.Module):
    def __init__(self, enc_dim, dec_dim, attn_dim):
        super().__init__()
        self.W = nn.Linear(enc_dim, attn_dim, bias=False)
        self.U = nn.Linear(dec_dim, attn_dim, bias=False)
        self.v = nn.Linear(attn_dim, 1, bias=False)

    def forward(self, enc_out, dec_state, src_mask):
        scores = self.v(torch.tanh(self.W(enc_out) + self.U(dec_state).unsqueeze(1))).squeeze(-1)
        scores = scores.masked_fill(src_mask == 0, -1e9)
        attn = F.softmax(scores, dim=-1)
        ctx = torch.bmm(attn.unsqueeze(1), enc_out).squeeze(1)
        return ctx, attn

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, enc_out_dim, dec_dim, num_layers=1, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD_ID)
        self.attn = BahdanauAttention(enc_out_dim, dec_dim, attn_dim=dec_dim)
        self.rnn = nn.GRU(emb_dim + enc_out_dim, dec_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(dec_dim + enc_out_dim + emb_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward_step(self, y_prev, dec_state, enc_out, src_mask):
        emb = self.dropout(self.embedding(y_prev))
        ctx, attn = self.attn(enc_out, dec_state.squeeze(0), src_mask)
        rnn_in = torch.cat([emb, ctx], dim=-1).unsqueeze(1)
        out, dec_state = self.rnn(rnn_in, dec_state)
        out = out.squeeze(1)
        logits = self.fc(torch.cat([out, ctx, emb], dim=-1))
        return logits, dec_state, attn

class Seq2Seq(nn.Module):
    def __init__(self, enc: Encoder, dec: Decoder, hid_dim):
        super().__init__()
        self.enc = enc
        self.dec = dec
        self.bridge = nn.Linear(hid_dim * 2, hid_dim)

    def forward(self, src, src_lens, tgt, teacher_forcing=0.9):
        B, T = tgt.size()
        enc_out, enc_h = self.enc(src, src_lens)
        src_mask = (src != PAD_ID).long()

        fwd = enc_h[-2]
        bwd = enc_h[-1]
        dec0 = torch.tanh(self.bridge(torch.cat([fwd, bwd], dim=-1))).unsqueeze(0)

        logits_all = []
        y = tgt[:, 0]
        dec_state = dec0

        for t in range(1, T):
            logits, dec_state, _ = self.dec.forward_step(y, dec_state, enc_out, src_mask)
            logits_all.append(logits.unsqueeze(1))
            use_teacher = (random.random() < teacher_forcing)
            y = tgt[:, t] if use_teacher else logits.argmax(dim=-1)

        return torch.cat(logits_all, dim=1)

ENC_V = ru_sp.get_piece_size()
DEC_V = en_sp.get_piece_size()

encoder = Encoder(ENC_V, EMB_DIM, HID_DIM, num_layers=ENC_LAYERS, dropout=DROPOUT)
decoder = Decoder(DEC_V, EMB_DIM, enc_out_dim=HID_DIM*2, dec_dim=HID_DIM, num_layers=DEC_LAYERS, dropout=DROPOUT)
model = Seq2Seq(encoder, decoder, hid_dim=HID_DIM).to(DEVICE)

hr("Model")
n_params = sum(p.numel() for p in model.parameters())
print_kv("params", f"{n_params:,}")

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)

def train_one_epoch(model, loader):
    model.train()
    total_loss = 0.0
    for src, src_lens, tgt, tgt_lens in tqdm(loader, desc="Train", leave=False):
        src, src_lens, tgt = src.to(DEVICE), src_lens.to(DEVICE), tgt.to(DEVICE)

        optimizer.zero_grad()
        logits = model(src, src_lens, tgt, teacher_forcing=TEACHER_FORCING)
        gold = tgt[:, 1:]

        loss = criterion(logits.reshape(-1, logits.size(-1)), gold.reshape(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()

        total_loss += float(loss.item())

    return total_loss / max(1, len(loader))

@torch.no_grad()
def greedy_translate_one(model, ru_text: str):
    model.eval()
    src_ids = encode_ru(ru_text)
    src = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(DEVICE)
    src_lens = torch.tensor([len(src_ids)], dtype=torch.long).to(DEVICE)

    enc_out, enc_h = model.enc(src, src_lens)
    src_mask = (src != PAD_ID).long()

    fwd = enc_h[-2]
    bwd = enc_h[-1]
    dec_state = torch.tanh(model.bridge(torch.cat([fwd, bwd], dim=-1))).unsqueeze(0)

    y = torch.tensor([BOS_ID], dtype=torch.long).to(DEVICE)
    out_ids = []

    for _ in range(MAX_DECODE_LEN):
        logits, dec_state, _ = model.dec.forward_step(y, dec_state, enc_out, src_mask)
        y = logits.argmax(dim=-1)
        token = int(y.item())
        if token == EOS_ID:
            break
        out_ids.append(token)

    return en_sp.decode(out_ids)

@torch.no_grad()
def eval_bleu(model, split_data, n_sent=500):
    model.eval()
    n = min(n_sent, len(split_data))
    hyps = []
    refs = []

    for i in tqdm(range(n), desc=f"BLEU ({n} sents)", leave=False):
        src_ids, tgt_ids = split_data[i]

        src_text = ru_sp.decode([x for x in src_ids if x not in (PAD_ID,)])
        hyp = greedy_translate_one(model, src_text)
        hyps.append(hyp)

        ref_ids = [x for x in tgt_ids if x not in (BOS_ID, EOS_ID, PAD_ID)]
        refs.append(en_sp.decode(ref_ids))

    bleu = sacrebleu.corpus_bleu(hyps, [refs])
    return bleu

def save_checkpoint(epoch, dev_bleu, test_bleu, tag="epoch"):
    ckpt = {
        "epoch": epoch,
        "model_state": model.state_dict(),
        "optimizer_state": optimizer.state_dict(),
        "config": {
            "EMB_DIM": EMB_DIM, "HID_DIM": HID_DIM, "DROPOUT": DROPOUT,
            "SP_VOCAB_SIZE": SP_VOCAB_SIZE, "MAX_LEN_SRC": MAX_LEN_SRC, "MAX_LEN_TGT": MAX_LEN_TGT
        },
        "scores": {"dev_bleu": float(dev_bleu), "test_bleu": float(test_bleu)},
        "sp_models": {"ru": ru_sp_model, "en": en_sp_model},
        "seed": SEED
    }
    local_path = f"{CKPT_DIR}/{tag}_{epoch}.pt"
    torch.save(ckpt, local_path)

    drive_path = f"{DRIVE_DIR}/{tag}_{epoch}.pt"
    os.system(f'cp "{local_path}" "{drive_path}"')

    print_kv("Saved local", local_path)
    print_kv("Saved drive", drive_path)

hr("Train")
bleu_curve = []

for epoch in range(1, EPOCHS + 1):
    avg_loss = train_one_epoch(model, train_loader)

    hr(f"Epoch {epoch} evaluation", ch="-")
    dev_bleu  = eval_bleu(model, dev_pairs,  n_sent=EVAL_SENTS)
    test_bleu = eval_bleu(model, test_pairs, n_sent=EVAL_SENTS)

    print_kv("Avg train loss", f"{avg_loss:.4f}")
    print_kv("Dev BLEU",       f"{dev_bleu.score:.2f}")
    print_kv("Test BLEU",      f"{test_bleu.score:.2f}")

    bleu_curve.append((float(dev_bleu.score), float(test_bleu.score)))
    save_checkpoint(epoch, dev_bleu.score, test_bleu.score, tag="epoch")

hr("Demo translations")
examples_ru = [
    "я люблю машинное обучение",
    "сегодня погода очень хорошая, но немного холодно",
    "пожалуйста, скажи мне где находится ближайшая станция метро",
]

for i, ru_sent in enumerate(examples_ru, 1):
    en_hyp = greedy_translate_one(model, ru_sent)
    print(f"\nExample {i}")
    print_kv("RU", ru_sent, pad=6)
    print_kv("EN*", en_hyp,  pad=6)

hr("BLEU per epoch")
for i, (d, t) in enumerate(bleu_curve, 1):
    print(f"Epoch {i}: Dev {d:.2f} | Test {t:.2f}")

hr("Save final model")
final_local = f"{WORK_DIR}/final"
final_drive = f"{DRIVE_DIR}/final"
os.makedirs(final_local, exist_ok=True)
os.makedirs(final_drive, exist_ok=True)

torch.save({"model_state": model.state_dict(), "config": {
    "ENC_V": ENC_V, "DEC_V": DEC_V, "EMB_DIM": EMB_DIM, "HID_DIM": HID_DIM,
    "ENC_LAYERS": ENC_LAYERS, "DEC_LAYERS": DEC_LAYERS, "DROPOUT": DROPOUT,
    "PAD_ID": PAD_ID, "UNK_ID": UNK_ID, "BOS_ID": BOS_ID, "EOS_ID": EOS_ID,
    "MAX_LEN_SRC": MAX_LEN_SRC, "MAX_LEN_TGT": MAX_LEN_TGT
}}, f"{final_local}/model.pt")

for f in [ru_sp_model, en_sp_model, ru_model_prefix + ".vocab", en_model_prefix + ".vocab"]:
    if os.path.exists(f):
        os.system(f'cp "{f}" "{final_local}/"')

os.system(f'cp -r "{final_local}/." "{final_drive}/"')

print_kv("Final local", final_local)
print_kv("Final drive", final_drive)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h
torch.cuda.is_available(): True
GPU                   : Tesla T4
Device                : cuda

Mounted at /content/drive
Drive save dir        : /content/drive/MyDrive/rnn_attn_ru_en



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

en-ru/test-00000-of-00001.parquet:   0%|          | 0.00/310k [00:00<?, ?B/s]

en-ru/train-00000-of-00001.parquet:   0%|          | 0.00/124M [00:00<?, ?B/s]

en-ru/validation-00000-of-00001.parquet:   0%|          | 0.00/310k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Train size            : 1000000
Dev size              : 2000
Test size             : 2000
Train used            : 50000



Writing 50000 lines:   0%|          | 0/50000 [00:00<?, ?it/s]

Pairs written to SP text: 49709

RU SP model           : /content/rnn_attn_ru_en/tokenizers/sp_ru.model
EN SP model           : /content/rnn_attn_ru_en/tokenizers/sp_en.model
PAD/UNK/BOS/EOS       : 0/1/2/3
RU vocab size         : 8000
EN vocab size         : 8000
Drive tokenizers      : saved ✅



Encoding 50000 pairs:   0%|          | 0/50000 [00:00<?, ?it/s]

Encoding 2000 pairs:   0%|          | 0/2000 [00:00<?, ?it/s]

Encoding 2000 pairs:   0%|          | 0/2000 [00:00<?, ?it/s]

Train pairs           : 49709
Dev pairs             : 1992
Test pairs            : 1985

params                : 24,872,768



Train:   0%|          | 0/1554 [00:00<?, ?it/s]


---------- Epoch 1 evaluation ----------


BLEU (500 sents):   0%|          | 0/500 [00:00<?, ?it/s]

BLEU (500 sents):   0%|          | 0/500 [00:00<?, ?it/s]

Avg train loss        : 5.0936
Dev BLEU              : 2.71
Test BLEU             : 2.46
Saved local           : /content/rnn_attn_ru_en/checkpoints/epoch_1.pt
Saved drive           : /content/drive/MyDrive/rnn_attn_ru_en/epoch_1.pt


Train:   0%|          | 0/1554 [00:00<?, ?it/s]


---------- Epoch 2 evaluation ----------


BLEU (500 sents):   0%|          | 0/500 [00:00<?, ?it/s]

BLEU (500 sents):   0%|          | 0/500 [00:00<?, ?it/s]

Avg train loss        : 4.2427
Dev BLEU              : 4.33
Test BLEU             : 4.00
Saved local           : /content/rnn_attn_ru_en/checkpoints/epoch_2.pt
Saved drive           : /content/drive/MyDrive/rnn_attn_ru_en/epoch_2.pt


Train:   0%|          | 0/1554 [00:00<?, ?it/s]


---------- Epoch 3 evaluation ----------


BLEU (500 sents):   0%|          | 0/500 [00:00<?, ?it/s]

BLEU (500 sents):   0%|          | 0/500 [00:00<?, ?it/s]

Avg train loss        : 3.7305
Dev BLEU              : 5.75
Test BLEU             : 5.88
Saved local           : /content/rnn_attn_ru_en/checkpoints/epoch_3.pt
Saved drive           : /content/drive/MyDrive/rnn_attn_ru_en/epoch_3.pt


Train:   0%|          | 0/1554 [00:00<?, ?it/s]


---------- Epoch 4 evaluation ----------


BLEU (500 sents):   0%|          | 0/500 [00:00<?, ?it/s]

BLEU (500 sents):   0%|          | 0/500 [00:00<?, ?it/s]

Avg train loss        : 3.3515
Dev BLEU              : 6.07
Test BLEU             : 6.40
Saved local           : /content/rnn_attn_ru_en/checkpoints/epoch_4.pt
Saved drive           : /content/drive/MyDrive/rnn_attn_ru_en/epoch_4.pt


Train:   0%|          | 0/1554 [00:00<?, ?it/s]


---------- Epoch 5 evaluation ----------


BLEU (500 sents):   0%|          | 0/500 [00:00<?, ?it/s]

BLEU (500 sents):   0%|          | 0/500 [00:00<?, ?it/s]

Avg train loss        : 3.0425
Dev BLEU              : 8.32
Test BLEU             : 8.01
Saved local           : /content/rnn_attn_ru_en/checkpoints/epoch_5.pt
Saved drive           : /content/drive/MyDrive/rnn_attn_ru_en/epoch_5.pt


Example 1
RU    : я люблю машинное обучение
EN*   : i love the phone

Example 2
RU    : сегодня погода очень хорошая, но немного холодно
EN*   : today ⁇ s a very good ⁇  but i ⁇ m cold

Example 3
RU    : пожалуйста, скажи мне где находится ближайшая станция метро
EN*   : please tell me where the siii ⁇ s the station ⁇ 

Epoch 1: Dev 2.71 | Test 2.46
Epoch 2: Dev 4.33 | Test 4.00
Epoch 3: Dev 5.75 | Test 5.88
Epoch 4: Dev 6.07 | Test 6.40
Epoch 5: Dev 8.32 | Test 8.01

Final local           : /content/rnn_attn_ru_en/final
Final drive           : /content/drive/MyDrive/rnn_attn_ru_en/final


In [None]:
# NLLB-200 "Beast" baseline (RU -> EN)

!pip -q install -U datasets sacrebleu transformers accelerate sentencepiece

import os
import random
import torch
from tqdm.auto import tqdm
from datasets import load_dataset
import sacrebleu
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

DATASET_NAME = "Helsinki-NLP/opus-100"
DATASET_CONFIG = "en-ru"     # has both "en" and "ru" in translation field
SRC_DATA_LANG = "ru"
TGT_DATA_LANG = "en"

SRC_LANG = "rus_Cyrl"
TGT_LANG = "eng_Latn"

MODEL_NAME = "facebook/nllb-200-distilled-600M"

MAX_NEW_TOKENS = 128
NUM_BEAMS = 5
BATCH_SIZE = 16

DEV_EVAL = 2000   # use full dev (opus-100 validation is 2000 anyway)
TEST_EVAL = 2000  # use full test (2000 anyway)

DRIVE_DIR = "/content/drive/MyDrive/nllb_beast_ru_en"

def hr(title=None, ch="="):
    if title:
        print(f"\n{ch*10} {title} {ch*10}")
    else:
        print(ch*32)

def print_kv(k, v, pad=22):
    print(f"{k:<{pad}}: {v}")

hr("Runtime")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print_kv("torch.cuda.is_available()", torch.cuda.is_available())
if device.type == "cuda":
    print_kv("GPU", torch.cuda.get_device_name(0))
print_kv("Device", device)

hr("Mount Drive")
from google.colab import drive
drive.mount("/content/drive", force_remount=False)
os.makedirs(DRIVE_DIR, exist_ok=True)
print_kv("Drive save dir", DRIVE_DIR)

hr("Load dataset")
ds = load_dataset(DATASET_NAME, DATASET_CONFIG)
train_data = ds["train"]
dev_data   = ds["validation"]
test_data  = ds["test"]
print_kv("Train size", len(train_data))
print_kv("Dev size",   len(dev_data))
print_kv("Test size",  len(test_data))

hr("Load model/tokenizer")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
model.eval()

print_kv("Model", MODEL_NAME)
print_kv("Params", f"{sum(p.numel() for p in model.parameters()):,}")

tokenizer.src_lang = SRC_LANG

def get_forced_bos_id(tok, tgt_lang_code: str):
    if hasattr(tok, "lang_code_to_id"):
        return tok.lang_code_to_id[tgt_lang_code]

    bos_id = tok.convert_tokens_to_ids(tgt_lang_code)

    if bos_id is None or bos_id == tok.unk_token_id:
        if hasattr(tok, "additional_special_tokens") and tgt_lang_code in tok.additional_special_tokens:
            bos_id = tok.convert_tokens_to_ids(tgt_lang_code)

    return bos_id

FORCED_BOS_ID = get_forced_bos_id(tokenizer, TGT_LANG)
print_kv("SRC_LANG", SRC_LANG)
print_kv("TGT_LANG", TGT_LANG)
print_kv("forced_bos_token_id", FORCED_BOS_ID)

if FORCED_BOS_ID is None or FORCED_BOS_ID == tokenizer.unk_token_id:
    raise ValueError(
        f"Could not resolve forced BOS id for {TGT_LANG}. "
        f"Check that the language code is correct (e.g., 'eng_Latn')."
    )

@torch.no_grad()
def translate_batch(texts, max_new_tokens=128, num_beams=5):
    batch = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256
    ).to(device)

    use_amp = (device.type == "cuda")
    with torch.autocast(device_type="cuda", enabled=use_amp):
        out = model.generate(
            **batch,
            forced_bos_token_id=FORCED_BOS_ID,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
        )

    return tokenizer.batch_decode(out, skip_special_tokens=True)

def eval_bleu(split, n_samples=2000, batch_size=16, name="dev"):
    n = min(n_samples, len(split))
    refs, hyps = [], []

    for i in tqdm(range(0, n, batch_size), desc=f"Translating {name} ({n})"):
        batch_ex = split[i:i+batch_size]["translation"]
        src_texts = [ex[SRC_DATA_LANG] for ex in batch_ex]
        ref_texts = [ex[TGT_DATA_LANG] for ex in batch_ex]

        hyp_texts = translate_batch(
            src_texts,
            max_new_tokens=MAX_NEW_TOKENS,
            num_beams=NUM_BEAMS
        )

        hyps.extend(hyp_texts)
        refs.extend(ref_texts)

    bleu = sacrebleu.corpus_bleu(hyps, [refs])

    out_path = os.path.join(DRIVE_DIR, f"nllb_{name}_hyps.txt")
    ref_path = os.path.join(DRIVE_DIR, f"nllb_{name}_refs.txt")
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("\n".join(hyps))
    with open(ref_path, "w", encoding="utf-8") as f:
        f.write("\n".join(refs))

    return bleu, out_path, ref_path

hr("Evaluate BLEU (pretrained baseline)")
dev_bleu, dev_hyp_path, dev_ref_path = eval_bleu(dev_data, n_samples=DEV_EVAL, batch_size=BATCH_SIZE, name="dev")
test_bleu, test_hyp_path, test_ref_path = eval_bleu(test_data, n_samples=TEST_EVAL, batch_size=BATCH_SIZE, name="test")

print_kv("Dev BLEU",  f"{dev_bleu.score:.2f}")
print_kv("Test BLEU", f"{test_bleu.score:.2f}")
print_kv("Saved dev hyps",  dev_hyp_path)
print_kv("Saved test hyps", test_hyp_path)

hr("Demo translations (3 sentences)")

examples_ru = [
    "я люблю машинное обучение",
    "сегодня погода очень хорошая, но немного холодно",
    "пожалуйста, скажи мне где находится ближайшая станция метро"
]

demo_out = translate_batch(examples_ru, max_new_tokens=MAX_NEW_TOKENS, num_beams=NUM_BEAMS)

for i, (ru, en) in enumerate(zip(examples_ru, demo_out), 1):
    print(f"\nExample {i}")
    print_kv("RU", ru, pad=6)
    print_kv("EN*", en, pad=6)

demo_path = os.path.join(DRIVE_DIR, "demo_3_sentences.txt")
with open(demo_path, "w", encoding="utf-8") as f:
    for i, (ru, en) in enumerate(zip(examples_ru, demo_out), 1):
        f.write(f"[{i}] RU: {ru}\n    EN: {en}\n\n")
print_kv("Saved demo", demo_path)

hr("Done")



torch.cuda.is_available(): True
GPU                   : Tesla T4
Device                : cuda

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive save dir        : /content/drive/MyDrive/nllb_beast_ru_en

Train size            : 1000000
Dev size              : 2000
Test size             : 2000

Model                 : facebook/nllb-200-distilled-600M
Params                : 615,073,792
SRC_LANG              : rus_Cyrl
TGT_LANG              : eng_Latn
forced_bos_token_id   : 256047



Translating dev (2000):   0%|          | 0/125 [00:00<?, ?it/s]

Translating test (2000):   0%|          | 0/125 [00:00<?, ?it/s]

Dev BLEU              : 34.62
Test BLEU             : 32.18
Saved dev hyps        : /content/drive/MyDrive/nllb_beast_ru_en/nllb_dev_hyps.txt
Saved test hyps       : /content/drive/MyDrive/nllb_beast_ru_en/nllb_test_hyps.txt


Example 1
RU    : я люблю машинное обучение
EN*   : I like machine learning.

Example 2
RU    : сегодня погода очень хорошая, но немного холодно
EN*   : It's very nice today, but it's a little cold.

Example 3
RU    : пожалуйста, скажи мне где находится ближайшая станция метро
EN*   : Please tell me where the nearest subway station is.
Saved demo            : /content/drive/MyDrive/nllb_beast_ru_en/demo_3_sentences.txt



In [5]:
# WMT19 RU→EN (facebook/wmt19-ru-en)

!pip -q install -U transformers sentencepiece sacrebleu tqdm datasets

import os, glob, torch
from tqdm.auto import tqdm
import sacrebleu
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset

MODEL = "facebook/wmt19-ru-en"

CANDIDATE_BASES = [
    "/content/pbsmt_ru_en",
    "/content",
]

SAVE_TO_DRIVE = True
DRIVE_DIR = "/content/drive/MyDrive/wmt19_ru_en_outputs"

BATCH_SIZE = 8
MAX_NEW_TOKENS = 128
NUM_BEAMS = 5

examples_ru = [
    "я люблю машинное обучение",
    "сегодня погода очень хорошая, но немного холодно",
    "пожалуйста, скажи мне где находится ближайшая станция метро",
]

def hr(title=None, ch="="):
    if title:
        print(f"\n{ch*10} {title} {ch*10}")
    else:
        print(ch*32)

def print_kv(k, v, pad=22):
    print(f"{k:<{pad}}: {v}")

def find_tok_pair(split_name):
    for base in CANDIDATE_BASES:
        ru = os.path.join(base, "tok", f"{split_name}.tok.ru")
        en = os.path.join(base, "tok", f"{split_name}.tok.en")
        if os.path.exists(ru) and os.path.exists(en):
            return ru, en
    return None, None

DEV_RU, DEV_EN = find_tok_pair("dev")
TEST_RU, TEST_EN = find_tok_pair("test")

use_local = all([DEV_RU, DEV_EN, TEST_RU, TEST_EN])

hr("Data source")
if use_local:
    print_kv("DEV_RU", DEV_RU)
    print_kv("DEV_EN", DEV_EN)
    print_kv("TEST_RU", TEST_RU)
    print_kv("TEST_EN", TEST_EN)

hr("Runtime")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print_kv("torch.cuda.is_available()", torch.cuda.is_available())
if device.type == "cuda":
    print_kv("GPU", torch.cuda.get_device_name(0))
print_kv("Device", device)

hr(f"Model = {MODEL}")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL,
    torch_dtype=(torch.float16 if device.type == "cuda" else torch.float32),
).to(device)
model.eval()

@torch.inference_mode()
def translate_lines(lines):
    outs = []
    for i in tqdm(range(0, len(lines), BATCH_SIZE), desc="Translating", leave=False):
        batch = lines[i:i+BATCH_SIZE]
        enc = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256,
        ).to(device)

        gen = model.generate(
            **enc,
            num_beams=NUM_BEAMS,
            max_new_tokens=MAX_NEW_TOKENS,
        )
        txt = tokenizer.batch_decode(gen, skip_special_tokens=True)
        outs.extend([t.strip() for t in txt])
    return outs

def bleu_score(hyps, refs):
    return sacrebleu.corpus_bleu(hyps, [refs]).score

def load_lines(path):
    with open(path, "r", encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

def decode_and_bleu_from_files(src_path, ref_path, out_path=None):
    src = load_lines(src_path)
    ref = load_lines(ref_path)
    assert len(src) == len(ref), f"Mismatch: {len(src)} src vs {len(ref)} ref"
    hyps = translate_lines(src)
    score = bleu_score(hyps, ref)
    if out_path:
        os.makedirs(os.path.dirname(out_path), exist_ok=True)
        with open(out_path, "w", encoding="utf-8") as f:
            f.write("\n".join(hyps) + "\n")
    return score

def decode_and_bleu_from_opus(split, out_path=None, limit=2000):
    ds = load_dataset("Helsinki-NLP/opus-100", "en-ru", split=split)
    ds = ds.select(range(min(limit, len(ds))))
    src = [ex["translation"]["ru"] for ex in ds]
    ref = [ex["translation"]["en"] for ex in ds]
    hyps = translate_lines(src)
    score = bleu_score(hyps, ref)
    if out_path:
        os.makedirs(os.path.dirname(out_path), exist_ok=True)
        with open(out_path, "w", encoding="utf-8") as f:
            f.write("\n".join(hyps) + "\n")
    return score

hr("Decode + BLEU (dev)")
dev_out = f"{DRIVE_DIR}/wmt19_dev_hyps.txt" if SAVE_TO_DRIVE else None
if use_local:
    dev_bleu = decode_and_bleu_from_files(DEV_RU, DEV_EN, out_path=dev_out)
else:
    dev_bleu = decode_and_bleu_from_opus("validation", out_path=dev_out, limit=2000)
print_kv("Dev BLEU", f"{dev_bleu:.2f}")
if dev_out:
    print_kv("Saved dev hyps", dev_out)

hr("Decode + BLEU (test)")
test_out = f"{DRIVE_DIR}/wmt19_test_hyps.txt" if SAVE_TO_DRIVE else None
if use_local:
    test_bleu = decode_and_bleu_from_files(TEST_RU, TEST_EN, out_path=test_out)
else:
    test_bleu = decode_and_bleu_from_opus("test", out_path=test_out, limit=2000)
print_kv("Test BLEU", f"{test_bleu:.2f}")
if test_out:
    print_kv("Saved test hyps", test_out)

hr("Demo (3 sentences)")
demo_hyps = translate_lines(examples_ru)
for i, (ru, en) in enumerate(zip(examples_ru, demo_hyps), 1):
    print(f"\n[{i}] RU: {ru}\n    EN: {en}")




torch.cuda.is_available(): True
GPU                   : Tesla T4
Device                : cuda




Translating:   0%|          | 0/250 [00:00<?, ?it/s]

Dev BLEU              : 39.10
Saved dev hyps        : /content/drive/MyDrive/wmt19_ru_en_outputs/wmt19_dev_hyps.txt



Translating:   0%|          | 0/250 [00:00<?, ?it/s]

Test BLEU             : 38.63
Saved test hyps       : /content/drive/MyDrive/wmt19_ru_en_outputs/wmt19_test_hyps.txt



Translating:   0%|          | 0/1 [00:00<?, ?it/s]


[1] RU: я люблю машинное обучение
    EN: i love machine learning

[2] RU: сегодня погода очень хорошая, но немного холодно
    EN: Today the weather is very good, but a little cold

[3] RU: пожалуйста, скажи мне где находится ближайшая станция метро
    EN: please tell me where the nearest metro station is located
