In [None]:
# IBM Model 1

!pip -q install datasets sacrebleu tqdm

import random
import re
import pickle
import os
from collections import Counter, defaultdict
from tqdm.auto import tqdm

from datasets import load_dataset
import sacrebleu

SEED = 42
random.seed(SEED)

DATASET_NAME = "Helsinki-NLP/opus-100"
DATASET_CONFIG = "en-ru"   # contains both en and ru in 'translation'
SRC_LANG = "ru"
TGT_LANG = "en"

N_TRAIN = 50_000
MAX_LEN = 40
MIN_FREQ = 2
MAX_VOCAB_RU = 50_000
MAX_VOCAB_EN = 50_000

EM_ITERS = 5
BLEU_SAMPLES = 2000

NULL_TOKEN = "<NULL>"
UNK_TOKEN  = "<UNK>"

_tok_re = re.compile(r"[^\w]+", flags=re.UNICODE)
_punct_fix = re.compile(r"\s+([,.!?;:])")

def tokenize(text: str):
    text = text.lower().strip()
    return [t for t in _tok_re.split(text) if t]

def filter_pair(src_toks, tgt_toks):
    return (1 <= len(src_toks) <= MAX_LEN) and (1 <= len(tgt_toks) <= MAX_LEN)

def map_to_vocab(tokens, vocab):
    return [t if t in vocab else UNK_TOKEN for t in tokens]

def hr(title=None, ch="="):
    if title:
        print(f"\n{ch*10} {title} {ch*10}")
    else:
        print(ch*32)

def print_kv(k, v, pad=18):
    print(f"{k:<{pad}}: {v}")

hr("Load dataset")
ds = load_dataset(DATASET_NAME, DATASET_CONFIG)
train_data = ds["train"]
dev_data   = ds["validation"]
test_data  = ds["test"]

print_kv("Train size", len(train_data))
print_kv("Valid size", len(dev_data))
print_kv("Test size",  len(test_data))

hr("Tokenize + build vocab")

def build_pairs_and_vocab(dataset_split, n_limit=None):
    src_counter = Counter()
    tgt_counter = Counter()
    pairs = []

    idxs = list(range(len(dataset_split)))
    if n_limit is not None:
        idxs = idxs[:n_limit]

    for i in tqdm(idxs, desc="Reading + tokenizing"):
        ex = dataset_split[i]["translation"]
        src = tokenize(ex[SRC_LANG])
        tgt = tokenize(ex[TGT_LANG])
        if not filter_pair(src, tgt):
            continue

        pairs.append((src, tgt))
        src_counter.update(src)
        tgt_counter.update(tgt)

    return pairs, src_counter, tgt_counter

train_pairs_raw, ru_counts, en_counts = build_pairs_and_vocab(train_data, n_limit=N_TRAIN)
print_kv("Filtered train pairs", len(train_pairs_raw))

def make_vocab(counter: Counter, min_freq: int, max_vocab: int):
    items = [(w, c) for w, c in counter.items() if c >= min_freq]
    items.sort(key=lambda x: x[1], reverse=True)
    words = [w for w, _ in items[:max_vocab]]
    vocab = {w: i for i, w in enumerate(words)}
    return vocab

ru_vocab = make_vocab(ru_counts, MIN_FREQ, MAX_VOCAB_RU)
en_vocab = make_vocab(en_counts, MIN_FREQ, MAX_VOCAB_EN)

ru_vocab[UNK_TOKEN] = len(ru_vocab)
en_vocab[UNK_TOKEN] = len(en_vocab)

print_kv("RU vocab size", len(ru_vocab))
print_kv("EN vocab size", len(en_vocab))

train_pairs = [
    ([NULL_TOKEN] + map_to_vocab(ru, ru_vocab), map_to_vocab(en, en_vocab))
    for (ru, en) in train_pairs_raw
]

hr("Initialize t(e|f) (sparse)")

cooc = defaultdict(set)
for ru_sent, en_sent in tqdm(train_pairs, desc="Building co-occurrence"):
    en_set = set(en_sent)
    for f in ru_sent:             # includes NULL_TOKEN
        cooc[f].update(en_set)

t_probs = {}
for f, e_set in cooc.items():
    if not e_set:
        continue
    uniform = 1.0 / len(e_set)
    t_probs[f] = {e: uniform for e in e_set}

print_kv("Source types (f)", len(t_probs))
avg_deg = sum(len(v) for v in t_probs.values()) / max(1, len(t_probs))
print_kv("Avg targets per f", f"{avg_deg:.2f}")

def best_translation_for_f(f, t_probs, default=UNK_TOKEN):
    if f not in t_probs or not t_probs[f]:
        return default
    return max(t_probs[f].items(), key=lambda x: x[1])[0]

def translate_ru_to_en(ru_text: str, t_probs):
    ru_toks = map_to_vocab(tokenize(ru_text), ru_vocab)
    out = [best_translation_for_f(f, t_probs) for f in ru_toks]
    sent = " ".join(out)
    return _punct_fix.sub(r"\1", sent)

def eval_bleu(test_split, t_probs, n_samples=2000):
    refs, hyps = [], []
    n = min(n_samples, len(test_split))
    for i in tqdm(range(n), desc=f"BLEU on {n} test sents"):
        ex = test_split[i]["translation"]
        hyp = translate_ru_to_en(ex[SRC_LANG], t_probs)
        hyps.append(hyp)
        refs.append(ex[TGT_LANG])
    return sacrebleu.corpus_bleu(hyps, [refs])

hr("Train IBM Model 1")

def ibm1_train_with_bleu(train_pairs, t_probs, iters=5):
    bleu_by_iter = []
    for it in range(1, iters + 1):
        count_fe = defaultdict(lambda: defaultdict(float))
        total_f = defaultdict(float)

        for ru_sent, en_sent in tqdm(train_pairs, desc=f"EM iter {it}/{iters}"):
            for e in en_sent:
                z = 0.0
                for f in ru_sent:
                    z += t_probs.get(f, {}).get(e, 0.0)
                if z == 0.0:
                    continue
                for f in ru_sent:
                    tef = t_probs.get(f, {}).get(e, 0.0)
                    if tef == 0.0:
                        continue
                    c = tef / z
                    count_fe[f][e] += c
                    total_f[f] += c

        new_t = {}
        for f, e_dict in count_fe.items():
            denom = total_f[f]
            if denom <= 0:
                continue
            new_t[f] = {e: (c / denom) for e, c in e_dict.items()}

        t_probs = new_t

        stored_f = len(t_probs)
        avg_deg = sum(len(v) for v in t_probs.values()) / max(1, stored_f)

        bleu = eval_bleu(test_data, t_probs, n_samples=BLEU_SAMPLES)
        bleu_by_iter.append(float(bleu.score))

        hr(f"Iteration {it} summary", ch="-")
        print_kv("Stored f types", stored_f)
        print_kv("Avg targets per f", f"{avg_deg:.2f}")
        print_kv("BLEU", f"{bleu.score:.2f}")

    return t_probs, bleu_by_iter

t_probs, bleu_curve = ibm1_train_with_bleu(train_pairs, t_probs, iters=EM_ITERS)

hr("Demo translations")

examples_ru = [
    "я люблю машинное обучение",
    "сегодня погода очень хорошая, но немного холодно",
    "пожалуйста, скажи мне где находится ближайшая станция метро"
]

for i, ru_sent in enumerate(examples_ru, 1):
    en_hyp = translate_ru_to_en(ru_sent, t_probs)
    print(f"\nExample {i}")
    print_kv("RU", ru_sent, pad=6)
    print_kv("EN*", en_hyp,  pad=6)

hr("BLEU per EM iteration")
for i, b in enumerate(bleu_curve, 1):
    print(f"Iter {i}: {b:.2f}")

hr("Save model")
os.makedirs("ibm1", exist_ok=True)
with open("ibm1/t_probs.pkl", "wb") as f:
    pickle.dump(t_probs, f)
print("Saved to ibm1/t_probs.pkl")



Train size        : 1000000
Valid size        : 2000
Test size         : 2000



Reading + tokenizing:   0%|          | 0/50000 [00:00<?, ?it/s]

Filtered train pairs: 47757
RU vocab size     : 24534
EN vocab size     : 14131



Building co-occurrence:   0%|          | 0/47757 [00:00<?, ?it/s]

Source types (f)  : 24535
Avg targets per f : 94.03



EM iter 1/5:   0%|          | 0/47757 [00:00<?, ?it/s]

BLEU on 2000 test sents:   0%|          | 0/2000 [00:00<?, ?it/s]


---------- Iteration 1 summary ----------
Stored f types    : 24535
Avg targets per f : 94.03
BLEU              : 0.62


EM iter 2/5:   0%|          | 0/47757 [00:00<?, ?it/s]

BLEU on 2000 test sents:   0%|          | 0/2000 [00:00<?, ?it/s]


---------- Iteration 2 summary ----------
Stored f types    : 24535
Avg targets per f : 94.03
BLEU              : 2.31


EM iter 3/5:   0%|          | 0/47757 [00:00<?, ?it/s]

BLEU on 2000 test sents:   0%|          | 0/2000 [00:00<?, ?it/s]


---------- Iteration 3 summary ----------
Stored f types    : 24535
Avg targets per f : 94.03
BLEU              : 2.79


EM iter 4/5:   0%|          | 0/47757 [00:00<?, ?it/s]

BLEU on 2000 test sents:   0%|          | 0/2000 [00:00<?, ?it/s]


---------- Iteration 4 summary ----------
Stored f types    : 24535
Avg targets per f : 94.03
BLEU              : 2.86


EM iter 5/5:   0%|          | 0/47757 [00:00<?, ?it/s]

BLEU on 2000 test sents:   0%|          | 0/2000 [00:00<?, ?it/s]


---------- Iteration 5 summary ----------
Stored f types    : 24535
Avg targets per f : 94.03
BLEU              : 2.92


Example 1
RU    : я люблю машинное обучение
EN*   : i love <UNK> training

Example 2
RU    : сегодня погода очень хорошая, но немного холодно
EN*   : today weather very good but little cold

Example 3
RU    : пожалуйста, скажи мне где находится ближайшая станция метро
EN*   : please tell me where is <UNK> station metro

Iter 1: 0.62
Iter 2: 2.31
Iter 3: 2.79
Iter 4: 2.86
Iter 5: 2.92

Saved to ibm1/t_probs.pkl


In [None]:
# Marian NMT (RU -> EN)
# - Fine-tune Helsinki-NLP/opus-mt-ru-en on OPUS-100 (en-ru)

!pip -q install datasets sacrebleu tqdm transformers sentencepiece accelerate

import os
import re
import math
import random
import numpy as np
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader

from datasets import load_dataset
import sacrebleu

from transformers import (
    MarianMTModel,
    MarianTokenizer,
    DataCollatorForSeq2Seq,
    get_linear_schedule_with_warmup,
)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DATASET_NAME = "Helsinki-NLP/opus-100"
DATASET_CONFIG = "en-ru"   # has both en/ru in 'translation'
SRC_LANG = "ru"
TGT_LANG = "en"

MODEL_NAME = "Helsinki-NLP/opus-mt-ru-en"  # Marian baseline

N_TRAIN = 50_000
N_DEV   = 5_000
N_TEST  = 5_000

MAX_LEN_SRC = 64
MAX_LEN_TGT = 64

EPOCHS = 3
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE  = 16
GRAD_ACCUM_STEPS = 2
LR = 5e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06

BLEU_SAMPLES = 2000

SAVE_DIR = "/content/marian_ru_en"

def hr(title=None, ch="="):
    if title:
        print(f"\n{ch*10} {title} {ch*10}")
    else:
        print(ch*32)

def print_kv(k, v, pad=22):
    print(f"{k:<{pad}}: {v}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hr("Runtime")
print_kv("Torch", torch.__version__)
print_kv("Device", device)
if device.type == "cuda":
    print_kv("GPU", torch.cuda.get_device_name(0))
    print_kv("CUDA", torch.version.cuda)

hr("Load dataset")
ds = load_dataset(DATASET_NAME, DATASET_CONFIG)
train_data = ds["train"]
dev_data   = ds["validation"]
test_data  = ds["test"]

print_kv("Train size", len(train_data))
print_kv("Valid size", len(dev_data))
print_kv("Test size",  len(test_data))

hr("Load Marian model")
tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
model = MarianMTModel.from_pretrained(MODEL_NAME).to(device)

def preprocess_example(ex):
    tr = ex["translation"]
    src = tr[SRC_LANG].strip()
    tgt = tr[TGT_LANG].strip()
    if not src or not tgt:
        return None

    if len(src.split()) > MAX_LEN_SRC or len(tgt.split()) > MAX_LEN_TGT:
        return None

    model_inputs = tokenizer(
        src,
        max_length=MAX_LEN_SRC,
        truncation=True,
        padding=False,
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            tgt,
            max_length=MAX_LEN_TGT,
            truncation=True,
            padding=False,
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def map_filter_dataset(split, n_limit):
    split = split.select(range(min(n_limit, len(split))))
    processed = []
    for i in tqdm(range(len(split)), desc="Preprocess"):
        out = preprocess_example(split[i])
        if out is not None:
            processed.append(out)
    return processed

hr("Preprocess splits")
train_proc = map_filter_dataset(train_data, N_TRAIN)
dev_proc   = map_filter_dataset(dev_data,   N_DEV)
test_proc  = map_filter_dataset(test_data,  N_TEST)

print_kv("Filtered train", len(train_proc))
print_kv("Filtered dev",   len(dev_proc))
print_kv("Filtered test",  len(test_proc))

collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

train_loader = DataLoader(train_proc, batch_size=TRAIN_BATCH_SIZE, shuffle=True,  collate_fn=collator)
dev_loader   = DataLoader(dev_proc,   batch_size=EVAL_BATCH_SIZE,  shuffle=False, collate_fn=collator)
test_loader  = DataLoader(test_proc,  batch_size=EVAL_BATCH_SIZE,  shuffle=False, collate_fn=collator)

@torch.no_grad()
def eval_bleu_from_loader(dataloader, n_samples=2000, desc="BLEU"):
    model.eval()
    refs, hyps = [], []

    seen = 0
    for batch in tqdm(dataloader, desc=desc):
        if seen >= n_samples:
            break

        batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
        input_ids = batch["input_ids"]
        attn = batch["attention_mask"]

        gen = model.generate(
            input_ids=input_ids,
            attention_mask=attn,
            max_length=MAX_LEN_TGT,
            num_beams=4,
        )

        hyp_texts = tokenizer.batch_decode(gen, skip_special_tokens=True)

        labels = batch["labels"].clone()
        labels[labels == -100] = tokenizer.pad_token_id
        ref_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

        hyps.extend(hyp_texts)
        refs.extend(ref_texts)

        seen += len(hyp_texts)

    bleu = sacrebleu.corpus_bleu(hyps, [refs])
    return bleu

hr("Setup optimizer")
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

total_steps = math.ceil(len(train_loader) / GRAD_ACCUM_STEPS) * EPOCHS
warmup_steps = int(total_steps * WARMUP_RATIO)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps,
)

print_kv("Train batches", len(train_loader))
print_kv("Total steps", total_steps)
print_kv("Warmup steps", warmup_steps)
print_kv("Grad accum", GRAD_ACCUM_STEPS)

use_amp = (device.type == "cuda")
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
print_kv("AMP (fp16)", use_amp)

hr("Train Marian (epochs as iterations)")

bleu_curve = []
os.makedirs(SAVE_DIR, exist_ok=True)

global_step = 0
for epoch in range(1, EPOCHS + 1):
    model.train()
    optimizer.zero_grad(set_to_none=True)

    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}")
    running_loss = 0.0
    seen = 0

    for step, batch in enumerate(pbar, start=1):
        batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

        with torch.cuda.amp.autocast(enabled=use_amp):
            out = model(**batch)
            loss = out.loss / GRAD_ACCUM_STEPS

        scaler.scale(loss).backward()
        running_loss += loss.item()
        seen += 1

        if step % GRAD_ACCUM_STEPS == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()
            global_step += 1

        if seen % 50 == 0:
            pbar.set_postfix({"loss": f"{(running_loss/seen):.4f}", "step": global_step})

    hr(f"Iteration/Epoch {epoch} evaluation", ch="-")
    dev_bleu = eval_bleu_from_loader(dev_loader,  n_samples=BLEU_SAMPLES, desc=f"Dev BLEU (epoch {epoch})")
    test_bleu = eval_bleu_from_loader(test_loader, n_samples=BLEU_SAMPLES, desc=f"Test BLEU (epoch {epoch})")

    print_kv("Dev BLEU",  f"{dev_bleu.score:.2f}")
    print_kv("Test BLEU", f"{test_bleu.score:.2f}")

    bleu_curve.append({"epoch": epoch, "dev": float(dev_bleu.score), "test": float(test_bleu.score)})

    ckpt_dir = os.path.join(SAVE_DIR, f"epoch_{epoch}")
    os.makedirs(ckpt_dir, exist_ok=True)
    model.save_pretrained(ckpt_dir)
    tokenizer.save_pretrained(ckpt_dir)
    print_kv("Saved checkpoint", ckpt_dir)

hr("Demo translations")

examples_ru = [
    "я люблю машинное обучение",
    "сегодня погода очень хорошая, но немного холодно",
    "пожалуйста, скажи мне где находится ближайшая станция метро"
]

model.eval()
for i, ru_sent in enumerate(examples_ru, 1):
    enc = tokenizer([ru_sent], return_tensors="pt", truncation=True, max_length=MAX_LEN_SRC).to(device)
    gen = model.generate(**enc, max_length=MAX_LEN_TGT, num_beams=4)
    en_hyp = tokenizer.decode(gen[0], skip_special_tokens=True)

    print(f"\nExample {i}")
    print_kv("RU", ru_sent, pad=6)
    print_kv("EN*", en_hyp,  pad=6)

hr("BLEU per iteration (epoch)")
for row in bleu_curve:
    print(f"Iter {row['epoch']}: Dev {row['dev']:.2f} | Test {row['test']:.2f}")

hr("Save final model")
FINAL_DIR = os.path.join(SAVE_DIR, "final")
os.makedirs(FINAL_DIR, exist_ok=True)
model.save_pretrained(FINAL_DIR)
tokenizer.save_pretrained(FINAL_DIR)
print_kv("Final saved to", FINAL_DIR)



Torch                 : 2.9.0+cu126
Device                : cuda
GPU                   : Tesla T4
CUDA                  : 12.6

Train size            : 1000000
Valid size            : 2000
Test size             : 2000



tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]




Preprocess:   0%|          | 0/50000 [00:00<?, ?it/s]



model.safetensors:   0%|          | 0.00/307M [00:00<?, ?B/s]

Preprocess:   0%|          | 0/2000 [00:00<?, ?it/s]

Preprocess:   0%|          | 0/2000 [00:00<?, ?it/s]

Filtered train        : 49599
Filtered dev          : 1983
Filtered test         : 1976

Train batches         : 6200
Total steps           : 9300
Warmup steps          : 558
Grad accum            : 2
AMP (fp16)            : True



  scaler = torch.cuda.amp.GradScaler(enabled=use_amp)


Epoch 1/3:   0%|          | 0/6200 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=use_amp):



---------- Iteration/Epoch 1 evaluation ----------


Dev BLEU (epoch 1):   0%|          | 0/124 [00:00<?, ?it/s]

Test BLEU (epoch 1):   0%|          | 0/124 [00:00<?, ?it/s]

Dev BLEU              : 36.60
Test BLEU             : 35.67




Saved checkpoint      : /content/marian_ru_en/epoch_1


Epoch 2/3:   0%|          | 0/6200 [00:00<?, ?it/s]


---------- Iteration/Epoch 2 evaluation ----------


Dev BLEU (epoch 2):   0%|          | 0/124 [00:00<?, ?it/s]

Test BLEU (epoch 2):   0%|          | 0/124 [00:00<?, ?it/s]

Dev BLEU              : 36.68
Test BLEU             : 35.43
Saved checkpoint      : /content/marian_ru_en/epoch_2


Epoch 3/3:   0%|          | 0/6200 [00:00<?, ?it/s]


---------- Iteration/Epoch 3 evaluation ----------


Dev BLEU (epoch 3):   0%|          | 0/124 [00:00<?, ?it/s]

Test BLEU (epoch 3):   0%|          | 0/124 [00:00<?, ?it/s]

Dev BLEU              : 36.97
Test BLEU             : 35.67
Saved checkpoint      : /content/marian_ru_en/epoch_3


Example 1
RU    : я люблю машинное обучение
EN*   : I love machine training.

Example 2
RU    : сегодня погода очень хорошая, но немного холодно
EN*   : The weather today is very good, but it's a little cold.

Example 3
RU    : пожалуйста, скажи мне где находится ближайшая станция метро
EN*   : Please tell me where the closest subway station is.

Iter 1: Dev 36.60 | Test 35.67
Iter 2: Dev 36.68 | Test 35.43
Iter 3: Dev 36.97 | Test 35.67

Final saved to        : /content/marian_ru_en/final


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_DIR = "/content/marian_ru_en/final"   # change if needed
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tok = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR).to(DEVICE)
model.eval()

def translate_batch(texts, num_beams=6, max_new_tokens=128, length_penalty=1.0):
    inputs = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=256).to(DEVICE)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            num_beams=num_beams,
            max_new_tokens=max_new_tokens,
            length_penalty=length_penalty,
            early_stopping=True,
        )
    return tok.batch_decode(out, skip_special_tokens=True)

hard_ru = [
    "Ну да, конечно… «лучший сервис» — ждал я всего-то два часа.",
    "Если бы я знал, что встреча перенесётся на завтра, я бы не вставал так рано и не ехал через весь город в час пик.",
    "Слушай, я вообще не выкупаю, зачем ты так заморачиваешься."
]


en_out = translate_batch(hard_ru, num_beams=6, max_new_tokens=128)

print("========== Hard test translations ==========\n")
for i, (ru, en) in enumerate(zip(hard_ru, en_out), 1):
    print(f"[{i}] RU: {ru}")
    print(f"    EN: {en}\n")



[1] RU: Ну да, конечно… «лучший сервис» — ждал я всего-то два часа.
    EN: Yeah, sure... the best service I've been waiting for is just two hours.

[2] RU: Если бы я знал, что встреча перенесётся на завтра, я бы не вставал так рано и не ехал через весь город в час пик.
    EN: If I'd known the meeting was tomorrow, I wouldn't have got up so early I wouldn't have driven across the city at a peak hour.

[3] RU: Слушай, я вообще не выкупаю, зачем ты так заморачиваешься.
    EN: Look, I don't care why you get so worked up about it at all.



In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_DIR = "/content/marian_ru_en/final"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tok = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR).to(DEVICE)
model.eval()

def translate_batch(texts, num_beams=6, max_new_tokens=128, length_penalty=1.0):
    inputs = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=256).to(DEVICE)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            num_beams=num_beams,
            max_new_tokens=max_new_tokens,
            length_penalty=length_penalty,
            early_stopping=True,
        )
    return tok.batch_decode(out, skip_special_tokens=True)

hard_ru = [
    "В отчёте РЖД за 2024 год упоминались МЦД-3 и МЦК, но цифры по пассажиропотоку разнятся по регионам.",
    "Хотя я уже отправил письмо, мне всё равно кажется, что я мог бы сформулировать просьбу мягче, чтобы не звучать требовательно.",
    "Ну конечно, «гениальная» идея — выключить сервер в пятницу вечером, а потом удивляться, что всё упало."
]



en_out = translate_batch(hard_ru, num_beams=6, max_new_tokens=128)

print("========== Harder test translations ==========\n")
for i, (ru, en) in enumerate(zip(hard_ru, en_out), 1):
    print(f"[{i}] RU: {ru}")
    print(f"    EN: {en}\n")



[1] RU: В отчёте РЖД за 2024 год упоминались МЦД-3 и МЦК, но цифры по пассажиропотоку разнятся по регионам.
    EN: The RZD's 2024 report mentioned IDC-3 and IFC, but the figures for passenger flows vary from region to region.

[2] RU: Хотя я уже отправил письмо, мне всё равно кажется, что я мог бы сформулировать просьбу мягче, чтобы не звучать требовательно.
    EN: Although I have already sent a letter, it still seems to me that I could make a soft request to avoid sounding demanding.

[3] RU: Ну конечно, «гениальная» идея — выключить сервер в пятницу вечером, а потом удивляться, что всё упало.
    EN: Of course, the genius idea is to turn off the server on Friday night, and then wonder if it's all gone down.



In [1]:
# Simple Dictionary Baseline (RU -> EN)
# - builds a word-to-word mapping using sentence-level co-occurrence counts

!pip -q install datasets sacrebleu tqdm

import re
import random
from collections import Counter, defaultdict
from tqdm.auto import tqdm
from datasets import load_dataset
import sacrebleu

SEED = 42
random.seed(SEED)

DATASET_NAME = "Helsinki-NLP/opus-100"
DATASET_CONFIG = "en-ru"
SRC_LANG = "ru"
TGT_LANG = "en"

MAX_LEN = 40
TRAIN_SIZES = [2000, 5000, 10000, 20000]
BLEU_SAMPLES = 2000

UNK_EN = "<unk>"

_tok_re = re.compile(r"[^\w]+", flags=re.UNICODE)
_punct_fix = re.compile(r"\s+([,.!?;:])")

def tokenize(text: str):
    text = text.lower().strip()
    return [t for t in _tok_re.split(text) if t]

def filter_pair(src_toks, tgt_toks):
    return (1 <= len(src_toks) <= MAX_LEN) and (1 <= len(tgt_toks) <= MAX_LEN)

def hr(title=None, ch="="):
    if title:
        print(f"\n{ch*10} {title} {ch*10}")
    else:
        print(ch*32)

def print_kv(k, v, pad=18):
    print(f"{k:<{pad}}: {v}")

hr("Load dataset")
ds = load_dataset(DATASET_NAME, DATASET_CONFIG)
train_data = ds["train"]
dev_data   = ds["validation"]
test_data  = ds["test"]

print_kv("Train size", len(train_data))
print_kv("Valid size", len(dev_data))
print_kv("Test size",  len(test_data))

hr("Tokenize train pairs (once)")

def build_pairs(dataset_split, n_limit):
    pairs = []
    for i in tqdm(range(n_limit), desc=f"Tokenizing {n_limit} pairs"):
        ex = dataset_split[i]["translation"]
        ru = tokenize(ex[SRC_LANG])
        en = tokenize(ex[TGT_LANG])
        if filter_pair(ru, en):
            pairs.append((ru, en))
    return pairs

max_need = max(TRAIN_SIZES)
train_pairs_all = build_pairs(train_data, max_need)
print_kv("Filtered pairs", len(train_pairs_all))

def train_cooc_dictionary(train_pairs):
    cooc = defaultdict(Counter)

    for ru, en in tqdm(train_pairs, desc="Counting co-occurrence"):
        en_set = set(en)
        for f in ru:
            cooc[f].update(en_set)

    best_e = {}
    for f, ctr in cooc.items():
        best_e[f] = ctr.most_common(1)[0][0]
    return best_e

def translate_ru_to_en(ru_text, best_e):
    ru_toks = tokenize(ru_text)
    out = [best_e.get(f, UNK_EN) for f in ru_toks]
    sent = " ".join(out)
    return _punct_fix.sub(r"\1", sent)

def eval_bleu(split, best_e, n_samples=2000, split_name="test"):
    refs, hyps = [], []
    n = min(n_samples, len(split))
    for i in tqdm(range(n), desc=f"BLEU on {split_name} ({n} sents)"):
        ex = split[i]["translation"]
        hyps.append(translate_ru_to_en(ex[SRC_LANG], best_e))
        refs.append(ex[TGT_LANG])
    return sacrebleu.corpus_bleu(hyps, [refs])

hr("Train + BLEU by training size")

bleu_curve = []
for n in TRAIN_SIZES:
    hr(f"Train size = {n}", ch="-")
    subset = train_pairs_all[:n]
    best_e = train_cooc_dictionary(subset)

    dev_bleu  = eval_bleu(dev_data,  best_e, n_samples=BLEU_SAMPLES, split_name="dev")
    test_bleu = eval_bleu(test_data, best_e, n_samples=BLEU_SAMPLES, split_name="test")

    print_kv("Dev BLEU",  f"{dev_bleu.score:.2f}")
    print_kv("Test BLEU", f"{test_bleu.score:.2f}")
    bleu_curve.append((n, float(dev_bleu.score), float(test_bleu.score)))

hr("Demo translations (3 sentences)")

examples_ru = [
    "я люблю машинное обучение",
    "сегодня погода очень хорошая, но немного холодно",
    "пожалуйста, скажи мне где находится ближайшая станция метро"
]

for i, ru_sent in enumerate(examples_ru, 1):
    en_hyp = translate_ru_to_en(ru_sent, best_e)
    print(f"\nExample {i}")
    print_kv("RU", ru_sent, pad=6)
    print_kv("EN*", en_hyp,  pad=6)

hr("BLEU curve (more data)")
for n, d, t in bleu_curve:
    print(f"Train={n:>5} | Dev {d:.2f} | Test {t:.2f}")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

en-ru/test-00000-of-00001.parquet:   0%|          | 0.00/310k [00:00<?, ?B/s]

en-ru/train-00000-of-00001.parquet:   0%|          | 0.00/124M [00:00<?, ?B/s]

en-ru/validation-00000-of-00001.parquet:   0%|          | 0.00/310k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Train size        : 1000000
Valid size        : 2000
Test size         : 2000



Tokenizing 20000 pairs:   0%|          | 0/20000 [00:00<?, ?it/s]

Filtered pairs    : 19116


---------- Train size = 2000 ----------


Counting co-occurrence:   0%|          | 0/2000 [00:00<?, ?it/s]

BLEU on dev (2000 sents):   0%|          | 0/2000 [00:00<?, ?it/s]

BLEU on test (2000 sents):   0%|          | 0/2000 [00:00<?, ?it/s]

Dev BLEU          : 0.19
Test BLEU         : 0.10

---------- Train size = 5000 ----------


Counting co-occurrence:   0%|          | 0/5000 [00:00<?, ?it/s]

BLEU on dev (2000 sents):   0%|          | 0/2000 [00:00<?, ?it/s]

BLEU on test (2000 sents):   0%|          | 0/2000 [00:00<?, ?it/s]

Dev BLEU          : 0.49
Test BLEU         : 0.43

---------- Train size = 10000 ----------


Counting co-occurrence:   0%|          | 0/10000 [00:00<?, ?it/s]

BLEU on dev (2000 sents):   0%|          | 0/2000 [00:00<?, ?it/s]

BLEU on test (2000 sents):   0%|          | 0/2000 [00:00<?, ?it/s]

Dev BLEU          : 0.75
Test BLEU         : 0.82

---------- Train size = 20000 ----------


Counting co-occurrence:   0%|          | 0/19116 [00:00<?, ?it/s]

BLEU on dev (2000 sents):   0%|          | 0/2000 [00:00<?, ?it/s]

BLEU on test (2000 sents):   0%|          | 0/2000 [00:00<?, ?it/s]

Dev BLEU          : 0.92
Test BLEU         : 1.00


Example 1
RU    : я люблю машинное обучение
EN*   : i i <unk> training

Example 2
RU    : сегодня погода очень хорошая, но немного холодно
EN*   : today weather very good but a cold

Example 3
RU    : пожалуйста, скажи мне где находится ближайшая станция метро
EN*   : please you i where the <unk> <unk> the

Train= 2000 | Dev 0.19 | Test 0.10
Train= 5000 | Dev 0.49 | Test 0.43
Train=10000 | Dev 0.75 | Test 0.82
Train=20000 | Dev 0.92 | Test 1.00


In [4]:
!pip -q install datasets sacrebleu tqdm

import re, random
from collections import Counter, defaultdict
from tqdm.auto import tqdm
from datasets import load_dataset
import sacrebleu

SEED = 42
random.seed(SEED)

DATASET_NAME = "Helsinki-NLP/opus-100"
DATASET_CONFIG = "en-ru"
SRC_LANG = "ru"
TGT_LANG = "en"

N_TRAIN = 50_000
MAX_LEN = 40
MIN_FREQ = 2
MAX_VOCAB_RU = 50_000
MAX_VOCAB_EN = 50_000

TOPK_PER_F = 200

NULL_TOKEN = "<NULL>"
UNK_TOKEN  = "<UNK>"

_tok_re = re.compile(r"[^\w]+", flags=re.UNICODE)
_punct_fix = re.compile(r"\s+([,.!?;:])")

def tokenize(text: str):
    text = text.lower().strip()
    return [t for t in _tok_re.split(text) if t]

def filter_pair(src_toks, tgt_toks):
    return (1 <= len(src_toks) <= MAX_LEN) and (1 <= len(tgt_toks) <= MAX_LEN)

def map_to_vocab(tokens, vocab):
    return [t if t in vocab else UNK_TOKEN for t in tokens]

def best_translation_for_f(f, t_probs, default=UNK_TOKEN):
    if f not in t_probs or not t_probs[f]:
        return default
    return max(t_probs[f].items(), key=lambda x: x[1])[0]

def translate_ru_to_en(ru_text: str, t_probs, ru_vocab):
    ru_toks = map_to_vocab(tokenize(ru_text), ru_vocab)
    out = [best_translation_for_f(f, t_probs) for f in ru_toks]
    sent = " ".join(out)
    return _punct_fix.sub(r"\1", sent)

ds = load_dataset(DATASET_NAME, DATASET_CONFIG)
train_data = ds["train"]
dev_data   = ds["validation"]
test_data  = ds["test"]

src_counter = Counter()
tgt_counter = Counter()
pairs = []

for i in tqdm(range(min(N_TRAIN, len(train_data))), desc=f"Tokenizing {N_TRAIN} pairs"):
    ex = train_data[i]["translation"]
    ru = tokenize(ex[SRC_LANG])
    en = tokenize(ex[TGT_LANG])
    if not filter_pair(ru, en):
        continue
    pairs.append((ru, en))
    src_counter.update(ru)
    tgt_counter.update(en)

print("Filtered pairs:", len(pairs))

def make_vocab(counter: Counter, min_freq: int, max_vocab: int):
    items = [(w, c) for w, c in counter.items() if c >= min_freq]
    items.sort(key=lambda x: x[1], reverse=True)
    words = [w for w, _ in items[:max_vocab]]
    vocab = {w: i for i, w in enumerate(words)}
    return vocab

ru_vocab = make_vocab(src_counter, MIN_FREQ, MAX_VOCAB_RU)
en_vocab = make_vocab(tgt_counter, MIN_FREQ, MAX_VOCAB_EN)
ru_vocab[UNK_TOKEN] = len(ru_vocab)
en_vocab[UNK_TOKEN] = len(en_vocab)

train_pairs = [
    ([NULL_TOKEN] + map_to_vocab(ru, ru_vocab), map_to_vocab(en, en_vocab))
    for (ru, en) in pairs
]

fe_counts = defaultdict(Counter)
for ru_sent, en_sent in tqdm(train_pairs, desc="Counting co-occurrence"):
    en_set = set(en_sent)
    for f in ru_sent:
        fe_counts[f].update(en_set)

t_probs = {}
for f, ctr in fe_counts.items():
    best = ctr.most_common(TOPK_PER_F)
    if not best:
        continue
    total = sum(c for _, c in best)
    t_probs[f] = {e: c / total for e, c in best}

def eval_bleu(split, t_probs):
    refs, hyps = [], []
    for i in tqdm(range(len(split)), desc=f"BLEU on {len(split)} sents"):
        ex = split[i]["translation"]
        hyps.append(translate_ru_to_en(ex[SRC_LANG], t_probs, ru_vocab))
        refs.append(ex[TGT_LANG])
    return sacrebleu.corpus_bleu(hyps, [refs])

dev_bleu = eval_bleu(dev_data, t_probs)
test_bleu = eval_bleu(test_data, t_probs)

print("\nDev BLEU :", f"{dev_bleu.score:.2f}")
print("Test BLEU:", f"{test_bleu.score:.2f}")

examples_ru = [
    "я люблю машинное обучение",
    "сегодня погода очень хорошая, но немного холодно",
    "пожалуйста, скажи мне где находится ближайшая станция метро"
]
print("\n=== Demo translations (3 sentences) ===")
for i, s in enumerate(examples_ru, 1):
    print(f"\nExample {i}")
    print("RU :", s)
    print("EN*:", translate_ru_to_en(s, t_probs, ru_vocab))


Using the latest cached version of the dataset since Helsinki-NLP/opus-100 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'en-ru' at /root/.cache/huggingface/datasets/Helsinki-NLP___opus-100/en-ru/0.0.0/805090dc28bf78897da9641cdf08b61287580df9 (last modified on Fri Jan  2 18:29:10 2026).


Tokenizing 50000 pairs:   0%|          | 0/50000 [00:00<?, ?it/s]

Filtered pairs: 47757


Counting co-occurrence:   0%|          | 0/47757 [00:00<?, ?it/s]

BLEU on 2000 sents:   0%|          | 0/2000 [00:00<?, ?it/s]

BLEU on 2000 sents:   0%|          | 0/2000 [00:00<?, ?it/s]


Dev BLEU : 1.27
Test BLEU: 1.11

=== Demo translations (3 sentences) ===

Example 1
RU : я люблю машинное обучение
EN*: i i <UNK> training

Example 2
RU : сегодня погода очень хорошая, но немного холодно
EN*: today weather very good but a s

Example 3
RU : пожалуйста, скажи мне где находится ближайшая станция метро
EN*: please tell i where the <UNK> station in
