In [None]:
pip install -U datasets

In [None]:
#@title 🚀 Colab Script: Tatoeba rus‑tuk Embedding Extraction with correct folder structure

# 1️⃣ Install dependencies
!pip install --quiet transformers torch numpy wget

import os, logging, random, numpy as np, torch, wget, tarfile
from transformers import NllbTokenizerFast, AutoModelForSeq2SeqLM, BertTokenizerFast, BertModel

# 2️⃣ Set seeds for reproducibility
seed = 42
random.seed(seed); np.random.seed(seed)
torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("rus-tuk-embed")

# 3️⃣ Download and extract rus-tuk.tar
URL = "https://object.pouta.csc.fi/Tatoeba-Challenge-v2023-09-26/rus-tuk.tar"
WORKDIR = "data"
os.makedirs(WORKDIR, exist_ok=True)
tar_path = os.path.join(WORKDIR, "rus-tuk.tar")

if not os.path.exists(tar_path):
    logger.info(f"Downloading {URL}")
    wget.download(URL, tar_path)
else:
    logger.info("Archive already exists, skipping download.")

with tarfile.open(tar_path, "r") as tar:
    tar.extractall(path=WORKDIR)
    logger.info("Extraction finished.")

# 4️⃣ Locate files inside release/v2023-09-26/rus-tuk/
base_dir = os.path.join(WORKDIR,'data', "release", "v2023-09-26", "rus-tuk")
src = os.path.join(base_dir, "test.src")
trg = os.path.join(base_dir, "test.trg")
assert os.path.isfile(src) and os.path.isfile(trg), "Could not find test.src/test.trg."

logger.info(f"Using:\n • {src}\n • {trg}")

# 5️⃣ Load datasets
with open(src, encoding="utf-8") as fs, open(trg, encoding="utf-8") as ft:
    S = [l.strip() for l in fs if l.strip()]
    T = [l.strip() for l in ft if l.strip()]
assert len(S) == len(T), f"Mismatch: {len(S)} src vs {len(T)} trg"
logger.info(f"Loaded {len(S)} parallel sentences.")

# 6️⃣ Initialize tokenizers and models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Device: {device}")

nllb_name = "facebook/nllb-200-distilled-600M"
tok_ru = NllbTokenizerFast.from_pretrained(nllb_name, src_lang="rus_Cyrl")
tok_tk = NllbTokenizerFast.from_pretrained(nllb_name, src_lang="tuk_Latn")
model_nllb = AutoModelForSeq2SeqLM.from_pretrained(nllb_name).to(device).eval()

labse_name = "setu4993/LaBSE"
tok_labse = BertTokenizerFast.from_pretrained(labse_name)
model_labse = BertModel.from_pretrained(labse_name).to(device).eval()

def mean_pool(emb, mask):
    m = mask.unsqueeze(-1).float()
    s = torch.sum(emb * m, dim=1)
    c = torch.clamp(m.sum(dim=1), min=1e-9)
    return s / c

# 7️⃣ Extract embeddings
bs, mx = 32, 128
emb_n_src, emb_n_trg = [], []
emb_l_src, emb_l_trg = [], []

for i in range(0, len(S), bs):
    s_batch = S[i:i+bs]
    t_batch = T[i:i+bs]

    # NLLB
    enc_s = tok_ru(s_batch, padding=True, truncation=True, max_length=mx, return_tensors="pt").to(device)
    enc_t = tok_tk(t_batch, padding=True, truncation=True, max_length=mx, return_tensors="pt").to(device)
    with torch.no_grad():
        hs = model_nllb.get_encoder()(**enc_s).last_hidden_state
        ht = model_nllb.get_encoder()(**enc_t).last_hidden_state
        emb_n_src.append(mean_pool(hs, enc_s.attention_mask).cpu().numpy())
        emb_n_trg.append(mean_pool(ht, enc_t.attention_mask).cpu().numpy())

    # LaBSE
    enc_s2 = tok_labse(s_batch, padding=True, truncation=True, max_length=mx, return_tensors="pt").to(device)
    enc_t2 = tok_labse(t_batch, padding=True, truncation=True, max_length=mx, return_tensors="pt").to(device)
    with torch.no_grad():
        emb_l_src.append(model_labse(**enc_s2).pooler_output.cpu().numpy())
        emb_l_trg.append(model_labse(**enc_t2).pooler_output.cpu().numpy())

    logger.info(f"Batch {i}-{i+len(s_batch)} processed.")

# 8️⃣ Save embeddings
n_src = np.vstack(emb_n_src); n_trg = np.vstack(emb_n_trg)
l_src = np.vstack(emb_l_src); l_trg = np.vstack(emb_l_trg)

np.savez("embeddings_NLLB.npz", src=n_src, tgt=n_trg)
np.savez("embeddings_LaBSE.npz", src=l_src, tgt=l_trg)
logger.info("✅ Saved embeddings: embeddings_NLLB.npz, embeddings_LaBSE.npz")


# 6️⃣ Load additional model
from sentence_transformers import SentenceTransformer

pm_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# Reuse mean_pool defined earlier

# 7️⃣ Extract embeddings: add MiniLM
emb_p_src, emb_p_trg = [], []

for i in range(0, len(S), bs):
    s_batch = S[i:i+bs]; t_batch = T[i:i+bs]
    # existing NLLB + LaBSE ...

    # MiniLM
    emb_s_p = pm_model.encode(s_batch, show_progress_bar=False, convert_to_numpy=True, batch_size=bs)
    emb_t_p = pm_model.encode(t_batch, show_progress_bar=False, convert_to_numpy=True, batch_size=bs)
    emb_p_src.append(emb_s_p); emb_p_trg.append(emb_t_p)

# 8️⃣ Save embeddings
p_src = np.vstack(emb_p_src); p_trg = np.vstack(emb_p_trg)
np.savez("embeddings_MiniLM.npz", src=p_src, tgt=p_trg)



In [None]:
#@title 🎯 Bitext Retrieval Evaluation (Precision@1 & MRR)

# 0️⃣ Install dependencies
!pip install --quiet numpy tqdm

import numpy as np
import argparse
import logging
from tqdm import tqdm

def compute_retrieval_metrics(src_emb: np.ndarray, tgt_emb: np.ndarray):
    src_norm = src_emb / np.linalg.norm(src_emb, axis=1, keepdims=True)
    tgt_norm = tgt_emb / np.linalg.norm(tgt_emb, axis=1, keepdims=True)
    sim = src_norm @ tgt_norm.T
    N = sim.shape[0]
    top1 = sim.argmax(axis=1)
    p1 = (top1 == np.arange(N)).sum() / float(N)
    corr = sim[np.arange(N), np.arange(N)]
    better = (sim > corr[:, None]).sum(axis=1)
    ranks = better + 1
    mrr = (1.0 / ranks).mean()
    return p1, mrr

def evaluate(emb_path, name):
    data = np.load(emb_path)
    src, tgt = data['src'], data['tgt']
    if src.shape[0] != tgt.shape[0]:
        L = min(src.shape[0], tgt.shape[0])
        src, tgt = src[:L], tgt[:L]
        logging.warning(f"{name}: truncated to {L} pairs due to mismatch")
    print(f"\n🔍 Evaluating {name} embeddings on {src.shape[0]} pairs...")
    p1, mrr = compute_retrieval_metrics(src, tgt)
    print(f"{name} → Precision@1: {p1:.4f}, MRR: {mrr:.4f}")
    return p1, mrr

if __name__ == "__main__":
    # In Colab, you can manually set the file names here or use argparse
    nllb_path = "embeddings_NLLB.npz"  # Adjust if necessary
    labse_path = "embeddings_LaBSE.npz"

    logging.basicConfig(level=logging.INFO)
    evaluate(nllb_path, "NLLB")
    evaluate(labse_path, "LaBSE")
    # After LabSE and NLLB:
    mini_path = "embeddings_MiniLM.npz"
    evaluate(mini_path, "MiniLM‑Multi")




🔍 Evaluating NLLB embeddings on 9 pairs...
NLLB → Precision@1: 1.0000, MRR: 1.0000

🔍 Evaluating LaBSE embeddings on 9 pairs...
LaBSE → Precision@1: 0.8889, MRR: 0.9444

🔍 Evaluating MiniLM‑Multi embeddings on 9 pairs...
MiniLM‑Multi → Precision@1: 0.5556, MRR: 0.7011


In [None]:
# All-in-One STS17 (Russian) Evaluation with NLLB & LaBSE

# 1. Load STS17 Russian dataset
from datasets import load_dataset
ds = load_dataset("ai-forever/ru-stsbenchmark-sts", split="test")
sents1, sents2 = ds["sentence1"], ds["sentence2"]
scores = ds["score"]
print(f"✅ Loaded {len(scores)} Russian sentence pairs from ai-forever/ru-stsbenchmark-sts")  # ~7,824 pairs :contentReference[oaicite:6]{index=6}

# 2. Initialize models on GPU
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
labse_tok = AutoTokenizer.from_pretrained("setu4993/LaBSE")
labse_model = AutoModel.from_pretrained("setu4993/LaBSE").to(device).eval()

nllb_tok = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to(device).eval()
nllb_encoder = nllb_model.get_encoder()

# 3. Define embedding helpers
import numpy as np

def encode_labse(texts, bs=32):
    embs = []
    for i in range(0, len(texts), bs):
        batch = labse_tok(texts[i:i+bs], padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            emb = labse_model(**batch).pooler_output.cpu().numpy()
        embs.append(emb)
    return np.vstack(embs)

def encode_nllb(texts, bs=32):
    embs = []
    nllb_tok.src_lang = "rus_Cyrl"  # ✅ Set the source language here
    for i in range(0, len(texts), bs):
        batch = nllb_tok(texts[i:i+bs], padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            hs = nllb_encoder(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]).last_hidden_state
        mask = batch["attention_mask"].clone()
        mask[:, 0] = 0  # Remove the language token
        mask[batch["input_ids"] == nllb_tok.eos_token_id] = 0
        summed = (hs * mask.unsqueeze(-1)).sum(dim=1)
        cnt = mask.sum(dim=1).unsqueeze(-1).clamp(min=1)
        embs.append((summed / cnt).cpu().numpy())
    return np.vstack(embs)


# 4. Generate embeddings & calculate similarities
e1_l = encode_labse(sents1)
e2_l = encode_labse(sents2)
e1_n = encode_nllb(sents1)
e2_n = encode_nllb(sents2)

def cosine(a, b):
    return np.sum(a * b, axis=1) / (np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1))

sim_l = cosine(e1_l, e2_l)
sim_n = cosine(e1_n, e2_n)

from scipy.stats import pearsonr
r_l, _ = pearsonr(sim_l, scores)
r_n, _ = pearsonr(sim_n, scores)

print(f"✅ LaBSE Pearson r = {r_l:.4f}")
print(f"✅ NLLB  Pearson r = {r_n:.4f}")
# 2. Initialize MiniLM model
from sentence_transformers import SentenceTransformer
pm_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2').to(device)

# 3. Define encode function
def encode_minilm(texts, bs=32):
    return pm_model.encode(texts, batch_size=bs, convert_to_numpy=True, show_progress_bar=True)

# 4. Generate embeddings & calculate similarities
e1_p = encode_minilm(sents1)
e2_p = encode_minilm(sents2)
sim_p = cosine(e1_p, e2_p)

# 5. Compute Pearson r
r_p, _ = pearsonr(sim_p, scores)
print(f"✅ MiniLM Pearson r = {r_p:.4f}")

# # 5. Visualization
# import matplotlib.pyplot as plt

# plt.figure(figsize=(6,5))
# plt.scatter(scores, sim_l, alpha=0.5, label=f"LaBSE (r={r_l:.2f})")
# plt.scatter(scores, sim_n, alpha=0.5, label=f"NLLB (r={r_n:.2f})", color='orange')
# plt.xlabel("Gold Similarity")
# plt.ylabel("Cosine Similarity")
# plt.title("Semantic Similarity — STS17 (Russian)")
# plt.legend()
# plt.grid(ls="--", alpha=0.3)
# plt.show()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/349 [00:00<?, ?B/s]

sts_rumteb_train.json: 0.00B [00:00, ?B/s]

sts_rumteb_test.json: 0.00B [00:00, ?B/s]

sts_rumteb_dev.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/5224 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1264 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1336 [00:00<?, ? examples/s]

✅ Loaded 1264 Russian sentence pairs from ai-forever/ru-stsbenchmark-sts
✅ LaBSE Pearson r = 0.7357
✅ NLLB  Pearson r = 0.6996


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

✅ MiniLM Pearson r = 0.7893


In [None]:
#@title 🚀 Fine-tuning SBERT RU–TK Embeddings

# 0. Install dependencies
# !pip install -q -U datasets sentence-transformers sklearn faiss-cpu

# 1. Imports
import gzip
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from torch.utils.data import DataLoader

# 2. Cleaning & loading data
def clean(text):
    return text.replace("\xa0", " ").replace("\u202f", " ").strip()

def load_pairs(src_path, trg_path, gzip_in=True):
    rows = []
    opener = gzip.open if gzip_in else open
    mode = 'rt' if gzip_in else 'r'
    with opener(src_path, mode, encoding='utf-8') as f1, opener(trg_path, mode, encoding='utf-8') as f2:
        for src, trg in zip(f1, f2):
            rows.append({"ru": clean(src), "tk": clean(trg)})
    return rows

# Load & split
all_rows = load_pairs("/content/train.src.gz", "/content/train.trg.gz", gzip_in=True)[:1000]
train_rows, temp = train_test_split(all_rows, test_size=0.10, random_state=42)
val_rows, extra_test = train_test_split(temp, test_size=0.5, random_state=42)
test_rows = load_pairs("/content/test.src", "/content/test.trg", gzip_in=False) + extra_test

dataset = DatasetDict({
    "train": Dataset.from_list(train_rows),
    "validation": Dataset.from_list(val_rows),
    "test": Dataset.from_list(test_rows)
})
print(dataset)

# 3. Create InputExample datasets
train_examples = [InputExample(texts=[r["ru"], r["tk"]]) for r in train_rows]
val_sents1 = [r["ru"] for r in val_rows]
val_sents2 = [r["tk"] for r in val_rows]
val_labels = [1]*len(val_rows)  # all positive

test_sents1 = [r["ru"] for r in test_rows]
test_sents2 = [r["tk"] for r in test_rows]
test_labels = [1]*len(test_rows)

# 4. Initialize model and DataLoader
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
train_loader = DataLoader(train_examples, shuffle=True, batch_size=32)

# 5. Loss & evaluators
train_loss = losses.MultipleNegativesRankingLoss(model)

val_evaluator = BinaryClassificationEvaluator(
    sentences1=val_sents1,
    sentences2=val_sents2,
    labels=val_labels,
    name="val-binary"
)
test_evaluator = BinaryClassificationEvaluator(
    sentences1=test_sents1,
    sentences2=test_sents2,
    labels=test_labels,
    name="test-binary"
)

original_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
similarity_before = original_model.similarity(
    original_model.encode(["Привет"]), original_model.encode(["Salam"])
)
print("⚖️ Similarity BEFORE fine-tuning:", similarity_before)


# 6. Training
num_epochs = 3
warmup_steps = int(len(train_loader) * num_epochs * 0.1)

model.fit(
    train_objectives=[(train_loader, train_loss)],
    evaluator=val_evaluator,
    epochs=num_epochs,
    evaluation_steps=500,
    warmup_steps=warmup_steps,
    output_path="fine-tuned-ru-tk-minilm",
    use_amp=True
)

# 7. Evaluate on test set
model = SentenceTransformer("fine-tuned-ru-tk-minilm")
print("🔎 Test Set Performance:")
test_evaluator(model)

# 8. Save and demo encode usage
model.save("fine_tuned_ru_tk_embeddings")
print("Sample embed similarity:",
      model.similarity(model.encode(["Привет"]), model.encode(["Salam"])))


DatasetDict({
    train: Dataset({
        features: ['ru', 'tk'],
        num_rows: 900
    })
    validation: Dataset({
        features: ['ru', 'tk'],
        num_rows: 50
    })
    test: Dataset({
        features: ['ru', 'tk'],
        num_rows: 59
    })
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

⚖️ Similarity BEFORE fine-tuning: tensor([[0.1569]])


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjelal[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Val-binary Cosine Accuracy,Val-binary Cosine Accuracy Threshold,Val-binary Cosine F1,Val-binary Cosine F1 Threshold,Val-binary Cosine Precision,Val-binary Cosine Recall,Val-binary Cosine Ap,Val-binary Cosine Mcc
29,No log,No log,0.98,0.215772,0.989899,0.215772,1.0,0.98,1.0,0.0
58,No log,No log,0.98,0.250371,0.989899,0.250371,1.0,0.98,1.0,0.0
87,No log,No log,0.98,0.251095,0.989899,0.251095,1.0,0.98,1.0,0.0


🔎 Test Set Performance:
Sample embed similarity: tensor([[0.3924]])


In [None]:
#@title 🚀 MiniLM (Multilingual) — Before/After Fine-tuning on RU–TK for Bitext & STS

# =========================================================
# 0) Setup
# =========================================================
!pip -q install sentence-transformers datasets faiss-cpu numpy scipy tqdm wget

import os, gzip, tarfile, random, logging, math, wget, pathlib
import numpy as np
from tqdm import tqdm
from typing import List, Tuple
import torch
from datasets import load_dataset
from scipy.stats import pearsonr, spearmanr
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader

# Reproducibility
seed = 42
random.seed(seed); np.random.seed(seed)
torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🖥️ Using device: {device}")

# =========================================================
# 1) Download Tatoeba rus–tuk data & locate files
# =========================================================
URL = "https://object.pouta.csc.fi/Tatoeba-Challenge-v2023-09-26/rus-tuk.tar"
WORKDIR = "data/data"
os.makedirs(WORKDIR, exist_ok=True)
tar_path = os.path.join(WORKDIR, "rus-tuk.tar")

if not os.path.exists(tar_path):
    print("⬇️ Downloading Tatoeba rus–tuk tar...")
    wget.download(URL, tar_path)
    print()
else:
    print("✅ Archive already exists. Skipping download.")

print("📦 Extracting archive (idempotent)...")
with tarfile.open(tar_path, "r") as tar:
    def is_within_directory(directory, target):
        abs_directory = os.path.abspath(directory)
        abs_target = os.path.abspath(target)
        prefix = os.path.commonprefix([abs_directory, abs_target])
        return prefix == abs_directory
    def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
        for member in tar.getmembers():
            member_path = os.path.join(path, member.name)
            if not is_within_directory(path, member_path):
                raise Exception("Attempted Path Traversal in Tar File")
        tar.extractall(path, members, numeric_owner=numeric_owner)
    safe_extract(tar, path=WORKDIR)

base_dir = os.path.join(WORKDIR, "release", "v2023-09-26", "rus-tuk")
assert os.path.isdir(base_dir), f"Could not find base dir: {base_dir}"
print("📁 Base dir:", base_dir)

# Detect files robustly
def find(path_candidates: List[str]) -> str:
    for p in path_candidates:
        if os.path.isfile(p):
            return p
    return ""

# Test files (plain text)
test_src = find([os.path.join(base_dir, "test.src"),
                 os.path.join(base_dir, "test.ru"),
                 os.path.join(base_dir, "test.rus")])
test_trg = find([os.path.join(base_dir, "test.trg"),
                 os.path.join(base_dir, "test.tk"),
                 os.path.join(base_dir, "test.tuk")])

assert test_src and test_trg, "Could not find test.src/test.trg in the archive."

# Training files (gz) — fall back to dev.* if train.* missing
train_src_gz = find([os.path.join(base_dir, "train.src.gz"),
                     os.path.join(base_dir, "train.ru.gz"),
                     os.path.join(base_dir, "train.rus.gz")])
train_trg_gz = find([os.path.join(base_dir, "train.trg.gz"),
                     os.path.join(base_dir, "train.tk.gz"),
                     os.path.join(base_dir, "train.tuk.gz")])

if not (train_src_gz and train_trg_gz):
    print("⚠️ train.*.gz not found — falling back to dev.*.gz")
    train_src_gz = find([os.path.join(base_dir, "dev.src.gz"),
                         os.path.join(base_dir, "dev.ru.gz"),
                         os.path.join(base_dir, "dev.rus.gz")])
    train_trg_gz = find([os.path.join(base_dir, "dev.trg.gz"),
                         os.path.join(base_dir, "dev.tk.gz"),
                         os.path.join(base_dir, "dev.tuk.gz")])

assert train_src_gz and train_trg_gz, "Could not find train/dev gz files."

print("🧪 Test:", test_src, "|", test_trg)
print("🧑‍🏫 Train:", train_src_gz, "|", train_trg_gz)

# =========================================================
# 2) IO helpers
# =========================================================
def clean_text(s: str) -> str:
    return s.replace("\xa0", " ").replace("\u202f", " ").strip()

def read_parallel_plain(src_path: str, trg_path: str, limit: int = None) -> List[Tuple[str, str]]:
    rows = []
    with open(src_path, "r", encoding="utf-8") as fs, open(trg_path, "r", encoding="utf-8") as ft:
        for s, t in zip(fs, ft):
            s, t = clean_text(s), clean_text(t)
            if s and t:
                rows.append((s, t))
            if limit and len(rows) >= limit:
                break
    return rows

def read_parallel_gz(src_path: str, trg_path: str, limit: int = None) -> List[Tuple[str, str]]:
    rows = []
    with gzip.open(src_path, "rt", encoding="utf-8") as fs, gzip.open(trg_path, "rt", encoding="utf-8") as ft:
        for s, t in zip(fs, ft):
            s, t = clean_text(s), clean_text(t)
            if s and t:
                rows.append((s, t))
            if limit and len(rows) >= limit:
                break
    return rows

# =========================================================
# 3) Load Data
# =========================================================
test_pairs = read_parallel_plain(test_src, test_trg)  # full test for evaluation
print(f"✅ Loaded {len(test_pairs)} test pairs for bitext retrieval.")

# Keep training reasonably small for Colab runtime; adjust as desired
MAX_TRAIN = 2000    # cap for speed; set to None to use all available
train_pairs = read_parallel_gz(train_src_gz, train_trg_gz, limit=MAX_TRAIN)
print(f"✅ Loaded {len(train_pairs)} training pairs for fine-tuning.")

# =========================================================
# 4) Models & Evaluation Helpers
# =========================================================
BASE_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

def embed(model: SentenceTransformer, texts: List[str], batch_size: int = 64) -> np.ndarray:
    return model.encode(texts, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=False, normalize_embeddings=False)

def l2norm(x: np.ndarray) -> np.ndarray:
    n = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
    return x / n

def bitext_metrics(src_emb: np.ndarray, tgt_emb: np.ndarray) -> Tuple[float, float]:
    S = l2norm(src_emb) @ l2norm(tgt_emb).T
    N = S.shape[0]
    # P@1
    top1 = S.argmax(axis=1)
    p1 = float((top1 == np.arange(N)).sum()) / N
    # MRR
    gold = S[np.arange(N), np.arange(N)]
    ranks = (S > gold[:, None]).sum(axis=1) + 1
    mrr = (1.0 / ranks).mean()
    return p1, mrr

def eval_bitext(model: SentenceTransformer, pairs: List[Tuple[str,str]], tag: str):
    src_texts = [s for s,_ in pairs]
    tgt_texts = [t for _,t in pairs]
    src_emb = embed(model, src_texts)
    tgt_emb = embed(model, tgt_texts)
    p1, mrr = bitext_metrics(src_emb, tgt_emb)
    print(f"🔎 Bitext ({tag}) — P@1: {p1:.4f} | MRR: {mrr:.4f}")
    return p1, mrr

def cosine_batch(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    a_n = l2norm(a); b_n = l2norm(b)
    return (a_n * b_n).sum(axis=1)

def eval_sts_ru(model: SentenceTransformer, tag: str):
    ds = load_dataset("ai-forever/ru-stsbenchmark-sts", split="test")
    s1, s2, y = ds["sentence1"], ds["sentence2"], np.array(ds["score"], dtype=float)
    e1 = embed(model, s1, batch_size=256)
    e2 = embed(model, s2, batch_size=256)
    sim = cosine_batch(e1, e2)
    pr, _ = pearsonr(sim, y)
    sr, _ = spearmanr(sim, y)
    print(f"🔎 STS17-RU ({tag}) — Pearson: {pr:.4f} | Spearman: {sr:.4f}")
    return pr, sr

# =========================================================
# 5) BEFORE — Baseline MiniLM (multilingual)
# =========================================================
baseline = SentenceTransformer(BASE_MODEL, device=device)
print("\n=== 📊 Baseline (Before fine-tuning) ===")
b_p1, b_mrr = eval_bitext(baseline, test_pairs, tag="baseline")
b_pr, b_sr = eval_sts_ru(baseline, tag="baseline")

# =========================================================
# 6) Fine-tune on RU–TK with MultipleNegativesRankingLoss
# =========================================================
print("\n=== 🛠️ Fine-tuning MiniLM on RU–TK (MultipleNegativesRankingLoss) ===")
train_examples = [InputExample(texts=[ru, tk]) for (ru, tk) in train_pairs]
train_loader = DataLoader(train_examples, shuffle=True, batch_size=64, drop_last=True)

model = SentenceTransformer(BASE_MODEL, device=device)

# A lightweight schedule for Colab — adjust epochs for better quality if you have time/GPU
num_epochs = 10
warmup_steps = int(len(train_loader) * num_epochs * 0.1)

train_loss = losses.MultipleNegativesRankingLoss(model)

model.fit(
    train_objectives=[(train_loader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    use_amp=True,
    show_progress_bar=True,
    output_path="minilm-ru-tk-finetuned"  # saved automatically
)

# Load the saved (ensures we're using the exact persisted weights)
finetuned = SentenceTransformer("minilm-ru-tk-finetuned", device=device)

# =========================================================
# 7) AFTER — Evaluate again
# =========================================================
print("\n=== 📊 After Fine-tuning ===")
a_p1, a_mrr = eval_bitext(finetuned, test_pairs, tag="after")
a_pr, a_sr = eval_sts_ru(finetuned, tag="after")

# =========================================================
# 8) Pretty Results Table
# =========================================================
import pandas as pd
summary = pd.DataFrame([
    {"Setting":"Before (Baseline)", "Bitext P@1": b_p1, "Bitext MRR": b_mrr, "STS17-RU Pearson": b_pr, "STS17-RU Spearman": b_sr},
    {"Setting":"After (Fine-tuned)", "Bitext P@1": a_p1, "Bitext MRR": a_mrr, "STS17-RU Pearson": a_pr, "STS17-RU Spearman": a_sr},
])
# nice formatting
def fmt(x):
    return f"{x:.4f}" if isinstance(x, (float, np.floating)) else x
print("\n==================== RESULTS ====================")
for _, row in summary.iterrows():
    print(f"{row['Setting']:>18} | "
          f"P@1: {fmt(row['Bitext P@1'])} | "
          f"MRR: {fmt(row['Bitext MRR'])} | "
          f"Pearson: {fmt(row['STS17-RU Pearson'])} | "
          f"Spearman: {fmt(row['STS17-RU Spearman'])}")
print("================================================\n")

summary


🖥️ Using device: cuda
✅ Archive already exists. Skipping download.
📦 Extracting archive (idempotent)...
📁 Base dir: data/data/release/v2023-09-26/rus-tuk
🧪 Test: data/data/release/v2023-09-26/rus-tuk/test.src | data/data/release/v2023-09-26/rus-tuk/test.trg
🧑‍🏫 Train: data/data/release/v2023-09-26/rus-tuk/train.src.gz | data/data/release/v2023-09-26/rus-tuk/train.trg.gz
✅ Loaded 9 test pairs for bitext retrieval.
✅ Loaded 2000 training pairs for fine-tuning.


  tar.extractall(path, members, numeric_owner=numeric_owner)



=== 📊 Baseline (Before fine-tuning) ===
🔎 Bitext (baseline) — P@1: 0.5556 | MRR: 0.7011
🔎 STS17-RU (baseline) — Pearson: 0.7893 | Spearman: 0.7955

=== 🛠️ Fine-tuning MiniLM on RU–TK (MultipleNegativesRankingLoss) ===


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss



=== 📊 After Fine-tuning ===
🔎 Bitext (after) — P@1: 0.7778 | MRR: 0.8704
🔎 STS17-RU (after) — Pearson: 0.7847 | Spearman: 0.7864

 Before (Baseline) | P@1: 0.5556 | MRR: 0.7011 | Pearson: 0.7893 | Spearman: 0.7955
After (Fine-tuned) | P@1: 0.7778 | MRR: 0.8704 | Pearson: 0.7847 | Spearman: 0.7864



Unnamed: 0,Setting,Bitext P@1,Bitext MRR,STS17-RU Pearson,STS17-RU Spearman
0,Before (Baseline),0.555556,0.701058,0.789284,0.795497
1,After (Fine-tuned),0.777778,0.87037,0.784675,0.786372


In [None]:
#@title 🚀 MiniLM + LoRA: Two Adapters (Bitext & STS) — Self-Contained Colab Script (Adapter A tuned)
# Trains TWO separate LoRA adapters on MiniLM:
#  • Adapter A (Bitext/MNRL) — stronger config (r=32, lr=1e-4, 5 epochs, bigger batch) + large eval set
#  • Adapter B (STS/Cosine)  — standard config
# Includes robust Tatoeba path detection and summary tables.

# --- Installs ---
!pip -q install sentence-transformers datasets peft accelerate faiss-cpu numpy scipy pandas tqdm wget

import os, tarfile, gzip, glob, random, logging, math, wget, pathlib
import numpy as np
import pandas as pd
from typing import List, Tuple, Optional
from tqdm import tqdm
from scipy.stats import pearsonr, spearmanr

import torch
from torch.utils.data import DataLoader

from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

from peft import LoraConfig, get_peft_model, TaskType, PeftModel

# --------------------
# Reproducibility & Env
# --------------------
seed = 42
random.seed(seed); np.random.seed(seed)
torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"   # suppress W&B logging (may show deprecation warning)
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("🖥️ Device:", device)

BASE_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
MAX_SEQ_LEN = 128

# =============================================================================
# 1) Data: Tatoeba RU–TK (bitext)  &  STS17 Russian (ai-forever/ru-stsbenchmark-sts)
# =============================================================================
URL = "https://object.pouta.csc.fi/Tatoeba-Challenge-v2023-09-26/rus-tuk.tar"
WORKDIR = "data"; os.makedirs(WORKDIR, exist_ok=True)
tar_path = os.path.join(WORKDIR, "rus-tuk.tar")

if not os.path.exists(tar_path):
    print("⬇️ Downloading Tatoeba rus–tuk …")
    wget.download(URL, tar_path); print()

print("📦 Extracting (idempotent)…")
with tarfile.open(tar_path, "r") as tar:
    # Safe extract (prevents path traversal)
    def is_within_directory(directory, target):
        abs_directory = os.path.abspath(directory)
        abs_target = os.path.abspath(target)
        return os.path.commonprefix([abs_directory, abs_target]) == abs_directory
    def safe_extract(tarobj, path=".", members=None, *, numeric_owner=False):
        for m in tarobj.getmembers():
            mpath = os.path.join(path, m.name)
            if not is_within_directory(path, mpath):
                raise Exception("Tar path traversal")
        tarobj.extractall(path, members, numeric_owner=numeric_owner)
    safe_extract(tar, path=WORKDIR)

# 🔧 Robust path detection for possible data/data/... layouts
candidates = []
for pat in [
    os.path.join(WORKDIR, "release", "v2023-09-26", "rus-tuk"),
    os.path.join(WORKDIR, "data", "release", "v2023-09-26", "rus-tuk"),
    os.path.join(WORKDIR, "**", "release", "v2023-09-26", "rus-tuk"),
]:
    candidates += glob.glob(pat, recursive=True)
if not candidates:
    raise FileNotFoundError("Could not locate 'rus-tuk' directory after extraction.")
base_dir = max(candidates, key=len)
print("📁 Using base_dir:", base_dir)

def find_one(name_options):
    for name in name_options:
        p = os.path.join(base_dir, name)
        if os.path.isfile(p):
            return p
    return None

# Allow small filename variations
test_src     = find_one(["test.src", "test.ru", "test.rus"])
test_trg     = find_one(["test.trg", "test.tk", "test.tuk"])
train_src_gz = find_one(["train.src.gz","train.ru.gz","train.rus.gz"]) or \
               find_one(["dev.src.gz","dev.ru.gz","dev.rus.gz"])
train_trg_gz = find_one(["train.trg.gz","train.tk.gz","train.tuk.gz"]) or \
               find_one(["dev.trg.gz","dev.tk.gz","dev.tuk.gz"])

# Optional dev files (for larger eval)
dev_src_gz = find_one(["dev.src.gz","dev.ru.gz","dev.rus.gz"])
dev_trg_gz = find_one(["dev.trg.gz","dev.tk.gz","dev.tuk.gz"])

assert test_src and test_trg, f"Missing test files in {base_dir}"
assert train_src_gz and train_trg_gz, f"Missing train/dev gz files in {base_dir}"
print("🧪 Test:", test_src, "|", test_trg)
print("🧑‍🏫 Train:", train_src_gz, "|", train_trg_gz)
if dev_src_gz and dev_trg_gz:
    print("🧪 Dev gz:", dev_src_gz, "|", dev_trg_gz)

def clean(s:str)->str:
    return s.replace("\xa0"," ").replace("\u202f"," ").strip()

def read_parallel_plain(src, trg, limit: Optional[int]=None):
    rows=[]
    with open(src, encoding="utf-8") as fs, open(trg, encoding="utf-8") as ft:
        for s,t in zip(fs, ft):
            s,t = clean(s), clean(t)
            if s and t: rows.append((s,t))
            if limit and len(rows)>=limit: break
    return rows

def read_parallel_gz(src_gz, trg_gz, start:int=0, limit: Optional[int]=None):
    """Read gz-parallel with optional start offset (skips first `start` lines)."""
    rows=[]; i=0
    with gzip.open(src_gz, "rt", encoding="utf-8") as fs, gzip.open(trg_gz, "rt", encoding="utf-8") as ft:
        for s,t in zip(fs, ft):
            if i < start: i+=1; continue
            s,t = clean(s), clean(t)
            if s and t: rows.append((s,t))
            i+=1
            if limit and len(rows)>=limit: break
    return rows

# Keep training compact but meaningful
MAX_TRAIN_BITEXT = 8000          # more signal than before
BITEXT_EVAL_LARGE = 1000         # show gains on a larger retrieval set

bitext_train = read_parallel_gz(train_src_gz, train_trg_gz, start=0, limit=MAX_TRAIN_BITEXT)
bitext_test_small = read_parallel_plain(test_src, test_trg)   # 9 pairs (coarse)
# Large eval set: use dev if present; else take the next 1000 pairs after training slice
if dev_src_gz and dev_trg_gz:
    bitext_eval_large = read_parallel_gz(dev_src_gz, dev_trg_gz, start=0, limit=BITEXT_EVAL_LARGE)
else:
    bitext_eval_large = read_parallel_gz(train_src_gz, train_trg_gz, start=MAX_TRAIN_BITEXT, limit=BITEXT_EVAL_LARGE)

print(f"✅ Bitext train: {len(bitext_train)} | small-test: {len(bitext_test_small)} | large-eval: {len(bitext_eval_large)}")

# STS17 Russian (ai-forever/ru-stsbenchmark-sts)
ds_train = load_dataset("ai-forever/ru-stsbenchmark-sts", split="train")
ds_val   = load_dataset("ai-forever/ru-stsbenchmark-sts", split="validation")
ds_test  = load_dataset("ai-forever/ru-stsbenchmark-sts", split="test")
print(f"✅ STS RU: train={len(ds_train)} | val={len(ds_val)} | test={len(ds_test)}")

# =============================================================================
# 2) Evaluation helpers (Bitext & STS)
# =============================================================================
def l2norm(x: np.ndarray) -> np.ndarray:
    return x / (np.linalg.norm(x, axis=1, keepdims=True) + 1e-12)

def eval_bitext(model: SentenceTransformer, pairs: List[Tuple[str,str]], tag=""):
    s = [a for a,_ in pairs]; t = [b for _,b in pairs]
    e_s = model.encode(s, batch_size=256, convert_to_numpy=True, show_progress_bar=False, normalize_embeddings=False)
    e_t = model.encode(t, batch_size=256, convert_to_numpy=True, show_progress_bar=False, normalize_embeddings=False)
    e_s, e_t = l2norm(e_s), l2norm(e_t)
    sim = e_s @ e_t.T
    N = sim.shape[0]
    p1 = float((sim.argmax(axis=1) == np.arange(N)).sum())/N
    gold = sim[np.arange(N), np.arange(N)]
    ranks = (sim > gold[:,None]).sum(axis=1) + 1
    mrr = (1.0 / ranks).mean()
    print(f"🔎 Bitext {tag}: P@1={p1:.4f} | MRR={mrr:.4f} | N={N}")
    return p1, mrr

def eval_sts(model: SentenceTransformer, split, tag=""):
    s1 = split["sentence1"]; s2 = split["sentence2"]; y = np.array(split["score"], dtype=float)
    e1 = model.encode(s1, batch_size=256, convert_to_numpy=True, show_progress_bar=False, normalize_embeddings=False)
    e2 = model.encode(s2, batch_size=256, convert_to_numpy=True, show_progress_bar=False, normalize_embeddings=False)
    sim = (l2norm(e1)*l2norm(e2)).sum(axis=1)
    pr,_ = pearsonr(sim, y); sr,_ = spearmanr(sim, y)
    print(f"🔎 STS {tag}: Pearson={pr:.4f} | Spearman={sr:.4f} | N={len(y)}")
    return pr, sr

# =============================================================================
# 3) Baseline evaluations (no LoRA)
# =============================================================================
baseline = SentenceTransformer(BASE_MODEL, device=device)
baseline.max_seq_length = MAX_SEQ_LEN
print("\n=== 📊 Baseline ===")
b_p1_s, b_mrr_s = eval_bitext(baseline, bitext_test_small, tag="(baseline-small)")
b_p1_L, b_mrr_L = eval_bitext(baseline, bitext_eval_large, tag="(baseline-large)")
b_pr,   b_sr    = eval_sts(baseline, ds_test, tag="(baseline)")

# =============================================================================
# 4) LoRA utility: attach LoRA to the internal transformer
# =============================================================================
def attach_lora_to_st_model(st_model: SentenceTransformer, r=32, alpha=64, dropout=0.05,
                            target_modules=("query","value")):
    """
    Wrap the internal AutoModel of a SentenceTransformer with PEFT-LoRA.
    Only LoRA params will be trainable.
    """
    transformer = st_model._first_module().auto_model  # HF transformer inside ST
    config = LoraConfig(
        task_type=TaskType.FEATURE_EXTRACTION,
        r=r, lora_alpha=alpha, lora_dropout=dropout,
        target_modules=list(target_modules),
        inference_mode=False
    )
    peft_wrapped = get_peft_model(transformer, config)
    peft_wrapped.print_trainable_parameters()
    st_model._first_module().auto_model = peft_wrapped
    return st_model

# =============================================================================
# 5) Train Adapter A (Bitext) with Stronger MNRL config
# =============================================================================
print("\n=== 🛠️ Training LoRA Adapter A (Bitext, MNRL — stronger config) ===")
adapterA_dir = "adapters/bitext_lora"
os.makedirs(adapterA_dir, exist_ok=True)

model_bitext = SentenceTransformer(BASE_MODEL, device=device)
model_bitext.max_seq_length = MAX_SEQ_LEN
model_bitext = attach_lora_to_st_model(model_bitext, r=32, alpha=64, dropout=0.05,
                                       target_modules=("query","value"))

train_examples_bitext = [InputExample(texts=[ru, tk]) for (ru, tk) in bitext_train]
loader_bitext = DataLoader(train_examples_bitext, shuffle=True, batch_size=128, drop_last=True)
loss_bitext = losses.MultipleNegativesRankingLoss(model_bitext)

epochs_A = 5
warmup_steps_A = int(len(loader_bitext)*epochs_A*0.1)
model_bitext.fit(
    train_objectives=[(loader_bitext, loss_bitext)],
    epochs=epochs_A,
    warmup_steps=warmup_steps_A,
    use_amp=True,
    show_progress_bar=True,
    output_path=None,
    optimizer_params={'lr': 1e-4}
)

# Save LoRA adapter ONLY
peft_model_bitext = model_bitext._first_module().auto_model  # this is a PeftModel
peft_model_bitext.save_pretrained(adapterA_dir)
print(f"💾 Saved LoRA adapter A to: {adapterA_dir}")

# Evaluate adapter A (attach to a fresh base)
print("\n=== 📊 Evaluate Adapter A (attach & test) ===")
base_for_evalA = SentenceTransformer(BASE_MODEL, device=device); base_for_evalA.max_seq_length = MAX_SEQ_LEN
base_transformerA = base_for_evalA._first_module().auto_model
base_for_evalA._first_module().auto_model = PeftModel.from_pretrained(base_transformerA, adapterA_dir)

a_p1_s, a_mrr_s = eval_bitext(base_for_evalA, bitext_test_small, tag="(adapter A, small)")
a_p1_L, a_mrr_L = eval_bitext(base_for_evalA, bitext_eval_large, tag="(adapter A, large)")
a_pr,   a_sr    = eval_sts(base_for_evalA, ds_test, tag="(adapter A)")

# =============================================================================
# 6) Train Adapter B (STS) with CosineSimilarityLoss (standard)
# =============================================================================
print("\n=== 🛠️ Training LoRA Adapter B (STS, Cosine) ===")
adapterB_dir = "adapters/sts_lora"
os.makedirs(adapterB_dir, exist_ok=True)

model_sts = SentenceTransformer(BASE_MODEL, device=device)
model_sts.max_seq_length = MAX_SEQ_LEN
model_sts = attach_lora_to_st_model(model_sts, r=16, alpha=32, dropout=0.1,
                                    target_modules=("query","value"))

def to_examples(split):
    ex=[]
    for s1, s2, y in zip(split["sentence1"], split["sentence2"], split["score"]):
        ex.append(InputExample(texts=[clean(s1), clean(s2)], label=float(y)/5.0))
    return ex

train_ex_sts = to_examples(ds_train)
val_ex_sts   = to_examples(ds_val)

loader_sts = DataLoader(train_ex_sts, shuffle=True, batch_size=64)
loss_sts = losses.CosineSimilarityLoss(model_sts)

val_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_ex_sts, name="sts-val")

epochs_B = 3
warmup_steps_B = int(len(loader_sts)*epochs_B*0.1)
model_sts.fit(
    train_objectives=[(loader_sts, loss_sts)],
    epochs=epochs_B,
    warmup_steps=warmup_steps_B,
    use_amp=True,
    show_progress_bar=True,
    evaluator=val_evaluator,
    evaluation_steps=200,
    output_path=None
)

peft_model_sts = model_sts._first_module().auto_model
peft_model_sts.save_pretrained(adapterB_dir)
print(f"💾 Saved LoRA adapter B to: {adapterB_dir}")

# Evaluate adapter B (attach to a fresh base)
print("\n=== 📊 Evaluate Adapter B (attach & test) ===")
base_for_evalB = SentenceTransformer(BASE_MODEL, device=device); base_for_evalB.max_seq_length = MAX_SEQ_LEN
base_transformerB = base_for_evalB._first_module().auto_model
base_for_evalB._first_module().auto_model = PeftModel.from_pretrained(base_transformerB, adapterB_dir)

b2_p1_s, b2_mrr_s = eval_bitext(base_for_evalB, bitext_test_small, tag="(adapter B, small)")
b2_p1_L, b2_mrr_L = eval_bitext(base_for_evalB, bitext_eval_large, tag="(adapter B, large)")
b2_pr,   b2_sr    = eval_sts(base_for_evalB, ds_test, tag="(adapter B)")

# =============================================================================
# 7) Summary Tables
# =============================================================================
summary_small = pd.DataFrame([
    {"Setting":"Baseline",                 "Bitext P@1 (9)":b_p1_s,   "Bitext MRR (9)":b_mrr_s,   "STS Pearson":b_pr,  "STS Spearman":b_sr},
    {"Setting":"Adapter A (Bitext/MNRL)", "Bitext P@1 (9)":a_p1_s,   "Bitext MRR (9)":a_mrr_s,   "STS Pearson":a_pr,  "STS Spearman":a_sr},
    {"Setting":"Adapter B (STS/Cosine)",  "Bitext P@1 (9)":b2_p1_s,  "Bitext MRR (9)":b2_mrr_s,  "STS Pearson":b2_pr, "STS Spearman":b2_sr},
])
summary_large = pd.DataFrame([
    {"Setting":"Baseline",                 "Bitext P@1 (N)":b_p1_L,   "Bitext MRR (N)":b_mrr_L},
    {"Setting":"Adapter A (Bitext/MNRL)", "Bitext P@1 (N)":a_p1_L,   "Bitext MRR (N)":a_mrr_L},
    {"Setting":"Adapter B (STS/Cosine)",  "Bitext P@1 (N)":b2_p1_L,  "Bitext MRR (N)":b2_mrr_L},
])

def fmt(x): return f"{x:.4f}" if isinstance(x,(float,np.floating)) else x

print("\n==================== RESULTS — SMALL (9) ====================")
for _,r in summary_small.iterrows():
    print(f"{r['Setting']:>26} | P@1:{fmt(r['Bitext P@1 (9)'])} | MRR:{fmt(r['Bitext MRR (9)'])} | "
          f"Pearson:{fmt(r['STS Pearson'])} | Spearman:{fmt(r['STS Spearman'])}")
print("=============================================================\n")

print("==================== RESULTS — LARGE (N≈1000) ================")
for _,r in summary_large.iterrows():
    print(f"{r['Setting']:>26} | P@1:{fmt(r['Bitext P@1 (N)'])} | MRR:{fmt(r['Bitext MRR (N)'])}")
print("=============================================================\n")

print("📋 DataFrames (for copy/save):")
display(summary_small)
display(summary_large)


🖥️ Device: cuda
📦 Extracting (idempotent)…
📁 Using base_dir: data/data/release/v2023-09-26/rus-tuk
🧪 Test: data/data/release/v2023-09-26/rus-tuk/test.src | data/data/release/v2023-09-26/rus-tuk/test.trg
🧑‍🏫 Train: data/data/release/v2023-09-26/rus-tuk/train.src.gz | data/data/release/v2023-09-26/rus-tuk/train.trg.gz
✅ Bitext train: 8000 | small-test: 9 | large-eval: 1000


  tarobj.extractall(path, members, numeric_owner=numeric_owner)


✅ STS RU: train=5224 | val=1336 | test=1264

=== 📊 Baseline ===
🔎 Bitext (baseline-small): P@1=0.5556 | MRR=0.7011 | N=9
🔎 Bitext (baseline-large): P@1=0.0930 | MRR=0.1350 | N=1000
🔎 STS (baseline): Pearson=0.7893 | Spearman=0.7955 | N=1264

=== 🛠️ Training LoRA Adapter A (Bitext, MNRL — stronger config) ===


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


trainable params: 589,824 || all params: 118,243,584 || trainable%: 0.4988


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


💾 Saved LoRA adapter A to: adapters/bitext_lora

=== 📊 Evaluate Adapter A (attach & test) ===
🔎 Bitext (adapter A, small): P@1=0.7778 | MRR=0.8889 | N=9
🔎 Bitext (adapter A, large): P@1=0.5020 | MRR=0.5809 | N=1000
🔎 STS (adapter A): Pearson=0.7656 | Spearman=0.7699 | N=1264

=== 🛠️ Training LoRA Adapter B (STS, Cosine) ===
trainable params: 294,912 || all params: 117,948,672 || trainable%: 0.2500


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Sts-val Pearson Cosine,Sts-val Spearman Cosine
82,No log,No log,0.843458,0.845098
164,No log,No log,0.845931,0.846517
200,No log,No log,0.846434,0.846851
246,No log,No log,0.84665,0.84692


💾 Saved LoRA adapter B to: adapters/sts_lora

=== 📊 Evaluate Adapter B (attach & test) ===
🔎 Bitext (adapter B, small): P@1=0.5556 | MRR=0.7011 | N=9
🔎 Bitext (adapter B, large): P@1=0.1020 | MRR=0.1427 | N=1000
🔎 STS (adapter B): Pearson=0.7952 | Spearman=0.7991 | N=1264

                  Baseline | P@1:0.5556 | MRR:0.7011 | Pearson:0.7893 | Spearman:0.7955
   Adapter A (Bitext/MNRL) | P@1:0.7778 | MRR:0.8889 | Pearson:0.7656 | Spearman:0.7699
    Adapter B (STS/Cosine) | P@1:0.5556 | MRR:0.7011 | Pearson:0.7952 | Spearman:0.7991

                  Baseline | P@1:0.0930 | MRR:0.1350
   Adapter A (Bitext/MNRL) | P@1:0.5020 | MRR:0.5809
    Adapter B (STS/Cosine) | P@1:0.1020 | MRR:0.1427

📋 DataFrames (for copy/save):


Unnamed: 0,Setting,Bitext P@1 (9),Bitext MRR (9),STS Pearson,STS Spearman
0,Baseline,0.555556,0.701058,0.789284,0.795497
1,Adapter A (Bitext/MNRL),0.777778,0.888889,0.765572,0.769852
2,Adapter B (STS/Cosine),0.555556,0.701058,0.795191,0.799121


Unnamed: 0,Setting,Bitext P@1 (N),Bitext MRR (N)
0,Baseline,0.093,0.13503
1,Adapter A (Bitext/MNRL),0.502,0.580905
2,Adapter B (STS/Cosine),0.102,0.142722
