In [10]:
# 0️⃣ Install dependencies
!pip install -qU sentence-transformers datasets scikit-learn wandb

# 1️⃣ Imports and data preparation
import os, gzip, time, datetime as dt
import wandb
import torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import TranslationEvaluator
from torch.utils.data import DataLoader

# --- W&B: init run ---
run_name = f"labse-ru-tk-mnr-{dt.datetime.now().strftime('%Y%m%d-%H%M%S')}"
wandb.init(
    project="LaBSE",
    name=run_name,
    job_type="finetune",
    config={
        "base_model": "sentence-transformers/LaBSE",
        "loss": "MultipleNegativesRankingLoss",
        "batch_size": 32,
        "epochs": 3,
        "warmup_steps": 100,
        "evaluation_steps": 500,
        "split_seed": 42,
    },
)

def clean(text):
    return text.replace("\xa0", " ").replace("\u202f", " ").strip()

# Read and clean training data
train_rows = []
with gzip.open("train.id.gz", "rt", encoding="utf-8") as idf, \
     gzip.open("train.src.gz", "rt", encoding="utf-8") as srcf, \
     gzip.open("train.trg.gz", "rt", encoding="utf-8") as trgf:
    for _, src, trg in zip(idf, srcf, trgf):
        train_rows.append({"translation": {"ru": clean(src), "tk": clean(trg)}})

# Split into train/val/test (val + extra_test come from 10% of original train)
train_data, temp = train_test_split(train_rows, test_size=0.10, random_state=wandb.config.split_seed)
val_data, extra_test = train_test_split(temp, test_size=0.5, random_state=wandb.config.split_seed)

# Read and clean original test data
test_rows = []
with open("test.id", "r", encoding="utf-8") as idf, \
     open("test.src", "r", encoding="utf-8") as srcf, \
     open("test.trg", "r", encoding="utf-8") as trgf:
    for _, src, trg in zip(idf, srcf, trgf):
        test_rows.append({"translation": {"ru": clean(src), "tk": clean(trg)}})

# Combine original + extra test
full_test = test_rows + extra_test

# ==== SIZE REPORT: RU–TK bitext ====
total_train_rows = len(train_rows)                       # original "train" file rows before split
n_train = len(train_data)                                # 90% of train_rows
n_val = len(val_data)                                    # 5% of train_rows
n_extra_test = len(extra_test)                           # 5% of train_rows
n_test_orig = len(test_rows)                             # original test file rows
n_test_final = len(full_test)                            # original test + extra_test (your eval set)

print("📊 RU–TK Bitext sizes")
print(f"  original train file     : {total_train_rows}")
print(f"    ├─ train (90%)        : {n_train}")
print(f"    ├─ val (5%)           : {n_val}")
print(f"    └─ extra_test (5%)    : {n_extra_test}")
print(f"  original test file      : {n_test_orig}")
print(f"  final test used         : {n_test_final}  (original test + extra_test)")

# Log sizes to W&B
wandb.log({
    "sizes/original_train_rows": total_train_rows,
    "sizes/train": n_train,
    "sizes/val": n_val,
    "sizes/extra_test": n_extra_test,
    "sizes/test_original": n_test_orig,
    "sizes/test_final": n_test_final
})

# Build DatasetDict
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data),
    "test": Dataset.from_list(full_test),
})

# 2️⃣ Build InputExamples
train_examples = [InputExample(texts=[r["translation"]["ru"], r["translation"]["tk"]])
                  for r in dataset["train"]]
val_src = [r["translation"]["ru"] for r in dataset["validation"]]
val_tgt = [r["translation"]["tk"] for r in dataset["validation"]]
test_src = [r["translation"]["ru"] for r in dataset["test"]]
test_tgt = [r["translation"]["tk"] for r in dataset["test"]]

print(f"🧮 Dataset sizes → Train: {len(train_examples)}, Val: {len(val_src)}, Test: {len(test_src)}")

# 3️⃣ Setup model, dataloader, loss, evaluator
model = SentenceTransformer(wandb.config.base_model)
train_loader = DataLoader(train_examples, shuffle=True, batch_size=wandb.config.batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model)

# W&B: watch model (grads & params)
wandb.watch(model, log="all", log_freq=100)

# --- W&B-aware evaluator wrapper ---
class WandbTranslationEvaluator(TranslationEvaluator):
    def __init__(self, src_sentences, trg_sentences, name: str, split_name: str):
        super().__init__(src_sentences, trg_sentences, name=name, batch_size=64, show_progress_bar=False)
        self.split_name = split_name

    def __call__(self, model, output_path=None, epoch=-1, steps=-1):
        score = super().__call__(model, output_path, epoch, steps)  # returns accuracy@1
        # Log with clear metric names
        wandb.log({
            f"{self.split_name}/accuracy": score,
            "epoch": epoch if epoch is not None else -1,
            "step": steps if steps is not None else -1,
        })
        return score

val_evaluator = WandbTranslationEvaluator(val_src, val_tgt, name="val_rus-tuk", split_name="val")

# 4️⃣ Fine-tune LaBSE (validation gets logged via our evaluator)
output_dir = "labse_finetuned_rustuk"
model.fit(
    train_objectives=[(train_loader, train_loss)],
    epochs=wandb.config.epochs,
    warmup_steps=wandb.config.warmup_steps,
    evaluator=val_evaluator,
    evaluation_steps=wandb.config.evaluation_steps,
    output_path=output_dir
)

# 5️⃣ Final evaluation on test set (also W&B-logged)
test_evaluator = WandbTranslationEvaluator(test_src, test_tgt, name="test_rus-tuk", split_name="test")
test_score = test_evaluator(model, output_path=output_dir, epoch=wandb.config.epochs, steps=-1)

print(f"✅ Final test accuracy: {test_score:.4f}")

# 6️⃣ Upload the fine-tuned model as a W&B artifact
artifact = wandb.Artifact(name=f"{run_name}-model", type="model", description="LaBSE fine-tuned on RU–TK bitext with MNR loss")
artifact.add_dir(output_dir)
wandb.log_artifact(artifact)

# Optional: mark summary metrics
wandb.run.summary["test/accuracy"] = test_score

wandb.finish()


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/9.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m6.6/9.5 MB[0m [31m199.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m9.5/9.5 MB[0m [31m206.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m113.2 MB/s[0m eta [36m0:00:00[0m
[?25h

📊 RU–TK Bitext sizes
  original train file     : 116565
    ├─ train (90%)        : 104908
    ├─ val (5%)           : 5828
    └─ extra_test (5%)    : 5829
  original test file      : 9
  final test used         : 5838  (original test + extra_test)
🧮 Dataset sizes → Train: 104908, Val: 5828, Test: 5838


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# ✅ Install dependencies
!pip install --quiet sentence-transformers datasets transformers torch numpy scikit-learn

import os, torch, numpy as np, logging
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# 🔹 Setup logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {device}")

# 🔹 Utility Functions
def get_embeddings(model, sentences, batch_size=32):
    embs = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        emb = model.encode(batch, convert_to_tensor=True, device=device, normalize_embeddings=True)
        embs.append(emb.cpu())
    return torch.cat(embs).numpy()

def bitext_retrieval(src_emb, tgt_emb):
    sim = cosine_similarity(src_emb, tgt_emb)
    N = sim.shape[0]
    top1 = sim.argmax(axis=1)
    p1 = (top1 == np.arange(N)).mean()
    correct_scores = sim[np.arange(N), np.arange(N)]
    better_scores = (sim > correct_scores[:, None]).sum(axis=1)
    ranks = better_scores + 1
    mrr = (1.0 / ranks).mean()
    return p1, mrr

def compute_sts_metrics(emb1, emb2, gold_scores):
    cos_sim = np.sum(emb1 * emb2, axis=1)
    pearson, _ = pearsonr(cos_sim, gold_scores)
    spearman, _ = spearmanr(cos_sim, gold_scores)
    return pearson, spearman

# 🔹 Load test bitext for Rus-Tuk
SRC_FILE = "test.src"
TGT_FILE = "test.trg"
with open(SRC_FILE, encoding="utf-8") as f1, open(TGT_FILE, encoding="utf-8") as f2:
    src_sents = [line.strip() for line in f1]
    tgt_sents = [line.strip() for line in f2]
assert len(src_sents) == len(tgt_sents)
logging.info(f"Loaded {len(src_sents)} test bitext pairs.")

# 🔹 Load STS17 Russian Dataset
sts = load_dataset("ai-forever/ru-stsbenchmark-sts", split="test")
sts_sents1 = sts["sentence1"]
sts_sents2 = sts["sentence2"]
sts_scores = np.array(sts["score"])
logging.info(f"Loaded STS17 test set: {len(sts_sents1)} sentence pairs.")

# 🔹 Load pretrained and fine-tuned LaBSE models
logging.info("Loading pretrained and fine-tuned LaBSE models...")
pretrained_labse = SentenceTransformer("sentence-transformers/LaBSE").to(device)
finetuned_labse = SentenceTransformer("labse_finetuned_rustuk").to(device)

# 🔹 Encode embeddings for Bitext Retrieval
logging.info("Encoding embeddings for Bitext Retrieval...")
src_pre = get_embeddings(pretrained_labse, src_sents)
tgt_pre = get_embeddings(pretrained_labse, tgt_sents)
src_ft  = get_embeddings(finetuned_labse, src_sents)
tgt_ft  = get_embeddings(finetuned_labse, tgt_sents)

# 🔹 Evaluate Bitext Retrieval
print("\n🎯 Bitext Retrieval Results:")
for name, src, tgt in [
    ("Pretrained LaBSE", src_pre, tgt_pre),
    ("Fine-tuned LaBSE", src_ft, tgt_ft)
]:
    p1, mrr = bitext_retrieval(src, tgt)
    print(f"{name:20} → Precision@1: {p1:.4f}, MRR: {mrr:.4f}")

# 🔹 Encode embeddings for STS
logging.info("Encoding embeddings for STS Evaluation...")
sts1_pre = get_embeddings(pretrained_labse, sts_sents1)
sts2_pre = get_embeddings(pretrained_labse, sts_sents2)
sts1_ft  = get_embeddings(finetuned_labse, sts_sents1)
sts2_ft  = get_embeddings(finetuned_labse, sts_sents2)

# 🔹 Evaluate STS
print("\n📐 STS Results:")
pearson_pre, spearman_pre = compute_sts_metrics(sts1_pre, sts2_pre, sts_scores)
pearson_ft,  spearman_ft  = compute_sts_metrics(sts1_ft, sts2_ft, sts_scores)
print(f"Pretrained LaBSE     → Pearson r = {pearson_pre:.4f}, Spearman ρ = {spearman_pre:.4f}")
print(f"Fine-tuned LaBSE     → Pearson r = {pearson_ft:.4f}, Spearman ρ = {spearman_ft:.4f}")


README.md:   0%|          | 0.00/349 [00:00<?, ?B/s]

sts_rumteb_train.json: 0.00B [00:00, ?B/s]

sts_rumteb_test.json: 0.00B [00:00, ?B/s]

sts_rumteb_dev.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/5224 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1264 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1336 [00:00<?, ? examples/s]


🎯 Bitext Retrieval Results:
Pretrained LaBSE     → Precision@1: 0.8889, MRR: 0.9444
Fine-tuned LaBSE     → Precision@1: 1.0000, MRR: 1.0000

📐 STS Results:
Pretrained LaBSE     → Pearson r = 0.7357, Spearman ρ = 0.7335
Fine-tuned LaBSE     → Pearson r = 0.6911, Spearman ρ = 0.6863


In [8]:
# ============================
# ⚙️ CONFIG
# ============================
BITEXT_BATCH_SIZE = 32
BITEXT_EPOCHS = 3
STS_BATCH_SIZE = 32
STS_EPOCHS = 3
MAX_SEQ_LEN = 256

# Your Rus–Tuk files (as in your working code)
TRAIN_ID = "train.id.gz"
TRAIN_SRC = "train.src.gz"   # Russian
TRAIN_TRG = "train.trg.gz"   # Turkmen
TEST_ID  = "test.id"
TEST_SRC = "test.src"
TEST_TRG = "test.trg"

BITEXT_ADAPTER_DIR = "adapters/bitext"
STS_ADAPTER_DIR    = "adapters/sts"
BASE_MODEL_NAME    = "sentence-transformers/LaBSE"

# ============================
# 0️⃣ Install deps
# ============================
!pip -qU install "sentence-transformers>=2.6.1" "transformers>=4.41.0" \
                  "datasets>=2.19.0" "peft>=0.11.1" "scikit-learn>=1.3.0" \
                  "tqdm>=4.66.0" "numpy>=1.24.0" "wandb>=0.16.0"

# ============================
# 1️⃣ Imports & helpers
# ============================
import os, gzip, random, numpy as np, torch, time
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
from torch.utils.data import DataLoader
from tqdm import tqdm

from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import TranslationEvaluator, EmbeddingSimilarityEvaluator

from peft import LoraConfig, TaskType, get_peft_model, PeftModel

import wandb

# --- W&B: project + no model uploads ---
os.environ["WANDB_PROJECT"]   = os.environ.get("WANDB_PROJECT", "LaBSE")
os.environ["WANDB_LOG_MODEL"] = "false"   # don't save/upload models
os.environ["WANDB_WATCH"]     = "false"   # don't auto-watch
# (Optional) silence service chatter:
# os.environ["WANDB_SILENT"]    = "true"

# Login once per environment (or set WANDB_API_KEY env var)
try:
    wandb.login(quiet=True)
except Exception:
    pass

run = wandb.init(
    project=os.environ["WANDB_PROJECT"],
    name=f"LoRA-LaBSE-RuTk_{int(time.time())}",
    config={
        "base_model": BASE_MODEL_NAME,
        "max_seq_len": MAX_SEQ_LEN,
        "bitext": {"batch_size": BITEXT_BATCH_SIZE, "epochs": BITEXT_EPOCHS},
        "sts":    {"batch_size": STS_BATCH_SIZE,    "epochs": STS_EPOCHS},
        "files": {
            "train_id": TRAIN_ID, "train_src": TRAIN_SRC, "train_trg": TRAIN_TRG,
            "test_id": TEST_ID, "test_src": TEST_SRC, "test_trg": TEST_TRG
        }
    }
)

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(42); random.seed(42); np.random.seed(42)
wandb.config.update({"device": device, "cuda": torch.cuda.is_available()}, allow_val_change=True)

def clean(text: str) -> str:
    return text.replace("\xa0", " ").replace("\u202f", " ").strip()

def ensure_files_exist(paths):
    missing = [p for p in paths if not os.path.exists(p)]
    if missing:
        raise FileNotFoundError(f"Missing files: {missing}\nUpload your Rus–Tuk files or fix paths in CONFIG.")

def cosine_pearson(emb1, emb2, gold):
    cs = np.sum(emb1 * emb2, axis=1)  # embeddings normalized → dot = cosine
    r, _ = pearsonr(cs, gold)
    return r

def encode_batches(st_model, texts, bsz=64):
    outs = []
    for i in range(0, len(texts), bsz):
        chunk = texts[i:i+bsz]
        embs = st_model.encode(
            chunk,
            batch_size=bsz,
            convert_to_tensor=True,
            device=device,
            normalize_embeddings=True
        )
        outs.append(embs.cpu())
    return torch.cat(outs).numpy()

# ============================
# 2️⃣ Load Bitext data (Rus–Tuk)
# ============================
ensure_files_exist([TRAIN_ID, TRAIN_SRC, TRAIN_TRG, TEST_ID, TEST_SRC, TEST_TRG])

train_rows = []
with gzip.open(TRAIN_ID, "rt", encoding="utf-8") as idf, \
     gzip.open(TRAIN_SRC, "rt", encoding="utf-8") as srcf, \
     gzip.open(TRAIN_TRG, "rt", encoding="utf-8") as trgf:
    for _, src, trg in zip(idf, srcf, trgf):
        train_rows.append({"translation": {"ru": clean(src), "tk": clean(trg)}})

# 90/5/5 split from your train → train/val/extra_test
train_data, temp = train_test_split(train_rows, test_size=0.10, random_state=42)
val_data, extra_test = train_test_split(temp, test_size=0.5, random_state=42)

test_rows = []
with open(TEST_ID, "r", encoding="utf-8") as idf, \
     open(TEST_SRC, "r", encoding="utf-8") as srcf, \
     open(TEST_TRG, "r", encoding="utf-8") as trgf:
    for _, src, trg in zip(idf, srcf, trgf):
        test_rows.append({"translation": {"ru": clean(src), "tk": clean(trg)}})

full_test = test_rows + extra_test

train_examples = [InputExample(texts=[r["translation"]["ru"], r["translation"]["tk"]])
                  for r in train_data]
val_src = [r["translation"]["ru"] for r in val_data]
val_tgt = [r["translation"]["tk"] for r in val_data]
test_src = [r["translation"]["ru"] for r in full_test]
test_tgt = [r["translation"]["tk"] for r in full_test]

wandb.log({
    "data/bitext/train": len(train_examples),
    "data/bitext/val":   len(val_src),
    "data/bitext/test":  len(test_src)
})

# ============================
# 3️⃣ Load STS data (train on ru-STS Benchmark, eval on STS17 ru-ru)
# ============================
ru_stsb = load_dataset("ai-forever/ru-stsbenchmark-sts")
# Splits: train / validation / test
sts_train = ru_stsb["train"]
sts_val   = ru_stsb["validation"]
sts_test  = ru_stsb["test"]

def to_examples(dsplit):
    return [
        InputExample(texts=[clean(a), clean(b)], label=float(score)/5.0)
        for a,b,score in zip(dsplit["sentence1"], dsplit["sentence2"], dsplit["score"])
    ]

sts_train_examples = to_examples(sts_train)
sts_val_eval = EmbeddingSimilarityEvaluator(
    sts_val["sentence1"], sts_val["sentence2"],
    [float(s)/5.0 for s in sts_val["score"]],
    name="dev_ru_stsb"
)

# Try to load MTEB STS17 and extract ru-ru; if absent/empty, fallback to ru-STS test
def load_sts17_ru_ru():
    try:
        ds = load_dataset("mteb/sts17-crosslingual-sts", split="test")
        cols = set(ds.column_names)
        if "language1" in cols and "language2" in cols:
            ds_ru = ds.filter(lambda ex: ex["language1"]=="ru" and ex["language2"]=="ru")
        elif "lang1" in cols and "lang2" in cols:
            ds_ru = ds.filter(lambda ex: ex["lang1"]=="ru" and ex["lang2"]=="ru")
        elif "language" in cols:
            def is_ruru(ex):
                val = ex["language"]
                if isinstance(val, str):
                    parts = val.replace('_','-').split('-')
                    if len(parts) == 2:
                        return parts[0] == "ru" and parts[1] == "ru"
                return False
            ds_ru = ds.filter(is_ruru)
        else:
            ds_ru = None
        if ds_ru is None or len(ds_ru)==0:
            return None
        s1 = [clean(s) for s in ds_ru["sentence1"]]
        s2 = [clean(s) for s in ds_ru["sentence2"]]
        scores01 = [float(s)/5.0 for s in ds_ru["score"]]
        evalr = EmbeddingSimilarityEvaluator(s1, s2, scores01, name="test_sts17_ru")
        return (s1, s2, scores01, evalr)
    except Exception as e:
        print(f"⚠️ Could not load STS17 ru-ru: {e}")
        return None

sts17_payload = load_sts17_ru_ru()
if sts17_payload is None:
    print("🔁 Falling back to ru-STS test split for final STS evaluation.")
    sts17_s1 = [clean(s) for s in sts_test["sentence1"]]
    sts17_s2 = [clean(s) for s in sts_test["sentence2"]]
    sts17_scores01 = [float(s)/5.0 for s in sts_test["score"]]
    sts17_eval = EmbeddingSimilarityEvaluator(sts17_s1, sts17_s2, sts17_scores01, name="test_ru_stsb")
else:
    sts17_s1, sts17_s2, sts17_scores01, sts17_eval = sts17_payload

wandb.log({
    "data/sts/train": len(sts_train_examples),
    "data/sts/dev":   len(sts_val["sentence1"]),
    "data/sts/test":  len(sts17_s1)
})

# ============================
# 4️⃣ Build SentenceTransformer + LoRA
# ============================
base = SentenceTransformer(BASE_MODEL_NAME, device=device)
base.max_seq_length = MAX_SEQ_LEN

# Get the underlying HF encoder used by Sentence-Transformers
hf_encoder = base._first_module().auto_model

# LoRA config for BERT-like encoders
lora_cfg = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=32,                 # try 32 first; 16 was a bit tight
    lora_alpha=64,
    lora_dropout=0.05,    # slightly lower dropout
    target_modules=[
        "query","key","value",
        "output.dense",           # attention output
        "intermediate.dense",     # FFN in
        "output.dense"            # FFN out (it’s in a different scope than the attention one)
    ],
    inference_mode=False
)

# Wrap encoder with PEFT; create first adapter "bitext"
peft_model = get_peft_model(hf_encoder, lora_cfg, adapter_name="bitext")
base._first_module().auto_model = peft_model  # reattach to ST pipeline

# Unfreeze the final Dense (projection) layer in the ST pipeline:
dense = None
for name, module in base.named_modules():
    if name.endswith("Dense") or name.endswith("2_Dense"):
        dense = module
        break

if dense is not None:
    for p in dense.parameters():
        p.requires_grad = True


# Parameter stats
total_params = sum(p.numel() for p in peft_model.parameters())
trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
wandb.log({
    "params/total": total_params,
    "params/trainable": trainable_params,
    "params/trainable_frac": trainable_params / total_params
})

# ============================
# 5️⃣ Train LoRA "bitext" adapter
# ============================
train_loader = DataLoader(train_examples, shuffle=True, batch_size=BITEXT_BATCH_SIZE)
bitext_loss = losses.MultipleNegativesRankingLoss(base)
val_evaluator = TranslationEvaluator(val_src, val_tgt, name="val_rus_tuk")

# W&B callback for evaluator during training
def make_wandb_callback(phase_label="bitext", eval_name="val"):
    def cb(score, epoch, steps):
        wandb.log({
            f"{phase_label}/{eval_name}/score": score,
            "epoch": epoch,
            "step": steps
        })
    return cb

print("🚀 Training 'bitext' adapter…")
base.fit(
    train_objectives=[(train_loader, bitext_loss)],
    epochs=5,
    warmup_steps=500,
    evaluator=val_evaluator,
    evaluation_steps=500,
    output_path=None,  # we save adapter below
    use_amp=True,
    optimizer_params={"lr": 1e-4},   # higher LR for LoRA
    callback=make_wandb_callback("bitext", "val")
)

os.makedirs(BITEXT_ADAPTER_DIR, exist_ok=True)
# Save ONLY this adapter locally (not to W&B)
peft_model.save_pretrained(BITEXT_ADAPTER_DIR, selected_adapters=["bitext"])
print(f"✅ Saved bitext adapter → {BITEXT_ADAPTER_DIR}")

print("\n🔎 Evaluating bitext retrieval (Rus↔Tuk) with 'bitext' adapter…")
test_eval = TranslationEvaluator(test_src, test_tgt, name="test_rus_tuk")
bitext_val_score = val_evaluator(base)
bitext_test_score = test_eval(base)
wandb.log({
    "bitext/val/score_final": bitext_val_score,
    "bitext/test/score": bitext_test_score
})

# ============================
# 6️⃣ Add+train LoRA "sts" adapter (ru STS)
# ============================
peft_model.add_adapter("sts", lora_cfg)
peft_model.set_adapter("sts")  # activate new adapter
print("\n🔁 Switched active adapter → 'sts'")

sts_loader = DataLoader(sts_train_examples, shuffle=True, batch_size=STS_BATCH_SIZE)
sts_loss = losses.CosineSimilarityLoss(base)  # labels in [0,1]

print("🚀 Training 'sts' adapter…")
base.fit(
    train_objectives=[(sts_loader, sts_loss)],
    epochs=STS_EPOCHS,
    warmup_steps=100,
    evaluator=sts_val_eval,         # monitor ru-STS validation
    evaluation_steps=500,
    output_path=None,
    use_amp=True,
    callback=make_wandb_callback("sts", "val")
)

os.makedirs(STS_ADAPTER_DIR, exist_ok=True)
peft_model.save_pretrained(STS_ADAPTER_DIR, selected_adapters=["sts"])
print(f"✅ Saved sts adapter → {STS_ADAPTER_DIR}")

# ============================
# 7️⃣ Swap adapters at inference & evaluate
# ============================
def bitext_metrics(st_model, src_list, tgt_list):
    src_emb = encode_batches(st_model, src_list, bsz=64)
    tgt_emb = encode_batches(st_model, tgt_list, bsz=64)
    sim = cosine_similarity(src_emb, tgt_emb)
    N = sim.shape[0]
    top1 = sim.argmax(axis=1)
    p1 = (top1 == np.arange(N)).mean()
    correct = sim[np.arange(N), np.arange(N)]
    ranks = (sim > correct[:, None]).sum(axis=1) + 1
    mrr = (1.0 / ranks).mean()
    return p1, mrr

# Evaluate bitext adapter
peft_model.set_adapter("bitext")
print("\n🧪 Active adapter: 'bitext'")
p1, mrr = bitext_metrics(base, test_src, test_tgt)
print(f"Rus↔Tuk Retrieval → P@1: {p1:.4f}, MRR: {mrr:.4f}")
wandb.log({"bitext/test/P@1": p1, "bitext/test/MRR": mrr})

# Evaluate STS adapter on STS17 ru-ru (or ru-STS fallback)
peft_model.set_adapter("sts")
print("\n🧪 Active adapter: 'sts'")
sts_dev_score = sts_val_eval(base)
sts_test_score = sts17_eval(base)
wandb.log({"sts/dev/spearman": sts_dev_score, "sts/test/spearman": sts_test_score})

# Also compute explicit Pearson for the final STS set
print("\n📐 Final STS set (manual Pearson on cosine sim):")
s1 = encode_batches(base, sts17_s1)
s2 = encode_batches(base, sts17_s2)
pearson = cosine_pearson(s1, s2, np.array(sts17_scores01))
print(f"Pearson r = {pearson:.4f}")
wandb.log({"sts/test/pearson": pearson})

print("\n✅ Done. Two swappable adapters saved locally:")
print(f"  • Bitext adapter dir: {BITEXT_ADAPTER_DIR}")
print(f"  • STS    adapter dir: {STS_ADAPTER_DIR}")

# Mark key results in the run summary
wandb.summary["bitext_P@1"] = p1
wandb.summary["bitext_MRR"] = mrr
wandb.summary["sts_spearman_test"] = sts_test_score
wandb.summary["sts_pearson_test"] = pearson

wandb.finish()



Usage:   
  pip3 <command> [options]

no such option: -U


0,1
bitext/val/score,▁▄▄▅▆▇█
data/bitext/test,▁
data/bitext/train,▁
data/bitext/val,▁
data/sts/dev,▁
data/sts/test,▁
data/sts/train,▁
epoch,▁▂▄▅▆▇█
eval/runtime,▇▅█▁▁▇
eval/samples_per_second,▁▁▁▁▁▁

0,1
bitext/val/score,0.78423
data/bitext/test,5838
data/bitext/train,104908
data/bitext/val,5828
data/sts/dev,1336
data/sts/test,1264
data/sts/train,5224
epoch,1
eval/runtime,21.7489
eval/samples_per_second,0


🔁 Falling back to ru-STS test split for final STS evaluation.
🚀 Training 'bitext' adapter…


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Val Rus Tuk Src2trg Accuracy,Val Rus Tuk Trg2src Accuracy,Val Rus Tuk Mean Accuracy
500,0.3844,No log,0.771277,0.763727,0.767502
1000,0.2926,No log,0.783631,0.776081,0.779856
1500,0.281,No log,0.786205,0.77814,0.782172
2000,0.2767,No log,0.786719,0.777454,0.782086
2500,0.2783,No log,0.787577,0.781915,0.784746
3000,0.2562,No log,0.789636,0.777968,0.783802
3279,0.2562,No log,0.795813,0.785003,0.790408
3500,0.2267,No log,0.798559,0.78792,0.79324
4000,0.2092,No log,0.795642,0.787577,0.791609
4500,0.2016,No log,0.796843,0.792382,0.794612


Step,Training Loss,Validation Loss,Val Rus Tuk Src2trg Accuracy,Val Rus Tuk Trg2src Accuracy,Val Rus Tuk Mean Accuracy
500,0.3844,No log,0.771277,0.763727,0.767502
1000,0.2926,No log,0.783631,0.776081,0.779856
1500,0.281,No log,0.786205,0.77814,0.782172
2000,0.2767,No log,0.786719,0.777454,0.782086
2500,0.2783,No log,0.787577,0.781915,0.784746
3000,0.2562,No log,0.789636,0.777968,0.783802
3279,0.2562,No log,0.795813,0.785003,0.790408
3500,0.2267,No log,0.798559,0.78792,0.79324
4000,0.2092,No log,0.795642,0.787577,0.791609
4500,0.2016,No log,0.796843,0.792382,0.794612


✅ Saved bitext adapter → adapters/bitext

🔎 Evaluating bitext retrieval (Rus↔Tuk) with 'bitext' adapter…

🔁 Switched active adapter → 'sts'
🚀 Training 'sts' adapter…


Step,Training Loss,Validation Loss,Dev Ru Stsb Pearson Cosine,Dev Ru Stsb Spearman Cosine
164,No log,No log,0.841761,0.840372
328,No log,No log,0.855018,0.853954
492,No log,No log,0.856819,0.855873


✅ Saved sts adapter → adapters/sts

🧪 Active adapter: 'bitext'
Rus↔Tuk Retrieval → P@1: 0.8119, MRR: 0.8425

🧪 Active adapter: 'sts'

📐 Final STS set (manual Pearson on cosine sim):
Pearson r = 0.8189

✅ Done. Two swappable adapters saved locally:
  • Bitext adapter dir: adapters/bitext
  • STS    adapter dir: adapters/sts


0,1
bitext/test/MRR,▁
bitext/test/P@1,▁
bitext/val/score,▁▃▄▄▄▄▅▆▆▆▆▆▆▇▇▆▆▇▇▇▇▇▇▇▇▇▇▇█████████
data/bitext/test,▁
data/bitext/train,▁
data/bitext/val,▁
data/sts/dev,▁
data/sts/test,▁
data/sts/train,▁
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███▂▄▅

0,1
bitext/test/MRR,0.84253
bitext/test/P@1,0.81192
bitext/val/score,0.80302
bitext_MRR,0.84253
bitext_P@1,0.81192
data/bitext/test,5838
data/bitext/train,104908
data/bitext/val,5828
data/sts/dev,1336
data/sts/test,1264


In [9]:
# ============================================
# 📊 BEFORE vs AFTER — baseline vs adapters
#  (handles nested paths like /content/adapters/bitext/bitext)
# ============================================
import os, numpy as np, torch, pandas as pd
from datasets import load_dataset
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from peft import PeftModel

device = "cuda" if torch.cuda.is_available() else "cpu"

# ---- Paths (parent dirs that contain the nested adapter folder) ----
BASE_MODEL_NAME = "sentence-transformers/LaBSE"
BITEXT_ADAPTER_PARENT = "/content/adapters/bitext"
STS_ADAPTER_PARENT    = "/content/adapters/sts"
MAX_SEQ_LEN = 256

# ---- Helper: resolve folder that actually has adapter_config.json ----
def resolve_adapter_dir(parent_dir: str, default_name: str):
    # Case 1: files are directly inside parent_dir
    if os.path.exists(os.path.join(parent_dir, "adapter_config.json")):
        return parent_dir
    # Case 2: nested dir equals adapter name (e.g., /bitext/bitext)
    candidate = os.path.join(parent_dir, default_name)
    if os.path.exists(os.path.join(candidate, "adapter_config.json")):
        return candidate
    # Case 3: search immediate children for adapter_config.json
    matches = []
    if os.path.isdir(parent_dir):
        for child in os.listdir(parent_dir):
            p = os.path.join(parent_dir, child)
            if os.path.isdir(p) and os.path.exists(os.path.join(p, "adapter_config.json")):
                matches.append(p)
    if len(matches) == 1:
        return matches[0]
    raise FileNotFoundError(
        f"Could not find adapter_config.json under {parent_dir}. "
        f"Tried '{parent_dir}', '{candidate}', and immediate children."
    )

bitext_path = resolve_adapter_dir(BITEXT_ADAPTER_PARENT, "bitext")
sts_path    = resolve_adapter_dir(STS_ADAPTER_PARENT, "sts")
print(f"🔎 Resolved bitext adapter dir: {bitext_path}")
print(f"🔎 Resolved sts    adapter dir: {sts_path}")

# ---- Load / reuse your bitext test data ----
def clean(t): return t.replace("\xa0"," ").replace("\u202f"," ").strip()

try:
    test_src, test_tgt  # noqa
except NameError:
    if not (os.path.exists("test.src") and os.path.exists("test.trg")):
        raise FileNotFoundError("test.src/test.trg not found. Re-run training or adjust paths.")
    with open("test.src", "r", encoding="utf-8") as f: test_src = [clean(x.strip()) for x in f]
    with open("test.trg", "r", encoding="utf-8") as f: test_tgt = [clean(x.strip()) for x in f]

# ---- Load STS17 ru-ru (fallback to ru-STS test) ----
def load_sts17_ru_ru():
    try:
        ds = load_dataset("mteb/sts17-crosslingual-sts", split="test")
        cols = set(ds.column_names)
        if "language1" in cols and "language2" in cols:
            ds_ru = ds.filter(lambda ex: ex["language1"]=="ru" and ex["language2"]=="ru")
        elif "lang1" in cols and "lang2" in cols:
            ds_ru = ds.filter(lambda ex: ex["lang1"]=="ru" and ex["lang2"]=="ru")
        elif "language" in cols:
            def is_ruru(ex):
                val = ex["language"]
                if isinstance(val, str):
                    a = val.replace('_','-').split('-')
                    return len(a)==2 and a[0]=="ru" and a[1]=="ru"
                return False
            ds_ru = ds.filter(is_ruru)
        else:
            ds_ru = None
        if ds_ru is None or len(ds_ru)==0:
            return None
        s1 = [clean(x) for x in ds_ru["sentence1"]]
        s2 = [clean(x) for x in ds_ru["sentence2"]]
        y  = [float(x)/5.0 for x in ds_ru["score"]]
        return s1, s2, y, "STS17 ru-ru"
    except Exception:
        return None

payload = load_sts17_ru_ru()
if payload is None:
    ru_stsb = load_dataset("ai-forever/ru-stsbenchmark-sts")
    s1 = [clean(x) for x in ru_stsb["test"]["sentence1"]]
    s2 = [clean(x) for x in ru_stsb["test"]["sentence2"]]
    y  = [float(x)/5.0 for x in ru_stsb["test"]["score"]]
    sts_name = "ru-STS test"
else:
    s1, s2, y, sts_name = payload

# ---- Helpers ----
def encode_batches(st_model, texts, bsz=64):
    outs = []
    for i in range(0, len(texts), bsz):
        ch = texts[i:i+bsz]
        embs = st_model.encode(
            ch, batch_size=bsz, convert_to_tensor=True,
            device=device, normalize_embeddings=True
        )
        outs.append(embs.cpu())
    return torch.cat(outs).numpy()

def bitext_metrics(st_model, src_list, tgt_list):
    src_emb = encode_batches(st_model, src_list)
    tgt_emb = encode_batches(st_model, tgt_list)
    sim = cosine_similarity(src_emb, tgt_emb)
    N = sim.shape[0]
    top1 = sim.argmax(axis=1)
    p1 = (top1 == np.arange(N)).mean()
    correct = sim[np.arange(N), np.arange(N)]
    ranks = (sim > correct[:, None]).sum(axis=1) + 1
    mrr = (1.0 / ranks).mean()
    return float(p1), float(mrr)

def sts_metrics(st_model, s1, s2, y):
    e1 = encode_batches(st_model, s1)
    e2 = encode_batches(st_model, s2)
    cos = np.sum(e1 * e2, axis=1)  # embeddings normalized → cosine
    pr = pearsonr(cos, y)[0]
    sr = spearmanr(cos, y)[0]
    return float(pr), float(sr)

# ---- Build three evaluable models: baseline, +bitext, +sts ----
baseline = SentenceTransformer(BASE_MODEL_NAME, device=device)
baseline.max_seq_length = MAX_SEQ_LEN

with_adapters = SentenceTransformer(BASE_MODEL_NAME, device=device)
auto = with_adapters._first_module().auto_model

# Load first adapter exactly from the resolved subdir; name it explicitly so we can switch
auto = PeftModel.from_pretrained(auto, bitext_path, adapter_name="bitext")
# Load second one and give it the 'sts' name
auto.load_adapter(sts_path, adapter_name="sts")

# Re-attach into Sentence-Transformers pipeline
with_adapters._first_module().auto_model = auto

# ---- Evaluate all variants on both tasks ----
rows = []

# 1) Baseline
bp1, bmrr = bitext_metrics(baseline, test_src, test_tgt)
bpr, bsr  = sts_metrics(baseline, s1, s2, y)
rows.append(["Baseline LaBSE", "none", bp1, bmrr, bpr, bsr])

# 2) Bitext adapter active
auto.set_adapter("bitext")
ap1, amrr = bitext_metrics(with_adapters, test_src, test_tgt)
apr, asr  = sts_metrics(with_adapters, s1, s2, y)
rows.append(["LaBSE + LoRA(bitext)", "bitext", ap1, amrr, apr, asr])

# 3) STS adapter active
auto.set_adapter("sts")
sp1, smrr = bitext_metrics(with_adapters, test_src, test_tgt)
spr, ssr  = sts_metrics(with_adapters, s1, s2, y)
rows.append(["LaBSE + LoRA(sts)", "sts", sp1, smrr, spr, ssr])

df = pd.DataFrame(rows, columns=[
    "Variant", "Active Adapter",
    "Bitext P@1", "Bitext MRR",
    f"{sts_name} Pearson", f"{sts_name} Spearman"
])

with pd.option_context('display.max_colwidth', None, 'display.precision', 4):
    display(df.style.format({
        "Bitext P@1": "{:.4f}", "Bitext MRR": "{:.4f}",
        f"{sts_name} Pearson": "{:.4f}", f"{sts_name} Spearman": "{:.4f}"
    }))


🔎 Resolved bitext adapter dir: /content/adapters/bitext/bitext
🔎 Resolved sts    adapter dir: /content/adapters/sts/sts


Unnamed: 0,Variant,Active Adapter,Bitext P@1,Bitext MRR,ru-STS test Pearson,ru-STS test Spearman
0,Baseline LaBSE,none,0.698,0.7354,0.7357,0.7334
1,LaBSE + LoRA(bitext),bitext,0.8215,0.8502,0.6897,0.6853
2,LaBSE + LoRA(sts),sts,0.709,0.7451,0.8136,0.8089


In [12]:
# ============================================
# 📊 BEFORE vs AFTER — baseline vs adapters (N=9 small test)
# ============================================
import os, numpy as np, torch, pandas as pd
from datasets import load_dataset
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from peft import PeftModel

device = "cuda" if torch.cuda.is_available() else "cpu"

# ---- Paths (parent dirs that contain the nested adapter folder) ----
BASE_MODEL_NAME = "sentence-transformers/LaBSE"
BITEXT_ADAPTER_PARENT = "/content/adapters/bitext"
STS_ADAPTER_PARENT    = "/content/adapters/sts"
MAX_SEQ_LEN = 256

# If you have dedicated 9-line files, put them here; else it falls back to test.src|trg
SMALL_TEST_SRC = "test.small.src"
SMALL_TEST_TRG = "test.small.trg"
TEST_SRC = "test.src"
TEST_TRG = "test.trg"

# ---- Helper: resolve folder that actually has adapter_config.json ----
def resolve_adapter_dir(parent_dir: str, default_name: str):
    if os.path.exists(os.path.join(parent_dir, "adapter_config.json")):
        return parent_dir
    candidate = os.path.join(parent_dir, default_name)
    if os.path.exists(os.path.join(candidate, "adapter_config.json")):
        return candidate
    matches = []
    if os.path.isdir(parent_dir):
        for child in os.listdir(parent_dir):
            p = os.path.join(parent_dir, child)
            if os.path.isdir(p) and os.path.exists(os.path.join(p, "adapter_config.json")):
                matches.append(p)
    if len(matches) == 1:
        return matches[0]
    raise FileNotFoundError(
        f"Could not find adapter_config.json under {parent_dir}. "
        f"Tried '{parent_dir}', '{candidate}', and immediate children."
    )

bitext_path = resolve_adapter_dir(BITEXT_ADAPTER_PARENT, "bitext")
sts_path    = resolve_adapter_dir(STS_ADAPTER_PARENT, "sts")
print(f"🔎 Resolved bitext adapter dir: {bitext_path}")
print(f"🔎 Resolved sts    adapter dir: {sts_path}")

def clean(t): return t.replace("\xa0"," ").replace("\u202f"," ").strip()

def load_small_bitext():
    def read_two(src_path, trg_path):
        with open(src_path, encoding="utf-8") as f1: src = [clean(l.strip()) for l in f1]
        with open(trg_path, encoding="utf-8") as f2: trg = [clean(l.strip()) for l in f2]
        if len(src) != len(trg):
            raise ValueError(f"Mismatch: {src_path} ({len(src)}) vs {trg_path} ({len(trg)})")
        return src, trg

    if os.path.exists(SMALL_TEST_SRC) and os.path.exists(SMALL_TEST_TRG):
        src, trg = read_two(SMALL_TEST_SRC, SMALL_TEST_TRG)
        print(f"✅ Using SMALL files: {SMALL_TEST_SRC}/{SMALL_TEST_TRG} (N={len(src)})")
    else:
        src, trg = read_two(TEST_SRC, TEST_TRG)
        if len(src) == 9:
            print(f"✅ Using test files: {TEST_SRC}/{TEST_TRG} (N=9)")
        else:
            print(f"⚠️ {TEST_SRC}/{TEST_TRG} have N={len(src)}; forcing first 9 lines to match the original small test.")
            src, trg = src[:9], trg[:9]
    if len(src) != 9:
        raise AssertionError(f"Small test must be N=9, got {len(src)}")
    return src, trg

# ---- Load the 9-pair bitext test ----
test_src, test_tgt = load_small_bitext()

# ---- Load STS17 ru-ru (fallback to ru-STS test) ----
def load_sts17_ru_ru():
    try:
        ds = load_dataset("mteb/sts17-crosslingual-sts", split="test")
        cols = set(ds.column_names)
        if "language1" in cols and "language2" in cols:
            ds_ru = ds.filter(lambda ex: ex["language1"]=="ru" and ex["language2"]=="ru")
        elif "lang1" in cols and "lang2" in cols:
            ds_ru = ds.filter(lambda ex: ex["lang1"]=="ru" and ex["lang2"]=="ru")
        elif "language" in cols:
            def is_ruru(ex):
                val = ex["language"]
                if isinstance(val, str):
                    a = val.replace('_','-').split('-')
                    return len(a)==2 and a[0]=="ru" and a[1]=="ru"
                return False
            ds_ru = ds.filter(is_ruru)
        else:
            ds_ru = None
        if ds_ru is None or len(ds_ru)==0:
            return None
        s1 = [clean(x) for x in ds_ru["sentence1"]]
        s2 = [clean(x) for x in ds_ru["sentence2"]]
        y  = [float(x)/5.0 for x in ds_ru["score"]]
        return s1, s2, y, "STS17 ru-ru"
    except Exception:
        return None

payload = load_sts17_ru_ru()
if payload is None:
    ru_stsb = load_dataset("ai-forever/ru-stsbenchmark-sts")
    s1 = [clean(x) for x in ru_stsb["test"]["sentence1"]]
    s2 = [clean(x) for x in ru_stsb["test"]["sentence2"]]
    y  = [float(x)/5.0 for x in ru_stsb["test"]["score"]]
    sts_name = "ru-STS test"
else:
    s1, s2, y, sts_name = payload

# ---- Helpers ----
def encode_batches(st_model, texts, bsz=64):
    outs = []
    for i in range(0, len(texts), bsz):
        ch = texts[i:i+bsz]
        embs = st_model.encode(
            ch, batch_size=bsz, convert_to_tensor=True,
            device=device, normalize_embeddings=True
        )
        outs.append(embs.cpu())
    return torch.cat(outs).numpy()

def bitext_metrics(st_model, src_list, tgt_list):
    src_emb = encode_batches(st_model, src_list)
    tgt_emb = encode_batches(st_model, tgt_list)
    sim = cosine_similarity(src_emb, tgt_emb)
    N = sim.shape[0]
    top1 = sim.argmax(axis=1)
    p1 = (top1 == np.arange(N)).mean()
    correct = sim[np.arange(N), np.arange(N)]
    ranks = (sim > correct[:, None]).sum(axis=1) + 1
    mrr = (1.0 / ranks).mean()
    return float(p1), float(mrr)

def sts_metrics(st_model, s1, s2, y):
    e1 = encode_batches(st_model, s1)
    e2 = encode_batches(st_model, s2)
    cos = np.sum(e1 * e2, axis=1)  # embeddings normalized → cosine
    pr = pearsonr(cos, y)[0]
    sr = spearmanr(cos, y)[0]
    return float(pr), float(sr)

# ---- Build three evaluable models: baseline, +bitext, +sts ----
baseline = SentenceTransformer(BASE_MODEL_NAME, device=device)
baseline.max_seq_length = MAX_SEQ_LEN

with_adapters = SentenceTransformer(BASE_MODEL_NAME, device=device)
auto = with_adapters._first_module().auto_model
auto = PeftModel.from_pretrained(auto, bitext_path, adapter_name="bitext")
auto.load_adapter(sts_path, adapter_name="sts")
with_adapters._first_module().auto_model = auto

# ---- Evaluate all variants on both tasks ----
rows = []

# 1) Baseline
bp1, bmrr = bitext_metrics(baseline, test_src, test_tgt)
bpr, bsr  = sts_metrics(baseline, s1, s2, y)
rows.append(["Baseline LaBSE", "none", bp1, bmrr, bpr, bsr])

# 2) Bitext adapter active
auto.set_adapter("bitext")
ap1, amrr = bitext_metrics(with_adapters, test_src, test_tgt)
apr, asr  = sts_metrics(with_adapters, s1, s2, y)
rows.append(["LaBSE + LoRA(bitext)", "bitext", ap1, amrr, apr, asr])

# 3) STS adapter active
auto.set_adapter("sts")
sp1, smrr = bitext_metrics(with_adapters, test_src, test_tgt)
spr, ssr  = sts_metrics(with_adapters, s1, s2, y)
rows.append(["LaBSE + LoRA(sts)", "sts", sp1, smrr, spr, ssr])

df = pd.DataFrame(rows, columns=[
    "Variant", "Active Adapter",
    "Bitext P@1", "Bitext MRR",
    f"{sts_name} Pearson", f"{sts_name} Spearman"
])

with pd.option_context('display.max_colwidth', None, 'display.precision', 4):
    display(df.style.format({
        "Bitext P@1": "{:.4f}", "Bitext MRR": "{:.4f}",
        f"{sts_name} Pearson": "{:.4f}", f"{sts_name} Spearman": "{:.4f}"
    }))

# Quick confirmation
print(f"\n✅ Confirmed small bitext size: N={len(test_src)} (should be 9)")


🔎 Resolved bitext adapter dir: /content/adapters/bitext/bitext
🔎 Resolved sts    adapter dir: /content/adapters/sts/sts
✅ Using test files: test.src/test.trg (N=9)


Unnamed: 0,Variant,Active Adapter,Bitext P@1,Bitext MRR,ru-STS test Pearson,ru-STS test Spearman
0,Baseline LaBSE,none,0.8889,0.9444,0.7357,0.7334
1,LaBSE + LoRA(bitext),bitext,1.0,1.0,0.6897,0.6853
2,LaBSE + LoRA(sts),sts,0.8889,0.9444,0.8136,0.8089



✅ Confirmed small bitext size: N=9 (should be 9)
