In [1]:
import os
import json
import numpy as np
from nltk.tokenize import word_tokenize

class Validator:
    def __init__(self, threshold=0.5, src_dir='data/validator'):
        with open(os.path.join(src_dir, 'ru_w2l.json'), encoding='utf-8') as f:
            self.ru_lemmas = json.load(f)
        with open(os.path.join(src_dir, 'en_w2l.json'), encoding='utf-8') as f:
            self.en_lemmas = json.load(f)
        with open(os.path.join(src_dir, 'ru_en.json'), encoding='utf-8') as f:
            self.ru_en = json.load(f)
        with open(os.path.join(src_dir, 'en_ru.json'), encoding='utf-8') as f:
            self.en_ru = json.load(f)
        self.threshold = threshold

    def lemmatize(self, sent, lang, lemmas):
        tokens = word_tokenize(sent.strip().lower(), language=lang)
        
        return [lemmas[token] for token in tokens if token in lemmas]

    def get_score_ordered(self, src_tokens, dst_tokens, mapper):
        if len(dst_tokens) == 0:
            return 0
        trans_tokens = [mapper[token] for token in src_tokens if token in mapper]
        dst_set = set(dst_tokens)
        trans_set = set(trans_tokens)
        score = len(dst_set & trans_set) / max(len(dst_set), len(trans_set))
        return score

    def get_score(self, en_sent, ru_sent):
        en_tokens = self.lemmatize(en_sent, 'english', self.en_lemmas)
        ru_tokens = self.lemmatize(ru_sent, 'russian', self.ru_lemmas)
        return (
            self.get_score_ordered(en_tokens, ru_tokens, self.en_ru) +
            self.get_score_ordered(ru_tokens, en_tokens, self.ru_en)
        ) / 2

    def fit(self, en_sents, ru_sents):
        random_labels = np.random.choice(len(en_sents), (len(en_sents), 2))
        bad_scores = []
        for en, ru in random_labels:
            score = self.get_score(en_sents[en], ru_sents[ru])
            bad_scores.append(score)
        self.threshold = np.percentile(bad_scores, 95)

    def validate(self, en_sent, ru_sent):
        return self.get_score(en_sent, ru_sent) > self.threshold

In [2]:
from torch.utils.data import Dataset, DataLoader

class LangDataset(Dataset):
    def __init__(self, emb):
        self.emb = emb

    def __getitem__(self, index):
        return {
            'emb': self.emb[index], 
            'index': index
        }

    def __len__(self):
        return len(self.emb)

In [3]:
import faiss
import torch
import pandas as pd

def df_to_tuples(df):
    mapper = lambda x: x if type(x) != torch.Tensor else x.item()
    transform = lambda data: list(map(mapper, data))
    pairs = list(zip(transform(df['en'].to_list()), transform(df['ru'].to_list())))
    pairs = list(set(pairs))
    return pairs

def get_labse_encoder(model):
    def encode(data, lang):
        return model.encode(data, show_progress_bar=False)
    return encode

def get_laser_encoder(en_model, ru_model):
    models = {
        'en': en_model,
        'ru': ru_model,
    }
    def encode(data, lang):
        return models[lang].encode_sentences(data)
    return encode

class SentenceMiner:
    def __init__(self, en_sents, ru_sents, encoder, use_margin=False, batch_size=32):
        self.sents = {
            'en': en_sents,
            'ru': ru_sents,
        }
        print("Encoding")
        self.emb = {
            'en': encoder(en_sents, 'en'),
            'ru': encoder(ru_sents, 'ru'),
        }
        self.emb_dim = self.emb['en'].shape[1]
        self.batch_size = batch_size
        self.use_margin = use_margin
        if use_margin:
            self.margins = {
                'en': np.zeros(len(en_sents)),
                'ru': np.zeros(len(ru_sents)),
            }
            self.nfind = 5
        self.index = {}
        print("Initialized")

    def build_index(self, lang):
        print(f"{lang} index building")
        M = 32
        nlist = 16384
        nprobe = 32

        opq = faiss.OPQMatrix(self.emb_dim, M)

        quantizer = faiss.IndexFlatL2(self.emb_dim)
        index_ivfpq = faiss.IndexIVFPQ(quantizer, self.emb_dim, nlist, M, 8)
        index_ivfpq.nprobe = nprobe
        index_ivfpq.metric_type = faiss.METRIC_INNER_PRODUCT

        index = faiss.IndexPreTransform(opq, index_ivfpq)
        res = faiss.StandardGpuResources()
        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
        gpu_index.train(self.emb[lang])
        gpu_index.add(self.emb[lang])
        print(f"index trained")
        return gpu_index

    def find_pairs_ordered(self, lang_src, lang_dst):
        index = self.build_index(lang_dst)
        src_ds = LangDataset(self.emb[lang_src])
        src_dataloader = DataLoader(src_ds, batch_size=self.batch_size)
        pairs = []
        for batch in src_dataloader:
            D, I = index.search(batch['emb'], self.nfind)
            for src_i, dst_i, d in zip(batch['index'], I, D):
                if self.use_margin:
                    self.margins[lang_src][src_i] = np.mean(d)
                pairs.append((src_i.item(), dst_i[0], d[0]))
        return pd.DataFrame(pairs, columns=[lang_src, lang_dst, 'score'])

    def find_pairs(self):
        df = pd.concat([
            self.find_pairs_ordered('en', 'ru'),
            self.find_pairs_ordered('ru', 'en'),
        ])
        if self.use_margin:
            new_scores = []
            for _, row in df.iterrows():
                en = int(row['en'])
                ru = int(row['ru'])
                score = 2 * row['score'] / (
                    self.margins['en'][en] +
                    self.margins['ru'][ru]
                )
                new_scores.append((en, ru, score))
            df = pd.DataFrame(new_scores, columns=['en', 'ru', 'score'])
        return df


In [4]:
def evaluate(preds, labels):
    n_true = len(labels)
    n_pred = len(preds)
    TP = len(set(labels) & set(preds))
    precision = TP / n_pred
    recall = TP / n_true
    f0_5 = (1 + 1/4) * TP / (n_true / 4 + n_pred)
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F0.5: {f0_5:.3f}")

In [5]:
test_dir = 'data/miner_test'

with open(os.path.join(test_dir, 'en_sents'), 'r') as f:
    en_sents = f.readlines()
    en_sents = [sent.strip() for sent in en_sents]

with open(os.path.join(test_dir, 'ru_sents'), 'r') as f:
    ru_sents = f.readlines()
    ru_sents = [sent.strip() for sent in ru_sents]

labels_df = pd.read_csv(os.path.join(test_dir, 'labels.csv'))
label_pairs = df_to_tuples(labels_df)

In [7]:
def filter_by_score(df, threshold=0.65, drop_rate=0.1):
    df = df[df['score'] > threshold]
    df = df.sort_values(by='score')
    return df[int(len(df) * drop_rate):]

def filter_by_validator(pairs, threshold=0.1, drop_rate=0):
    pairs_with_score = []
    validator = Validator()
    for en, ru in pairs:
        score = validator.get_score(en_sents[en], ru_sents[ru])
        if score > threshold:
            pairs_with_score.append((en, ru, score))
    pairs_with_score = sorted(pairs_with_score, key=lambda x: x[2])[int(len(pairs) * drop_rate):]
    pairs = list(map(lambda x: (x[0], x[1]), pairs_with_score))
    return pairs

In [None]:
import torch
from sentence_transformers import SentenceTransformer

model_base = SentenceTransformer("sentence-transformers/LaBSE")
lora_model = torch.load('lora_labse2', weights_only=False)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# LaBSE
miner = SentenceMiner(en_sents, ru_sents, get_labse_encoder(model_base))
preds_df = miner.find_pairs()

Encoding


Batches: 100%|██████████| 22202/22202 [04:31<00:00, 81.83it/s] 
Batches: 100%|██████████| 22202/22202 [04:50<00:00, 76.53it/s] 


Initialized
ru index building
index trained
en index building
index trained


In [None]:
preds_df = preds_df.drop_duplicates(subset=['en', 'ru'])
preds = df_to_tuples(filter_by_score(preds_df))
preds = filter_by_validator(preds)
evaluate(preds, label_pairs)

Precision: 0.881
Recall: 0.773
F0.5: 0.857


In [None]:
# Fine-tuned
miner = SentenceMiner(en_sents, ru_sents, get_labse_encoder(lora_model))
preds_df = miner.find_pairs()

Encoding


Batches: 100%|██████████| 22202/22202 [07:56<00:00, 46.59it/s]
Batches: 100%|██████████| 22202/22202 [08:01<00:00, 46.12it/s]


Initialized
ru index building
index trained
en index building
index trained


In [9]:
preds_df = preds_df.drop_duplicates(subset=['en', 'ru'])
preds = df_to_tuples(filter_by_score(preds_df))
preds = filter_by_validator(preds)
evaluate(preds, label_pairs)

Precision: 0.179
Recall: 0.500
F0.5: 0.205


In [None]:
# LaBSE + margin
miner = SentenceMiner(en_sents, ru_sents, get_labse_encoder(model_base), use_margin=True)
preds_df = miner.find_pairs()

Encoding


Batches: 100%|██████████| 22202/22202 [04:32<00:00, 81.47it/s] 
Batches: 100%|██████████| 22202/22202 [04:48<00:00, 77.06it/s] 


Initialized
ru index building
index trained
en index building
index trained


In [31]:
preds_df = preds_df.drop_duplicates(subset=['en', 'ru'])
preds = df_to_tuples(filter_by_score(preds_df, threshold=1.06))
preds = filter_by_validator(preds)
evaluate(preds, label_pairs)

Precision: 0.946
Recall: 0.888
F0.5: 0.934


In [None]:
# Fine-tuned + margin
miner = SentenceMiner(en_sents, ru_sents, get_labse_encoder(lora_model), use_margin=True)
preds_df = miner.find_pairs()

Encoding


Batches: 100%|██████████| 22202/22202 [08:04<00:00, 45.87it/s]
Batches: 100%|██████████| 22202/22202 [08:25<00:00, 43.91it/s]


Initialized
ru index building
index trained
en index building
index trained


In [18]:
preds_df = preds_df.drop_duplicates(subset=['en', 'ru'])
preds = df_to_tuples(filter_by_score(preds_df, threshold=1.06))
preds = filter_by_validator(preds)
evaluate(preds, label_pairs)

Precision: 0.991
Recall: 0.040
F0.5: 0.171


In [2]:
from laser_encoders import LaserEncoderPipeline
en_encoder = LaserEncoderPipeline(lang="eng_Latn")
ru_encoder = LaserEncoderPipeline(lang="rus_Cyrl")

2025-07-30 13:42:50,826 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-07-30 13:42:50,902 | INFO | laser_encoders.download_models |  - laser2.pt already downloaded
2025-07-30 13:42:50,904 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-07-30 13:42:50,905 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded
2025-07-30 13:42:51,949 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-07-30 13:42:52,012 | INFO | laser_encoders.download_models |  - laser2.pt already downloaded
2025-07-30 13:42:52,013 | INFO | laser_encoders.download_models |  - laser2.spm already downloaded
2025-07-30 13:42:52,014 | INFO | laser_encoders.download_models |  - laser2.cvocab already downloaded


In [None]:
# LASER
miner = SentenceMiner(en_sents, ru_sents, get_laser_encoder(en_encoder, ru_encoder), use_margin=True)
preds_df = miner.find_pairs()

Encoding


In [None]:
preds_df = preds_df.drop_duplicates(subset=['en', 'ru'])
preds = df_to_tuples(filter_by_score(preds_df, threshold=1.06))
preds = filter_by_validator(preds)
evaluate(preds, label_pairs)

In [None]:
# Custom dataset
with open('YOUR/PATH/EN', 'r') as f:
    en_sents = f.readlines()

with open('YOUR/PATH/RU', 'r') as f:
    ru_sents = f.readlines()

model = SentenceTransformer("sentence-transformers/LaBSE")
miner = SentenceMiner(en_sents, ru_sents, get_labse_encoder(model_base), use_margin=True)

preds_df = miner.find_pairs().drop_duplicates(subset=['en', 'ru'])
preds = df_to_tuples(filter_by_score(preds_df, threshold=1.06))
preds = filter_by_validator(preds)

sent_pairs = [(en_sents[en], ru_sents[ru]) for en, ru in preds]