In [1]:
import sys

sys.path.append("/home/ducha/Dropbox/Projects/viet-examples/anki-examples")
import torch
import numpy as np
from find_examples import CorpusExamples

corpus_folder = "/media/ducha/SSDSHARED/VN/subs_dump/viet_subs_processed2"
corpus = CorpusExamples(corpus_folder, use_semantic_sorting=False)
vi, en_1, en_2 = "từ", "word", "from, since"

examples = corpus.find_examples(vi)
examples_np = np.array(examples)

Preparing corpus from /media/ducha/SSDSHARED/VN/subs_dump/viet_subs_processed2
Corpus prepared with 13646 files
Total Examples 7736833


# Semantic Search
Idea: Use text embeddings to find the example that is most similar to the translation.

Model: https://huggingface.co/intfloat/multilingual-e5-small

In [3]:
import numpy as np
from sentence_transformers import SentenceTransformer


def format_translation(source, target):
    # Format according to XLIFF
    return f"query: <source>{source}</source><target>{target}</target>"


def format_example(ex):
    # Use "query: " prefix for symmetric tasks such as semantic similarity, bitext mining, paraphrase retrieval.
    return f"query: {ex}"


vi, en = "từ", "word"

In [4]:
query = format_translation(vi, en)
print(query)

query: <source>từ</source><target>word</target>


In [5]:
examples = corpus.find_examples(vi)
examples_np = np.array(examples)
examples_formatted = [format_example(ex["text"]) for ex in examples]

KeyboardInterrupt: 

## Model Inference

In [12]:
model = SentenceTransformer("intfloat/multilingual-e5-small").to("cuda")

In [29]:
model_input = [query] + examples_formatted
print(len(model_input))
all_encoded = model.encode(
    model_input,
    normalize_embeddings=True,
    device="cuda",
    convert_to_tensor=True,
    convert_to_numpy=False,
)

60495


In [30]:
query_enc, examples_enc = all_encoded[0], all_encoded[1:]

In [31]:
import torch

similarities = torch.cosine_similarity(
    query_enc[None], examples_enc, dim=1
)  # can also use query_enc @ examples_enc.T
similarities.shape
similarities_sorted = (
    torch.argsort(similarities, descending=True).cpu().detach().numpy()
)

In [38]:
import numpy as np

examples_np = np.array([ex["text"] for ex in examples])
examples_np[similarities_sorted]

array(['- Từ đây tới đây.', '(Từ này tự tìm hiểu )',
       '{\\an8}THỊT MÁ TỪ THỦ HEO', ...,
       'Cậu là bạn với Cha Eun-sang từ khi nào?',
       'Từ khi nào Matty có bạn gái?',
       'Lucid Dream đã bắt đầu từ khi nào?'], dtype='<U105')

## Different Model

- https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2
- https://huggingface.co/VoVanPhuc/sup-SimCSE-VietNamese-phobert-base#sentences-transformers

In [3]:
import numpy as np
import torch
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
).to("cuda")

In [4]:
def format_translation(source, target):
    # Format according to XLIFF
    return f"<source>{source}</source><target>{target}</target>"
    # return f"{source} - {target}"


def format_example(ex):
    # Use "query: " prefix for symmetric tasks such as semantic similarity, bitext mining, paraphrase retrieval.
    return f"{ex}"

In [5]:
query_1 = format_translation(vi, en_1)
query_2 = format_translation(vi, en_2)
print(query_1)
print(query_2)

<source>từ</source><target>word</target>
<source>từ</source><target>from, since</target>


In [6]:
# examples = [
#     "Anh ta tìm từ đó trong từ điển.",
#     "Chúng ta ở ngay trên đường từ hộp đêm ra bến tàu.",
#     "từ từ, từ từ.",
# ]
# examples_formatted = [format_example(ex) for ex in examples]
# examples_np = np.array(examples)


examples = corpus.find_examples(vi)
len(examples)
examples_np = np.array([ex["text"] for ex in examples])
examples_formatted = [format_example(ex["text"]) for ex in examples]

In [8]:
model_input = [query_1, query_2] + examples_formatted
print(len(model_input))
all_encoded = model.encode(
    model_input,
    normalize_embeddings=True,
    device="cuda",
    convert_to_tensor=True,
)
query_1_enc, query_2_enc, examples_enc = all_encoded[0], all_encoded[1], all_encoded[2:]

60496


In [9]:
similarities_1 = query_1_enc @ examples_enc.T
similarities_sorted_1 = (
    torch.argsort(similarities_1, descending=True).cpu().detach().numpy()
)
similarities_2 = query_2_enc @ examples_enc.T
similarities_sorted_2 = (
    torch.argsort(similarities_2, descending=True).cpu().detach().numpy()
)

In [12]:
examples_np[similarities_sorted_1]

array(['Nhận được tin từ mục tiêu.', 'Từ khóa tìm kiếm là gì?',
       'Chọn từ khóa nè, "tấn công", "xử lý Bush".', ...,
       'Tôi không gặp cha từ khi lên tám.',
       'Đã sống một mình từ bao giờ.',
       'Chắc từ giờ tôi không thể nắm tay anh nữa?'], dtype='<U105')

In [13]:
examples_np[similarities_sorted_2]

array(['Nhận được tin từ mục tiêu.',
       'Tôi có thể nhắm vào bộ nguồn của nó từ đây.',
       'Tiếp cận mục tiêu từ phía sau.', ...,
       'Nói lời từ biệt đi, Alice.', 'Không ai có thể nói lời từ biệt.',
       'Không ai muốn nói lời chào từ biệt.'], dtype='<U105')

### Alternative Pooling Strategy

In [2]:
def perform_search(model):
    model_input = [query_1, query_2] + examples_formatted
    all_encoded = model.encode(
        model_input,
        normalize_embeddings=True,
        device="cuda",
        convert_to_tensor=True,
        output_value="token_embeddings",
    )
    query_1_enc, query_2_enc, examples_enc = (
        all_encoded[0].mean(dim=0),
        all_encoded[1].mean(dim=0),
        all_encoded[2:],
    )
    del all_encoded
    # query_1_enc, query_2_enc, examples_enc = (
    #     all_encoded[0][offsetq_start:offsetq_end].mean(dim=0),
    #     all_encoded[1][offsetq_start:offsetq_end].mean(dim=0),
    #     all_encoded[2:],
    # )
    similarities_1 = [
        torch.max(query_1_enc @ ex.T).cpu().detach().numpy() for ex in examples_enc
    ]
    similarities_2 = [
        torch.max(query_2_enc @ ex.T).cpu().detach().numpy() for ex in examples_enc
    ]
    similarities_sorted_1 = np.argsort(similarities_1)[::-1]
    similarities_sorted_2 = np.argsort(similarities_2)[::-1]

    return (
        examples_np[similarities_sorted_1[:10]],
        examples_np[similarities_sorted_2[:10]],
    )

#### mpnet

In [8]:
def format_translation(source, target):
    # Format according to XLIFF
    return f"<source>{source}</source><target>{target}</target>"
    # return f"{source} - {target}"


def format_example(ex):
    # Use "query: " prefix for symmetric tasks such as semantic similarity, bitext mining, paraphrase retrieval.
    return f"{ex}"


examples_formatted = [format_example(ex["text"]) for ex in examples]
query_1 = format_translation(vi, en_1)
query_2 = format_translation(vi, en_2)
print(query_1)
print(query_2)

<source>từ</source><target>word</target>
<source>từ</source><target>from, since</target>


In [9]:
import numpy as np
import torch
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
).to("cuda")
tokens = model.tokenizer(query_1)
for i, tok in enumerate(tokens["input_ids"]):
    print(f"{i:2}. {tok:6}", model.tokenizer.decode(tok, skip_special_tokens=True))

2024-06-05 20:24:36.920030: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


 0.      0 
 1.   4426 <
 2.  60427 source
 3.   2740 >
 4.     18 t
 5.  56906 ừ
 6.  42946 </
 7.  60427 source
 8.  74047 ><
 9.    867 tar
10.   3794 get
11.   2740 >
12.  47416 word
13.  42946 </
14.    867 tar
15.   3794 get
16.   2740 >
17.      2 


In [10]:
offsetq_start = 4  # Check output above for when the first token starts
offsetq_end = 6  # Check output above for when the first token ends

model.tokenizer.decode(tokens["input_ids"][offsetq_start:offsetq_end])

'từ'

In [11]:
perform_search(model)

60496


(array([{'file': 'the.wolf.of.wall.street.(2013)', 'text': 'Để tôi hỏi anh, định hướng tương lai của Một từ à?', 'num_words': 12, 'start': 45, 'end': 47},
        {'file': 'the.sunset.limited.(2011)', 'text': 'Tôi chỉ đang tìm từ thôi, Giáo sư.', 'num_words': 8, 'start': 17, 'end': 19},
        {'file': 'hostages.s01.e11.off.the.record.(2013)', 'text': 'Tìm mấy từ như bu lông móng, cốt thép này nọ ấy.', 'num_words': 12, 'start': 8, 'end': 10},
        {'file': 'criminal.minds.s03.e13.limelight.(2008)', 'text': 'Tôi để ý cách sử dụng một từ trong các trang giấy từ kho.', 'num_words': 14, 'start': 26, 'end': 28},
        {'file': 'criminal.minds.s01.e22.the.fisher.king.part.1.(2006)', 'text': 'Mỗi bộ số đại diện cho một từ cụ thể.', 'num_words': 10, 'start': 27, 'end': 29},
        {'file': 'physical.s01.e01.lets.do.this.thing.(2021)', 'text': 'Cháu đang tìm từ gì nhỉ?', 'num_words': 6, 'start': 14, 'end': 16},
        {'file': 'dark.waters.(2019)', 'text': 'Ông có thấy từ này đây, đã đư

#### e5

In [3]:
import numpy as np
import torch
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("intfloat/multilingual-e5-small").to("cuda")


def format_translation(source, target):
    # Format according to XLIFF
    return f"query: <source>{source}</source><target>{target}</target>"
    # return f"{source} - {target}"


def format_example(ex):
    # Use "query: " prefix for symmetric tasks such as semantic similarity, bitext mining, paraphrase retrieval.
    return f"query: {ex}"


query_1 = format_translation(vi, en_1)
query_2 = format_translation(vi, en_2)
examples_formatted = [format_example(ex) for ex in examples]
tokens = model.tokenizer(query_1)
for i, tok in enumerate(tokens["input_ids"]):
    print(f"{i:2}. {tok:6}", model.tokenizer.decode(tok, skip_special_tokens=True))

2024-06-05 20:39:10.169075: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


 0.      0 
 1.     41 que
 2.   1294 ry
 3.     12 :
 4.   4426 <
 5.  60427 source
 6.   2740 >
 7.     18 t
 8.  56906 ừ
 9.  42946 </
10.  60427 source
11.  74047 ><
12.    867 tar
13.   3794 get
14.   2740 >
15.  47416 word
16.  42946 </
17.    867 tar
18.   3794 get
19.   2740 >
20.      2 


In [4]:
offsetq_start = 4  # Check output above for when the first token starts
offsetq_end = 6  # Check output above for when the first token ends

model.tokenizer.decode(tokens["input_ids"][offsetq_start:offsetq_end])

'<source'

In [5]:
perform_search(model)

OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB (GPU 0; 7.76 GiB total capacity; 6.24 GiB already allocated; 42.31 MiB free; 6.67 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF