In [None]:
import os
import warnings
from pathlib import Path

import torch
import tiktoken
from pyvi.ViTokenizer import tokenize
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import CoSENTLoss
from sentence_transformers.evaluation import (
    EmbeddingSimilarityEvaluator,
    SimilarityFunction,
)


warnings.filterwarnings("ignore")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
MODEL_IDS = {
    "vn_sbert": "keepitreal/vietnamese-sbert",
    "dvt": "dangvantuan/vietnamese-embedding",
    "aivn": "AITeamVN/Vietnamese_Embedding",
}

## Dataset

In [9]:
vi_sts = load_dataset("doanhieung/vi-stsbenchmark")["train"]
df_train = vi_sts.filter(lambda example: example["split"] == "train")
df_dev = vi_sts.filter(lambda example: example["split"] == "dev")
df_test = vi_sts.filter(lambda example: example["split"] == "test")

Generating train split: 8628 examples [00:00, 20944.50 examples/s]
Filter: 100%|██████████| 8628/8628 [00:00<00:00, 33459.13 examples/s]
Filter: 100%|██████████| 8628/8628 [00:00<00:00, 95861.68 examples/s]
Filter: 100%|██████████| 8628/8628 [00:00<00:00, 82680.94 examples/s]


In [28]:
df_train

Dataset({
    features: ['split', 'genre', 'dataset', 'year', 'sid', 'score', 'sentence1', 'sentence2'],
    num_rows: 5749
})

In [11]:
ds_train = df_train.select_columns(["sentence1", "sentence2", "score"])
ds_dev = df_dev.select_columns(["sentence1", "sentence2", "score"])
ds_test = df_test.select_columns(["sentence1", "sentence2", "score"])

In [21]:
ds_train[0]

{'sentence1': 'Một chiếc máy bay đang cất cánh.',
 'sentence2': 'Một chiếc máy bay đang cất cánh.',
 'score': 5.0}

## Models

In [14]:
sbert_model = SentenceTransformer(MODEL_IDS["vn_sbert"])
dvt_model = SentenceTransformer(MODEL_IDS["dvt"])
aivn_model = SentenceTransformer(MODEL_IDS["aivn"])

In [17]:
sbert_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'RobertaModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [18]:
dvt_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'RobertaModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [19]:
aivn_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False, 'architecture': 'XLMRobertaModel'})
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

## Evaluations


| Model                                | Pearson (cosine) | Spearman (cosine) | Time (s) |
| ------------------------------------ | ---------------- | ----------------- | -------- |
| **keepitreal/vietnamese-sbert**      | 0.774            | 0.767             | 16.6     |
| **dangvantuan/vietnamese-embedding** | **0.797**        | **0.793**         | 12.9     |
| **AITeamVN/Vietnamese\_Embedding**   | 0.781            | 0.780             | 33.1     |

In [None]:
# sbert_loss = CoSENTLoss(sbert_model)
# dvt_loss = CoSENTLoss(dvt_model)
# aivn_loss = CoSENTLoss(aivn_model)

In [23]:
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=ds_test["sentence1"],
    sentences2=ds_test["sentence2"],
    scores=ds_test["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="vi-sts-bench",
)

In [25]:
test_evaluator(sbert_model)

{'vi-sts-bench_pearson_cosine': 0.7741504472811993,
 'vi-sts-bench_spearman_cosine': 0.7672006662775815}

In [26]:
test_evaluator(dvt_model)

{'vi-sts-bench_pearson_cosine': 0.7966283070690964,
 'vi-sts-bench_spearman_cosine': 0.7927477894929178}

In [27]:
test_evaluator(aivn_model)

{'vi-sts-bench_pearson_cosine': 0.7809618671297143,
 'vi-sts-bench_spearman_cosine': 0.7801241540920268}