# STS22 Benchmark

## Install and import packages

In [None]:
!pip install datasets sentence_transformers pandas

In [None]:
from datasets import load_dataset
from  sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import SentenceTransformer
from sentence_transformers import InputExample
import os
import pandas as pd

## Preparation

In [None]:
subsets = [
  "all_languages",
  # "ar",
  # "de",
  # "de-en",
  # "de-fr",
  # "de-pl",
  # "en",
  # "es",
  # "es-en",
  # "es-it",
  # "fr",
  # "fr-pl",
  # "it",
  # "pl",
  # "pl-en",
  # "ru",
  # "tr",
  # "zh",
  # "zh-en"
]

sts22 = {}

for subset in subsets:
  sts22[subset] = load_dataset("mteb/sts22-crosslingual-sts", subset, split="test")

BINS = [0, 10, 20, 50, 100, 200, 500, 1000, 2000]
SUBSETS = subsets

In [None]:
samples = {}
for subset in SUBSETS:
    samples_subset = {}
    for idx in range(len(BINS)-1):
        key = f"{BINS[idx]}-{BINS[idx+1]}"
        samples_subset[key] = [InputExample(texts=[item["sentence1"], item["sentence2"]], label=item["score"]/5) for item in sts22[subset] if BINS[idx] <= len(item["sentence1"].split()) < BINS[idx+1]]
    samples[subset] = samples_subset

In [None]:
evaluators = {}

for subset in SUBSETS:
    evaluators_subset = {}
    samples_subset = samples[subset]
    for i in range(len(BINS)-1):
        key = f"{BINS[i]}-{BINS[i+1]}"

        # if there are less than 2 samples, skip this bin
        if samples_subset[key] is None or len(samples_subset[key]) < 2:
            continue

        evaluators_subset[key] = EmbeddingSimilarityEvaluator.from_input_examples(samples_subset[key], name=f"sts22-{key}", batch_size=1)
    evaluators[subset] = evaluators_subset

## Run benchmark

In [None]:
models = [
  "jinaai/jina-embeddings-v2-base-en",
  "thenlper/gte-base",
  "intfloat/multilingual-e5-base"
]

for model_name in models:
  print("Evaluating model", model_name)
  model = SentenceTransformer(model_name, trust_remote_code=True)
  for subset in evaluators:
    print("Evaluating subset", subset)
    evaluators_subset = evaluators[subset]
    for key in evaluators_subset:
      print("Evaluating bin", key)
      evaluator = evaluators_subset[key]
      model_name = model_name.replace("/", "-")
      output_path = f"results/sts/sts22-{subset}/{model_name}/"
      os.makedirs(output_path, exist_ok=True)
      evaluator(model, output_path=output_path)

## Results

In [None]:
for model_name in models:
  for subset in evaluators:
    file_paths = []
    extracted_data = {'Range': [], 'cosine_spearman': [], 'Sample Count': []}
    for key in evaluators[subset]:
      model_name = model_name.replace("/", "_")
      file_path = f"results/sts/sts22-{subset}/{model_name}/similarity_evaluation_sts22-{key}_results.csv"
      file_paths.append(file_path)

      df = pd.read_csv(file_path)
      # get last row
      cosine_spearman = df['cosine_spearman'].iloc[-1]
      extracted_data['Range'].append(key)
      extracted_data['cosine_spearman'].append(cosine_spearman)
      extracted_data['Sample Count'].append(len(samples[subset][key]))
    extracted_df = pd.DataFrame(extracted_data)
    print("Model", model_name)
    print("Subset", subset)
    print(extracted_df)