In [None]:
! pip install -Uq tensorboard sentence-transformers datasets transformers

In [None]:
import wandb
wandb.init(mode="disabled")

In [None]:
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()

#### **Create and Prepare embedding dataset**

In [None]:
from datasets import load_dataset

dataset = load_dataset("rasyosef/amharic-news-retrieval-dataset", split="train")
dataset

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


Dataset({
    features: ['query_id', 'passage_id', 'query', 'passage', 'category', 'link'],
    num_rows: 44708
})

In [None]:
# rename columns
dataset = dataset.rename_column("query", "anchor")
dataset = dataset.rename_column("passage", "positive")

In [None]:
# Add an id column to the dataset
dataset = dataset.add_column("id", range(len(dataset)))
dataset

Dataset({
    features: ['query_id', 'passage_id', 'anchor', 'positive', 'category', 'link', 'id'],
    num_rows: 44708
})

In [None]:
# Split dataset into a 10% test set
dataset = dataset.class_encode_column("category")
dataset = dataset.train_test_split(test_size=0.1, seed=16, stratify_by_column="category")
dataset

DatasetDict({
    train: Dataset({
        features: ['query_id', 'passage_id', 'anchor', 'positive', 'category', 'link', 'id'],
        num_rows: 40237
    })
    test: Dataset({
        features: ['query_id', 'passage_id', 'anchor', 'positive', 'category', 'link', 'id'],
        num_rows: 4471
    })
})

#### **Create baseline and evaluate pretrained model**

In [None]:
from datasets import concatenate_datasets

train_dataset = dataset["train"]
test_dataset = dataset["test"]
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

corpus_dataset

Dataset({
    features: ['query_id', 'passage_id', 'anchor', 'positive', 'category', 'link', 'id'],
    num_rows: 44708
})

In [None]:
# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
) # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["id"], test_dataset["anchor"])
) # Our queries (qid => question)

In [None]:
# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}
for q_id in queries:
  relevant_docs[q_id] = [q_id]

#### **Evaluate Amharic Embedding model**

List of models to evaluate:
- intfloat/multilingual-e5-large-instruct
- Alibaba-NLP/gte-modernbert-base
- Alibaba-NLP/gte-multilingual-base


In [None]:
import torch
from sentence_transformers import SentenceTransformer

# model_id = "rasyosef/roberta-amharic-text-embedding-base"
# model_id = "rasyosef/roberta-amharic-text-embedding-medium"
# model_id = "rasyosef/bert-amharic-text-embedding-medium"

# # OTHER MODELS
# model_id = "intfloat/multilingual-e5-large-instruct"
# model_id = "Alibaba-NLP/gte-modernbert-base"
# model_id = "Alibaba-NLP/gte-multilingual-base"
# model_id = "Snowflake/snowflake-arctic-embed-l-v2.0"
model_id = "rasyosef/snowflake-arctic-embed-l-v2.0-finetuned-amharic"

# Load a model
model = SentenceTransformer(
    model_id,
    device="cuda" if torch.cuda.is_available() else "cpu",
    trust_remote_code=True
)

print(model.max_seq_length)
if model.max_seq_length > 1024:
  model.max_seq_length = 1024
model.max_seq_length

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
from sentence_transformers.util import cos_sim

EMBED_DIM = model.get_sentence_embedding_dimension()
matryoshka_dimensions = [EMBED_DIM]

print("Embedding Dimension:", EMBED_DIM)

matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in matryoshka_dimensions:
  ir_evaluator = InformationRetrievalEvaluator(
      queries=queries,
      corpus=corpus,
      relevant_docs=relevant_docs,
      batch_size=256,
      name=f"dim_{dim}",
      truncate_dim=dim,
      score_functions={"cosine": cos_sim},
      mrr_at_k=[10, 100],
      ndcg_at_k=[10, 100],
      precision_recall_at_k=[5, 10, 50, 100],
      corpus_chunk_size=8192,
      show_progress_bar=True
  )
  matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
evaluator = SequentialEvaluator(matryoshka_evaluators)

Embedding Dimension: 1024


In [None]:
# Evaluate the model
results = evaluator(model)

In [None]:
# rasyosef/snowflake-arctic-embed-l-v2.0-finetuned-amharic

dim = EMBED_DIM
metrics = [
    f"dim_{dim}_cosine_recall@10",
    f"dim_{dim}_cosine_recall@50",
    f"dim_{dim}_cosine_recall@100",
    f"dim_{dim}_cosine_mrr@10",
    f"dim_{dim}_cosine_mrr@100",
    f"dim_{dim}_cosine_ndcg@10",
    f"dim_{dim}_cosine_ndcg@100"
  ]

for key in metrics:
  metric_name = key.split("_")[-1]
  print(f"{metric_name}: {round(results[key], 3)}")

recall@10: 0.942
recall@50: 0.977
recall@100: 0.985
mrr@10: 0.827
mrr@100: 0.829
ndcg@10: 0.855
ndcg@100: 0.865


### **Results**

##### **rasyosef/roberta-amharic-text-embedding-base**
- recall@10: 0.913
- recall@50: 0.964
- recall@100: 0.979
- mrr@10: 0.775
- mrr@100: 0.778
- ndcg@10: 0.808
- ndcg@100: 0.823

##### **rasyosef/roberta-amharic-text-embedding-medium**
- recall@10: 0.884
- recall@50: 0.955
- recall@100: 0.971
- mrr@10: 0.735
- mrr@100: 0.739
- ndcg@10: 0.771
- ndcg@100: 0.79

##### **rasyosef/bert-amharic-text-embedding-medium**
- recall@10: 0.843
- recall@50: 0.931
- recall@100: 0.954
- mrr@10: 0.682
- mrr@100: 0.686
- ndcg@10: 0.72
- ndcg@100: 0.744

##### **intfloat/multilingual-e5-large-instruct**
- recall@10: 0.825
- recall@50: 0.911
- recall@100: 0.931
- mrr@10: 0.672
- mrr@100: 0.676
- ndcg@10: 0.709
- ndcg@100: 0.732

##### **Alibaba-NLP/gte-modernbert-base**
- recall@10: 0.033
- recall@50: 0.051
- recall@100: 0.067
- mrr@10: 0.019
- mrr@100: 0.021
- ndcg@10: 0.023
- ndcg@100: 0.029


##### **Alibaba-NLP/gte-multilingual-base**
- recall@10: 0.76
- recall@50: 0.851
- recall@100: 0.882
- mrr@10: 0.6
- mrr@100: 0.605
- ndcg@10: 0.638
- ndcg@100: 0.664

##### **Snowflake/snowflake-arctic-embed-l-v2.0**
- recall@10: 0.831
- recall@50: 0.922
- recall@100: 0.942
- mrr@10: 0.659
- mrr@100: 0.664
- ndcg@10: 0.701
- ndcg@100: 0.725

##### **rasyosef/snowflake-arctic-embed-l-v2.0-finetuned-amharic**

- recall@10: 0.942
- recall@50: 0.977
- recall@100: 0.985
- mrr@10: 0.827
- mrr@100: 0.829
- ndcg@10: 0.855
- ndcg@100: 0.865

### **Example**

In [None]:
sentences = [
  "የተደጋገመው የመሬት መንቀጥቀጥና የእሳተ ገሞራ ምልክት በአፋር ክልል",
  "የዋጋ ግሽበት በባለሙያዎች እና ነዋሪዎች አተያይ",
  "ከተደጋጋሚ መሬት መንቀጥቀጥ በኋላ አፋር ክልል እሳት ከመሬት ውስጥ ሲፈላ ታይቷል፡፡ ከመሬት ውስጥ እሳትና ጭስ የሚተፋው እንፋሎቱ ዛሬ ማለዳውን 11 ሰዓት ግድም ከከባድ ፍንዳታ በኋላየተስተዋለ መሆኑን የአከባቢው ነዋሪዎች እና ባለስልጣናት ለዶቼ ቬለ ተናግረዋል፡፡ አለት የሚያፈናጥር እሳት ነው የተባለው እንፋሎቱ በክልሉ ጋቢረሱ (ዞን 03) ዱለቻ ወረዳ ሰጋንቶ ቀበሌ መከሰቱን የገለጹት የአከባቢው የአይን እማኞች ከዋናው ፍንዳታ በተጨማሪ በዙሪያው ተጨማሪ ፍንዳታዎች መታየት ቀጥሏል ባይ ናቸው፡፡",
  "ለኢትዮጵያ ብሔራዊ ባንክ ዋጋን የማረጋጋት ቀዳሚ ዓላማ ጋር የተጣጣሙ የገንዘብ ፖሊሲ ምክረ ሀሳቦችን እንዲሰጥ የተቋቋመው የኢትዮጵያ ብሔራዊ ባንክ የገንዘብ ፖሊሲ ኮሚቴ እስካለፈው ህዳር ወር የነበረው እአአ የ2024 የዋጋ ግሽበት በተለይምምግብ ነክ ምርቶች ላይ ከአንድ ዓመት በፊት ከነበው ጋር ሲነጻጸር መረጋጋት ማሳየቱን ጠቁሟል፡፡ ዶይቼ ቬለ ያነጋገራቸው የአዲስ አበባ ነዋሪዎች ግን በዚህ የሚስማሙ አይመስልም፡፡ ከአምና አንጻር ያልጨመረ ነገር የለም ባይ ናቸው፡፡ የኢኮኖሚ  ባለሙያም በሰጡን አስተያየት ጭማሪው በሁሉም ረገድ የተስተዋለ በመሆኑ የመንግስት ወጪን በመቀነስ ግብርናው ላይ አተኩሮ መስራት ምናልባትም የዋጋ መረጋጋቱን ሊያመጣ ይችላል ይላሉ፡፡"
]
embeddings = model.encode(sentences)

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities)

tensor([[1.0000, 0.1400, 0.6069, 0.0815],
        [0.1400, 1.0000, 0.0104, 0.6810],
        [0.6069, 0.0104, 1.0000, 0.0133],
        [0.0815, 0.6810, 0.0133, 1.0000]])
