In [None]:
! pip install -Uq torch tensorboard sentence-transformers datasets transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatibl

In [None]:
import wandb
wandb.init(mode="disabled")

#### **Create and Prepare embedding dataset**

In [None]:
from datasets import load_dataset

dataset = load_dataset("./amharic-dataset-binary", split="train")
dataset = dataset.filter(lambda row: row["label"]==1)#.select(range(4000))
dataset

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


README.md:   0%|          | 0.00/432 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/124652 [00:00<?, ? examples/s]

Filter:   0%|          | 0/124652 [00:00<?, ? examples/s]

Dataset({
    features: ['query_id', 'passage_id', 'query', 'passage', 'label'],
    num_rows: 31163
})

In [None]:
# rename columns
dataset = dataset.rename_column("query", "anchor")
dataset = dataset.rename_column("passage", "positive")

In [None]:
# Add an id column to the dataset
dataset = dataset.add_column("id", range(len(dataset)))
dataset

Flattening the indices:   0%|          | 0/31163 [00:00<?, ? examples/s]

Dataset({
    features: ['query_id', 'passage_id', 'anchor', 'positive', 'label', 'id'],
    num_rows: 31163
})

In [None]:
# Split dataset into a 10% test set
dataset = dataset.train_test_split(test_size=0.1, seed=7)
dataset

DatasetDict({
    train: Dataset({
        features: ['query_id', 'passage_id', 'anchor', 'positive', 'label', 'id'],
        num_rows: 28046
    })
    test: Dataset({
        features: ['query_id', 'passage_id', 'anchor', 'positive', 'label', 'id'],
        num_rows: 3117
    })
})

In [None]:
# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

Creating json from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

12602505

#### **Create baseline and evaluate pretrained model**

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
from sentence_transformers.util import cos_sim
from datasets import load_dataset, concatenate_datasets

# load test dataset
test_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

corpus_dataset

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['query_id', 'passage_id', 'anchor', 'positive', 'label', 'id'],
    num_rows: 31163
})

In [None]:
# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
) # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["id"], test_dataset["anchor"])
) # Our queries (qid => question)

In [None]:
# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}
for q_id in queries:
  relevant_docs[q_id] = [q_id]

In [None]:
matryoshka_dimensions = [768, 512, 384, 256, 128, 64]

matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in matryoshka_dimensions:
  ir_evaluator = InformationRetrievalEvaluator(
      queries=queries,
      corpus=corpus,
      relevant_docs=relevant_docs,
      name=f"dim_{dim}",
      truncate_dim=dim,
      score_functions={"cosine": cos_sim}
  )
  matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
evaluator = SequentialEvaluator(matryoshka_evaluators)

#### **Evaluate RoBERTa Base Amharic Embed (33M)**

In [None]:
import torch
from sentence_transformers import SentenceTransformer

model_id = "yosefw/roberta-base-am-embed"

# Load a model
model = SentenceTransformer(
    model_id,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.00k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
# Evaluate the model
results = evaluator(model)

for dim in matryoshka_dimensions:
  key = f"dim_{dim}_cosine_ndcg@10"
  print(f"{key}: {results[key]}")

dim_768_cosine_ndcg@10: 0.1452150968839238
dim_512_cosine_ndcg@10: 0.13653228366555137
dim_384_cosine_ndcg@10: 0.13786939505355503
dim_256_cosine_ndcg@10: 0.11244131687353581
dim_128_cosine_ndcg@10: 0.070480582228537
dim_64_cosine_ndcg@10: 0.03256639005101119


In [None]:
# print the main score
for dim in matryoshka_dimensions:
  key = f"dim_{dim}_cosine_recall@5"
  print(f"{key}: {results[key]}")

dim_768_cosine_recall@5: 0.17131857555341676
dim_512_cosine_recall@5: 0.15944818735964067
dim_384_cosine_recall@5: 0.16041065126724416
dim_256_cosine_recall@5: 0.1286493423163298
dim_128_cosine_recall@5: 0.08533846647417388
dim_64_cosine_recall@5: 0.0381777350016041


#### **Define loss function with Matryoshka Representation**

In [None]:
from sentence_transformers import SentenceTransformer, SentenceTransformerModelCardData

model_id = "yosefw/roberta-base-am-embed"

model = SentenceTransformer(
    model_id,
    model_kwargs={"attn_implementation": "sdpa"},
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="RoBERTa Amharic Text Embedding Base"
    )
)

In [None]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 384, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

#### **Fine-tune embedding model with** `SentenceTransformersTrainer`

In [None]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers

train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")

args = SentenceTransformerTrainingArguments(
    output_dir="roberta-base-amharic-embedding-matryoshka",
    num_train_epochs=4,
    per_device_train_batch_size=128, #32
    # gradient_accumulation_steps=4,
    per_device_eval_batch_size=128, #32
    warmup_ratio=0.1,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    optim="adamw_torch_fused",
    fp16=True,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to=None,
    logging_steps=10,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_dim_128_cosine_ndcg@10",
)

In [None]:
from sentence_transformers import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=model,
    args=args, # training arguments
    train_dataset=train_dataset.select_columns(
        ["anchor", "positive"]
    ), # training dataset
    loss=train_loss,
    evaluator=evaluator
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Dim 768 Cosine Accuracy@1,Dim 768 Cosine Accuracy@3,Dim 768 Cosine Accuracy@5,Dim 768 Cosine Accuracy@10,Dim 768 Cosine Precision@1,Dim 768 Cosine Precision@3,Dim 768 Cosine Precision@5,Dim 768 Cosine Precision@10,Dim 768 Cosine Recall@1,Dim 768 Cosine Recall@3,Dim 768 Cosine Recall@5,Dim 768 Cosine Recall@10,Dim 768 Cosine Ndcg@10,Dim 768 Cosine Mrr@10,Dim 768 Cosine Map@100,Dim 512 Cosine Accuracy@1,Dim 512 Cosine Accuracy@3,Dim 512 Cosine Accuracy@5,Dim 512 Cosine Accuracy@10,Dim 512 Cosine Precision@1,Dim 512 Cosine Precision@3,Dim 512 Cosine Precision@5,Dim 512 Cosine Precision@10,Dim 512 Cosine Recall@1,Dim 512 Cosine Recall@3,Dim 512 Cosine Recall@5,Dim 512 Cosine Recall@10,Dim 512 Cosine Ndcg@10,Dim 512 Cosine Mrr@10,Dim 512 Cosine Map@100,Dim 384 Cosine Accuracy@1,Dim 384 Cosine Accuracy@3,Dim 384 Cosine Accuracy@5,Dim 384 Cosine Accuracy@10,Dim 384 Cosine Precision@1,Dim 384 Cosine Precision@3,Dim 384 Cosine Precision@5,Dim 384 Cosine Precision@10,Dim 384 Cosine Recall@1,Dim 384 Cosine Recall@3,Dim 384 Cosine Recall@5,Dim 384 Cosine Recall@10,Dim 384 Cosine Ndcg@10,Dim 384 Cosine Mrr@10,Dim 384 Cosine Map@100,Dim 256 Cosine Accuracy@1,Dim 256 Cosine Accuracy@3,Dim 256 Cosine Accuracy@5,Dim 256 Cosine Accuracy@10,Dim 256 Cosine Precision@1,Dim 256 Cosine Precision@3,Dim 256 Cosine Precision@5,Dim 256 Cosine Precision@10,Dim 256 Cosine Recall@1,Dim 256 Cosine Recall@3,Dim 256 Cosine Recall@5,Dim 256 Cosine Recall@10,Dim 256 Cosine Ndcg@10,Dim 256 Cosine Mrr@10,Dim 256 Cosine Map@100,Dim 128 Cosine Accuracy@1,Dim 128 Cosine Accuracy@3,Dim 128 Cosine Accuracy@5,Dim 128 Cosine Accuracy@10,Dim 128 Cosine Precision@1,Dim 128 Cosine Precision@3,Dim 128 Cosine Precision@5,Dim 128 Cosine Precision@10,Dim 128 Cosine Recall@1,Dim 128 Cosine Recall@3,Dim 128 Cosine Recall@5,Dim 128 Cosine Recall@10,Dim 128 Cosine Ndcg@10,Dim 128 Cosine Mrr@10,Dim 128 Cosine Map@100,Dim 64 Cosine Accuracy@1,Dim 64 Cosine Accuracy@3,Dim 64 Cosine Accuracy@5,Dim 64 Cosine Accuracy@10,Dim 64 Cosine Precision@1,Dim 64 Cosine Precision@3,Dim 64 Cosine Precision@5,Dim 64 Cosine Precision@10,Dim 64 Cosine Recall@1,Dim 64 Cosine Recall@3,Dim 64 Cosine Recall@5,Dim 64 Cosine Recall@10,Dim 64 Cosine Ndcg@10,Dim 64 Cosine Mrr@10,Dim 64 Cosine Map@100,Sequential Score
1,0.9636,No log,0.65608,0.784408,0.827719,0.875521,0.65608,0.261469,0.165544,0.087552,0.65608,0.784408,0.827719,0.875521,0.765574,0.730417,0.734337,0.649021,0.783125,0.824832,0.873596,0.649021,0.261042,0.164966,0.08736,0.649021,0.783125,0.824832,0.873596,0.761366,0.725501,0.729454,0.648701,0.780879,0.819698,0.867822,0.648701,0.260293,0.16394,0.086782,0.648701,0.780879,0.819698,0.867822,0.758766,0.72379,0.727979,0.639076,0.774783,0.818415,0.865897,0.639076,0.258261,0.163683,0.08659,0.639076,0.774783,0.818415,0.865897,0.753442,0.717411,0.721637,0.618864,0.764196,0.805903,0.855951,0.618864,0.254732,0.161181,0.085595,0.618864,0.764196,0.805903,0.855951,0.738546,0.700887,0.705295,0.58197,0.734039,0.7812,0.832852,0.58197,0.24468,0.15624,0.083285,0.58197,0.734039,0.7812,0.832852,0.708112,0.668097,0.672888,0.708112
2,0.2214,No log,0.666987,0.792749,0.838627,0.888354,0.666987,0.26425,0.167725,0.088835,0.666987,0.792749,0.838627,0.888354,0.777269,0.741739,0.745424,0.6641,0.795637,0.838948,0.885467,0.6641,0.265212,0.16779,0.088547,0.6641,0.795637,0.838948,0.885467,0.775397,0.740113,0.743875,0.661534,0.792749,0.834135,0.885788,0.661534,0.26425,0.166827,0.088579,0.661534,0.792749,0.834135,0.885788,0.773764,0.737919,0.741543,0.651588,0.78922,0.836702,0.87905,0.651588,0.263073,0.16734,0.087905,0.651588,0.78922,0.836702,0.87905,0.767002,0.730878,0.734856,0.638114,0.775425,0.823548,0.872955,0.638114,0.258475,0.16471,0.087295,0.638114,0.775425,0.823548,0.872955,0.755169,0.717454,0.721409,0.611165,0.753288,0.80077,0.857235,0.611165,0.251096,0.160154,0.085723,0.611165,0.753288,0.80077,0.857235,0.733219,0.693666,0.698206,0.733219
3,0.0598,No log,0.677254,0.808149,0.847931,0.892525,0.677254,0.269383,0.169586,0.089252,0.677254,0.808149,0.847931,0.892525,0.785733,0.751439,0.754782,0.6718,0.80847,0.846968,0.888033,0.6718,0.26949,0.169394,0.088803,0.6718,0.80847,0.846968,0.888033,0.782079,0.747893,0.751559,0.671158,0.803657,0.846327,0.889317,0.671158,0.267886,0.169265,0.088932,0.671158,0.803657,0.846327,0.889317,0.781089,0.746292,0.749759,0.666987,0.798524,0.844402,0.884825,0.666987,0.266175,0.16888,0.088483,0.666987,0.798524,0.844402,0.884825,0.776605,0.741803,0.74552,0.652871,0.789862,0.834135,0.874559,0.652871,0.263287,0.166827,0.087456,0.652871,0.789862,0.834135,0.874559,0.76505,0.729727,0.733757,0.623997,0.766442,0.817132,0.865255,0.623997,0.255481,0.163426,0.086526,0.623997,0.766442,0.817132,0.865255,0.744313,0.705564,0.709792,0.744313
4,0.0417,No log,0.683349,0.807507,0.851139,0.896696,0.683349,0.269169,0.170228,0.08967,0.683349,0.807507,0.851139,0.896696,0.789919,0.755753,0.758915,0.679179,0.810395,0.852101,0.893808,0.679179,0.270132,0.17042,0.089381,0.679179,0.810395,0.852101,0.893808,0.788128,0.754105,0.757428,0.674366,0.807828,0.852422,0.891562,0.674366,0.269276,0.170484,0.089156,0.674366,0.807828,0.852422,0.891562,0.784618,0.750132,0.753583,0.675008,0.803978,0.847931,0.889637,0.675008,0.267993,0.169586,0.088964,0.675008,0.803978,0.847931,0.889637,0.783319,0.749137,0.752595,0.658646,0.793712,0.838306,0.877446,0.658646,0.264571,0.167661,0.087745,0.658646,0.793712,0.838306,0.877446,0.769539,0.734728,0.738713,0.633622,0.777671,0.823227,0.866859,0.633622,0.259224,0.164645,0.086686,0.633622,0.777671,0.823227,0.866859,0.751518,0.714395,0.718559,0.751518


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=880, training_loss=0.8123623002997854, metrics={'train_runtime': 1078.2205, 'train_samples_per_second': 104.046, 'train_steps_per_second': 0.816, 'total_flos': 0.0, 'train_loss': 0.8123623002997854, 'epoch': 4.0})

In [None]:
# save the best model
trainer.save_model()

#### **Evaluate fine-tuned model against baseline**

In [None]:
from sentence_transformers import SentenceTransformer

fine_tuned_model = SentenceTransformer(
    args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
)

# Evaluate the model
results = evaluator(fine_tuned_model)

# print the main score
for dim in matryoshka_dimensions:
  key = f"dim_{dim}_cosine_ndcg@10"
  print(f"{key}: {results[key]}")

dim_768_cosine_ndcg@10: 0.789918502095199
dim_512_cosine_ndcg@10: 0.788021297021498
dim_384_cosine_ndcg@10: 0.7843401629873883
dim_256_cosine_ndcg@10: 0.783158767546139
dim_128_cosine_ndcg@10: 0.7695288822602219
dim_64_cosine_ndcg@10: 0.7513815826597857


In [None]:
# print the main score
for dim in matryoshka_dimensions:
  key = f"dim_{dim}_cosine_recall@5"
  print(f"{key}: {results[key]}")

dim_768_cosine_recall@5: 0.8511389156239975
dim_512_cosine_recall@5: 0.8521013795316009
dim_384_cosine_recall@5: 0.8524222008341353
dim_256_cosine_recall@5: 0.8479307025986526
dim_128_cosine_recall@5: 0.8379852422200834
dim_64_cosine_recall@5: 0.823227462303497


In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 510, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [None]:
# The sentences to encode
sentences = [
  "የተደጋገመው የመሬት መንቀጥቀጥና የእሳተ ገሞራ ምልክት በአፋር ክልል",
  "በአክሱም ከተማ የሚገኙ ሙስሊም ሴት ተማሪዎች ከሒጃብ መልበስ ጋር በተያያዘ ውዝግብ ከትምህርት ገበታ ውጭ ሆነው እንደሚገኙ የትግራይ እስልምና ጉዳዮች ምክርቤት ስታወቀ። ይህን ለመፍታት ከክልሉ ትምህርት ቢሮ ጋር ንግግር ላይ መሆኑም የክልሉ እስልምና ጉዳዮች ምክርቤት ለዶቼቬለ ገልጿል።",
  "በማዕከላዊ ኢትዮጵያ ክልል ሃድያ ዞን ጊቤ ወረዳ በሚገኙ 12 ቀበሌዎች መሠረታዊ የመንግሥት አገልግሎት መስጫ ተቋማት በሙሉና በከፊል በመዘጋታቸው መቸገራቸውን ነዋሪዎች አመለከቱ። ከባለፈው ዓመት ጀምሮ የጤና፣ የትምህርት እና የግብር አሰባሰብ ሥራዎች በየአካባቢያቸው እየተከናወኑ አለመሆናቸውንም ለዶቼ ቬለ ተናግረዋል።",
  "የሕዝብ ተወካዮች ምክር ቤት አባል እና የቋሚ ኮሚቴ ሰብሳቢ የነበሩት አቶ ክርስቲያን ታደለ እና የአማራ ክልል ምክር ቤት አባል የሆኑት አቶ ዮሐንስ ቧያለው ከቃሊቲ ወደ ቂሊንጦ ማረሚያ ቤት መዛወራቸውን ጠበቃቸው ተናገሩ።",
  "ከተደጋጋሚ መሬት መንቀጥቀጥ በኋላ አፋር ክልል እሳት ከመሬት ውስጥ ሲፈላ ታይቷል፡፡ ከመሬት ውስጥ እሳትና ጭስ የሚተፋው እንፋሎቱ ዛሬ ማለዳውን 11 ሰዓት ግድም ከከባድ ፍንዳታ በኋላየተስተዋለ መሆኑን የአከባቢው ነዋሪዎች እና ባለስልጣናት ለዶቼ ቬለ ተናግረዋል፡፡ አለት የሚያፈናጥር እሳት ነው የተባለው እንፋሎቱ በክልሉ ጋቢረሱ (ዞን 03) ዱለቻ ወረዳ ሰጋንቶ ቀበሌ መከሰቱን የገለጹት የአከባቢው የአይን እማኞች ከዋናው ፍንዳታ በተጨማሪ በዙሪያው ተጨማሪ ፍንዳታዎች መታየት ቀጥሏል ባይ ናቸው፡፡"
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)

(5, 768)
tensor([[ 1.0000,  0.0926,  0.0297, -0.0675,  0.6645],
        [ 0.0926,  1.0000,  0.2751,  0.0907,  0.0994],
        [ 0.0297,  0.2751,  1.0000,  0.0474,  0.1517],
        [-0.0675,  0.0907,  0.0474,  1.0000,  0.0028],
        [ 0.6645,  0.0994,  0.1517,  0.0028,  1.0000]])


In [None]:
from google.colab import userdata

# push model to hub
# trainer.model.push_to_hub("roberta-amharic-text-embedding-base", token=userdata.get("HF_WRITE"))

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

'https://huggingface.co/rasyosef/roberta-amharic-text-embedding-base/commit/1452588155c7dd4a76c00ccac713f9a8f18514af'

In [None]:
from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("rasyosef/roberta-amharic-text-embedding-base")
# Run inference
sentences = [
  "የተደጋገመው የመሬት መንቀጥቀጥና የእሳተ ገሞራ ምልክት በአፋር ክልል",
  "ከተደጋጋሚ መሬት መንቀጥቀጥ በኋላ አፋር ክልል እሳት ከመሬት ውስጥ ሲፈላ ታይቷል፡፡ ከመሬት ውስጥ እሳትና ጭስ የሚተፋው እንፋሎቱ ዛሬ ማለዳውን 11 ሰዓት ግድም ከከባድ ፍንዳታ በኋላየተስተዋለ መሆኑን የአከባቢው ነዋሪዎች እና ባለስልጣናት ለዶቼ ቬለ ተናግረዋል፡፡ አለት የሚያፈናጥር እሳት ነው የተባለው እንፋሎቱ በክልሉ ጋቢረሱ (ዞን 03) ዱለቻ ወረዳ ሰጋንቶ ቀበሌ መከሰቱን የገለጹት የአከባቢው የአይን እማኞች ከዋናው ፍንዳታ በተጨማሪ በዙሪያው ተጨማሪ ፍንዳታዎች መታየት ቀጥሏል ባይ ናቸው፡፡",
  "ለኢትዮጵያ ብሔራዊ ባንክ ዋጋን የማረጋጋት ቀዳሚ ዓላማ ጋር የተጣጣሙ የገንዘብ ፖሊሲ ምክረ ሀሳቦችን እንዲሰጥ የተቋቋመው የኢትዮጵያ ብሔራዊ ባንክ የገንዘብ ፖሊሲ ኮሚቴ እስካለፈው ህዳር ወር የነበረው እአአ የ2024 የዋጋ ግሽበት በተለይምምግብ ነክ ምርቶች ላይ ከአንድ ዓመት በፊት ከነበው ጋር ሲነጻጸር መረጋጋት ማሳየቱን ጠቁሟል፡፡ ዶይቼ ቬለ ያነጋገራቸው የአዲስ አበባ ነዋሪዎች ግን በዚህ የሚስማሙ አይመስልም፡፡ ከአምና አንጻር ያልጨመረ ነገር የለም ባይ ናቸው፡፡ የኢኮኖሚ  ባለሙያም በሰጡን አስተያየት ጭማሪው በሁሉም ረገድ የተስተዋለ በመሆኑ የመንግስት ወጪን በመቀነስ ግብርናው ላይ አተኩሮ መስራት ምናልባትም የዋጋ መረጋጋቱን ሊያመጣ ይችላል ይላሉ፡፡"
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 512]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities, similarities.shape)
# [3, 3]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/83.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

(3, 768)
tensor([[1.0000, 0.6644, 0.0478],
        [0.6644, 1.0000, 0.0021],
        [0.0478, 0.0021, 1.0000]]) torch.Size([3, 3])


In [None]:
sentences = [
  "የተደጋገመው የመሬት መንቀጥቀጥና የእሳተ ገሞራ ምልክት በአፋር ክልል",
  "የዋጋ ግሽበት በባለሙያዎች እና ነዋሪዎች አተያይ",
  "ከተደጋጋሚ መሬት መንቀጥቀጥ በኋላ አፋር ክልል እሳት ከመሬት ውስጥ ሲፈላ ታይቷል፡፡ ከመሬት ውስጥ እሳትና ጭስ የሚተፋው እንፋሎቱ ዛሬ ማለዳውን 11 ሰዓት ግድም ከከባድ ፍንዳታ በኋላየተስተዋለ መሆኑን የአከባቢው ነዋሪዎች እና ባለስልጣናት ለዶቼ ቬለ ተናግረዋል፡፡ አለት የሚያፈናጥር እሳት ነው የተባለው እንፋሎቱ በክልሉ ጋቢረሱ (ዞን 03) ዱለቻ ወረዳ ሰጋንቶ ቀበሌ መከሰቱን የገለጹት የአከባቢው የአይን እማኞች ከዋናው ፍንዳታ በተጨማሪ በዙሪያው ተጨማሪ ፍንዳታዎች መታየት ቀጥሏል ባይ ናቸው፡፡",
  "ለኢትዮጵያ ብሔራዊ ባንክ ዋጋን የማረጋጋት ቀዳሚ ዓላማ ጋር የተጣጣሙ የገንዘብ ፖሊሲ ምክረ ሀሳቦችን እንዲሰጥ የተቋቋመው የኢትዮጵያ ብሔራዊ ባንክ የገንዘብ ፖሊሲ ኮሚቴ እስካለፈው ህዳር ወር የነበረው እአአ የ2024 የዋጋ ግሽበት በተለይምምግብ ነክ ምርቶች ላይ ከአንድ ዓመት በፊት ከነበው ጋር ሲነጻጸር መረጋጋት ማሳየቱን ጠቁሟል፡፡ ዶይቼ ቬለ ያነጋገራቸው የአዲስ አበባ ነዋሪዎች ግን በዚህ የሚስማሙ አይመስልም፡፡ ከአምና አንጻር ያልጨመረ ነገር የለም ባይ ናቸው፡፡ የኢኮኖሚ  ባለሙያም በሰጡን አስተያየት ጭማሪው በሁሉም ረገድ የተስተዋለ በመሆኑ የመንግስት ወጪን በመቀነስ ግብርናው ላይ አተኩሮ መስራት ምናልባትም የዋጋ መረጋጋቱን ሊያመጣ ይችላል ይላሉ፡፡"
]
embeddings = model.encode(sentences)

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities)

tensor([[1.0000, 0.1963, 0.6644, 0.0478],
        [0.1963, 1.0000, 0.1050, 0.7985],
        [0.6644, 0.1050, 1.0000, 0.0021],
        [0.0478, 0.7985, 0.0021, 1.0000]])
