# Evaluation of the cosine similarities computed by  different types of models

In [1]:
from sentence_transformers import SentenceTransformer, util

## Loading models
*requires setting the HF_TOKEN secret*

In [3]:
# Sentence Transformer
sent_model = SentenceTransformer("all-MiniLM-L6-v2")
# SimCSE unsupervised
unsup_simcse = SentenceTransformer("princeton-nlp/unsup-simcse-roberta-base")
# SimCSE supervised
sup_simcse = SentenceTransformer("princeton-nlp/sup-simcse-roberta-base")
# E5
e5_model = SentenceTransformer('intfloat/multilingual-e5-base')



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

## Initializations

In [4]:
dict_models = {
    "sent_model": sent_model,
    "unsup_simcse": unsup_simcse,
    "sup_simcse": sup_simcse,
    "e5_model": e5_model
}

dict_model_labels = {
    "sent_model": "Classical SentenceTransformer",
    "unsup_simcse": "SimCSE - unsupervised",
    "sup_simcse": "SimCSE - supervised",
    "e5_model": "E5"
}

list_corpus = [
    "A man is slicing a cake carefully with a knife.",
    "Someone cuts a dessert with precision.",
    "A woman is slicing a cake.",
    "Elle découpe soigneusement la tarte en parts égales."
]

## Embedding corpus

In [5]:
# Corpus encoding for all models
dict_corpus_embeddings = {
    name: model.encode(list_corpus, convert_to_tensor=True)
    for name, model in dict_models.items()
}

## Embedding query

In [6]:
query = "Someone cuts a dessert with a knife."

# Query encoding for all models
dict_query_embeddings = {
    name: model.encode(query, convert_to_tensor=True)
    for name, model in dict_models.items()
}

## Processing

In [7]:
# Semantic search for all models
dict_hits = {
    name: util.semantic_search(emb, dict_corpus_embeddings[name])[0]
    for name, emb in dict_query_embeddings.items()
}

## Displaying results

In [8]:
print(f"\nQuery: {query}\n\nCosine similarity for each model:")

for name, hit_list in dict_hits.items():
    print(f"\n{dict_model_labels.get(name, name)}:")
    for hit in hit_list:
        print(f"  {hit['score']:.3f} - {list_corpus[hit['corpus_id']]}")


Query: Someone cuts a dessert with a knife.

Cosine similarity for each model:

Classical SentenceTransformer:
  0.870 - Someone cuts a dessert with precision.
  0.746 - A man is slicing a cake carefully with a knife.
  0.607 - A woman is slicing a cake.
  0.020 - Elle découpe soigneusement la tarte en parts égales.

SimCSE - unsupervised:
  0.861 - Someone cuts a dessert with precision.
  0.827 - A man is slicing a cake carefully with a knife.
  0.785 - A woman is slicing a cake.
  0.518 - Elle découpe soigneusement la tarte en parts égales.

SimCSE - supervised:
  0.957 - Someone cuts a dessert with precision.
  0.897 - A man is slicing a cake carefully with a knife.
  0.857 - A woman is slicing a cake.
  0.755 - Elle découpe soigneusement la tarte en parts égales.

E5:
  0.959 - Someone cuts a dessert with precision.
  0.918 - A man is slicing a cake carefully with a knife.
  0.881 - A woman is slicing a cake.
  0.817 - Elle découpe soigneusement la tarte en parts égales.
