In [1]:
import torch
from datasets import load_dataset

from sentence_transformers import SentenceTransformer, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction

In [2]:
# Limit torch to 4 threads
torch.set_num_threads(4)

In [3]:
# https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py

In [4]:
validation_dataset = load_dataset("sentence-transformers/stsb", split="validation")
validation_dataset.to_pandas()

Unnamed: 0,sentence1,sentence2,score
0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,1.00
1,A young child is riding a horse.,A child is riding a horse.,0.95
2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,1.00
3,A woman is playing the guitar.,A man is playing guitar.,0.48
4,A woman is playing the flute.,A man is playing a flute.,0.55
...,...,...,...
1495,Scientists prove there is water on Mars,Has Nasa discovered water on Mars?,0.40
1496,Pranab stresses need to strive for peace by na...,WTO: India regrets action of developed nations,0.00
1497,Volkswagen skids into red in wake of pollution...,"Volkswagen's ""gesture of goodwill"" to diesel o...",0.40
1498,Obama is right: Africa deserves better leadership,Obama waiting for midterm to name attorney gen...,0.00


In [6]:
# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer("paraphrase-multilingual-minilm-l12-v2")
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=validation_dataset["sentence1"],
    sentences2=validation_dataset["sentence2"],
    scores=validation_dataset["score"],
    # main_similarity=SimilarityFunction.COSINE,
    batch_size=1,
    name="sts-dev",
)
dev_evaluator(model)

{'sts-dev_pearson_cosine': 0.8700415020412305,
 'sts-dev_spearman_cosine': 0.8745994206581585,
 'sts-dev_pearson_manhattan': 0.8582551158246639,
 'sts-dev_spearman_manhattan': 0.8601235801773527,
 'sts-dev_pearson_euclidean': 0.8591308863530069,
 'sts-dev_spearman_euclidean': 0.8611762359530402,
 'sts-dev_pearson_dot': 0.7708352627737998,
 'sts-dev_spearman_dot': 0.7778711785168627,
 'sts-dev_pearson_max': 0.8700415020412305,
 'sts-dev_spearman_max': 0.8745994206581585}

In [8]:
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
test_dataset.to_pandas()

Unnamed: 0,sentence1,sentence2,score
0,A girl is styling her hair.,A girl is brushing her hair.,0.50
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,0.72
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,1.00
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,0.84
4,A man is playing a harp.,A man is playing a keyboard.,0.30
...,...,...,...
1374,"Philippines, Canada pledge to further boost re...",Philippines saves 100 after ferry sinks,0.00
1375,Israel bars Palestinians from Jerusalem's Old ...,"Two-state solution between Palestinians, Israe...",0.20
1376,How much do you know about Secret Service?,Lawmakers from both sides express outrage at S...,0.20
1377,Obama Struggles to Soothe Saudi Fears As Iran ...,Myanmar Struggles to Finalize Voter Lists for ...,0.00


In [9]:
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_dataset["sentence1"],
    sentences2=test_dataset["sentence2"],
    scores=test_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-test",
)
test_evaluator(model)

{'sts-test_pearson_cosine': 0.8342051141830861,
 'sts-test_spearman_cosine': 0.8441498881524113,
 'sts-test_pearson_manhattan': 0.8361050224552014,
 'sts-test_spearman_manhattan': 0.8378574034123987,
 'sts-test_pearson_euclidean': 0.8358012203669963,
 'sts-test_spearman_euclidean': 0.8374013083693628,
 'sts-test_pearson_dot': 0.7035295122002462,
 'sts-test_spearman_dot': 0.6981041091263,
 'sts-test_pearson_max': 0.8361050224552014,
 'sts-test_spearman_max': 0.8441498881524113}

### OpenAI Embeddings

In [14]:
from dotenv import load_dotenv, find_dotenv
import os
from openai import OpenAI

_ = load_dotenv(find_dotenv())
openai_api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=openai_api_key)

In [15]:
from typing import List

def get_embedding(text: str, model="text-embedding-3-small") -> List[float]:
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

In [11]:
validation_dataset = load_dataset("sentence-transformers/stsb", split="validation")
validation_dataset = validation_dataset.to_pandas()

In [17]:
embeddings1 = validation_dataset.sentence1.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
embeddings2 = validation_dataset.sentence2.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
embeddings1.to_csv('embeddings1.csv', index=False)
embeddings2.to_csv('embeddings2.csv', index=False)

import pandas as pd
import numpy as np

df = pd.read_csv('embeddings1.csv')
embeddings1 = df.sentence1.apply(eval).apply(np.array)
df = pd.read_csv('embeddings2.csv')
embeddings2 = df.sentence2.apply(eval).apply(np.array)

In [19]:
embeddings2

0       [-0.003388892626389861, 0.018921080976724625, ...
1       [-0.011210653930902481, -0.034644100815057755,...
2       [0.024244142696261406, 0.004071642644703388, -...
3       [-0.010717172175645828, -0.01387102622538805, ...
4       [-0.018342604860663414, 0.0007186566945165396,...
                              ...                        
1495    [-0.000687376013956964, 0.021258048713207245, ...
1496    [-0.0047498648054897785, 0.021038316190242767,...
1497    [0.028107900172472, 0.01971922069787979, -0.02...
1498    [-0.00634743319824338, -0.0047687264159321785,...
1499    [-0.015754982829093933, -0.0007625228026881814...
Name: sentence2, Length: 1500, dtype: object

In [29]:
import openai
import numpy as np
import os
import csv
import logging
from sklearn.metrics.pairwise import paired_cosine_distances, paired_manhattan_distances, paired_euclidean_distances
from scipy.stats import pearsonr, spearmanr
from typing import List, Dict, Optional

logger = logging.getLogger(__name__)  # Create a logger object

def get_embedding(text: str, model: str = "text-embedding-3-small") -> List[float]:
    """Get embeddings from OpenAI API"""
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

class OpenAIEmbeddingSimilarityEvaluator:
    
    def __init__(self, sentences1: List[str], sentences2: List[str], scores: List[float], 
                 batch_size: int = 16, main_similarity: str = 'COSINE', name: str = "", 
                 show_progress_bar: bool = False, write_csv: bool = True, 
                 precision: Optional[str] = None, truncate_dim: Optional[int] = None):
        """Initialize evaluator with sentences, scores, and other settings."""
        self.sentences1 = sentences1
        self.sentences2 = sentences2
        self.scores = scores
        self.batch_size = batch_size
        self.main_similarity = main_similarity
        self.name = name
        self.show_progress_bar = show_progress_bar
        self.write_csv = write_csv
        self.precision = precision
        self.truncate_dim = truncate_dim
        self.csv_file = f"similarity_evaluation_{name}.csv" if name else "similarity_evaluation_results.csv"
        self.csv_headers = ["epoch", "steps", "cosine_pearson", "cosine_spearman",
                            "euclidean_pearson", "euclidean_spearman", "manhattan_pearson", 
                            "manhattan_spearman", "dot_pearson", "dot_spearman"]

        assert len(sentences1) == len(sentences2) == len(scores), "Sentence lists and scores must be of equal length."

    def __call__(self, output_path: Optional[str] = None, epoch: int = -1, steps: int = -1) -> Dict[str, float]:
        """Evaluate the similarity between the two sentence sets."""
        
        # Optional logging for the epoch and step context
        if epoch != -1:
            out_txt = f" after epoch {epoch}" if steps == -1 else f" in epoch {epoch} after {steps} steps"

        # Get embeddings for sentences1 and sentences2
        embeddings1 = np.array([get_embedding(sentence, model='text-embedding-3-small') for sentence in self.sentences1])
        embeddings2 = np.array([get_embedding(sentence, model='text-embedding-3-small') for sentence in self.sentences2])

        # Handle precision adjustments (e.g., binary precision)
        if self.precision in ("binary", "ubinary"):
            embeddings1 = np.unpackbits(embeddings1, axis=1)
            embeddings2 = np.unpackbits(embeddings2, axis=1)

        # Compute distances and similarity metrics
        cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
        manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
        euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
        dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]

        # Ground truth labels for the sentence pairs
        labels = np.array(self.scores)

        # Evaluate Pearson and Spearman correlations
        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
        eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
        eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
        eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
        eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
        eval_pearson_dot, _ = pearsonr(labels, dot_products)
        eval_spearman_dot, _ = spearmanr(labels, dot_products)

        # Log results
        logger.info(f"Cosine-Similarity :\tPearson: {eval_pearson_cosine:.4f}\tSpearman: {eval_spearman_cosine:.4f}")
        logger.info(f"Manhattan-Distance:\tPearson: {eval_pearson_manhattan:.4f}\tSpearman: {eval_spearman_manhattan:.4f}")
        logger.info(f"Euclidean-Distance:\tPearson: {eval_pearson_euclidean:.4f}\tSpearman: {eval_spearman_euclidean:.4f}")
        logger.info(f"Dot-Product-Similarity:\tPearson: {eval_pearson_dot:.4f}\tSpearman: {eval_spearman_dot:.4f}")

        # Write results to CSV if needed
        if output_path and self.write_csv:
            csv_path = os.path.join(output_path, self.csv_file)
            output_file_exists = os.path.isfile(csv_path)
            with open(csv_path, mode="a" if output_file_exists else "w", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                if not output_file_exists:
                    writer.writerow(self.csv_headers)
                writer.writerow([epoch, steps, eval_pearson_cosine, eval_spearman_cosine,
                                 eval_pearson_euclidean, eval_spearman_euclidean,
                                 eval_pearson_manhattan, eval_spearman_manhattan,
                                 eval_pearson_dot, eval_spearman_dot])

        # Return a dictionary of the evaluation metrics
        metrics = {
            "pearson_cosine": eval_pearson_cosine, "spearman_cosine": eval_spearman_cosine,
            "pearson_manhattan": eval_pearson_manhattan, "spearman_manhattan": eval_spearman_manhattan,
            "pearson_euclidean": eval_pearson_euclidean, "spearman_euclidean": eval_spearman_euclidean,
            "pearson_dot": eval_pearson_dot, "spearman_dot": eval_spearman_dot,
            "pearson_max": max(eval_pearson_cosine, eval_pearson_manhattan, eval_pearson_euclidean, eval_pearson_dot),
            "spearman_max": max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot),
        }

        return metrics

In [30]:
# Initialize the evaluator with corrected class
evaluator = OpenAIEmbeddingSimilarityEvaluator(
    sentences1=validation_dataset["sentence1"],
    sentences2=validation_dataset["sentence2"],
    scores=validation_dataset["score"],
    main_similarity='COSINE',
    batch_size=16,
    name="similarity-eval",
    show_progress_bar=True,
    write_csv=True,
    precision="float32"
)

# Run the evaluation
metrics = evaluator(output_path=".", epoch=1, steps=100)

# Display the results
print("Evaluation Metrics:", metrics)

[13/Sep/2024 20:59:23] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 20:59:23] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 20:59:23] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 20:59:23] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 20:59:24] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 20:59:24] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 20:59:24] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 20:59:24] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 20:59:24] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 20:59:24] INFO - HTTP Request: POST https://api.ope

In [31]:
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
test_dataset = test_dataset.to_pandas()

In [33]:
# Initialize the evaluator with corrected class
evaluator = OpenAIEmbeddingSimilarityEvaluator(
    sentences1=test_dataset["sentence1"],
    sentences2=test_dataset["sentence2"],
    scores=test_dataset["score"],
    main_similarity='COSINE',
    batch_size=16,
    name="similarity-eval",
    show_progress_bar=True,
    write_csv=True,
    precision="float32"
)

# Run the evaluation
metrics = evaluator(output_path=".", epoch=1, steps=100)

# Display the results
print("Evaluation Metrics:", metrics)

[13/Sep/2024 21:51:55] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 21:51:55] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 21:51:55] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 21:51:55] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 21:51:56] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 21:51:56] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 21:51:56] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 21:51:56] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 21:51:56] INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[13/Sep/2024 21:51:56] INFO - HTTP Request: POST https://api.ope

In [None]:
from mteb import MTEB
from sentence_transformers import SentenceTransformer

model_name = "average_word_embeddings_komninos"
model = SentenceTransformer(model_name)

evaluation = MTEB(tasks=["Banking77Classification"])
results = evaluation.run(model, output_folder=f"results/{model_name}")