### Snowflake Artic Embed 2.0 Experiments

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
from typing import List, Dict, Any, Tuple
import torch
from sentence_transformers import SentenceTransformer

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
def embed_query_docs(docs: List[str], model: SentenceTransformer, is_query: bool = False) -> List[List[float]]:
    """
    Embeds a list of documents using a SentenceTransformer model
    Args:
    docs: List of documents to embed
    model: SentenceTransformer model to use
    is_query: If True, the input is a query, otherwise it is a document
    Returns:
    Tuple of a list of document IDs and a list of embeddings
    """
    prompt_name = "query"
    if is_query:
        embeddings = model.encode(docs, prompt_name=prompt_name)
    else:
        embeddings = model.encode(docs)
    return embeddings

In [5]:
def find_similarity(
    query_embeddings: List[float], document_embeddings: List[List[float]],
    model: SentenceTransformer
) -> List[float]:
    """
    Finds the cosine similarity between a query and a list of documents
    Args:
    query: List of embeddings for the query
    docs: List of embeddings for the documents
    Returns:
    List of cosine similarities
    """
    return model.similarity(query_embeddings, document_embeddings)

In [6]:
model_name = "Snowflake/snowflake-arctic-embed-xs"
model = SentenceTransformer(model_name, device=device, trust_remote_code=True)

##### Note
The documentes for test are generated using Google Gemini.
Prompt: Prepare one English query and two Englosh and French documents set for IR evaluation.

In [7]:
query = ["What are the benefits of using renewable energy sources?"]

french_docs = [
    "Les énergies renouvelables offrent de nombreux avantages, notamment la réduction des émissions de gaz à effet de serre et la diminution de la dépendance aux combustibles fossiles.",
    "L'utilisation de sources d'énergie renouvelables contribue à la préservation de l'environnement et à la lutte contre le changement climatique.",
    "Investir dans les énergies renouvelables est un choix judicieux pour l'avenir de la planète.",
]

# English Documents
english_docs = [
    "Renewable energy sources offer numerous benefits, including reduced greenhouse gas emissions and decreased reliance on fossil fuels.",
    "Using renewable energy sources helps protect the environment and combat climate change.",
    "Investing in renewable energy is a smart choice for the future of the planet.",
]


# Embed query and documents
query_embedding = embed_query_docs(query, model, is_query=True)
french_doc_embeddings = embed_query_docs(french_docs, model)
english_doc_embeddings = embed_query_docs(english_docs, model)


# Calculate similarities
french_similarities = find_similarity(query_embedding, french_doc_embeddings, model)
english_similarities = find_similarity(query_embedding, english_doc_embeddings, model)

# Example output
print("Query:", query)
print("\nFrench Documents and Similarities:")
for idx, sim in enumerate(french_similarities):
    print(f"Document {idx+1}: {french_docs[idx]}")
    print(f"Similarity Score: {sim}")

print("\nEnglish Documents and Similarities:")
for idx, sim in enumerate(english_similarities):
    print(f"Document {idx+1}: {english_docs[idx]}")
    print(f"Similarity Score: {sim}")

Query: ['What are the benefits of using renewable energy sources?']

French Documents and Similarities:
Document 1: Les énergies renouvelables offrent de nombreux avantages, notamment la réduction des émissions de gaz à effet de serre et la diminution de la dépendance aux combustibles fossiles.
Similarity Score: tensor([0.5670, 0.5841, 0.5591])

English Documents and Similarities:
Document 1: Renewable energy sources offer numerous benefits, including reduced greenhouse gas emissions and decreased reliance on fossil fuels.
Similarity Score: tensor([0.8008, 0.7642, 0.6709])


#### TODO
Try using `scores = query_embeddings @ document_embeddings.T`