In [None]:
'''
# For models that have a rate limit of 1 per second (like Mistral)
# Update venv\Lib\site-packages\llama_index\core\indices\vector_store\base.py
    def _add_nodes_to_index(
        self,
        index_struct: IndexDict,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **insert_kwargs: Any,
    ) -> None:
        """Add document to index."""
        if not nodes:
            return


        # added --------------------------------------------------------------
        import time
        # --------------------------------------------------------------------
        for nodes_batch in iter_batch(nodes, self._insert_batch_size):
            # added (for Mistral) ------------------------------------------------
            # time.sleep(2)
            # --------------------------------------------------------------------
            # added (for Voayage) ------------------------------------------------
            # time.sleep(60)
            # --------------------------------------------------------------------
''';

In [None]:
import os
import time
import json
from pathlib import Path
from sentence_transformers import SentenceTransformer, losses
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.finetuning import SentenceTransformersFinetuneEngine
from sentence_transformers.evaluation import InformationRetrievalEvaluator

from llama_index.embeddings.openai import OpenAIEmbedding, OpenAIEmbeddingModelType
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.embeddings.voyageai import VoyageEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

In [None]:
stransformers_cache_dir = "../models/stransformers"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = stransformers_cache_dir

HF_CACHE_DIR = "../models/hf"
os.environ['HF_HOME'] = HF_CACHE_DIR

TIKTOKEN_CACHE_DIR = "../models/tiktoken"
os.environ["TIKTOKEN_CACHE_DIR"] = TIKTOKEN_CACHE_DIR

with open('../reqs/tokens.json', 'r') as file:
    tokens = json.load(file)

GOOGLE_API_KEY = tokens['GOOGLE_API_KEY'][0]
OPENAI_API_KEY = tokens['OPENAI_API_KEY'][0]
COHERE_API_KEY = tokens['COHERE_API_KEY'][0]
MISTRALAI_API_KEY = tokens['MISTRALAI_API_KEY'][0]
VOYAGEAI_API_KEY = tokens['VOYAGEAI_API_KEY'][0]

In [None]:
train_dataset = EmbeddingQAFinetuneDataset.from_json("../data/finetune/datasets/train.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("../data/finetune/datasets/val.json")
models = [
    "Snowflake/snowflake-arctic-embed-m",
    "Snowflake/snowflake-arctic-embed-l",
    "dunzhang/stella_en_400M_v5l",
]

model_index = 1
start = 3
num_times = 3
epochs = 2
# max_epoch = num_times * epochs

In [None]:
len(train_dataset.corpus.keys()), len(val_dataset.corpus.keys())

## Finetune

In [None]:
# # Gradually fine-tune models while increasing number of epochs
# for i in range(start, num_times + 1):
#     model_id = \
#         models[model_index-1] if i == 1 else \
#         f"../models/stransformers/{models[model_index-1].split('/')[-1]}-token-finetuned-{(i-1)*epochs}"

#     # model = SentenceTransformer(embedding_name, trust_remote_code=True)
#     # loss = losses.MultipleNegativesRankingLoss(model)
#     # loss = losses.MatryoshkaLoss(model, loss, [768, 256])  # for m
#     # loss = losses.MatryoshkaLoss(model, loss, [1024, 512])  # for l

#     finetune_engine = SentenceTransformersFinetuneEngine(
#         dataset = train_dataset,
#         model_id = model_id,
#         model_output_path = \
#             f"../models/stransformers/{model_id.split('/')[-1]}-token-finetuned-{i*epochs}" if i == 1 else \
#             f"../models/stransformers/{model_id.split('/')[-1].replace(str((i-1)*epochs), str(i*epochs))}",
#         val_dataset = val_dataset,
#         epochs = epochs,
#         trust_remote_code = True,
#         batch_size = 12 if model_index == 1 else 2,
#         evaluation_steps = 100 if model_index == 1 else 200,
#         # loss = loss,
#     )

#     finetune_engine.finetune()

## Evaluate

In [None]:
def evaluate_for_all_models(
        embed_model, dataset, output_path,
        batch_size, show_progress, sleep, rate_limit,
        top_k=5, local=False,
    ):
    Path(output_path).mkdir(exist_ok=True, parents=True)

    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    if local:
        embed_model = HuggingFaceEmbedding(model_name=embed_model)

    index = VectorStoreIndex(
        nodes,
        embed_model=embed_model,
        show_progress=show_progress,
        insert_batch_size=batch_size,
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results, counter = [], 0
    for query_id, query in tqdm(queries.items()):
        time.sleep(sleep)
        counter += 1
        if counter == rate_limit:
            time.sleep(60)
            counter = 0
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)

    return eval_results

In [None]:
def evaluate_for_st_models(model_id, dataset, name, output_path):
    Path(output_path).mkdir(exist_ok=True, parents=True)

    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs
    evaluator = InformationRetrievalEvaluator(
        queries, corpus, relevant_docs, name=name,
    )

    model = SentenceTransformer(model_id)

    return evaluator(model, output_path=output_path)

In [None]:
def get_api_based_results(
        company, embedding_name, embedding, dataset, results,
        batch_size, show_progress, sleep, rate_limit, load):
    if not load:
        output_path = f"../results/{company}-{embedding_name}"
        results[f'{company}-{embedding_name}'] = evaluate_for_all_models(
            embedding, dataset, output_path,
            batch_size, show_progress, sleep, rate_limit,
        )

        df = pd.DataFrame(results[f'{company}-{embedding_name}'])
        df.to_csv(f"../results/{company}-{embedding_name[:]}/hit_rates.csv", index=False)
    
    else:
        df = pd.read_csv(f"../results/{company}-{embedding_name}/hit_rates.csv")

    return df

In [None]:
results = {}
results_st = {}

### Google's

In [None]:
load = True

rate_limit = 1000  # per min
sleep = 0
show_progress = True
batch_size = None

company = 'google'
embedding_name = 'text-embedding-004'
embedding = GeminiEmbedding(
    api_key = GOOGLE_API_KEY,
    model_name = f"models/{embedding_name}",
)

results[f'{company}-{embedding_name}'] = get_api_based_results(
    company, embedding_name, embedding,
    val_dataset, results,
    batch_size, show_progress, sleep, rate_limit,
    load,
)

embedding

In [None]:
# load = False

# rate_limit = 9  # per min
# sleep = 60
# show_progress = True
# batch_size = 9

# company = 'google'
# embedding_name = 'gemini-embedding-exp-03-07'
# embedding = GeminiEmbedding(
#     api_key = GOOGLE_API_KEY,
#     model_name = f"models/{embedding_name}",
# )

# results[f'{company}-{embedding_name}'] = get_api_based_results(
#     company, embedding_name, embedding,
#     val_dataset, results,
#     batch_size, show_progress, sleep, rate_limit,
#     load,
# )

# embedding

### OpenAI's

In [None]:
load = True

rate_limit = 50  # per min
sleep = 0
show_progress = True
batch_size = None

company = 'openai'
# embedding_name = OpenAIEmbeddingModelType.TEXT_EMBED_ADA_002
embedding_name = OpenAIEmbeddingModelType.TEXT_EMBED_3_LARGE
embedding = OpenAIEmbedding(
    api_key = OPENAI_API_KEY,
    model = embedding_name,
)

results[f'{company}-{embedding_name}'] = get_api_based_results(
    company, embedding_name, embedding,
    val_dataset, results,
    batch_size, show_progress, sleep, rate_limit,
    load,
)

embedding

### Cohere's

In [None]:
load = True

rate_limit = 50  # per min
sleep = 0
show_progress = True
batch_size = None

company = 'cohere'
embedding_name = "embed-english-v3.0"
embedding = CohereEmbedding(
    api_key = COHERE_API_KEY,
    model = embedding_name,
    input_type="search_query",
)

results[f'{company}-{embedding_name}'] = get_api_based_results(
    company, embedding_name, embedding,
    val_dataset, results,
    batch_size, show_progress, sleep, rate_limit,
    load,
)

embedding

### Mistral's

In [None]:
load = True

rate_limit = 1000  # per min
sleep = 2
show_progress = False
batch_size = 1

company = 'mistralai'
embedding_name = "mistral-embed"
embedding = MistralAIEmbedding(
    api_key = MISTRALAI_API_KEY,
    model = embedding_name,
    input_type="search_query",
)

results[f'{company}-{embedding_name}'] = get_api_based_results(
    company, embedding_name, embedding,
    val_dataset, results,
    batch_size, show_progress, sleep, rate_limit,
    load,
)

embedding

### VoyageAI's

In [None]:
load = True

rate_limit = 3  # per min
sleep = 0
show_progress = True
batch_size = 3

company = 'voyageai'
# embedding_name = "voyage-3-large"
embedding_name = "voyage-finance-2"
embedding = VoyageEmbedding(
    voyage_api_key = VOYAGEAI_API_KEY,
    model_name = embedding_name,
)

results[f'{company}-{embedding_name}'] = get_api_based_results(
    company, embedding_name, embedding,
    val_dataset, results,
    batch_size, show_progress, sleep, rate_limit,
    load,
)

embedding

### Ours

In [None]:
# Sentence transformers comprehensive evaluation
load = True
st = False

rate_limit = 1e10  # per min
sleep = 0
show_progress = False
batch_size = None
local = True

for i in range(1, num_times + 2):
    try:
        model_base = models[model_index-1]
        if i == 1:
            # Base
            output_path = f"../results/{model_base.split('/')[-1]}"

            if not load:
                embed_model = f"../models/stransformers/{model_base.split('/')[-1]}"
                results['base'] = pd.DataFrame(evaluate_for_all_models(
                    embed_model, val_dataset, output_path, batch_size, show_progress, sleep, rate_limit, local
                ))
                results['base'].to_csv(f"{output_path}/hit_rates.csv", index=False)
            else:
                results['base'] = pd.read_csv(f"{output_path}/hit_rates.csv")

            if st:
                results_st['base'] = evaluate_for_st_models(model_base, val_dataset, "base", output_path)
        else:

            # Finetuned
            output_path = f"../results/{model_base.split('/')[-1]}-finetuned-2"
            model_finetuned = f"../models/stransformers/{model_base.split('/')[-1]}-finetuned-2"

            if not load:
                embed_model = f"../models/stransformers/{model_base.split('/')[-1]}-finetuned-2"
                results['finetuned-2'] = pd.DataFrame(evaluate_for_all_models(
                    embed_model, val_dataset, output_path, batch_size, show_progress, sleep, rate_limit, local
                ))
                results['finetuned-2'].to_csv(f"{output_path}/hit_rates.csv", index=False)
            else:
                results['finetuned-2'] = pd.read_csv(f"{output_path}/hit_rates.csv")
            
            if st:
                results_st[f"finetuned-2"] = evaluate_for_st_models(model_finetuned, val_dataset, f"finetuned-2", output_path)

            # Token finetuned
            output_path = f"../results/{model_base.split('/')[-1]}-token-finetuned-{(i-1)*epochs}"
            model_token_finetuned = f"../models/stransformers/{models[model_index-1].split('/')[-1]}-token-finetuned-{(i-1)*epochs}"

            if not load:
                embed_model = f"../models/stransformers/{model_base.split('/')[-1]}-token-finetuned-{(i-1)*epochs}"
                results[f'token-finetuned-{(i-1)*epochs}'] = pd.DataFrame(evaluate_for_all_models(
                    embed_model, val_dataset, output_path, batch_size, show_progress, sleep, rate_limit, local
                ))
                results[f'token-finetuned-{(i-1)*epochs}'].to_csv(f"{output_path}/hit_rates.csv", index=False)
            else:
                results[f'token-finetuned-{(i-1)*epochs}'] = pd.read_csv(f"{output_path}/hit_rates.csv")

            if st:
                results_st[f"token-finetuned-{(i-1)*epochs}"] = evaluate_for_st_models(model_token_finetuned, val_dataset, f"token-finetuned-{(i-1)*epochs}", output_path)

    except Exception as e:
        print(e)

### Comparison

#### Hit Rate @ 5

In [None]:
# Number of parameters, and Embedding dimension
model_size = {
    'google-text-embedding-004': ['X', '768'],
    'google-gemini-embedding-exp-03-07': ['X', '3072'],
    'openai-text-embedding-3-large': ['X', '3072'],
    'openai-text-embedding-3-small': ['X', '1536'], # better than ada
    'openai-text-embedding-ada-002': ['X', '1536'],
    'cohere-embed-english-v3.0': ['X', '1024'],
    'mistralai-mistral-embed': ['X', '1024'],
    'voyageai-voyage-finance-2': ['X', '1024'],
    'base': ['305M', '768'],
    'finetuned-2': ['305M', '768'],
    'token-finetuned-2': ['305M+', '768'],
    'token-finetuned-4': ['305M+', '768'],
    'token-finetuned-6': ['305M+', '768'],
}

In [None]:
print('Hit Rate @ 5 Comparison:')
for key in results.keys():
    df = results[key]
    if not isinstance(df, pd.DataFrame):
        df = pd.DataFrame(df)
    try:
        if key == 'base':
            _key = models[model_index-1].split('/')[1] + ' (base)'
            print(f' {_key:31}: {round(df["is_hit"].mean()*100, 1):4}%  (with model size of {model_size[key][0]:5} and embedding size of {model_size[key][1]:4})')
        else:
            print(f' {key:31}: {round(df["is_hit"].mean()*100, 1):4}%  (with model size of {model_size[key][0]:5} and embedding size of {model_size[key][1]:4})')
    except:
        print(f' {key:31}: {round(df["is_hit"].mean()*100, 1):4}%')

#### MAP

In [None]:
for key in results_st.keys():
    res = results_st[key]
    improvement = round((res - results_st['base']) / results_st['base'] * 100)
    print(f'{key}: {round(res*100, 1)}% (with {improvement}% improvement)')

#### Plot

In [None]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

colors = px.colors.sequential.Turbo
# colors = px.colors.qualitative.G10

df = pd.DataFrame({
    "Metric": ["MRR@10", "MAP@100", "NDCG@10"],
    "Base": [38, 39, 43],
    "Finetuned": [84, 84, 86]
})

fig = go.Figure(
    data=[
        go.Bar(
            name='Base',
            x=df['Metric'],
            y=df['Base'],
            marker=dict(
                color='rgba(255, 255, 255, 0)',  # Transparent fill color
                line=dict(color=colors[-3], width=10)  # Black border around bars
            ),
            text=[str(i)+'%' for i in df['Base'].values],
            textposition='auto',
        ),
        go.Bar(
            name='Finetuned',
            x=df['Metric'],
            y=df['Finetuned'],
            marker=dict(
                color='rgba(255, 255, 255, 0)',  # Transparent fill color
                line=dict(color=colors[1], width=10)  # Black border around bars
            ),
            text=[str(i)+'%' for i in df['Finetuned'].values],
            textposition='auto',
        ),
    ],
    layout=dict(
        barcornerradius=15,
    ),
)

fig.update_layout(
    height=600, width=1400,
    # title='Performance Comparison',
    xaxis_title='Metric',
    template="plotly_white", # use white background
    yaxis=dict(range=[-5, 100]),
    font=dict(size=18),
    barmode='group',
    bargap=0.2, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)

fig.show()
fig.write_image('../results/perf.png', scale=2)

In [None]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

data = {
    'Model': ['Google Text-Embedding-004', 'Cohere Embed-English-v3.0', 'OpenAI Text-Embedding-3-Large', 'MistralAI Mistral-Embed', 'Voyage AI Voyage-Finance-2', 'Ours'],
    'HR@5 [%]': [84, 85, 86, 87, 88, 88],
    'Improvement [%]': [5, 4, 2, 1, 0, None],
    'Embedding Size': [768, 1024, 3072, 1024, 1024, 768]
}
df = pd.DataFrame(data)
df['Improvement [%]'] = df['Improvement [%]'].fillna(0)

# Create a bubble chart with color based on Improvement
fig_bubble_color = go.Figure(data=go.Scatter(
    x=df['Model'],
    y=df['HR@5 [%]'],
    mode='markers',
    marker=dict(
        symbol="octagon-open-dot",
        size=df['Embedding Size'] / 15,  # Scale the size
        sizemode='diameter',
        color=df['Improvement [%]'],  # Color based on Improvement
        # colorscale='turbo',
        colorscale=px.colors.sequential.Turbo[1:4] + px.colors.sequential.Turbo[-4:-1],
        colorbar=dict(title='Imp.'),  # Add a colorbar
        opacity=1,
        line_width=10,
    ),
    text=df['Model']
))

fig_bubble_color.update_layout(
    height=600, width=1400,
    # title='Performance Benchmarking',
    # xaxis_title='Model',
    yaxis_title='HR@5 [%]',
    template="plotly_white", # use white background
    yaxis=dict(range=[79, 90]), # set the y-axis range
    font=dict(size=18) # Increase font size
)

fig_bubble_color.show()
fig_bubble_color.write_image('../results/bench.png', scale=2)