<a href="https://colab.research.google.com/github/joaowinderfeldbussolotto/assistente-ppc-ciencia-da-computacao/blob/main/embeddings_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import userdata

class Settings:
  HF_TOKEN          = userdata.get('HF_TOKEN')
  PINECONE_API_KEY  = userdata.get('PINECONE_API_KEY')
  GROQ_API_KEY      = userdata.get('GROQ_API_KEY')
  MISTRAL_AI_KEY    = userdata.get('MISTRAL_AI_KEY')
  GROQ_API_KEY2     = userdata.get('GROQ_API_KEY2')
  MISTRAL_AI_KEY2   = userdata.get('MISTRAL_AI_KEY2')


settings = Settings()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install -qU langchain_community langchain_core langchain_openai langchain_huggingface langchain_mistralai faiss-cpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.8/64.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.8/50.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m408.7/408.7 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
from langchain_core.load import dumpd, dumps, load, loads
import json
import time
import hashlib
import os


def save_document_locally(documents, version):

  output_file_name = f'ppc{version}_documents.json'
  json_string = dumps(documents, pretty=True, ensure_ascii=False)
  with open(output_file_name, "w", encoding="utf-8") as file:
      file.write(json_string)

def load_document_locally(version):
  output_file_name = f'ppc{version}_documents.json'
  with open(output_file_name, "r") as fp:
      doc = json.load(fp)
  return load(doc)


def save_dict_to_json_on_drive(documents, drive_folder_path, drive_file_name):

    os.makedirs(drive_folder_path, exist_ok=True)
    timestamp = str(int(time.time()))
    hash_suffix = hashlib.md5(timestamp.encode()).hexdigest()[:6]  # Gera um hash curto com 6 caracteres

    drive_file_name = f'{hash_suffix}_{drive_file_name}'
    drive_path = f'{drive_folder_path}/{os.path.basename(drive_file_name)}'

    json_string = dumpd(documents)
    print(json_string)
    with open(drive_path, 'w', encoding='utf-8') as json_file:
        json.dump(json_string, json_file, ensure_ascii=False)
    print(f'File saved to: {drive_path}')
    return drive_path

In [6]:
import json

def load_json_from_drive(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            return data
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {file_path}")
        return None


paths = ['/content/drive/MyDrive/tcc/preprocessing/ft/538ac8_train_dataset.json',
 '/content/drive/MyDrive/tcc/preprocessing/ft/538ac8_val_dataset.json',
 '/content/drive/MyDrive/tcc/preprocessing/ft/538ac8_test_dataset.json']

loaded_data = []
for path in paths:
  loaded_data.append(load_json_from_drive(path))

train_dataset = loaded_data[0]
val_dataset = loaded_data[1]
test_dataset = loaded_data[2]

In [7]:
import pandas as pd

from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document

In [8]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from typing import List, Dict
from sklearn.metrics import ndcg_score
from collections import defaultdict
import matplotlib.pyplot as plt

from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document

def calculate_metrics(eval_results: List[Dict], k: int, dataset) -> dict:
    """Calculate various retrieval metrics for top k results"""
    metrics = {}

    # Hit Rate (HR@k)
    hits = sum(1 for result in eval_results
              if result["expected_id"] in result["retrieved_ids"][:k])
    metrics["hit_rate"] = (hits / len(eval_results)) * 100

    # Mean Reciprocal Rank (MRR@k)
    mrr_sum = 0
    for result in eval_results:
        try:
            rank = result["retrieved_ids"][:k].index(result["expected_id"]) + 1
            mrr_sum += 1/rank
        except ValueError:
            mrr_sum += 0
    metrics["mrr"] = (mrr_sum / len(eval_results)) * 100

    # Mean Average Precision (MAP@k)
    ap_sum = 0
    for result in eval_results:
        precision_sum = 0
        num_relevant = 0
        for i, doc_id in enumerate(result["retrieved_ids"][:k], 1):
            if doc_id in dataset['relevant_contexts'][result["id"]]:
                num_relevant += 1
                precision_sum += num_relevant / i
        if len(dataset['relevant_contexts'][result["id"]]) > 0:
            ap_sum += precision_sum / len(dataset['relevant_contexts'][result["id"]])
    metrics["map"] = (ap_sum / len(eval_results)) * 100

    # Normalized Discounted Cumulative Gain (NDCG@k)
    ndcg_sum = 0
    for result in eval_results:
        relevance = np.zeros(k)
        for i, doc_id in enumerate(result["retrieved_ids"][:k]):
            if doc_id in dataset['relevant_contexts'][result["id"]]:
                relevance[i] = 1
        ideal_relevance = np.sort(relevance)[::-1]
        if np.sum(ideal_relevance) > 0:  # Only if there are relevant documents
            ndcg_sum += ndcg_score([ideal_relevance], [relevance])
    metrics["ndcg"] = (ndcg_sum / len(eval_results)) * 100

    return metrics

def evaluate_embeddings(
    dataset,
    models_to_evaluate,
    top_k_values=[3, 5, 10],
    verbose=True
):
    # Define models to evaluate based on the list passed as a parameter
    models = {
        model_name: HuggingFaceEmbeddings(model_name=model_name)
        for model_name in models_to_evaluate
    }

    # Prepare results storage
    results = []

    # Convert corpus to documents
    documents = [
        Document(page_content=content, metadata={"id": doc_id})
        for doc_id, content in dataset['corpus'].items()
    ]

    # Evaluate each model
    for model_name, embed_model in tqdm(models.items(), desc="Evaluating models"):
        if verbose:
            print(f"\nEvaluating {model_name}")

        try:
            # Create vector store
            vectorstore = FAISS.from_documents(documents, embed_model)

            # Evaluate for max k
            max_k = max(top_k_values)
            retriever = vectorstore.as_retriever(search_kwargs={"k": max_k})

            eval_results = []
            for id, question in tqdm(dataset['questions'].items(),
                                   desc="Processing questions",
                                   leave=False):
                retrieved_nodes = retriever.invoke(question)
                retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
                expected_id = dataset['relevant_contexts'][id][0]
                is_hit = expected_id in retrieved_ids
                eval_results.append({
                    "id": id,
                    "question": question,
                    "expected_id": expected_id,
                    "is_hit": is_hit,
                    "retrieved_ids": retrieved_ids
                })

            # Calculate metrics for different k values
            model_metrics = {
                "model_name": model_name,
                "status": "success"
            }

            for k in top_k_values:
                # Adjust results for current k
                k_results = []
                for result in eval_results:
                    k_result = result.copy()
                    k_result["retrieved_ids"] = result["retrieved_ids"][:k]
                    k_results.append(k_result)

                metrics = calculate_metrics(k_results, k, dataset)

                # Add metrics with k suffix
                for metric_name, value in metrics.items():
                    model_metrics[f"{metric_name}@{k}"] = value

            results.append(model_metrics)

        except Exception as e:
            results.append({
                "model_name": model_name,
                "status": f"error: {str(e)}",
                **{f"{metric}@{k}": np.nan
                   for k in top_k_values
                   for metric in ["hit_rate", "mrr", "map", "ndcg"]}
            })
            if verbose:
                print(f"Error evaluating {model_name}: {str(e)}")

    # Create DataFrame with results
    df_results = pd.DataFrame(results)

    # Reorder columns to group by metric instead of k
    base_columns = ['model_name', 'status']
    metric_names = ['mrr', 'hit_rate', 'map', 'ndcg']

    # Create ordered list of metric columns
    metric_columns = []
    for metric in metric_names:
        metric_columns.extend([f"{metric}@{k}" for k in top_k_values])

    # Reorder columns
    ordered_columns = base_columns + metric_columns

    # Return reordered DataFrame
    df_results = df_results[ordered_columns]

    # Sort by MRR@5 (or adjust as needed)
    df_results = df_results.sort_values(by="hit_rate@5", ascending=False)

    return df_results

def format_results(df):
    """Format results DataFrame for better visualization"""
    # Create a copy to avoid modifying the original
    formatted_df = df.copy()

    # Round all numeric columns to 2 decimal places
    numeric_columns = formatted_df.select_dtypes(include=[np.number]).columns
    formatted_df[numeric_columns] = formatted_df[numeric_columns].round(2)

    # Add % symbol to metric values
    for col in numeric_columns:
        formatted_df[col] = formatted_df[col].apply(lambda x: f"{x}%" if pd.notnull(x) else x)

    return formatted_df

def display_metrics_summary(df, k_values=[3, 5, 10]):
    """Display a summary of the best model for each metric and k value"""
    metrics = ['mrr', 'hit_rate', 'map', 'ndcg']
    summary = []

    for metric in metrics:
        for k in k_values:
            col = f"{metric}@{k}"
            best_model = df.loc[df[col].astype(float).idxmax()]
            summary.append({
                'Metric': col,
                'Best Model': best_model['model_name'],
                'Score': best_model[col]
            })

    return pd.DataFrame(summary)

def plot_metric_progression(df_results, metric_name='mrr', top_n_models=5):
    """Plot progression of a metric across different k values for top N models"""
    # Get top N models based on metric@5
    top_models = df_results.nlargest(top_n_models, f'{metric_name}@5')['model_name']

    plt.figure(figsize=(10, 6))
    k_values = [3, 5, 10]

    for model in top_models:
        values = [float(str(df_results[df_results['model_name'] == model][f'{metric_name}@{k}'].iloc[0]).rstrip('%'))
                 for k in k_values]
        plt.plot(k_values, values, marker='o', label=model)

    plt.xlabel('k')
    plt.ylabel(f'{metric_name} Score (%)')
    plt.title(f'{metric_name.upper()} Progression for Top {top_n_models} Models')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.grid(True)
    plt.show()

def plot_metrics_comparison(df_results, k=5):
    """Plot comparison of metrics for different models"""
    metrics = [f"hit_rate@{k}", f"mrr@{k}", f"map@{k}", f"ndcg@{k}"]

    plt.figure(figsize=(12, 6))
    x = np.arange(len(df_results))
    width = 0.2

    for i, metric in enumerate(metrics):
        values = [float(str(val).rstrip('%')) for val in df_results[metric]]
        plt.bar(x + i*width, values, width, label=metric.split('@')[0])

    plt.xlabel('Models')
    plt.ylabel('Score (%)')
    plt.title(f'Comparison of Metrics @{k} Across Models')
    plt.xticks(x + width*1.5, df_results['model_name'], rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
results_df = evaluate_embeddings(test_dataset)
results_df['status']



Evaluating models:   0%|          | 0/1 [00:00<?, ?it/s]


Evaluating distiluse-base-multilingual-cased-v1


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]

Unnamed: 0,status
0,success


In [None]:
models_to_evaluate = [
    "sentence-transformers/all-distilroberta-v1",
    "sentence-transformers/all-MiniLM-L12-v2",
    "sentence-transformers/multi-qa-distilbert-cos-v1",
    "sentence-transformers/all-MiniLM-L6-v2",
    "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    "sentence-transformers/paraphrase-albert-small-v2",
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    "sentence-transformers/paraphrase-MiniLM-L3-v2",
    "sentence-transformers/distiluse-base-multilingual-cased-v1",
    "sentence-transformers/distiluse-base-multilingual-cased-v2",
    "sentence-transformers/all-mpnet-base-v2",
    "sentence-transformers/multi-qa-mpnet-base-dot-v1",
    "intfloat/multilingual-e5-large-instruct",
    "rufimelo/Legal-BERTimbau-sts-large",
    "mixedbread-ai/mxbai-embed-large-v1",
    "hkunlp/instructor-xl",
    "BAAI/bge-small-en-v1.5",
    "winderfeld/cc-uffs-ppc-ft-test-multiqa"
]

# Chamada da função
df_results = evaluate_embeddings(test_dataset, models_to_evaluate)


model.safetensors:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/245 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.69k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/140k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.80k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/833 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/542 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/678k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Invalid model-index. Not loading eval results into CardData.


1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Invalid model-index. Not loading eval results into CardData.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/114k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Evaluating models:   0%|          | 0/18 [00:00<?, ?it/s]


Evaluating sentence-transformers/all-distilroberta-v1


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating sentence-transformers/all-MiniLM-L12-v2


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating sentence-transformers/multi-qa-distilbert-cos-v1


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating sentence-transformers/all-MiniLM-L6-v2


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating sentence-transformers/multi-qa-MiniLM-L6-cos-v1


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating sentence-transformers/paraphrase-multilingual-mpnet-base-v2


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating sentence-transformers/paraphrase-albert-small-v2


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating sentence-transformers/paraphrase-MiniLM-L3-v2


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating sentence-transformers/distiluse-base-multilingual-cased-v1


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating sentence-transformers/distiluse-base-multilingual-cased-v2


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating sentence-transformers/all-mpnet-base-v2


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating sentence-transformers/multi-qa-mpnet-base-dot-v1


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating intfloat/multilingual-e5-large-instruct


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating rufimelo/Legal-BERTimbau-sts-large


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating mixedbread-ai/mxbai-embed-large-v1


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating hkunlp/instructor-xl


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]


Evaluating BAAI/bge-small-en-v1.5


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]

In [None]:
save_dict_to_json_on_drive(df_results.to_json(orient='records', indent=4, force_ascii=False), '/content/drive/MyDrive/tcc/embeddings', 'embedding_eval_results_test_dataset.json')

[
    {
        "model_name":"hkunlp\/instructor-xl",
        "status":"success",
        "mrr@3":56.3211382114,
        "mrr@5":57.3577235772,
        "mrr@10":58.2114789005,
        "hit_rate@3":65.3658536585,
        "hit_rate@5":70.0,
        "hit_rate@10":76.2195121951,
        "map@3":56.3211382114,
        "map@5":57.3577235772,
        "map@10":58.2114789005,
        "ndcg@3":58.2649138402,
        "ndcg@5":59.2419197697,
        "ndcg@10":59.7318933929
    },
    {
        "model_name":"intfloat\/multilingual-e5-large-instruct",
        "status":"success",
        "mrr@3":49.8780487805,
        "mrr@5":51.506097561,
        "mrr@10":52.8482384824,
        "hit_rate@3":60.8536585366,
        "hit_rate@5":67.8048780488,
        "hit_rate@10":78.1707317073,
        "map@3":49.8780487805,
        "map@5":51.506097561,
        "map@10":52.8482384824,
        "ndcg@3":52.1629560723,
        "ndcg@5":53.9819958924,
        "ndcg@10":55.5464655482
    },
    {
        "model_name":"mi

'/content/drive/MyDrive/tcc/embeddings/38a3dc_embedding_eval_results_test_dataset.json'

In [None]:
results_json = load_json_from_drive('/content/drive/MyDrive/tcc/embeddings/38a3dc_embedding_eval_results_test_dataset.json')

In [None]:
import pandas as pd

results_json_df = pd.DataFrame(json.loads(results_json))
results_json_df

Unnamed: 0,model_name,status,mrr@3,mrr@5,mrr@10,hit_rate@3,hit_rate@5,hit_rate@10,map@3,map@5,map@10,ndcg@3,ndcg@5,ndcg@10
0,hkunlp/instructor-xl,success,56.321138,57.357724,58.211479,65.365854,70.0,76.219512,56.321138,57.357724,58.211479,58.264914,59.24192,59.731893
1,intfloat/multilingual-e5-large-instruct,success,49.878049,51.506098,52.848238,60.853659,67.804878,78.170732,49.878049,51.506098,52.848238,52.162956,53.981996,55.546466
2,mixedbread-ai/mxbai-embed-large-v1,success,47.357724,49.132114,50.277052,55.609756,63.414634,71.95122,47.357724,49.132114,50.277052,49.091729,51.718349,52.949793
3,sentence-transformers/multi-qa-mpnet-base-dot-v1,success,45.50813,46.800813,48.139373,53.780488,59.512195,69.146341,45.50813,46.800813,48.139373,47.209469,48.816662,50.662464
4,BAAI/bge-small-en-v1.5,success,42.174797,43.583333,44.727642,50.243902,56.585366,65.121951,42.174797,43.583333,44.727642,43.990836,45.95238,47.377429
5,sentence-transformers/all-MiniLM-L12-v2,success,35.650407,37.412602,38.419425,43.780488,51.463415,58.902439,35.650407,37.412602,38.419425,37.580413,40.204959,41.083981
6,sentence-transformers/all-MiniLM-L6-v2,success,36.056911,37.386179,38.508759,42.926829,48.902439,57.560976,36.056911,37.386179,38.508759,37.521636,39.457845,41.147292
7,sentence-transformers/multi-qa-distilbert-cos-v1,success,33.536585,35.134146,36.163908,41.341463,48.414634,55.853659,33.536585,35.134146,36.163908,35.194381,37.53146,38.478814
8,sentence-transformers/multi-qa-MiniLM-L6-cos-v1,success,33.373984,34.977642,36.128097,40.609756,47.682927,56.341463,33.373984,34.977642,36.128097,35.045587,37.487769,39.040554
9,sentence-transformers/distiluse-base-multiling...,success,33.963415,35.195122,36.263018,40.365854,45.853659,53.902439,33.963415,35.195122,36.263018,35.437589,37.222176,38.819595


In [None]:

import pandas as pd
import json

selected_columns = ['model_name', 'mrr@5', 'hit_rate@5', 'map@5', 'ndcg@5']
top_5_results = results_json_df[selected_columns].head(5)

# Display or further process the top_5_results DataFrame
top_5_results

Unnamed: 0,model_name,mrr@5,hit_rate@5,map@5,ndcg@5
0,hkunlp/instructor-xl,57.357724,70.0,57.357724,59.24192
1,intfloat/multilingual-e5-large-instruct,51.506098,67.804878,51.506098,53.981996
2,mixedbread-ai/mxbai-embed-large-v1,49.132114,63.414634,49.132114,51.718349
3,sentence-transformers/multi-qa-mpnet-base-dot-v1,46.800813,59.512195,46.800813,48.816662
4,BAAI/bge-small-en-v1.5,43.583333,56.585366,43.583333,45.95238


In [None]:
top_5_results.to_json()

'{"model_name":{"0":"hkunlp\\/instructor-xl","1":"intfloat\\/multilingual-e5-large-instruct","2":"mixedbread-ai\\/mxbai-embed-large-v1","3":"sentence-transformers\\/multi-qa-mpnet-base-dot-v1","4":"BAAI\\/bge-small-en-v1.5"},"mrr@5":{"0":57.3577235772,"1":51.506097561,"2":49.1321138211,"3":46.8008130081,"4":43.5833333333},"hit_rate@5":{"0":70.0,"1":67.8048780488,"2":63.4146341463,"3":59.512195122,"4":56.5853658537},"map@5":{"0":57.3577235772,"1":51.506097561,"2":49.1321138211,"3":46.8008130081,"4":43.5833333333},"ndcg@5":{"0":59.2419197697,"1":53.9819958924,"2":51.7183492448,"3":48.8166618698,"4":45.9523795796}}'

In [None]:
models_to_evaluate = [
    "winderfeld/cc-uffs-ppc-ft-test-multiqa",
]

df_results_finetuned = evaluate_embeddings(test_dataset, models_to_evaluate)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/202 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Evaluating models:   0%|          | 0/1 [00:00<?, ?it/s]


Evaluating winderfeld/cc-uffs-ppc-ft-test-multiqa


Processing questions:   0%|          | 0/820 [00:00<?, ?it/s]

In [None]:
df_results_finetuned


Unnamed: 0,model_name,status,mrr@3,mrr@5,mrr@10,hit_rate@3,hit_rate@5,hit_rate@10,map@3,map@5,map@10,ndcg@3,ndcg@5,ndcg@10
0,winderfeld/cc-uffs-ppc-ft-test-multiqa,success,72.276423,73.349593,74.128968,81.463415,86.097561,91.829268,72.276423,73.349593,74.128968,74.203499,75.15184,75.415585


In [None]:
embeddings_results = pd.concat([results_json_df, df_results_finetuned], axis=0, ignore_index=True)

In [None]:
embeddings_results

Unnamed: 0,model_name,status,mrr@3,mrr@5,mrr@10,hit_rate@3,hit_rate@5,hit_rate@10,map@3,map@5,map@10,ndcg@3,ndcg@5,ndcg@10
0,hkunlp/instructor-xl,success,56.321138,57.357724,58.211479,65.365854,70.0,76.219512,56.321138,57.357724,58.211479,58.264914,59.24192,59.731893
1,intfloat/multilingual-e5-large-instruct,success,49.878049,51.506098,52.848238,60.853659,67.804878,78.170732,49.878049,51.506098,52.848238,52.162956,53.981996,55.546466
2,mixedbread-ai/mxbai-embed-large-v1,success,47.357724,49.132114,50.277052,55.609756,63.414634,71.95122,47.357724,49.132114,50.277052,49.091729,51.718349,52.949793
3,sentence-transformers/multi-qa-mpnet-base-dot-v1,success,45.50813,46.800813,48.139373,53.780488,59.512195,69.146341,45.50813,46.800813,48.139373,47.209469,48.816662,50.662464
4,BAAI/bge-small-en-v1.5,success,42.174797,43.583333,44.727642,50.243902,56.585366,65.121951,42.174797,43.583333,44.727642,43.990836,45.95238,47.377429
5,sentence-transformers/all-MiniLM-L12-v2,success,35.650407,37.412602,38.419425,43.780488,51.463415,58.902439,35.650407,37.412602,38.419425,37.580413,40.204959,41.083981
6,sentence-transformers/all-MiniLM-L6-v2,success,36.056911,37.386179,38.508759,42.926829,48.902439,57.560976,36.056911,37.386179,38.508759,37.521636,39.457845,41.147292
7,sentence-transformers/multi-qa-distilbert-cos-v1,success,33.536585,35.134146,36.163908,41.341463,48.414634,55.853659,33.536585,35.134146,36.163908,35.194381,37.53146,38.478814
8,sentence-transformers/multi-qa-MiniLM-L6-cos-v1,success,33.373984,34.977642,36.128097,40.609756,47.682927,56.341463,33.373984,34.977642,36.128097,35.045587,37.487769,39.040554
9,sentence-transformers/distiluse-base-multiling...,success,33.963415,35.195122,36.263018,40.365854,45.853659,53.902439,33.963415,35.195122,36.263018,35.437589,37.222176,38.819595


In [None]:
# prompt: sort by hit_rate@5, no comments. One liner, reset index

embeddings_results.sort_values(by="hit_rate@5", ascending=False).reset_index(drop=True)


Unnamed: 0,model_name,status,mrr@3,mrr@5,mrr@10,hit_rate@3,hit_rate@5,hit_rate@10,map@3,map@5,map@10,ndcg@3,ndcg@5,ndcg@10
0,winderfeld/cc-uffs-ppc-ft-test-multiqa,success,72.276423,73.349593,74.128968,81.463415,86.097561,91.829268,72.276423,73.349593,74.128968,74.203499,75.15184,75.415585
1,hkunlp/instructor-xl,success,56.321138,57.357724,58.211479,65.365854,70.0,76.219512,56.321138,57.357724,58.211479,58.264914,59.24192,59.731893
2,intfloat/multilingual-e5-large-instruct,success,49.878049,51.506098,52.848238,60.853659,67.804878,78.170732,49.878049,51.506098,52.848238,52.162956,53.981996,55.546466
3,mixedbread-ai/mxbai-embed-large-v1,success,47.357724,49.132114,50.277052,55.609756,63.414634,71.95122,47.357724,49.132114,50.277052,49.091729,51.718349,52.949793
4,sentence-transformers/multi-qa-mpnet-base-dot-v1,success,45.50813,46.800813,48.139373,53.780488,59.512195,69.146341,45.50813,46.800813,48.139373,47.209469,48.816662,50.662464
5,BAAI/bge-small-en-v1.5,success,42.174797,43.583333,44.727642,50.243902,56.585366,65.121951,42.174797,43.583333,44.727642,43.990836,45.95238,47.377429
6,sentence-transformers/all-MiniLM-L12-v2,success,35.650407,37.412602,38.419425,43.780488,51.463415,58.902439,35.650407,37.412602,38.419425,37.580413,40.204959,41.083981
7,sentence-transformers/all-MiniLM-L6-v2,success,36.056911,37.386179,38.508759,42.926829,48.902439,57.560976,36.056911,37.386179,38.508759,37.521636,39.457845,41.147292
8,sentence-transformers/multi-qa-distilbert-cos-v1,success,33.536585,35.134146,36.163908,41.341463,48.414634,55.853659,33.536585,35.134146,36.163908,35.194381,37.53146,38.478814
9,sentence-transformers/multi-qa-MiniLM-L6-cos-v1,success,33.373984,34.977642,36.128097,40.609756,47.682927,56.341463,33.373984,34.977642,36.128097,35.045587,37.487769,39.040554
