In [None]:
import pandas as pd
import json
import voyageai
import torch
import time
import logging
from tqdm import tqdm

In [None]:
tabular_retrieval = ['TATQA', 'FinQA', 'ConvFinQA', 'MultiHiertt']

In [None]:
vo = voyageai.Client(api_key="")

In [None]:
## Embedding
model_name = "voyage-3"
batch_size = 32
delay = 0.1

for task in tabular_retrieval:
    for attr in ["corpus", "queries"]:
        versions = ["convert", "original"] if attr == "corpus" else ["original"]

        for version in versions:
            file_path = f"./data/{task}_{attr}_convert.csv" if version == "convert" else f"./data/{task}_{attr}.csv"
            data = pd.read_csv(file_path)
            data = data.dropna(subset=["text"]).reset_index(drop=True)

            embeddings = {}
            for i in tqdm(range(0, len(data), batch_size), desc=f"{attr} - {version} Batches", leave=False):
                batch = data[i:i + batch_size]
                batch_ids = batch["_id"].tolist()
                batch_texts = batch["convert_text"].tolist() if version == "convert" else batch["text"].tolist()
                emb_type = "query" if attr == "queries" else "document"
                
                try:
                    result = vo.embed(batch_texts, model=model_name, input_type=emb_type).embeddings
                    for _id, embedding in zip(batch_ids, result):
                        embeddings[_id] = embedding
                except Exception as e:
                    logging.error(f"Error embedding batch starting at index {i} for task '{task}' - {e}")
                
                # Sleep to respect rate limits
                time.sleep(delay)
            
            # save
            output_filename = f"{task}_{attr}.json" if version == "original" else f"{task}_{attr}_convert.json"
            output_path = f"./{model_name}/embed/{output_filename}"
            with open(output_path, "w") as f:
                json.dump(embeddings, f)

In [None]:
## Calculate Score

@torch.no_grad()
def cos_sim(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
    return torch.mm(
        torch.nn.functional.normalize(a, p=2, dim=1),
        torch.nn.functional.normalize(b, p=2, dim=1).transpose(0, 1),
    )


for task in tabular_retrieval:
    # Load query embeddings
    with open(f"./{model_name}/embed/{task}_queries.json", "r") as f:
        loaded_data = json.load(f)
        query_ids = list(loaded_data.keys())
        queries_embeddings = torch.tensor([loaded_data[_id] for _id in query_ids])
    
    # Load corpus embeddings
    with open(f"./{model_name}/embed/{task}_corpus_convert.json", "r") as f:
        loaded_data = json.load(f)
        corpus_ids = list(loaded_data.keys())
        corpus_embeddings = torch.tensor([loaded_data[_id] for _id in corpus_ids])
    
    similarity_matrix = cos_sim(queries_embeddings, corpus_embeddings)

    top_k = 50 if task in ['FinQABench', 'FinanceBench'] else 500

    top_matches = {}
    
    for i, query_id in enumerate(query_ids):
        top_values, top_indices = torch.topk(similarity_matrix[i], top_k)
        top_corpus_ids_scores = {corpus_ids[idx]: top_values[j].item() for j, idx in enumerate(top_indices)}
        top_matches[query_id] = top_corpus_ids_scores
    
    output_path = f"./{model_name}/{task}_convert.json"
    with open(output_path, "w") as f:
        json.dump(top_matches, f)