In [16]:
from datasets import load_dataset
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sentence_transformers.util import cos_sim
import time
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer
from torch import no_grad
import torch
import pickle
import os
import json

In [6]:
CHECKPOINT_DIR = "../checkpoints"

MODELS_PATH = {
    model: os.path.join(CHECKPOINT_DIR, model, "best-model")
    for model in os.listdir(CHECKPOINT_DIR)
    if os.path.isdir(os.path.join(CHECKPOINT_DIR, model, "best-model"))
}

print(json.dumps(MODELS_PATH, indent=4))

{
    "e5-mnrl": "../checkpoints/e5-mnrl/best-model",
    "stella-contrastive": "../checkpoints/stella-contrastive/best-model",
    "stella-csmnrl": "../checkpoints/stella-csmnrl/best-model",
    "e5-onlinecontrastive": "../checkpoints/e5-onlinecontrastive/best-model",
    "gte-csmnrl": "../checkpoints/gte-csmnrl/best-model",
    "stella-onlinecontrastive": "../checkpoints/stella-onlinecontrastive/best-model",
    "stella-mnrl": "../checkpoints/stella-mnrl/best-model",
    "e5-csmnrl": "../checkpoints/e5-csmnrl/best-model",
    "gte-mnrl": "../checkpoints/gte-mnrl/best-model",
    "gte-contrastive": "../checkpoints/gte-contrastive/best-model",
    "gte-onlinecontrastive": "../checkpoints/gte-onlinecontrastive/best-model",
    "e5-contrastive": "../checkpoints/e5-contrastive/best-model"
}


In [11]:
ds = load_dataset("WhereIsAI/github-issue-similarity", "default")

ds["train"] = ds["train"].filter(lambda x: x["text1"] != "" and x["text2"] != "")
ds["valid"] = ds["valid"].filter(lambda x: x["text1"] != "" and x["text2"] != "")
ds["test"] = ds["test"].filter(lambda x: x["text1"] != "" and x["text2"] != "")

ds = ds.rename_columns({"text1": "sentence1", "text2": "sentence2"})

def remove_html_tags(sample):
    sample["sentence1"] = BeautifulSoup(sample["sentence1"], "html.parser").get_text().strip()
    sample["sentence2"] = BeautifulSoup(sample["sentence2"], "html.parser").get_text().strip()
    return sample

ds = ds.map(remove_html_tags, num_proc=8)

Map (num_proc=8):   0%|          | 0/18426 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/1538 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/1539 [00:00<?, ? examples/s]

In [13]:
baseline_tfidf = TfidfVectorizer(tokenizer=word_tokenize, stop_words="english")

baseline_tfidf.fit(ds["train"]["sentence1"] + ds["train"]["sentence2"])



In [15]:
os.makedirs(os.path.join(CHECKPOINT_DIR, "baseline_tfidf"), exist_ok=True)
with open(os.path.join(CHECKPOINT_DIR, "baseline_tfidf", "model.pkl"), "wb") as f:
    pickle.dump(baseline_tfidf, f)

In [17]:
class RetrievalEvaluator:
    def __init__(self, models_path, baseline, dataset, k_values=[1, 3, 5, 10]):
        self.models_path = models_path
        self.baseline = baseline
        self.dataset = dataset
        self.k_values = k_values
        self.results = []

    def _compute_metrics(self, ranked_indices, relevant_idx):
        mrr = 0
        hits = {k: 0 for k in self.k_values}

        for i, idx in enumerate(ranked_indices):
            if idx == relevant_idx:
                mrr = 1 / (i + 1)
                for k in self.k_values:
                    if i < k:
                        hits[k] = 1
                break

        return mrr, hits

    def _evaluate_model(self,
                        model: SentenceTransformer):
        test_queries = self.dataset["test"]["sentence1"]
        test_docs = self.dataset["test"]["sentence2"]
        relevant_indices = np.arange(len(test_queries))

        start_encode = time.time()
        embeddings = model.encode(test_queries + test_docs, convert_to_tensor=True, show_progress_bar=True)
        time_per_sample = (time.time() - start_encode)/len(test_queries+test_docs)
        embeddings_queries = embeddings[:len(test_queries)]
        embeddings_docs = embeddings[len(test_queries):]
        
        similarity_matrix = cos_sim(embeddings_queries, embeddings_docs).cpu().numpy()
        rankings = np.argsort(-similarity_matrix, axis=1)

        mrr_scores = []
        hits_scores = {k: [] for k in self.k_values}

        for i, ranked_indices in enumerate(rankings):
            mrr, hits = self._compute_metrics(ranked_indices, relevant_indices[i])
            mrr_scores.append(mrr)
            for k in self.k_values:
                hits_scores[k].append(hits[k])

        avg_mrr = np.mean(mrr_scores)
        avg_hits = {k: np.mean(hits_scores[k]) for k in self.k_values}

        return avg_mrr, avg_hits, time_per_sample

    def evaluate(self):
        for model_name, model_path in self.models_path.items():
            print(f"Evaluating model: {model_name}...")
            model = SentenceTransformer(model_path)
            avg_mrr, avg_hits, time_per_sample = self._evaluate_model(model)
            self.results.append({
                "model": model_name,
                "mrr": avg_mrr,
                **{f"hits@{k}": avg_hits[k] for k in self.k_values},
                "time": time_per_sample
            })
            del model
            torch.cuda.empty_cache()

        print("Evaluating baseline TF-IDF...")
        tfidf_vectors = self.baseline.fit_transform(self.dataset["test"]["sentence2"])
        test_queries = self.baseline.transform(self.dataset["test"]["sentence1"])
        similarity_matrix = (test_queries @ tfidf_vectors.T).toarray()
        rankings = np.argsort(-similarity_matrix, axis=1)

        mrr_scores = []
        hits_scores = {k: [] for k in self.k_values}
        times_per_sample = []

        for i, ranked_indices in enumerate(rankings):
            start_eval = time.time()
            mrr, hits = self._compute_metrics(ranked_indices, i)
            mrr_scores.append(mrr)
            for k in self.k_values:
                hits_scores[k].append(hits[k])
            times_per_sample.append(time.time() - start_eval)

        avg_mrr = np.mean(mrr_scores)
        avg_hits = {k: np.mean(hits_scores[k]) for k in self.k_values}
        time_per_sample = np.mean(times_per_sample)

        self.results.append({
            "model": "TF-IDF Baseline",
            "mrr": avg_mrr,
            **{f"hits@{k}": avg_hits[k] for k in self.k_values},
            "time": time_per_sample
        })

        self._save_results()

    def _save_results(self):
        results_df = pd.DataFrame(self.results)
        os.makedirs("rsrc", exist_ok=True)
        filename = f"rsrc/retrieval_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
        results_df.to_csv(filename, index=False)
        print(f"Results saved to {filename}")

In [18]:
retr_eval = RetrievalEvaluator(MODELS_PATH, baseline_tfidf, ds, k_values=[1, 5, 10])

In [19]:
retr_eval.evaluate()

Evaluating model: e5-mnrl...


Batches:   0%|          | 0/97 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [24]:
from sentence_transformers.evaluation import BinaryClassificationEvaluator

binary_acc_evaluator = BinaryClassificationEvaluator(
    sentences1=ds["test"]["sentence1"][:10],
    sentences2=ds["test"]["sentence2"][:10],
    labels=ds["test"]["label"][:10],
    name="git-issues",
    write_csv=False
)

In [26]:
results = binary_acc_evaluator(SentenceTransformer(MODELS_PATH["gte-mnrl"]))

In [27]:
results

{'git-issues_cosine_accuracy': 1.0,
 'git-issues_cosine_accuracy_threshold': 0.43238911032676697,
 'git-issues_cosine_f1': 1.0,
 'git-issues_cosine_f1_threshold': 0.43238911032676697,
 'git-issues_cosine_precision': 1.0,
 'git-issues_cosine_recall': 1.0,
 'git-issues_cosine_ap': 1.0,
 'git-issues_cosine_mcc': 1.0}