In [1]:
import pandas as pd
import json

from beir.retrieval.search.lexical import BM25Search as BM25


from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval.models import SPLADE, SentenceBERT, UniCOIL
from beir.retrieval.search.sparse import SparseSearch


from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from github_search.evaluation.beir_evaluation import EvaluateRetrievalCustom as EvaluateRetrieval, CorpusDataLoader
from beir.retrieval.search.lexical import BM25Search as BM25

import sentence_transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pickle 

with open("/home/kuba/Projects/github_search/.dagster/storage/corpus_information", "rb") as f:
    corpora = json.loads(pickle.load(f))

[(cname, len(corpora[cname].keys())) for cname in corpora.keys()]

[('readme', 6773),
 ('repomap', 6773),
 ('selected_code', 6773),
 ('dependency_signature', 6773),
 ('repository_signature', 6773),
 ('generated_tasks', 6773),
 ('code2doc_generated_readme', 6773),
 ('code2doc_files_summary', 6773),
 ('repomap_code2doc_generated_readme', 6773),
 ('repomap_code2doc_files_summary', 6773)]

In [3]:
list(corpora["repository_signature"].items())[:10]

[('0',
  {'text': 'repo: 008karan/SincNet_demo\n\nrepo-file:\ncompute_d_vector.py, data_io.py, speaker_id.py, dnn_models.py, data_io.py, similarity.py, dnn_models.py, TIMIT_preparation.py, inference.py, dnn_models.py\n\nfile-import:\nnumpy, sys, torch, os, soundfile, torch.autograd.Variable, torch.nn, json, sklearn.metrics.pairwise.cosine_similarity, data_io.ReadList\n\nfile-class:\nLayerNorm, dnn_models.py, dnn_models.py, dnn_models.py, SincConv_fast, MLP, sinc_conv, SincNet\n\nfile-function:\nflip, ReadList, create_batches_rnd, act_fun, sinc, speaker_id.py, speaker_id.py, dnn_models.py, data_io.py, dnn_models.py\ntasks:\nspeaker recognition, speech feature extraction',
   'title': '008karan/SincNet_demo'}),
 ('1',
  {'text': 'repo: 011235813/cm3\n\nrepo-file:\nalg/replay_buffer.py, alg/alg_baseline_checkers.py, alg/alg_qmix.py, env/multiagent-particle-envs/multiagent/policy.py, env/multiagent-particle-envs/multiagent/core.py, env/multiagent-particle-envs/multiagent/multi_discrete.py,

In [4]:
corpora.keys()

dict_keys(['readme', 'repomap', 'selected_code', 'dependency_signature', 'repository_signature', 'generated_tasks', 'code2doc_generated_readme', 'code2doc_files_summary', 'repomap_code2doc_generated_readme', 'repomap_code2doc_files_summary'])

In [5]:
def get_repos_for_query(query, repos_df):
    return repos_df[repos_df["tasks"].apply(lambda ts: query in ts)]


def get_queries(repos_df, min_query_count):
    all_queries = repos_df["query_tasks"].explode()
    qcounts = all_queries.value_counts()
    return qcounts[qcounts >= min_query_count].index.to_list()

def prepare_query_data(repos_df, min_query_count=5):
    task_queries = {str(i): query for (i, query) in enumerate(get_queries(repos_df, min_query_count=min_query_count))}

    task_qrels = {
        qid: {str(corpus_id): 1 for corpus_id in get_repos_for_query(task_queries[qid], repos_df).index}
        for qid in task_queries.keys()
    }
    return task_queries, task_qrels

In [6]:
with open("/home/kuba/Projects/github_search/.dagster/storage/repos_with_representations_df", "rb") as f:
    sampled_repos_df = pickle.load(f)


repos_sorted = [rec["title"] for rec in list(corpora["readme"].values())]
sampled_repos_df = pd.Series(repos_sorted, name="repo").reset_index().merge(sampled_repos_df, on="repo")
task_queries, task_qrels = prepare_query_data(sampled_repos_df, min_query_count=10)

In [7]:
len(task_queries.values())

742

In [8]:
pd.Series(map(len, task_qrels.values())).describe()

count    742.000000
mean      23.832884
std       30.464228
min       10.000000
25%       11.000000
50%       13.000000
75%       22.000000
max      271.000000
dtype: float64

In [9]:
sampled_repos_df = sampled_repos_df[sampled_repos_df["tasks"].apply(len) <= 10]

with open("../output/elasticsearch/queries_qrels.json", "w") as f:
    json.dump({"task_queries": task_queries, "task_qrels": task_qrels}, f)

In [10]:
for cid in corpora["readme"].keys():
    assert corpora["readme"][cid]["title"] == corpora["readme"][cid]["title"], f"no match at {cid}"
    #assert corpora["readme"][cid]["title"] == corpora[("dependency_signature", 0)][cid]["title"], f"no match at {cid}"

In [11]:
## Checking elasticsearch

In [12]:
import elasticsearch

es_client = elasticsearch.Elasticsearch()
def retrieve_repos_with_es(query, k=50, index="readme", es_client=es_client):
    es_result = es_client.search(index=index, body={"query": {"match": {"txt": query}}}, size=k)
    return [
        hit["_source"]["title"]
        for hit in es_result["hits"]["hits"]
    ]



def get_elasticsearch_results():
    retrieved_repo_tasks = {}

    qcounts = sampled_repos_df["tasks"].explode().value_counts()
    used_queries = [
        query
        for query in sampled_repos_df["tasks"].explode().drop_duplicates()
        if qcounts.loc[query] > 5
    ]
    # [task_queries[qid] for qid in task_queries.keys()]
    
    index="selected_code"
    for query in used_queries:
        retrieved_tasks = sampled_repos_df[sampled_repos_df["repo"].isin(retrieve_repos_with_es(query, index=index))]["tasks"].to_list()
        retrieved_repo_tasks[query] = retrieved_tasks
    
    k = 10
    query_hits = pd.Series({
        query: sum([query in tasks for tasks in retrieved_repo_tasks[query][:k]])
        for query in retrieved_repo_tasks.keys()
    })

def show_elasticsearch_results(qid='10'):
    query = task_queries[qid]
    
    print(query)
    print(query_hits[query], "hits")
    
    for hit in es_client.search(index=index, body={"query": {"match": {"txt": task_queries[qid]}}}, size=k)["hits"]["hits"]:
        print("#" * 100)
        print("#" * 100)
        repo_name = hit["_source"]["title"]
        repo_record = sampled_repos_df[sampled_repos_df["repo"] == repo_name].iloc[0]
        is_hit = query in repo_record["tasks"]
        print(repo_name, "HIT" if is_hit else "NO HIT")
        
        if is_hit:
            print("#" * 100)
            print("#" * 100)
            print(hit['_source']['txt'])

## Evaluating with BEIR

In [13]:
def load_w2v_sentence_transformer(w2v_model_path):
    w2v_layer = sentence_transformers.models.WordEmbeddings.load(w2v_model_path)
    model = sentence_transformers.SentenceTransformer(modules=[w2v_layer, sentence_transformers.models.Pooling(200)])
    model.max_seq_length = 2048
    return model


def load_sentence_bert(model_name):
    st_model = SentenceBERT("sentence-transformers/all-mpnet-base-v2")
    st_model.doc_model = sentence_transformers.SentenceTransformer(model_name, trust_remote_code=True)
    st_model.q_model = st_model.doc_model
    return st_model

def get_w2v_retriever(w2v_model_path="../models/rnn_abstract_readme_w2v/0_WordEmbeddings"):
    w2v_model = load_w2v_sentence_transformer(w2v_model_path)
    st_model = SentenceBERT("sentence-transformers/all-mpnet-base-v2")
    st_model.q_model = w2v_model
    st_model.doc_model = w2v_model
    return EvaluateRetrieval(DRES(st_model), score_function="cos_sim")

def get_splade_retriever(splade_model_path = "splade/weights/distilsplade_max", batch_size=128):
    splade_model = DRES(SPLADE(splade_model_path), batch_size=128)
    return EvaluateRetrieval(splade_model, score_function="dot")


def get_bm25_retrievers(corpora):
    def sanitize_index_name(index_name):
        if type(index_name) is str:
            return index_name
        else:
            return "".join(map(str, index_name))
    
    bm25_retrievers = {}
    for corpus_name, corpus in corpora.items():
        model = BM25(index_name=sanitize_index_name(corpus_name))
        retriever = EvaluateRetrieval(model)
        bm25_retrievers[corpus_name] = retriever
    return bm25_retrievers


sentence_transformer_model_names = [
    "sentence-transformers/all-mpnet-base-v2",
    "sentence-transformers/all-MiniLM-L12-v2",
    #"nomic-ai/modernbert-embed-base",
    
    #"estrogen/ModernBERT-base-nli-v3"
    #"BAAI/bge-large-en-v1.5",
    #"mixedbread-ai/mxbai-embed-large-v1"
]

def get_sentence_transformer_retriever(model_name="sentence-transformers/all-mpnet-base-v2", batch_size=8):
    model = DRES(load_sentence_bert(model_name), batch_size=batch_size)
    return EvaluateRetrieval(model, score_function="cos_sim")

def get_unicoil_retriever(model_name="castorini/unicoil-msmarco-passage"):
    """
    THERE IS A BUG WITH BEIR THAT MAKES THIS UNUSABLE
    """
    model = SparseSearch(UniCOIL(model_path=model_name), batch_size=32)
    return EvaluateRetrieval(model, score_function="dot")

In [14]:
corpora.keys()

dict_keys(['readme', 'repomap', 'selected_code', 'dependency_signature', 'repository_signature', 'generated_tasks', 'code2doc_generated_readme', 'code2doc_files_summary', 'repomap_code2doc_generated_readme', 'repomap_code2doc_files_summary'])

In [15]:
def get_corpus_samples(corpora, n_repos=10):
    records = []


    for k in range(n_repos):
        for cname in corpora.keys():
            if type(cname) is tuple:
                if 0 in cname:
                    display_name = cname[0]
                else:
                    continue
            else:
                display_name = cname
            record = corpora[cname][str(k)]
            record["corpus"] = display_name
            records.append(record)
    
    return pd.DataFrame.from_records(records).rename(columns = {"title": "repo_name", "corpus": "representation"}).fillna(method="ffill")

In [16]:
def get_repomaps_df(repo_names, repomap_path="../output/aider/selected_repo_maps_1024.json"):
    with open(repomap_path) as f:
        repomaps = json.load(f)

    records = []
    for repo in repo_names:
        records.append({"repo_name": repo, "text": repomaps[repo], "representation": "repomap"})
    return pd.DataFrame.from_records(records)

from pylate import indexes, models, retrieve


class PyLateBEIRWrapper:

    def __init__(self, model_name="lightonai/colbertv2.0"):
        
        self.model = models.ColBERT(
            model_name_or_path=model_name,
        )
        self.index = indexes.Voyager(
            index_folder=f"../output/pylate-index/{model_name}",
            index_name="index",
            override=True,
        )
        self.retriever = None

    def index_corpus(self, corpus):
        documents = corpus.values()
        documents_embeddings = self.model.encode(
            documents,
            batch_size=32,
            is_query=False, # Encoding documents
            show_progress_bar=True,
        )
        
        # Add the documents ids and embeddings to the Voyager index
        self.index.add_documents(
            documents_ids=corpus.keys(),
            documents_embeddings=documents_embeddings,
        )
        self.retriever = retrieve.ColBERT(index=self.index)

    def retrieve(self, query):
        return self.retriever.retrieve(query)

pylate_model = PyLateBEIRWrapper()

In [17]:
import sentence_transformers

In [18]:
w2v_retriever = get_w2v_retriever()

In [19]:
w2v_retriever

<github_search.evaluation.beir_evaluation.EvaluateRetrievalCustom at 0x7c13e14bb380>

In [20]:
#splade_retriever = get_splade_retriever() 

# change sentence-transformers to 2.7?
sentence_transformer_retrievers = {
    model_name: get_sentence_transformer_retriever(model_name)
    for model_name in sentence_transformer_model_names
}

In [21]:
bm25_retrievers = get_bm25_retrievers(corpora)

## Per query results

In [22]:
from pydantic import BaseModel
from typing import Dict

class RetrieverInput(BaseModel):
    corpus: Dict[str, dict]
    queries: Dict[str, str]
    qrels: Dict[str, Dict[str, int]]


class RetrievalEvaluationResults(BaseModel):
    retrieval_results: Dict[str, Dict[str, float]]
    metrics: dict
    model_type: str

    @classmethod
    def from_retriever(cls, retriever, retriever_input, metric_names=["accuracy@k", "hits@k", "r_cap@k", "mrr@k"]):
        retrieval_results = retriever.retrieve(retriever_input.corpus, retriever_input.queries)
        custom_metrics = retriever.evaluate_custom_multi(retriever_input.qrels, retrieval_results, retriever.k_values, metrics=metric_names)
        other_metrics = retriever.evaluate(retriever_input.qrels, retrieval_results, retriever.k_values, ignore_identical_ids=False)
        metrics = custom_metrics | cls.tuple_to_dict(other_metrics)
        try:
            model_type = str(retriever.retriever.model)
        except:
            model_type = "bm25"
        return RetrievalEvaluationResults(metrics=metrics, model_type=model_type, retrieval_results=retrieval_results)


    @classmethod
    def tuple_to_dict(cls, dicts):
        merged_dict = {}
        for d in dicts:
            merged_dict = d | merged_dict
        return merged_dict




In [23]:
retriever_inputs = {
    corpus_name: RetrieverInput(corpus=corpus, queries=task_queries, qrels=task_qrels)
    for (corpus_name, corpus) in corpora.items()
}

In [24]:
from github_search.evaluation.beir_evaluation import PerQueryIREvaluator

In [25]:
per_query_evaluator = PerQueryIREvaluator(k_values=[1, 5, 10, 25])

In [26]:
retriever_inputs = {
    corpus_name: RetrieverInput(corpus=corpus, queries=task_queries, qrels=task_qrels)
    for (corpus_name, corpus) in corpora.items()
}

In [27]:
retriever_inputs.keys()

dict_keys(['readme', 'repomap', 'selected_code', 'dependency_signature', 'repository_signature', 'generated_tasks', 'code2doc_generated_readme', 'code2doc_files_summary', 'repomap_code2doc_generated_readme', 'repomap_code2doc_files_summary'])

In [28]:
named_retrievers = {
    corpus_name: [
        ("bm25", bm25_retrievers[corpus_name]),
        ("word2vec", w2v_retriever),
    ] + list(sentence_transformer_retrievers.items())
    for corpus_name in retriever_inputs.keys()
}

In [32]:
rc = named_retrievers["readme"][1][1]

In [29]:
retriever_inputs.keys()

dict_keys(['readme', 'repomap', 'selected_code', 'dependency_signature', 'repository_signature', 'generated_tasks', 'code2doc_generated_readme', 'code2doc_files_summary', 'repomap_code2doc_generated_readme', 'repomap_code2doc_files_summary'])

In [35]:
%%time
per_query_results = {
    (corpus_name, retriever_name): per_query_evaluator.get_scores(retriever=retriever, ir_data=retriever_inputs[corpus_name])
    for corpus_name in retriever_inputs.keys()
    for (retriever_name, retriever) in named_retrievers[corpus_name]
}

  0%|                                                                                                                                                                                   | 0/6773 [00:00<?, ?docs/s]
que: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.37it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 109.17it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 53/53 [00:01<00:00, 35.06it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

CPU times: user 25min 14s, sys: 59.1 s, total: 26min 14s
Wall time: 7min 59s


In [36]:
raw_per_query_results_df = pd.concat([
    df.assign(retriever=[retriever_name]*len(df)).assign(corpus=[corpus_name]*len(df))
    for ((corpus_name, retriever_name), df) in per_query_results.items()
])

In [37]:
per_query_results_df = raw_per_query_results_df.assign(
    corpus=raw_per_query_results_df["corpus"].apply(lambda cname: cname if type(cname) is str else cname[0]),
    generation=raw_per_query_results_df["corpus"].apply(lambda cname: 0 if type(cname) is str else cname[1])
)

In [38]:
per_query_results_df = (
    per_query_results_df
        .groupby(["query", "retriever", "corpus"]).agg("mean").drop(columns=["generation"])
        .reset_index()
)

In [39]:
per_query_results_df

Unnamed: 0,query,retriever,corpus,Hits@1,Hits@5,Hits@10,Hits@25,Accuracy@1,Precision@1,Accuracy@5,Precision@5,Accuracy@10,Precision@10,Accuracy@25,Precision@25
0,2d human pose estimation,bm25,code2doc_files_summary,0.0,0.0,2.0,6.0,0.0,0.0,0.0,0.0,1.0,0.2,1.0,0.24
1,2d human pose estimation,bm25,code2doc_generated_readme,0.0,2.0,2.0,8.0,0.0,0.0,1.0,0.4,1.0,0.2,1.0,0.32
2,2d human pose estimation,bm25,dependency_signature,0.0,1.0,4.0,5.0,0.0,0.0,1.0,0.2,1.0,0.4,1.0,0.20
3,2d human pose estimation,bm25,generated_tasks,0.0,2.0,5.0,9.0,0.0,0.0,1.0,0.4,1.0,0.5,1.0,0.36
4,2d human pose estimation,bm25,readme,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,1.0,0.2,1.0,0.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29675,zero shot learning,word2vec,repomap,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
29676,zero shot learning,word2vec,repomap_code2doc_files_summary,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.04
29677,zero shot learning,word2vec,repomap_code2doc_generated_readme,0.0,1.0,1.0,2.0,0.0,0.0,1.0,0.2,1.0,0.1,1.0,0.08
29678,zero shot learning,word2vec,repository_signature,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,1.0,0.1,1.0,0.12


In [40]:
per_query_results_df.to_csv("../results/per_query_ir_results.csv", index=False)

In [41]:
(per_query_results_df
    .drop(columns=["query"])
    .groupby(["corpus", "retriever"])
    .agg("mean").reset_index(drop=False)
    .sort_values("Accuracy@10")
)[["corpus", "retriever", "Precision@10", "Accuracy@10"]]

Unnamed: 0,corpus,retriever,Precision@10,Accuracy@10
23,repomap,word2vec,0.015768,0.130728
11,dependency_signature,word2vec,0.024394,0.160377
39,selected_code,word2vec,0.027763,0.203504
27,repomap_code2doc_files_summary,word2vec,0.077763,0.431267
3,code2doc_files_summary,word2vec,0.078437,0.443396
31,repomap_code2doc_generated_readme,word2vec,0.118329,0.571429
7,code2doc_generated_readme,word2vec,0.124124,0.583558
19,readme,word2vec,0.121563,0.586253
35,repository_signature,word2vec,0.130863,0.592992
8,dependency_signature,bm25,0.13483,0.593197


## Aggregated results

In [36]:
for corpus_name in corpora.keys():
    try:
        RetrievalEvaluationResults.from_retriever(bm25_retrievers[corpus_name], retriever_inputs[corpus_name])
    except:
        print(corpus_name)

  0%|                                                                                                            | 0/6776 [00:00<?, ?docs/s]
que: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.73it/s]
  0%|                                                                                                            | 0/6776 [00:00<?, ?docs/s]
que: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.21it/s]
  0%|                                                                                                            | 0/6776 [00:00<?, ?docs/s]
que: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.63it/s]
  0%|                                                                                                            | 0/6776 [00:00<?, ?docs/s]
que: 100%|███

In [37]:
bm25_results = {
    corpus_name: RetrievalEvaluationResults.from_retriever(bm25_retrievers[corpus_name], retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
}

  0%|                                                                                                            | 0/6776 [00:00<?, ?docs/s]
que: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.71it/s]
  0%|                                                                                                            | 0/6776 [00:00<?, ?docs/s]
que: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.77it/s]
  0%|                                                                                                            | 0/6776 [00:00<?, ?docs/s]
que: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.60it/s]
  0%|                                                                                                            | 0/6776 [00:00<?, ?docs/s]
que: 100%|███

splade_results = {
    corpus_name: RetrievalEvaluationResults.from_retriever(splade_retriever, retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
}

In [38]:
word2vec_results = {
    corpus_name: RetrievalEvaluationResults.from_retriever(w2v_retriever, retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
}

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 350.39it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 53/53 [00:01<00:00, 39.09it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1510.74it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 53/53 [00:00<00:00, 226.02it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1476.61it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 53/53 [00:02<00:00, 24.48it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1590.66it/s]
Batches: 100%

In [39]:
sentence_transformer_results = {
    (corpus_name, model_name.split("/")[1]): RetrievalEvaluationResults.from_retriever(sentence_transformer_retrievers[model_name], retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
    for model_name in sentence_transformer_model_names
}

Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 93/93 [00:00<00:00, 159.87it/s]
Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 847/847 [00:35<00:00, 23.70it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 93/93 [00:00<00:00, 172.51it/s]
Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 847/847 [00:09<00:00, 89.90it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 93/93 [00:00<00:00, 164.30it/s]
Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 847/847 [00:18<00:00, 44.68it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 93/93 [00:00<00:00, 171.20it/s]
Batches: 100%

In [40]:
bm25_metrics = [
    {"corpus": corpus_name, "retriever": "bm25", **bm25_results[corpus_name].metrics}
    for corpus_name in corpora.keys()
]

In [41]:
word2vec_metrics = [
    {"corpus": corpus_name, "retriever": "Python code word2vec", **word2vec_results[corpus_name].metrics}
    for corpus_name in corpora.keys()
]

In [42]:
#splade_metrics = [
#    {"corpus": corpus_name, "retriever": "splade", **splade_results[corpus_name].metrics}
#     for corpus_name in corpora.keys()
#]
 
sentence_transformer_metrics = [
    {"corpus": corpus_name, "retriever": f"{model_name} (sentence_transformer)", **sentence_transformer_results[(corpus_name, model_name)].metrics}
    for (corpus_name, model_name) in sentence_transformer_results.keys()
]

all_metrics_df = pd.DataFrame.from_records(bm25_metrics + word2vec_metrics +  sentence_transformer_metrics).sort_values("Hits@10", ascending=False)

In [43]:
all_metrics_df.shape

(32, 50)

In [44]:
all_metrics_df[["corpus", "retriever", "Accuracy@10"]]

Unnamed: 0,corpus,retriever,Accuracy@10
16,readme,all-mpnet-base-v2 (sentence_transformer),0.92867
0,readme,bm25,0.91386
17,readme,all-MiniLM-L12-v2 (sentence_transformer),0.91925
18,code2doc_generated_readme,all-mpnet-base-v2 (sentence_transformer),0.85599
19,code2doc_generated_readme,all-MiniLM-L12-v2 (sentence_transformer),0.86272
22,code2doc_reasoning,all-mpnet-base-v2 (sentence_transformer),0.84657
23,code2doc_reasoning,all-MiniLM-L12-v2 (sentence_transformer),0.84522
1,code2doc_generated_readme,bm25,0.83176
3,code2doc_reasoning,bm25,0.83445
24,code2doc_generation_context,all-mpnet-base-v2 (sentence_transformer),0.79139


In [45]:
all_metrics_df.columns

Index(['corpus', 'retriever', 'Accuracy@1', 'Accuracy@3', 'Accuracy@5',
       'Accuracy@10', 'Accuracy@100', 'Accuracy@1000', 'Hits@1', 'Hits@3',
       'Hits@5', 'Hits@10', 'Hits@100', 'Hits@1000', 'R_cap@1', 'R_cap@3',
       'R_cap@5', 'R_cap@10', 'R_cap@100', 'R_cap@1000', 'MRR@1', 'MRR@3',
       'MRR@5', 'MRR@10', 'MRR@100', 'MRR@1000', 'P@1', 'P@3', 'P@5', 'P@10',
       'P@100', 'P@1000', 'Recall@1', 'Recall@3', 'Recall@5', 'Recall@10',
       'Recall@100', 'Recall@1000', 'MAP@1', 'MAP@3', 'MAP@5', 'MAP@10',
       'MAP@100', 'MAP@1000', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10',
       'NDCG@100', 'NDCG@1000'],
      dtype='object')

In [68]:
model_name = "qwen2.5:7b-instruct"

In [47]:
all_metrics_df.to_csv(f"../output/code2doc/beir_results_{model_name}.csv", index=False)

In [48]:
#all_metrics_df.to_csv(f"../output/code2doc/beir_results_with_modernbert_{model_name}.csv", index=False)

## Results

By default we will use min_task_count=10 (as we used originally)

We can switch to smaller task counts like 5 to incorporate the fact that we use sample of repos

In [49]:
metric_df_cols = ["corpus", "retriever", "Accuracy@10", "Hits@10", "R_cap@10", "P@1", "P@5", "P@10"]

In [50]:
all_metrics_df[metric_df_cols]

Unnamed: 0,corpus,retriever,Accuracy@10,Hits@10,R_cap@10,P@1,P@5,P@10
16,readme,all-mpnet-base-v2 (sentence_transformer),0.92867,4.25841,0.42571,0.64334,0.51252,0.42571
0,readme,bm25,0.91386,4.23149,0.42301,0.60162,0.50148,0.42301
17,readme,all-MiniLM-L12-v2 (sentence_transformer),0.91925,4.03769,0.40363,0.66353,0.51602,0.40363
18,code2doc_generated_readme,all-mpnet-base-v2 (sentence_transformer),0.85599,3.21669,0.32153,0.52086,0.40215,0.32153
19,code2doc_generated_readme,all-MiniLM-L12-v2 (sentence_transformer),0.86272,3.15882,0.31575,0.51952,0.40054,0.31575
22,code2doc_reasoning,all-mpnet-base-v2 (sentence_transformer),0.84657,3.06326,0.30619,0.52355,0.39112,0.30619
23,code2doc_reasoning,all-MiniLM-L12-v2 (sentence_transformer),0.84522,2.95424,0.29529,0.52221,0.37362,0.29529
1,code2doc_generated_readme,bm25,0.83176,2.77793,0.27766,0.50338,0.36022,0.27889
3,code2doc_reasoning,bm25,0.83445,2.7214,0.27201,0.49933,0.35088,0.27301
24,code2doc_generation_context,all-mpnet-base-v2 (sentence_transformer),0.79139,2.50067,0.24993,0.4428,0.32463,0.24993


In [51]:
all_metrics_df[metric_df_cols].sort_values("Accuracy@10", ascending=False)

Unnamed: 0,corpus,retriever,Accuracy@10,Hits@10,R_cap@10,P@1,P@5,P@10
16,readme,all-mpnet-base-v2 (sentence_transformer),0.92867,4.25841,0.42571,0.64334,0.51252,0.42571
17,readme,all-MiniLM-L12-v2 (sentence_transformer),0.91925,4.03769,0.40363,0.66353,0.51602,0.40363
0,readme,bm25,0.91386,4.23149,0.42301,0.60162,0.50148,0.42301
19,code2doc_generated_readme,all-MiniLM-L12-v2 (sentence_transformer),0.86272,3.15882,0.31575,0.51952,0.40054,0.31575
18,code2doc_generated_readme,all-mpnet-base-v2 (sentence_transformer),0.85599,3.21669,0.32153,0.52086,0.40215,0.32153
22,code2doc_reasoning,all-mpnet-base-v2 (sentence_transformer),0.84657,3.06326,0.30619,0.52355,0.39112,0.30619
23,code2doc_reasoning,all-MiniLM-L12-v2 (sentence_transformer),0.84522,2.95424,0.29529,0.52221,0.37362,0.29529
3,code2doc_reasoning,bm25,0.83445,2.7214,0.27201,0.49933,0.35088,0.27301
1,code2doc_generated_readme,bm25,0.83176,2.77793,0.27766,0.50338,0.36022,0.27889
4,code2doc_generation_context,bm25,0.80619,2.4751,0.24738,0.46559,0.3247,0.24777


In [52]:
all_metrics_df.groupby("corpus").apply(lambda df: df.sort_values("Accuracy@10", ascending=False).iloc[0])[metric_df_cols].sort_values("Accuracy@10", ascending=False)

  all_metrics_df.groupby("corpus").apply(lambda df: df.sort_values("Accuracy@10", ascending=False).iloc[0])[metric_df_cols].sort_values("Accuracy@10", ascending=False)


Unnamed: 0_level_0,corpus,retriever,Accuracy@10,Hits@10,R_cap@10,P@1,P@5,P@10
corpus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
readme,readme,all-mpnet-base-v2 (sentence_transformer),0.92867,4.25841,0.42571,0.64334,0.51252,0.42571
code2doc_generated_readme,code2doc_generated_readme,all-MiniLM-L12-v2 (sentence_transformer),0.86272,3.15882,0.31575,0.51952,0.40054,0.31575
code2doc_reasoning,code2doc_reasoning,all-mpnet-base-v2 (sentence_transformer),0.84657,3.06326,0.30619,0.52355,0.39112,0.30619
code2doc_generation_context,code2doc_generation_context,bm25,0.80619,2.4751,0.24738,0.46559,0.3247,0.24777
selected_code,selected_code,all-MiniLM-L12-v2 (sentence_transformer),0.69044,1.63122,0.16299,0.36205,0.22503,0.16299
generated_tasks,generated_tasks,all-MiniLM-L12-v2 (sentence_transformer),0.27995,0.38089,0.03795,0.05114,0.04199,0.03795
repository_signature,repository_signature,all-mpnet-base-v2 (sentence_transformer),0.23957,0.32301,0.03217,0.05114,0.04118,0.03217
dependency_signature,dependency_signature,all-mpnet-base-v2 (sentence_transformer),0.19785,0.26514,0.02638,0.04711,0.03634,0.02638


In [53]:
all_metrics_df.groupby("retriever").apply(lambda df: df.sort_values("Accuracy@10", ascending=False).iloc[0])[metric_df_cols].sort_values("Accuracy@10", ascending=False)

  all_metrics_df.groupby("retriever").apply(lambda df: df.sort_values("Accuracy@10", ascending=False).iloc[0])[metric_df_cols].sort_values("Accuracy@10", ascending=False)


Unnamed: 0_level_0,corpus,retriever,Accuracy@10,Hits@10,R_cap@10,P@1,P@5,P@10
retriever,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
all-mpnet-base-v2 (sentence_transformer),readme,all-mpnet-base-v2 (sentence_transformer),0.92867,4.25841,0.42571,0.64334,0.51252,0.42571
all-MiniLM-L12-v2 (sentence_transformer),readme,all-MiniLM-L12-v2 (sentence_transformer),0.91925,4.03769,0.40363,0.66353,0.51602,0.40363
bm25,readme,bm25,0.91386,4.23149,0.42301,0.60162,0.50148,0.42301
Python code word2vec,readme,Python code word2vec,0.58681,1.21669,0.12153,0.27322,0.15693,0.12153


In [54]:
all_metrics_df[all_metrics_df["retriever"] == "bm25"][metric_df_cols]

Unnamed: 0,corpus,retriever,Accuracy@10,Hits@10,R_cap@10,P@1,P@5,P@10
0,readme,bm25,0.91386,4.23149,0.42301,0.60162,0.50148,0.42301
1,code2doc_generated_readme,bm25,0.83176,2.77793,0.27766,0.50338,0.36022,0.27889
3,code2doc_reasoning,bm25,0.83445,2.7214,0.27201,0.49933,0.35088,0.27301
4,code2doc_generation_context,bm25,0.80619,2.4751,0.24738,0.46559,0.3247,0.24777
2,selected_code,bm25,0.67968,1.80619,0.18048,0.37348,0.23843,0.18146
7,generated_tasks,bm25,0.25168,0.33244,0.03311,0.0625,0.03723,0.03139
6,repository_signature,bm25,0.21534,0.27725,0.02759,0.06495,0.03599,0.0276
5,dependency_signature,bm25,0.14939,0.18304,0.01817,0.04348,0.02582,0.01834


In [55]:
len(task_queries)

743

In [56]:
# task count = 5

In [57]:
all_metrics_df[["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10", ascending=False)

Unnamed: 0,corpus,retriever,Accuracy@10
16,readme,all-mpnet-base-v2 (sentence_transformer),0.92867
17,readme,all-MiniLM-L12-v2 (sentence_transformer),0.91925
0,readme,bm25,0.91386
19,code2doc_generated_readme,all-MiniLM-L12-v2 (sentence_transformer),0.86272
18,code2doc_generated_readme,all-mpnet-base-v2 (sentence_transformer),0.85599
22,code2doc_reasoning,all-mpnet-base-v2 (sentence_transformer),0.84657
23,code2doc_reasoning,all-MiniLM-L12-v2 (sentence_transformer),0.84522
3,code2doc_reasoning,bm25,0.83445
1,code2doc_generated_readme,bm25,0.83176
4,code2doc_generation_context,bm25,0.80619


In [58]:
# task count = 10

In [59]:
all_metrics_df[["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10", ascending=False)

Unnamed: 0,corpus,retriever,Accuracy@10
16,readme,all-mpnet-base-v2 (sentence_transformer),0.92867
17,readme,all-MiniLM-L12-v2 (sentence_transformer),0.91925
0,readme,bm25,0.91386
19,code2doc_generated_readme,all-MiniLM-L12-v2 (sentence_transformer),0.86272
18,code2doc_generated_readme,all-mpnet-base-v2 (sentence_transformer),0.85599
22,code2doc_reasoning,all-mpnet-base-v2 (sentence_transformer),0.84657
23,code2doc_reasoning,all-MiniLM-L12-v2 (sentence_transformer),0.84522
3,code2doc_reasoning,bm25,0.83445
1,code2doc_generated_readme,bm25,0.83176
4,code2doc_generation_context,bm25,0.80619


In [60]:
all_metrics_df.groupby("retriever")["Accuracy@10"].agg("mean").sort_values()

retriever
Python code word2vec                        0.347915
bm25                                        0.585294
all-MiniLM-L12-v2 (sentence_transformer)    0.596399
all-mpnet-base-v2 (sentence_transformer)    0.602290
Name: Accuracy@10, dtype: float64

In [61]:
all_metrics_df.groupby("retriever")["Accuracy@10"].agg("mean").sort_values()

retriever
Python code word2vec                        0.347915
bm25                                        0.585294
all-MiniLM-L12-v2 (sentence_transformer)    0.596399
all-mpnet-base-v2 (sentence_transformer)    0.602290
Name: Accuracy@10, dtype: float64

In [62]:
all_metrics_df.groupby("retriever")["Accuracy@10"].agg("mean").sort_values()

retriever
Python code word2vec                        0.347915
bm25                                        0.585294
all-MiniLM-L12-v2 (sentence_transformer)    0.596399
all-mpnet-base-v2 (sentence_transformer)    0.602290
Name: Accuracy@10, dtype: float64

In [63]:
all_metrics_df.groupby("corpus")["Accuracy@10"].agg("mean").sort_values()

corpus
dependency_signature           0.150740
repository_signature           0.204575
generated_tasks                0.251010
selected_code                  0.560902
code2doc_generation_context    0.707268
code2doc_reasoning             0.768505
code2doc_generated_readme      0.783648
readme                         0.837148
Name: Accuracy@10, dtype: float64

In [64]:
sampled_repos_df["tasks"].explode().value_counts().loc[list(task_queries.values())]

tasks
image classification            271
representation learning         252
frame                           246
question answering              224
transfer learning               219
                               ... 
aerial scene classification      10
backdoor attack                  10
keyphrase generation             10
multi frame super resolution     10
sequential recommendation        10
Name: count, Length: 743, dtype: int64

In [65]:
all_metrics_df[["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10", ascending=False)

Unnamed: 0,corpus,retriever,Accuracy@10
16,readme,all-mpnet-base-v2 (sentence_transformer),0.92867
17,readme,all-MiniLM-L12-v2 (sentence_transformer),0.91925
0,readme,bm25,0.91386
19,code2doc_generated_readme,all-MiniLM-L12-v2 (sentence_transformer),0.86272
18,code2doc_generated_readme,all-mpnet-base-v2 (sentence_transformer),0.85599
22,code2doc_reasoning,all-mpnet-base-v2 (sentence_transformer),0.84657
23,code2doc_reasoning,all-MiniLM-L12-v2 (sentence_transformer),0.84522
3,code2doc_reasoning,bm25,0.83445
1,code2doc_generated_readme,bm25,0.83176
4,code2doc_generation_context,bm25,0.80619


## Does combining rationale with generated readme help?

It seems that the best sentence transformer retrievers can only get worse when using any other information!

In [66]:
sentence_transformer_results.keys()

dict_keys([('readme', 'all-mpnet-base-v2'), ('readme', 'all-MiniLM-L12-v2'), ('code2doc_generated_readme', 'all-mpnet-base-v2'), ('code2doc_generated_readme', 'all-MiniLM-L12-v2'), ('selected_code', 'all-mpnet-base-v2'), ('selected_code', 'all-MiniLM-L12-v2'), ('code2doc_reasoning', 'all-mpnet-base-v2'), ('code2doc_reasoning', 'all-MiniLM-L12-v2'), ('code2doc_generation_context', 'all-mpnet-base-v2'), ('code2doc_generation_context', 'all-MiniLM-L12-v2'), ('dependency_signature', 'all-mpnet-base-v2'), ('dependency_signature', 'all-MiniLM-L12-v2'), ('repository_signature', 'all-mpnet-base-v2'), ('repository_signature', 'all-MiniLM-L12-v2'), ('generated_tasks', 'all-mpnet-base-v2'), ('generated_tasks', 'all-MiniLM-L12-v2')])

In [67]:
st_generated_readme_results= sentence_transformer_results[('generated_readme', 'all-mpnet-base-v2')].retrieval_results
st_rationale_results = sentence_transformer_results[('generated_rationale', 'all-mpnet-base-v2')].retrieval_results
bm25_generated_readme_results = bm25_results["generated_readme"].retrieval_results
st_context_results = sentence_transformer_results[('generation_context', 'all-mpnet-base-v2')].retrieval_results

KeyError: ('generated_readme', 'all-mpnet-base-v2')

In [None]:
len(list(bm25_generated_readme_results.keys()))

In [None]:
len(list(st_generated_readme_results.keys()))

In [None]:
def merge_qrels(qrels1, qrels2):
    merged_qrels = {}
    for k in qrels1.keys():
        tmp_rel = dict()
        for rel_k in set(qrels1[k].keys()).union(qrels2[k]):
            tmp_rel[rel_k] = qrels1[k].get(rel_k, 0) +  qrels2[k].get(rel_k, 0)
        merged_qrels[k] = tmp_rel
    return merged_qrels

In [None]:
st_generation_results = merge_qrels(bm25_generated_readme_results, st_generated_readme_results)

In [None]:
EvaluateRetrieval().evaluate_custom(task_qrels, st_generation_results, metric="acc", k_values=[1,5,10])

In [None]:
EvaluateRetrieval().evaluate_custom(task_qrels, st_generated_readme_results, metric="acc", k_values=[1,5,10])

In [None]:
EvaluateRetrieval().evaluate_custom(task_qrels, st_rationale_results, metric="acc", k_values=[1,5,10])

In [None]:
all_metrics_df[all_metrics_df["retriever"] == "bm25"][["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10")

In [None]:
Splitting does not make much sense as the most of generated data is under the sentence-transformer context length (384 tokens)

In [None]:
def split_corpus_by_lengths(corpus, chunk_length):
    splitted_corpora = [dict() for _ in range(n_splits)]
    for c_id in corpus.keys():
        text = corpus[c_id]["text"]
        chunk_length =  len(text) // n_splits
        for i in range(0, n_splits):
            splitted_corpora[i] = text[i*chunk_length:(i+1)*chunk_length]
        

In [None]:
class MultiTextEvaluator(BaseModel):
    """
    Evaluate a dataframe that has multiple texts for each query (multiple generation experiments)
    iteration_col says which experiment it was
    """
    iteration_col: str
    text_cols: List[str]
    k_values: List[int] = [1,5,10,25]

    def get_ir_datas(self, df):
        for iter in df[self.iteration_col].unique():
            ir_data = load_ir_data(df[df[self.iteration_col] == iter], self.text_cols)
            yield (iter, ir_data)

    def evaluate(self, df, retriever):
        ir_datas = dict(self.get_ir_datas(df))
        dfs = []
        for iter, ir_data in ir_datas.items():
            per_query_evaluator = PerQueryIREvaluator(k_values=self.k_values)
            df = per_query_evaluator.get_scores(ir_data, retriever)
            df[self.iteration_col] = iter
            dfs.append(df)
        metrics_df = pd.concat(dfs)
        metrics_df["query"] = metrics_df.index
        return metrics_df