In [1]:
import pandas as pd
import json

from beir.retrieval.search.lexical import BM25Search as BM25


from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval.models import SPLADE, SentenceBERT, UniCOIL
from beir.retrieval.search.sparse import SparseSearch


from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from github_search.evaluation.beir_evaluation import EvaluateRetrievalCustom as EvaluateRetrieval, CorpusDataLoader
from beir.retrieval.search.lexical import BM25Search as BM25

import sentence_transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pickle 

with open("/home/kuba/Projects/github_search/.dagster/storage/corpus_information", "rb") as f:
    corpora = json.loads(pickle.load(f))

[(cname, len(corpora[cname].keys())) for cname in corpora.keys()]

[('readme', 6612),
 ('generated_readme', 6612),
 ('selected_code', 6612),
 ('generated_rationale', 6612),
 ('generation_context', 6612),
 ('dependency_signature', 6612),
 ('repository_signature', 6612),
 ('generated_tasks', 6612),
 ('pagerank_dependency_signature', 6612),
 ('pagerank_repository_signature', 6612),
 ('pagerank_generated_tasks', 6612)]

In [3]:
corpora.keys()

dict_keys(['readme', 'generated_readme', 'selected_code', 'generated_rationale', 'generation_context', 'dependency_signature', 'repository_signature', 'generated_tasks', 'pagerank_dependency_signature', 'pagerank_repository_signature', 'pagerank_generated_tasks'])

In [4]:
def get_repos_for_query(query, repos_df):
    return repos_df[repos_df["tasks"].apply(lambda ts: query in ts)]


def get_queries(repos_df, min_query_count):
    all_queries = repos_df["query_tasks"].explode()
    qcounts = all_queries.value_counts()
    return qcounts[qcounts >= min_query_count].index.to_list()

def prepare_query_data(repos_df, min_query_count=5):
    task_queries = {str(i): query for (i, query) in enumerate(get_queries(repos_df, min_query_count=min_query_count))}

    task_qrels = {
        qid: {str(corpus_id): 1 for corpus_id in get_repos_for_query(task_queries[qid], repos_df).index}
        for qid in task_queries.keys()
    }
    return task_queries, task_qrels

In [5]:
with open("/home/kuba/Projects/github_search/.dagster/storage/sampled_repos", "rb") as f:
    sampled_repos_df = pickle.load(f)


repos_sorted = [rec["title"] for rec in list(corpora["readme"].values())]
sampled_repos_df = pd.Series(repos_sorted, name="repo").reset_index().merge(sampled_repos_df, on="repo")
task_queries, task_qrels = prepare_query_data(sampled_repos_df, min_query_count=10)

In [6]:
pd.Series(map(len, task_qrels.values())).describe()

count    693.000000
mean      24.288600
std       30.528634
min       10.000000
25%       11.000000
50%       14.000000
75%       23.000000
max      258.000000
dtype: float64

In [7]:
sampled_repos_df = sampled_repos_df[sampled_repos_df["tasks"].apply(len) <= 10]

In [8]:
sampled_repos_df.shape

(6612, 11)

In [9]:
with open("../output/elasticsearch/queries_qrels.json", "w") as f:
    json.dump({"task_queries": task_queries, "task_qrels": task_qrels}, f)

In [10]:
for cid in corpora["readme"].keys():
    assert corpora["readme"][cid]["title"] == corpora["readme"][cid]["title"], f"no match at {cid}"
    #assert corpora["readme"][cid]["title"] == corpora[("dependency_signature", 0)][cid]["title"], f"no match at {cid}"

In [11]:
## Checking elasticsearch

In [12]:
import elasticsearch

es_client = elasticsearch.Elasticsearch()
def retrieve_repos_with_es(query, k=50, index="readme", es_client=es_client):
    es_result = es_client.search(index=index, body={"query": {"match": {"txt": query}}}, size=k)
    return [
        hit["_source"]["title"]
        for hit in es_result["hits"]["hits"]
    ]



def get_elasticsearch_results():
    retrieved_repo_tasks = {}

    qcounts = sampled_repos_df["tasks"].explode().value_counts()
    used_queries = [
        query
        for query in sampled_repos_df["tasks"].explode().drop_duplicates()
        if qcounts.loc[query] > 5
    ]
    # [task_queries[qid] for qid in task_queries.keys()]
    
    index="selected_code"
    for query in used_queries:
        retrieved_tasks = sampled_repos_df[sampled_repos_df["repo"].isin(retrieve_repos_with_es(query, index=index))]["tasks"].to_list()
        retrieved_repo_tasks[query] = retrieved_tasks
    
    k = 10
    query_hits = pd.Series({
        query: sum([query in tasks for tasks in retrieved_repo_tasks[query][:k]])
        for query in retrieved_repo_tasks.keys()
    })

def show_elasticsearch_results(qid='10'):
    query = task_queries[qid]
    
    print(query)
    print(query_hits[query], "hits")
    
    for hit in es_client.search(index=index, body={"query": {"match": {"txt": task_queries[qid]}}}, size=k)["hits"]["hits"]:
        print("#" * 100)
        print("#" * 100)
        repo_name = hit["_source"]["title"]
        repo_record = sampled_repos_df[sampled_repos_df["repo"] == repo_name].iloc[0]
        is_hit = query in repo_record["tasks"]
        print(repo_name, "HIT" if is_hit else "NO HIT")
        
        if is_hit:
            print("#" * 100)
            print("#" * 100)
            print(hit['_source']['txt'])

## Evaluating with BEIR

In [13]:
def load_w2v_sentence_transformer(w2v_model_path):
    w2v_layer = sentence_transformers.models.WordEmbeddings.load(w2v_model_path)
    return sentence_transformers.SentenceTransformer(modules=[w2v_layer, sentence_transformers.models.Pooling(200)])



def load_sentence_bert(model_name):
    st_model = SentenceBERT("sentence-transformers/all-mpnet-base-v2")
    st_model.doc_model = sentence_transformers.SentenceTransformer(model_name, trust_remote_code=True)
    st_model.q_model = st_model.doc_model
    return st_model

def get_w2v_retriever(w2v_model_path="../models/rnn_abstract_readme_w2v/0_WordEmbeddings"):
    w2v_model = load_w2v_sentence_transformer(w2v_model_path)
    st_model = SentenceBERT("sentence-transformers/all-mpnet-base-v2")
    st_model.q_model = w2v_model
    st_model.doc_model = w2v_model
    return EvaluateRetrieval(DRES(st_model), score_function="cos_sim")

def get_splade_retriever(splade_model_path = "splade/weights/distilsplade_max", batch_size=128):
    splade_model = DRES(SPLADE(splade_model_path), batch_size=128)
    return EvaluateRetrieval(splade_model, score_function="dot")


def get_bm25_retrievers(corpora):
    def sanitize_index_name(index_name):
        if type(index_name) is str:
            return index_name
        else:
            return "".join(map(str, index_name))
    
    bm25_retrievers = {}
    for corpus_name, corpus in corpora.items():
        model = BM25(index_name=sanitize_index_name(corpus_name))
        retriever = EvaluateRetrieval(model)
        bm25_retrievers[corpus_name] = retriever
    return bm25_retrievers


sentence_transformer_model_names = [
    "sentence-transformers/all-mpnet-base-v2",
    "sentence-transformers/all-MiniLM-L12-v2",
    #"nomic-ai/modernbert-embed-base",
    
    #"estrogen/ModernBERT-base-nli-v3"
    #"BAAI/bge-large-en-v1.5",
    #"mixedbread-ai/mxbai-embed-large-v1"
]

def get_sentence_transformer_retriever(model_name="sentence-transformers/all-mpnet-base-v2", batch_size=8):
    model = DRES(load_sentence_bert(model_name), batch_size=batch_size)
    return EvaluateRetrieval(model, score_function="cos_sim")

def get_unicoil_retriever(model_name="castorini/unicoil-msmarco-passage"):
    """
    THERE IS A BUG WITH BEIR THAT MAKES THIS UNUSABLE
    """
    model = SparseSearch(UniCOIL(model_path=model_name), batch_size=32)
    return EvaluateRetrieval(model, score_function="dot")

In [14]:
corpora.keys()

dict_keys(['readme', 'generated_readme', 'selected_code', 'generated_rationale', 'generation_context', 'dependency_signature', 'repository_signature', 'generated_tasks'])

In [15]:
def get_corpus_samples(corpora, n_repos=10):
    records = []


    for k in range(n_repos):
        for cname in corpora.keys():
            if type(cname) is tuple:
                if 0 in cname:
                    display_name = cname[0]
                else:
                    continue
            else:
                display_name = cname
            record = corpora[cname][str(k)]
            record["corpus"] = display_name
            records.append(record)
    
    return pd.DataFrame.from_records(records).rename(columns = {"title": "repo_name", "corpus": "representation"}).fillna(method="ffill")

In [16]:
!mkdir ../output/visualization

mkdir: cannot create directory ‘../output/visualization’: File exists


In [17]:
df = get_corpus_samples(corpora)#.to_json("../output/visualization/corpus_samples.jsonl", lines=True, orient="records")

  return pd.DataFrame.from_records(records).rename(columns = {"title": "repo_name", "corpus": "representation"}).fillna(method="ffill")


In [18]:
def get_repomaps_df(repo_names, repomap_path="../output/aider/selected_repo_maps_1024.json"):
    with open(repomap_path) as f:
        repomaps = json.load(f)

    records = []
    for repo in repo_names:
        records.append({"repo_name": repo, "text": repomaps[repo], "representation": "repomap"})
    return pd.DataFrame.from_records(records)

In [19]:
corpora.keys()

dict_keys(['readme', 'generated_readme', 'selected_code', 'generated_rationale', 'generation_context', 'dependency_signature', 'repository_signature', 'generated_tasks'])

In [20]:
df = pd.concat(
    [df, get_repomaps_df(df["repo_name"].unique())]
).sort_values("repo_name").fillna(method="ffill")
#df["representation"] = df["representation"].apply(lambda c: c if c not in ["generated_readme", "generated_rationale", "generation_context"] else c + " (repomap)")
#df.to_json("../output/visualization/corpus_samples.jsonl", lines=True, orient="records")

  ).sort_values("repo_name").fillna(method="ffill")


In [21]:
df = pd.concat([
    df,
    pd.read_json("../output/visualization/corpus_samples.jsonl", lines=True, orient="records")
]).drop_duplicates(subset=["repo_name", "representation"])

In [22]:
df.to_json("../output/visualization/corpus_samples.jsonl", lines=True, orient="records")

In [23]:
#df.to_json("../output/visualization/corpus_samples.jsonl", lines=True, orient="records")

In [24]:
#df["representation"] = df["representation"].apply(lambda c: c if c not in ["generated_readme", "generated_rationale", "generation_context"] else c + " (repomap)")

In [25]:
#df.to_json("../output/visualization/corpus_samples.jsonl", lines=True, orient="records")

In [26]:
#print(corpora["selected_code"]['0']["text"])

In [27]:
corpora["readme"]["0"]

{'text': '# PyTorch Implementation of Differentiable ODE Solvers\n\nThis library provides ordinary differential equation (ODE) solvers implemented in PyTorch. Backpropagation through all solvers is supported using the adjoint method. For usage of ODE solvers in deep learning applications, see [1].\n\nAs the solvers are implemented in PyTorch, algorithms in this repository are fully supported to run on the GPU.\n\n---\n\n<p align="center">\n  <img align="middle" src="./assets/resnet_0_viz.png" alt="Discrete-depth network" width="240" height="330" />\n  <img align="middle" src="./assets/odenet_0_viz.png" alt="Continuous-depth network" width="240" height="330" />\n</p>\n\n## Installation\n```\ngit clone https://github.com/rtqichen/torchdiffeq.git\ncd torchdiffeq\npip install -e .\n```\n\n## Examples\nExamples are placed in the [`examples`](./examples) directory.\n\nWe encourage those who are interested in using this library to take a look at [`examples/ode_demo.py`](./examples/ode_demo.py

from pylate import indexes, models, retrieve


class PyLateBEIRWrapper:

    def __init__(self, model_name="lightonai/colbertv2.0"):
        
        self.model = models.ColBERT(
            model_name_or_path=model_name,
        )
        self.index = indexes.Voyager(
            index_folder=f"../output/pylate-index/{model_name}",
            index_name="index",
            override=True,
        )
        self.retriever = None

    def index_corpus(self, corpus):
        documents = corpus.values()
        documents_embeddings = self.model.encode(
            documents,
            batch_size=32,
            is_query=False, # Encoding documents
            show_progress_bar=True,
        )
        
        # Add the documents ids and embeddings to the Voyager index
        self.index.add_documents(
            documents_ids=corpus.keys(),
            documents_embeddings=documents_embeddings,
        )
        self.retriever = retrieve.ColBERT(index=self.index)

    def retrieve(self, query):
        return self.retriever.retrieve(query)

pylate_model = PyLateBEIRWrapper()

In [28]:
import sentence_transformers

In [29]:
w2v_retriever = get_w2v_retriever()

In [30]:
w2v_retriever

<github_search.evaluation.beir_evaluation.EvaluateRetrievalCustom at 0x7cebdc1e6060>

In [31]:
#splade_retriever = get_splade_retriever() 

# change sentence-transformers to 2.7?
sentence_transformer_retrievers = {
    model_name: get_sentence_transformer_retriever(model_name)
    for model_name in sentence_transformer_model_names
}

In [32]:
bm25_retrievers = get_bm25_retrievers(corpora)

## Per query results

In [33]:
from pydantic import BaseModel
from typing import Dict

class RetrieverInput(BaseModel):
    corpus: Dict[str, dict]
    queries: Dict[str, str]
    qrels: Dict[str, Dict[str, int]]


class RetrievalEvaluationResults(BaseModel):
    retrieval_results: Dict[str, Dict[str, float]]
    metrics: dict
    model_type: str

    @classmethod
    def from_retriever(cls, retriever, retriever_input, metric_names=["accuracy@k", "hits@k", "r_cap@k", "mrr@k"]):
        retrieval_results = retriever.retrieve(retriever_input.corpus, retriever_input.queries)
        custom_metrics = retriever.evaluate_custom_multi(retriever_input.qrels, retrieval_results, retriever.k_values, metrics=metric_names)
        other_metrics = retriever.evaluate(retriever_input.qrels, retrieval_results, retriever.k_values, ignore_identical_ids=False)
        metrics = custom_metrics | cls.tuple_to_dict(other_metrics)
        try:
            model_type = str(retriever.retriever.model)
        except:
            model_type = "bm25"
        return RetrievalEvaluationResults(metrics=metrics, model_type=model_type, retrieval_results=retrieval_results)


    @classmethod
    def tuple_to_dict(cls, dicts):
        merged_dict = {}
        for d in dicts:
            merged_dict = d | merged_dict
        return merged_dict

In [34]:
retriever_inputs = {
    corpus_name: RetrieverInput(corpus=corpus, queries=task_queries, qrels=task_qrels)
    for (corpus_name, corpus) in corpora.items()
}

In [35]:
from github_search.evaluation.beir_evaluation import PerQueryIREvaluator

In [36]:
per_query_evaluator = PerQueryIREvaluator(k_values=[1, 5, 10, 25])

In [37]:
retriever_inputs = {
    corpus_name: RetrieverInput(corpus=corpus, queries=task_queries, qrels=task_qrels)
    for (corpus_name, corpus) in corpora.items()
}

In [38]:
retriever_inputs.keys()

dict_keys(['readme', 'generated_readme', 'selected_code', 'generated_rationale', 'generation_context', 'dependency_signature', 'repository_signature', 'generated_tasks'])

In [39]:
named_retrievers = {
    corpus_name: [
        ("bm25", bm25_retrievers[corpus_name]),
        ("word2vec", w2v_retriever),
    ] + list(sentence_transformer_retrievers.items())
    for corpus_name in retriever_inputs.keys()
}

In [40]:
retriever_inputs.keys()

dict_keys(['readme', 'generated_readme', 'selected_code', 'generated_rationale', 'generation_context', 'dependency_signature', 'repository_signature', 'generated_tasks'])

In [41]:
%%time
per_query_results = {
    (corpus_name, retriever_name): per_query_evaluator.get_scores(retriever=retriever, ir_data=retriever_inputs[corpus_name])
    for corpus_name in retriever_inputs.keys()
    for (retriever_name, retriever) in named_retrievers[corpus_name]
}

  0%|                                                                                                                                                                                   | 0/6612 [00:00<?, ?docs/s]
que: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.01it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 74.53it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52/52 [00:01<00:00, 38.59it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

CPU times: user 18min 7s, sys: 39.3 s, total: 18min 47s
Wall time: 5min 34s


In [42]:
per_query_results_df = pd.concat([
    df.assign(retriever=[retriever_name]*len(df)).assign(corpus=[corpus_name]*len(df))
    for ((corpus_name, retriever_name), df) in per_query_results.items()
])

In [43]:
per_query_results_df = per_query_results_df.assign(
    corpus=per_query_results_df["corpus"].apply(lambda cname: cname if type(cname) is str else cname[0]),
    generation=per_query_results_df["corpus"].apply(lambda cname: 0 if type(cname) is str else cname[1])
)

In [44]:
per_query_results_df = (
    per_query_results_df
        .groupby(["query", "retriever", "corpus"]).agg("mean").drop(columns=["generation"])
        .reset_index()
)

In [45]:
per_query_results_df

Unnamed: 0,query,retriever,corpus,Hits@1,Hits@5,Hits@10,Hits@25,Accuracy@1,Accuracy@5,Accuracy@10,Accuracy@25
0,2d human pose estimation,bm25,dependency_signature,0.0,1.0,3.0,3.0,0.0,1.0,1.0,1.0
1,2d human pose estimation,bm25,generated_rationale,0.0,4.0,6.0,7.0,0.0,1.0,1.0,1.0
2,2d human pose estimation,bm25,generated_readme,0.0,3.0,5.0,7.0,0.0,1.0,1.0,1.0
3,2d human pose estimation,bm25,generated_tasks,0.0,1.0,1.0,3.0,0.0,1.0,1.0,1.0
4,2d human pose estimation,bm25,generation_context,0.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
22171,zero shot learning,word2vec,generated_tasks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22172,zero shot learning,word2vec,generation_context,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22173,zero shot learning,word2vec,readme,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
22174,zero shot learning,word2vec,repository_signature,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
per_query_results_df.to_csv("../results/per_query_ir_results.csv", index=False)

In [47]:
(per_query_results_df
    .drop(columns=["query"])
    .groupby(["corpus", "retriever"])
    .agg("mean").reset_index(drop=False)
    .sort_values("Accuracy@10")
)[["corpus", "retriever", "Hits@10", "Accuracy@10"]]

Unnamed: 0,corpus,retriever,Hits@10,Accuracy@10
3,dependency_signature,word2vec,0.20202,0.163059
31,selected_code,word2vec,0.266955,0.197691
19,generation_context,word2vec,0.705628,0.419913
15,generated_tasks,word2vec,0.782107,0.44733
27,repository_signature,word2vec,0.792208,0.450216
0,dependency_signature,bm25,0.97076,0.502924
23,readme,word2vec,1.209235,0.58153
12,generated_tasks,bm25,1.501453,0.640988
7,generated_rationale,word2vec,1.503608,0.646465
11,generated_readme,word2vec,1.535354,0.655123


## Aggregated results

In [48]:
for corpus_name in corpora.keys():
    try:
        RetrievalEvaluationResults.from_retriever(bm25_retrievers[corpus_name], retriever_inputs[corpus_name])
    except:
        print(corpus_name)


  0%|                                                                                                                                                                                   | 0/6612 [00:00<?, ?docs/s]
que: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.82it/s]
  0%|                                                                                                                                                                                   | 0/6612 [00:00<?, ?docs/s]
que: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.78it/s]
  0%|                                                                                                                                                   

generation_context


  0%|                                                                                                                                                                                   | 0/6612 [00:00<?, ?docs/s]

dependency_signature



  0%|                                                                                                                                                                                   | 0/6612 [00:00<?, ?docs/s][A


repository_signature


  0%|                                                                                                                                                                                   | 0/6612 [00:00<?, ?docs/s]

generated_tasks





In [52]:
bm25_results = {
    corpus_name: RetrievalEvaluationResults.from_retriever(bm25_retrievers[corpus_name], retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
}




  0%|                                                                                                                                                                                   | 0/6612 [00:00<?, ?docs/s][A[A[A


  0%|                                                                                                                                                                           | 1/6612 [00:00<26:36,  4.14docs/s][A[A[A


  8%|████████████▋                                                                                                                                                          | 501/6612 [00:00<00:04, 1524.69docs/s][A[A[A


 15%|█████████████████████████▏                                                                                                                                            | 1001/6612 [00:00<00:02, 2121.94docs/s][A[A[A


 23%|█████████████████████████████████████▋                                                              

splade_results = {
    corpus_name: RetrievalEvaluationResults.from_retriever(splade_retriever, retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
}

In [53]:
word2vec_results = {
    corpus_name: RetrievalEvaluationResults.from_retriever(w2v_retriever, retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
}

Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 413.14it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52/52 [00:01<00:00, 39.83it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1352.64it/s]
Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52/52 [00:00<00:00, 141.94it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [54]:
sentence_transformer_results = {
    (corpus_name, model_name.split("/")[1]): RetrievalEvaluationResults.from_retriever(sentence_transformer_retrievers[model_name], retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
    for model_name in sentence_transformer_model_names
}

Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [00:00<00:00, 168.07it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 827/827 [00:36<00:00, 22.75it/s]
Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [00:00<00:00, 183.88it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 827/827 [00:09<00:00, 87.93it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [55]:
bm25_metrics = [
    {"corpus": corpus_name, "retriever": "bm25", **bm25_results[corpus_name].metrics}
    for corpus_name in corpora.keys()
]

In [56]:
word2vec_metrics = [
    {"corpus": corpus_name, "retriever": "Python code word2vec", **word2vec_results[corpus_name].metrics}
    for corpus_name in corpora.keys()
]

In [57]:
#splade_metrics = [
#    {"corpus": corpus_name, "retriever": "splade", **splade_results[corpus_name].metrics}
#     for corpus_name in corpora.keys()
#]
 
sentence_transformer_metrics = [
    {"corpus": corpus_name, "retriever": f"{model_name} (sentence_transformer)", **sentence_transformer_results[(corpus_name, model_name)].metrics}
    for (corpus_name, model_name) in sentence_transformer_results.keys()
]

all_metrics_df = pd.DataFrame.from_records(bm25_metrics + word2vec_metrics +  sentence_transformer_metrics).sort_values("Hits@10", ascending=False)

In [58]:
all_metrics_df.shape

(32, 50)

In [59]:
all_metrics_df[["corpus", "retriever", "Accuracy@10"]]

Unnamed: 0,corpus,retriever,Accuracy@10
16,readme,all-mpnet-base-v2 (sentence_transformer),0.92641
0,readme,bm25,0.91919
17,readme,all-MiniLM-L12-v2 (sentence_transformer),0.91919
18,generated_readme,all-mpnet-base-v2 (sentence_transformer),0.85714
22,generated_rationale,all-mpnet-base-v2 (sentence_transformer),0.86003
19,generated_readme,all-MiniLM-L12-v2 (sentence_transformer),0.85281
23,generated_rationale,all-MiniLM-L12-v2 (sentence_transformer),0.84127
1,generated_readme,bm25,0.85137
3,generated_rationale,bm25,0.85281
4,generation_context,bm25,0.78066


In [63]:
all_metrics_df.columns

Index(['corpus', 'retriever', 'Accuracy@1', 'Accuracy@3', 'Accuracy@5',
       'Accuracy@10', 'Accuracy@100', 'Accuracy@1000', 'Hits@1', 'Hits@3',
       'Hits@5', 'Hits@10', 'Hits@100', 'Hits@1000', 'R_cap@1', 'R_cap@3',
       'R_cap@5', 'R_cap@10', 'R_cap@100', 'R_cap@1000', 'MRR@1', 'MRR@3',
       'MRR@5', 'MRR@10', 'MRR@100', 'MRR@1000', 'P@1', 'P@3', 'P@5', 'P@10',
       'P@100', 'P@1000', 'Recall@1', 'Recall@3', 'Recall@5', 'Recall@10',
       'Recall@100', 'Recall@1000', 'MAP@1', 'MAP@3', 'MAP@5', 'MAP@10',
       'MAP@100', 'MAP@1000', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10',
       'NDCG@100', 'NDCG@1000'],
      dtype='object')

In [60]:
model_name = "qwen2.5:7b-instruct"

In [61]:
all_metrics_df.to_csv(f"../output/code2doc/beir_results_{model_name}.csv", index=False)

In [62]:
#all_metrics_df.to_csv(f"../output/code2doc/beir_results_with_modernbert_{model_name}.csv", index=False)

## Results

By default we will use min_task_count=10 (as we used originally)

We can switch to smaller task counts like 5 to incorporate the fact that we use sample of repos

In [None]:
metric_df_cols = ["corpus", "retriever", "Accuracy@10", "Hits@10", "R_cap@10", "P@1", "P@5", "P@10"]

In [None]:
all_metrics_df[metric_df_cols]

In [None]:
all_metrics_df[metric_df_cols].sort_values("Accuracy@10", ascending=False)

In [None]:
all_metrics_df.groupby("corpus").apply(lambda df: df.sort_values("Accuracy@10", ascending=False).iloc[0])[metric_df_cols].sort_values("Accuracy@10", ascending=False)

In [None]:
all_metrics_df.groupby("retriever").apply(lambda df: df.sort_values("Accuracy@10", ascending=False).iloc[0])[metric_df_cols].sort_values("Accuracy@10", ascending=False)

In [None]:
all_metrics_df[all_metrics_df["retriever"] == "bm25"][metric_df_cols]

In [None]:
len(task_queries)

In [None]:
# task count = 5

In [None]:
all_metrics_df[["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10", ascending=False)

In [None]:
# task count = 10

In [None]:
all_metrics_df[["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10", ascending=False)

In [None]:
all_metrics_df.groupby("retriever")["Accuracy@10"].agg("mean").sort_values()

In [None]:
all_metrics_df.groupby("retriever")["Accuracy@10"].agg("mean").sort_values()

In [None]:
all_metrics_df.groupby("retriever")["Accuracy@10"].agg("mean").sort_values()

In [None]:
all_metrics_df.groupby("corpus")["Accuracy@10"].agg("mean").sort_values()

In [None]:
sampled_repos_df["tasks"].explode().value_counts().loc[list(task_queries.values())]

In [None]:
all_metrics_df[["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10", ascending=False)

## Does combining rationale with generated readme help?

It seems that the best sentence transformer retrievers can only get worse when using any other information!

In [None]:
sentence_transformer_results.keys()

In [None]:
st_generated_readme_results= sentence_transformer_results[('generated_readme', 'all-mpnet-base-v2')].retrieval_results
st_rationale_results = sentence_transformer_results[('generated_rationale', 'all-mpnet-base-v2')].retrieval_results
bm25_generated_readme_results = bm25_results["generated_readme"].retrieval_results
st_context_results = sentence_transformer_results[('generation_context', 'all-mpnet-base-v2')].retrieval_results

In [None]:
len(list(bm25_generated_readme_results.keys()))

In [None]:
len(list(st_generated_readme_results.keys()))

In [None]:
def merge_qrels(qrels1, qrels2):
    merged_qrels = {}
    for k in qrels1.keys():
        tmp_rel = dict()
        for rel_k in set(qrels1[k].keys()).union(qrels2[k]):
            tmp_rel[rel_k] = qrels1[k].get(rel_k, 0) +  qrels2[k].get(rel_k, 0)
        merged_qrels[k] = tmp_rel
    return merged_qrels

In [None]:
st_generation_results = merge_qrels(bm25_generated_readme_results, st_generated_readme_results)

In [None]:
EvaluateRetrieval().evaluate_custom(task_qrels, st_generation_results, metric="acc", k_values=[1,5,10])

In [None]:
EvaluateRetrieval().evaluate_custom(task_qrels, st_generated_readme_results, metric="acc", k_values=[1,5,10])

In [None]:
EvaluateRetrieval().evaluate_custom(task_qrels, st_rationale_results, metric="acc", k_values=[1,5,10])

In [None]:
all_metrics_df[all_metrics_df["retriever"] == "bm25"][["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10")

In [None]:
Splitting does not make much sense as the most of generated data is under the sentence-transformer context length (384 tokens)

In [None]:
def split_corpus_by_lengths(corpus, chunk_length):
    splitted_corpora = [dict() for _ in range(n_splits)]
    for c_id in corpus.keys():
        text = corpus[c_id]["text"]
        chunk_length =  len(text) // n_splits
        for i in range(0, n_splits):
            splitted_corpora[i] = text[i*chunk_length:(i+1)*chunk_length]
        

In [None]:
class MultiTextEvaluator(BaseModel):
    """
    Evaluate a dataframe that has multiple texts for each query (multiple generation experiments)
    iteration_col says which experiment it was
    """
    iteration_col: str
    text_cols: List[str]
    k_values: List[int] = [1,5,10,25]

    def get_ir_datas(self, df):
        for iter in df[self.iteration_col].unique():
            ir_data = load_ir_data(df[df[self.iteration_col] == iter], self.text_cols)
            yield (iter, ir_data)

    def evaluate(self, df, retriever):
        ir_datas = dict(self.get_ir_datas(df))
        dfs = []
        for iter, ir_data in ir_datas.items():
            per_query_evaluator = PerQueryIREvaluator(k_values=self.k_values)
            df = per_query_evaluator.get_scores(ir_data, retriever)
            df[self.iteration_col] = iter
            dfs.append(df)
        metrics_df = pd.concat(dfs)
        metrics_df["query"] = metrics_df.index
        return metrics_df