In [1]:
import pytrec_eval
import pandas as pd

In [2]:
pd.set_option('max_colwidth', 128)

from pathlib import Path

In [5]:
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.lexical import BM25Search as BM25


#from github_search.ir.evaluate_bm25 import load_ir_data, load_generation_metrics_df, RetrievalConfig, get_retriever
#from github_search.pipelines.get_zenml_results import ArtifactLoader


In [None]:
## Dependency and librarian signatures

In [6]:
librarian_signatures_df = pd.read_parquet("/home/kuba/Projects/uhackathons/fastrag_util/data/librarian_signatures.parquet")

0                       xgfelicia/SRVRPG
1                 hula-ai/mc_dropconnect
2                btc-room101/bitcoin-rnn
3             leobean/CenterNet_oriented
4        bnpy/hdp-grid-image-restoration
                      ...               
36170                       ssokota/spie
36171                        xihechn/QSA
36172                     fursovia/dilma
36173                    L4TTiCe/SAR2SAR
36174    BryanPlummer/Two_branch_network
Name: repo, Length: 36175, dtype: object

In [25]:
from typing import Union
import ast
from pydantic import BaseModel

class CorpusDataLoader(BaseModel):
    repos_df_path: Union[str, Path]
    generated_readmes_df_path: Union[str, Path]
    code_df_path: Union[str, Path]

    @classmethod
    def from_dir(cls, dir):
        dir = Path(dir)
        return CorpusDataLoader(
            repos_df_path=dir / "repos_with_all_data.jsonl",
            generated_readmes_df_path=dir / "dspy_generated_readmes.json",
            code_df_path=dir.parent.parent / "code" / "python_files_with_selected_code.feather"
        )

    def load_repos_df(self):
        assert self.repos_df_path.exists()
        df = pd.read_json(self.repos_df_path, orient="records", lines=True)
        if type(df["tasks"].iloc[0]) is str:
            df["tasks"] = df["tasks"].apply(ast.literal_eval)
        for col in ["repo", "tasks", "readme"]:
            assert col in df.columns
        return df

    def load_generated_readmes_df(self):
        assert self.generated_readmes_df_path.exists()
        df = pd.read_json(self.generated_readmes_df_path, orient="records", lines=True)
        for col in ['rationale', 'answer', 'context_history', 'repo_name']:
            assert col in df.columns
        return df

    def load_python_code_df(self):
        assert self.code_df_path.exists()
        df = pd.read_feather(self.code_df_path)
        for col in ['content', 'path', 'repo_name', 'tasks', 'selected_code']:
            assert col in df.columns
        return df

    def load_corpus_dfs(self, selected_repos=None):
        readme_df = self.load_repos_df()
        generated_readme_df = self.load_generated_readmes_df()
        selected_python_code_df = self.load_python_code_df()
        repos = set(readme_df["repo"]).intersection(set(generated_readme_df["repo_name"]))
        if selected_repos is not None:
            repos = repos.intersection(set(selected_repos))
        readme_df = readme_df[readme_df["repo"].isin(repos)].reset_index()
        generated_readme_df = generated_readme_df.set_index("repo_name").loc[readme_df["repo"]].reset_index()
        selected_python_code_df = selected_python_code_df[selected_python_code_df["repo_name"].isin(repos)]
        return readme_df, generated_readme_df, selected_python_code_df

small_sample_loader = CorpusDataLoader.from_dir(Path("../output/code2doc/small_1k"))


In [26]:
#sampled_repos_df = pd.read_json("../output/code2doc/sample_2k/sampled_repos.jsonl", orient="records", lines=True)
sampled_repos_df, sampled_generated_readmes_df, sample_python_code_df = small_sample_loader.load_corpus_dfs(librarian_signatures_df["repo"])

Select only repos with signatures that were in sample

In [30]:
sampled_librarian_signatures_df = librarian_signatures_df[librarian_signatures_df["repo"].isin(sampled_repos_df["repo"])]
# use one generation
sampled_librarian_signatures_df = sampled_librarian_signatures_df[sampled_librarian_signatures_df["generation"] == 0]
sampled_librarian_signatures_df = sampled_librarian_signatures_df.set_index("repo").loc[sampled_repos_df["repo"]].reset_index()

In [31]:
sampled_librarian_signatures_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864 entries, 0 to 863
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   repo                  864 non-null    object
 1   tasks                 864 non-null    object
 2   generation            864 non-null    int64 
 3   dependency_signature  864 non-null    object
 4   repository_signature  864 non-null    object
 5   generated_tasks       864 non-null    object
dtypes: int64(1), object(5)
memory usage: 40.6+ KB


## Example BEIR dataset

In [9]:
import os
import pathlib

dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(pathlib.Path("..").parent.absolute(), "datasets")
data_path = util.download_and_unzip(url, out_dir)

In [10]:
_corpus, _queries, _qrels = GenericDataLoader(data_path).load(split="test")

[1;35mLoading Corpus...[0m


  0%|          | 0/5183 [00:00<?, ?it/s]

[1;35mLoaded 5183 TEST Documents.[0m
[1;35mDoc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 versus 1.1 microm2/ms). Relati

In [11]:
print(_corpus['4983'].keys())
_corpus['4983']

dict_keys(['text', 'title'])


{'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 versus 1.1 microm2/ms). Relative anisotropy was higher the closer birth was to term with 

In [38]:
def get_repos_for_query(query, repos_df):
    return repos_df[repos_df["tasks"].apply(lambda ts: query in ts)]


def get_queries(repos_df, min_query_count):
    all_queries = repos_df["tasks"].explode()
    qcounts = all_queries.value_counts()
    return qcounts[qcounts >= min_query_count].index.to_list()

def prepare_query_data(repos_df, min_query_count=3):
    task_queries = {str(i): query for (i, query) in enumerate(get_queries(repos_df, min_query_count=min_query_count))}

    task_qrels = {
        qid: {str(corpus_id): 1 for corpus_id in get_repos_for_query(task_queries[qid], repos_df).index}
        for qid in task_queries.keys()
    }
    return task_queries, task_qrels


def prepare_readme_corpus(repos_df):
    return {str(i): {"text": row["readme"], "title": row["repo"], 'tasks': row['tasks']} for (i, row) in repos_df.iterrows()}


def prepare_generated_readme_corpus(repos_df, generated_readmes_df, columns=["answer"]):
    generated_readmes_df = generated_readmes_df.set_index("repo_name").loc[repos_df["repo"]].reset_index()
    return {str(i): {"text": "\n".join(row[columns]), "title": row["repo_name"]} for (i, row) in generated_readmes_df.iterrows()}

    
def prepare_code_corpus(repos_df, selected_python_code_df):
    per_repo_code_df = selected_python_code_df.groupby("repo_name").apply(lambda df: "\n\n".join(df["selected_code"].fillna("")))
    per_repo_code_df = per_repo_code_df.loc[repos_df["repo"]].reset_index()
    return {str(i): {"text": row[0], "title": row["repo_name"]} for (i, row) in per_repo_code_df.iterrows()}


# THIS IS FOR ONE GENERATION ONLY NOW
def prepare_librarian_corpora(sampled_librarian_signatures_df):
    columns = ["dependency_signature", "repository_signature", "generated_tasks"]
    return {
        column: {str(i): {"text": row[column], "title": row["repo"]} for (i, row) in sampled_librarian_signatures_df[["repo", column]].iterrows()} 
        for column in columns
    }


def prepare_corpora(repos_df, generated_readmes_df, selected_python_code_df):
    readme_corpus = prepare_readme_corpus(sampled_repos_df)
    generated_readme_corpus = prepare_generated_readme_corpus(sampled_repos_df, sampled_generated_readmes_df)
    generated_rationale_corpus = prepare_generated_readme_corpus(sampled_repos_df, sampled_generated_readmes_df, columns=["rationale"])
    generated_readme_rationale_corpus = prepare_generated_readme_corpus(sampled_repos_df, sampled_generated_readmes_df, columns=["answer", "rationale"])
    generated_readme_context_corpus = prepare_generated_readme_corpus(sampled_repos_df, sampled_generated_readmes_df, columns=["context_history"])
    selected_python_code_corpus = prepare_code_corpus(sampled_repos_df, selected_python_code_df)

    assert len(readme_corpus) == len(generated_readme_corpus)
    assert len(selected_python_code_corpus) == len(readme_corpus)
    
    for k in readme_corpus.keys():
        assert readme_corpus[k]['title'] == generated_readme_corpus[k]['title'], str((readme_corpus[k]['title'], generated_readme_corpus[k]['title']))
        assert readme_corpus[k]['title'] == selected_python_code_corpus[k]['title']
    return {
        "readme": readme_corpus,
        "generated_readme": generated_readme_corpus,
        "selected_code": selected_python_code_corpus,
        "generated_rationale": generated_rationale_corpus,
        "generation_context": generated_readme_context_corpus,
    }

In [157]:
sampled_generated_readmes_df.head()

Unnamed: 0,repo_name,rationale,answer,context_history
0,21-projects-for-deep-learning/image2text,"This repository tackles the task of image captioning, which is a machine learning problem that involves generating natural l...","This repository tackles the task of image captioning, which is a machine learning problem that involves generating natural l...","* `im2txt/configuration.py`: This file contains the configuration for the Show and Tell model, including the number of input..."
1,2anchao/VovJpu,"This repository, '2anchao/VovJpu', tackles the problem of image upscaling using a deep learning model called VovJpu. The mod...","This repository, '2anchao/VovJpu', tackles the problem of image upscaling using a deep learning model called VovJpu. The mod...",* `config.py`: This file contains a class called `DefaultConfigs` that defines the default configuration for the VovJpu mode...
2,2myeonggyu/Graph-Embedding,The '2myeonggyu/Graph-Embedding' repository tackles the problem of generating high-quality embeddings for nodes in a graph u...,The '2myeonggyu/Graph-Embedding' repository tackles the problem of generating high-quality embeddings for nodes in a graph u...,"* `nodevectors-0.0.1/graph2vec/__init__.py`: This file initializes the Graph2Vec library and defines the Node2Vec class, whi..."
3,394781865/insightface,This repository is focused on developing a deep learning model for object detection using the ResNet-50 architecture with DL...,This repository tackles the task of object detection using the ResNet-50 architecture with DLA (Dilated Convolutional Layers...,* `SSH/rcnn/processing/generate_anchor.py` generates anchor boxes for object detection using the ratio and scale parameters ...
4,461054993/SDCN,This repository tackles the problem of clustering data and evaluating the performance of a self-supervised learning model ca...,The `evaluation.py` file in this repository is likely used to evaluate the performance of the SDCN model on a test dataset. ...,The `evaluation.py` file contains a function that matches two clustering results obtained from different algorithms using th...


In [158]:
sampled_generated_readmes_df

Unnamed: 0,repo_name,rationale,answer,context_history
0,21-projects-for-deep-learning/image2text,"This repository tackles the task of image captioning, which is a machine learning problem that involves generating natural l...","This repository tackles the task of image captioning, which is a machine learning problem that involves generating natural l...","* `im2txt/configuration.py`: This file contains the configuration for the Show and Tell model, including the number of input..."
1,2anchao/VovJpu,"This repository, '2anchao/VovJpu', tackles the problem of image upscaling using a deep learning model called VovJpu. The mod...","This repository, '2anchao/VovJpu', tackles the problem of image upscaling using a deep learning model called VovJpu. The mod...",* `config.py`: This file contains a class called `DefaultConfigs` that defines the default configuration for the VovJpu mode...
2,2myeonggyu/Graph-Embedding,The '2myeonggyu/Graph-Embedding' repository tackles the problem of generating high-quality embeddings for nodes in a graph u...,The '2myeonggyu/Graph-Embedding' repository tackles the problem of generating high-quality embeddings for nodes in a graph u...,"* `nodevectors-0.0.1/graph2vec/__init__.py`: This file initializes the Graph2Vec library and defines the Node2Vec class, whi..."
3,394781865/insightface,This repository is focused on developing a deep learning model for object detection using the ResNet-50 architecture with DL...,This repository tackles the task of object detection using the ResNet-50 architecture with DLA (Dilated Convolutional Layers...,* `SSH/rcnn/processing/generate_anchor.py` generates anchor boxes for object detection using the ratio and scale parameters ...
4,461054993/SDCN,This repository tackles the problem of clustering data and evaluating the performance of a self-supervised learning model ca...,The `evaluation.py` file in this repository is likely used to evaluate the performance of the SDCN model on a test dataset. ...,The `evaluation.py` file contains a function that matches two clustering results obtained from different algorithms using th...
...,...,...,...,...
898,zliucr/mixed-language-training,This repository tackles the task of dialogue state tracking and natural language understanding (NLU) for mixed-language conv...,This repository tackles the task of dialogue state tracking (DST) and natural language understanding (NLU) for mixed-languag...,"* `src/dst_loader.py`: This file defines a custom dataset class for the mixed language training task, which loads the dialog..."
899,zoj613/polya-gamma,"Using summaries of 'zoj613/polya-gamma' files from Context, write repository README. Focus on the functionalities and featur...","This repository appears to be focused on generating random points for the polyagamma distribution, which is a type of probab...",* `scripts/benchmark.py`: This script contains a function called `random_polyagamma` that generates random points for the po...
900,zuzuba/CISR_NeurIPS20,"This repository, zuzuba/CISR_NeurIPS20, tackles various machine learning problems related to constrained MDPs (CMDPs) and co...","This repository, zuzuba/CISR_NeurIPS20, tackles various machine learning problems related to constrained MDPs (CMDPs) and co...","1. `src/CMDP_solvers/test.py`: This file contains unit tests for the Lagrangian CMDP solver class, which is used to solve co..."
901,zyning/signalSeparation,"Using summaries of 'zyning/signalSeparation' files from Context, write repository README. Focus on the functionalities and f...","This repository tackles the problem of signal separation in audio data, which involves separating a mixed signal into its in...",1. `unet/.ipynb_checkpoints/unet_model-checkpoint.py`: This file contains a Python class named `UNet` that is a neural netwo...


In [39]:
task_queries, task_qrels = prepare_query_data(sampled_repos_df)

corpora = prepare_corpora(sampled_repos_df, sampled_generated_readmes_df, sample_python_code_df) | prepare_librarian_corpora(sampled_librarian_signatures_df)

In [42]:
import elasticsearch

es_client = elasticsearch.Elasticsearch()
def retrieve_repos_with_es(query, k=50, index="readme", es_client=es_client):
    es_result = es_client.search(index=index, body={"query": {"match": {"txt": query}}}, size=k)
    return [
        hit["_source"]["title"]
        for hit in es_result["hits"]["hits"]
    ]

In [44]:
print(task_queries['2'])
retrieve_repos_with_es(task_queries['2'], index="selected_code")

object detection


['ankit-vaghela30/Google-landmark-prediction',
 'JensSettelmeier/EfficientDet-DeepSORT-Tracker',
 'hanghang177/pedestrian_awareness',
 'ChristianMarzahl/ObjectDetection',
 'anonymousjack/hijacking',
 'robin-chan/decision-rules',
 'eddyhkchiu/mahalanobis_3d_multi_object_tracking',
 'Pranav21091996/Semantic_Fidelity-and-Egoshots',
 'zj463261929/darknet_mAP',
 'stigma0617/maskrcnn-benchmark-vovnet',
 'AcramBousa/darknet',
 'facebookresearch/detectron',
 'AlbertoSabater/Robust-and-efficient-post-processing-for-video-object-detection',
 'hankpark0706/darknet',
 'HongSic/DarknetAI',
 'Wangxy2180/darknetKinectDetect',
 'artxtech/darknet-rnn',
 'ghadahamed/darknet',
 'hirohiro23/Darknet',
 'iskandari/darknet',
 'tommyjtl/darknet-colab',
 'ycchiusieve/yolo3',
 'zliucr/mixed-language-training',
 'zanmange/darknet',
 'ppengtang/oicr',
 'shangtse/robust-physical-attack',
 'ahhan02/darknet-alex',
 'sdu2011/darknet_alexyab',
 'Ekim-Yurtsever/DeepTL-Lane-Change-Classification',
 'Rahmanzia3/yolo',
 '

In [45]:
retrieved_repo_tasks = {}

qcounts = sampled_repos_df["tasks"].explode().value_counts()
used_queries = [
    query
    for query in sampled_repos_df["tasks"].explode().drop_duplicates()
    if qcounts.loc[query] > 5
]
# [task_queries[qid] for qid in task_queries.keys()]

index="selected_code"
for query in used_queries:
    retrieved_tasks = sampled_repos_df[sampled_repos_df["repo"].isin(retrieve_repos_with_es(query, index=index))]["tasks"].to_list()
    retrieved_repo_tasks[query] = retrieved_tasks

In [163]:
k = 10
query_hits = pd.Series({
    query: sum([query in tasks for tasks in retrieved_repo_tasks[query][:k]])
    for query in retrieved_repo_tasks.keys()
})

In [164]:
query_hits.shape

(214,)

In [165]:
qid = '10'
query = task_queries[qid]

print(query)
print(query_hits[query], "hits")

for hit in es_client.search(index=index, body={"query": {"match": {"txt": task_queries[qid]}}}, size=k)["hits"]["hits"]:
    print("#" * 100)
    print("#" * 100)
    repo_name = hit["_source"]["title"]
    repo_record = sampled_repos_df[sampled_repos_df["repo"] == repo_name].iloc[0]
    is_hit = query in repo_record["tasks"]
    print(repo_name, "HIT" if is_hit else "NO HIT")
    
    if is_hit:
        print("#" * 100)
        print("#" * 100)
        print(hit['_source']['txt'])

representation learning
0 hits
####################################################################################################
####################################################################################################
priba/siamese_ged NO HIT
####################################################################################################
####################################################################################################
shenxiaocam/ACDNE NO HIT
####################################################################################################
####################################################################################################
murthyrudra/NeuralNER NO HIT
####################################################################################################
####################################################################################################
Zeta36/muzero NO HIT
#############################################################

In [166]:
query = "object detection"
es_client.search(index="readme", body={"query": {"match": {"txt": query}}}, size=10)["hits"]["hits"]

[{'_index': 'readme',
  '_id': '762',
  '_score': 6.161534,
  '_source': {'refresh': 'wait_for',
   'txt': "# ShapeShifter: Robust Physical Adversarial Attack on Faster R-CNN Object Detector\n\n## Overview\n\nThis is the code repository for the ECML-PKDD 2018 paper: **ShapeShifter: Robust Physical Adversarial Attack on Faster R-CNN Object Detector**\n\nThe arXiv version is available at https://arxiv.org/abs/1804.05810\n\nThe code included here reproduces our techniques presented in the paper.\n\nIn this work, we tackle the more challenging problem of crafting physical adversarial perturbations to fool image-based object detectors like Faster R-CNN.\nAttacking an object detector is more difficult than attacking an image classifier, as it needs to mislead the classification results in multiple bounding boxes with different scales.\nOur approach can generate perturbed stop signs that are consistently mis-detected by Faster R-CNN as other objects, posing a potential threat to autonomous ve

In [167]:
(1.0 * (pd.Series(query_hits) > 0)).describe()

count    214.000000
mean       0.523364
std        0.500625
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
dtype: float64

In [168]:
pd.Series(query_hits).describe()

count    214.000000
mean       1.009346
std        1.433963
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        8.000000
dtype: float64

In [217]:
print(sampled_repos_df[sampled_repos_df["repo"] == "oxwhirl/wqmix"].iloc[0]["query_tasks"])

['decision making', 'reinforcement learning', 'q learning']


## Evaluating with BEIR

In [169]:
import sys

sys.path.append("./splade")

In [49]:
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval.models import SPLADE, SentenceBERT, UniCOIL
from beir.retrieval.search.sparse import SparseSearch


In [50]:
def get_splade_retriever(splade_model_path = "splade/weights/distilsplade_max", batch_size=128):
    splade_model = DRES(SPLADE(splade_model_path), batch_size=128)
    return EvaluateRetrieval(splade_model, score_function="dot")

def get_bm25_retrievers(corpora):
        
    bm25_retrievers = {}
    for corpus_name, corpus in corpora.items():
        model = BM25(index_name=corpus_name)
        retriever = EvaluateRetrieval(model)
        bm25_retrievers[corpus_name] = retriever
    return bm25_retrievers


sentence_transformer_model_names = [
    "sentence-transformers/all-mpnet-base-v2",
    "sentence-transformers/all-MiniLM-L12-v2",
    "flax-sentence-embeddings/st-codesearch-distilroberta-base"
]

def get_sentence_transformer_retriever(model_name="sentence-transformers/all-mpnet-base-v2", batch_size=256):
    model = DRES(SentenceBERT(model_name), batch_size=batch_size)
    return EvaluateRetrieval(model, score_function="cos_sim")

def get_unicoil_retriever(model_name="castorini/unicoil-msmarco-passage"):
    """
    THERE IS A BUG WITH BEIR THAT MAKES THIS UNUSABLE
    """
    model = SparseSearch(UniCOIL(model_path=model_name), batch_size=32)
    return EvaluateRetrieval(model, score_function="dot")

In [51]:
bm25_retrievers = get_bm25_retrievers(corpora)

In [52]:
splade_retriever = get_splade_retriever()
sentence_transformer_retrievers = {
    model_name: get_sentence_transformer_retriever(model_name)
    for model_name in sentence_transformer_model_names
}

In [53]:
from pydantic import BaseModel
from typing import Dict

class RetrieverInput(BaseModel):
    corpus: Dict[str, dict]
    queries: Dict[str, str]
    qrels: Dict[str, Dict[str, int]]


class RetrievalEvaluationResults(BaseModel):
    retrieval_results: Dict[str, Dict[str, float]]
    metrics: dict
    model_type: str


    
    @classmethod
    def from_retriever(cls, retriever, retriever_input):
        retrieval_results = retriever.retrieve(retriever_input.corpus, retriever_input.queries)
        acc = retriever.evaluate_custom(retriever_input.qrels, retrieval_results, retriever.k_values, metric="accuracy")
        other_metrics = retriever.evaluate(retriever_input.qrels, retrieval_results, retriever.k_values, ignore_identical_ids=False)
        metrics = acc | cls.tuple_to_dict(other_metrics)
        try:
            model_type = str(retriever.retriever.model)
        except:
            model_type = "bm25"
        return RetrievalEvaluationResults(metrics=metrics, model_type=model_type, retrieval_results=retrieval_results)


    @classmethod
    def tuple_to_dict(cls, dicts):
        merged_dict = {}
        for d in dicts:
            merged_dict = d | merged_dict
        return merged_dict




In [54]:
retriever_inputs = {
    corpus_name: RetrieverInput(corpus=corpus, queries=task_queries, qrels=task_qrels)
    for (corpus_name, corpus) in corpora.items()
}

In [55]:
retriever_inputs.keys()

dict_keys(['readme', 'generated_readme', 'selected_code', 'generated_rationale', 'generation_context', 'dependency_signature', 'repository_signature', 'generated_tasks'])

In [56]:
bm25_results = {
    corpus_name: RetrievalEvaluationResults.from_retriever(bm25_retrievers[corpus_name], retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
}

  0% 0/864 [00:00<?, ?docs/s]            
que: 100% 3/3 [00:01<00:00,  2.59it/s]
  0% 0/864 [00:00<?, ?docs/s]            
que: 100% 3/3 [00:00<00:00,  7.06it/s]
  0% 0/864 [00:00<?, ?docs/s]            
que: 100% 3/3 [00:00<00:00,  3.44it/s]
  0% 0/864 [00:00<?, ?docs/s]            
que: 100% 3/3 [00:00<00:00,  6.57it/s]
  0% 0/864 [00:00<?, ?docs/s]            
que: 100% 3/3 [00:00<00:00,  6.93it/s]
  0% 0/864 [00:00<?, ?docs/s]
que: 100% 3/3 [00:00<00:00, 32.00it/s]
  0% 0/864 [00:00<?, ?docs/s]            
que: 100% 3/3 [00:00<00:00, 13.58it/s]
  0% 0/864 [00:00<?, ?docs/s]
que: 100% 3/3 [00:00<00:00, 17.56it/s]


In [60]:
splade_results = {
    corpus_name: RetrievalEvaluationResults.from_retriever(splade_retriever, retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
}

Batches: 100% 11/11 [00:00<00:00, 22.42it/s]
Batches: 100% 27/27 [00:02<00:00, 10.09it/s]
Batches: 100% 11/11 [00:00<00:00, 196.23it/s]
Batches: 100% 27/27 [00:01<00:00, 15.60it/s]
Batches: 100% 11/11 [00:00<00:00, 196.52it/s]
Batches: 100% 27/27 [00:02<00:00,  9.96it/s]
Batches: 100% 11/11 [00:00<00:00, 197.67it/s]
Batches: 100% 27/27 [00:01<00:00, 13.87it/s]
Batches: 100% 11/11 [00:00<00:00, 199.73it/s]
Batches: 100% 27/27 [00:02<00:00, 12.22it/s]
Batches: 100% 11/11 [00:00<00:00, 183.19it/s]
Batches: 100% 27/27 [00:01<00:00, 20.26it/s]
Batches: 100% 11/11 [00:00<00:00, 195.39it/s]
Batches: 100% 27/27 [00:01<00:00, 18.19it/s]
Batches: 100% 11/11 [00:00<00:00, 198.32it/s]
Batches: 100% 27/27 [00:00<00:00, 86.80it/s]


In [61]:
sentence_transformer_results = {
    (corpus_name, model_name.split("/")[1]): RetrievalEvaluationResults.from_retriever(sentence_transformer_retrievers[model_name], retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
    for model_name in sentence_transformer_model_names
}

Batches: 100% 2/2 [00:00<00:00, 25.44it/s]
Batches: 100% 4/4 [00:04<00:00,  1.03s/it]
Batches: 100% 2/2 [00:00<00:00, 90.42it/s]
Batches: 100% 4/4 [00:00<00:00,  5.76it/s]
Batches: 100% 2/2 [00:00<00:00, 65.80it/s]
Batches: 100% 4/4 [00:00<00:00,  4.56it/s]
Batches: 100% 2/2 [00:00<00:00, 43.45it/s]
Batches: 100% 4/4 [00:02<00:00,  1.51it/s]
Batches: 100% 2/2 [00:00<00:00, 88.32it/s]
Batches: 100% 4/4 [00:00<00:00,  9.82it/s]
Batches: 100% 2/2 [00:00<00:00, 87.02it/s]
Batches: 100% 4/4 [00:00<00:00,  7.31it/s]
Batches: 100% 2/2 [00:00<00:00, 43.15it/s]
Batches: 100% 4/4 [00:04<00:00,  1.03s/it]
Batches: 100% 2/2 [00:00<00:00, 88.01it/s]
Batches: 100% 4/4 [00:00<00:00,  5.97it/s]
Batches: 100% 2/2 [00:00<00:00, 88.46it/s]
Batches: 100% 4/4 [00:00<00:00,  4.90it/s]
Batches: 100% 2/2 [00:00<00:00, 44.95it/s]
Batches: 100% 4/4 [00:02<00:00,  1.42it/s]
Batches: 100% 2/2 [00:00<00:00, 83.17it/s]
Batches: 100% 4/4 [00:00<00:00,  9.80it/s]
Batches: 100% 2/2 [00:00<00:00, 85.82it/s]
Batches: 10

In [64]:
bm25_metrics = [
    {"corpus": corpus_name, "retriever": "bm25", **bm25_results[corpus_name].metrics}
    for corpus_name in corpora.keys()
]

splade_metrics = [
    {"corpus": corpus_name, "retriever": "splade", **splade_results[corpus_name].metrics}
     for corpus_name in corpora.keys()
]

sentence_transformer_metrics = [
    {"corpus": corpus_name, "retriever": f"{model_name} (sentence_transformer)", **sentence_transformer_results[(corpus_name, model_name)].metrics}
    for (corpus_name, model_name) in sentence_transformer_results.keys()
]

all_metrics_df = pd.DataFrame.from_records(bm25_metrics + splade_metrics + sentence_transformer_metrics)

In [65]:
all_metrics_df[["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10", ascending=False)

Unnamed: 0,corpus,retriever,Accuracy@10
0,readme,bm25,0.86747
16,readme,all-mpnet-base-v2 (sentence_transformer),0.85241
17,readme,all-MiniLM-L12-v2 (sentence_transformer),0.81627
8,readme,splade,0.81024
25,generated_rationale,all-mpnet-base-v2 (sentence_transformer),0.79518
19,generated_readme,all-mpnet-base-v2 (sentence_transformer),0.78614
26,generated_rationale,all-MiniLM-L12-v2 (sentence_transformer),0.7741
9,generated_readme,splade,0.75602
20,generated_readme,all-MiniLM-L12-v2 (sentence_transformer),0.75602
11,generated_rationale,splade,0.74699


In [66]:
all_metrics_df.groupby("retriever")["Accuracy@10"].agg("mean").sort_values()

retriever
st-codesearch-distilroberta-base (sentence_transformer)    0.579066
bm25                                                       0.648720
splade                                                     0.695031
all-MiniLM-L12-v2 (sentence_transformer)                   0.700678
all-mpnet-base-v2 (sentence_transformer)                   0.742845
Name: Accuracy@10, dtype: float64

In [67]:
all_metrics_df.groupby("corpus")["Accuracy@10"].agg("mean").sort_values()

corpus
selected_code           0.576506
dependency_signature    0.595784
generated_tasks         0.615058
repository_signature    0.641566
generation_context      0.681328
generated_readme        0.737950
generated_rationale     0.738554
readme                  0.799398
Name: Accuracy@10, dtype: float64

## Does combining rationale with generated readme help?

It seems that the best sentence transformer retrievers can only get worse when using any other information!

In [267]:
sentence_transformer_results.keys()

dict_keys([('readme', 'all-mpnet-base-v2'), ('readme', 'all-MiniLM-L12-v2'), ('readme', 'st-codesearch-distilroberta-base'), ('generated_readme', 'all-mpnet-base-v2'), ('generated_readme', 'all-MiniLM-L12-v2'), ('generated_readme', 'st-codesearch-distilroberta-base'), ('selected_code', 'all-mpnet-base-v2'), ('selected_code', 'all-MiniLM-L12-v2'), ('selected_code', 'st-codesearch-distilroberta-base'), ('generated_rationale', 'all-mpnet-base-v2'), ('generated_rationale', 'all-MiniLM-L12-v2'), ('generated_rationale', 'st-codesearch-distilroberta-base'), ('generation_context', 'all-mpnet-base-v2'), ('generation_context', 'all-MiniLM-L12-v2'), ('generation_context', 'st-codesearch-distilroberta-base')])

In [270]:
st_generated_readme_results= sentence_transformer_results[('generated_readme', 'all-mpnet-base-v2')].retrieval_results
st_rationale_results = sentence_transformer_results[('generated_rationale', 'all-mpnet-base-v2')].retrieval_results
bm25_generated_readme_results = bm25_results["generated_readme"].retrieval_results
st_context_results = sentence_transformer_results[('generation_context', 'all-mpnet-base-v2')].retrieval_results

In [278]:
len(list(bm25_generated_readme_results.keys()))

335

In [279]:
len(list(st_generated_readme_results.keys()))

340

In [346]:
def merge_qrels(qrels1, qrels2):
    merged_qrels = {}
    for k in qrels1.keys():
        tmp_rel = dict()
        for rel_k in set(qrels1[k].keys()).union(qrels2[k]):
            tmp_rel[rel_k] = qrels1[k].get(rel_k, 0) +  qrels2[k].get(rel_k, 0)
        merged_qrels[k] = tmp_rel
    return merged_qrels

In [347]:
st_generation_results = merge_qrels(bm25_generated_readme_results, st_generated_readme_results)

In [348]:
st_generation_results['0']

{'219': 0.1825864315032959,
 '483': 0.0775195062160492,
 '186': 2.1769384084968566,
 '106': 0.1905636489391327,
 '189': 0.16104750335216522,
 '754': 0.32497990131378174,
 '485': 1.346730067921257,
 '248': 2.708789764968109,
 '839': 0.12559418380260468,
 '660': 0.1671973168849945,
 '629': 0.2034534215927124,
 '79': 2.318812138961792,
 '252': 0.12258391082286835,
 '835': 0.21031951904296875,
 '697': 0.18036513030529022,
 '796': 2.0657383785074233,
 '124': 0.23715591430664062,
 '422': 0.24739211797714233,
 '648': 1.902257665161705,
 '131': 0.19106096029281616,
 '546': 1.6653497374698638,
 '747': 2.103604193731117,
 '241': 0.24655652046203613,
 '751': 2.0184137257299426,
 '781': 0.15321020781993866,
 '450': 0.22829921543598175,
 '740': 2.330208239438629,
 '239': 1.3455350843093872,
 '158': 0.18039080500602722,
 '764': 0.248685821890831,
 '563': 1.9906711696355819,
 '470': 2.5413575289096832,
 '318': 0.0811888724565506,
 '509': 0.21818174421787262,
 '656': 0.2141641527414322,
 '108': 0.2142

In [349]:
EvaluateRetrieval().evaluate_custom(task_qrels, st_generation_results, metric="acc", k_values=[1,5,10])

[1;35m
[0m
[1;35mAccuracy@1: 0.4000[0m
[1;35mAccuracy@5: 0.6500[0m
[1;35mAccuracy@10: 0.7412[0m


{'Accuracy@1': 0.4, 'Accuracy@5': 0.65, 'Accuracy@10': 0.74118}

In [350]:
EvaluateRetrieval().evaluate_custom(task_qrels, st_generated_readme_results, metric="acc", k_values=[1,5,10])

[1;35m
[0m
[1;35mAccuracy@1: 0.4265[0m
[1;35mAccuracy@5: 0.6765[0m
[1;35mAccuracy@10: 0.7912[0m


{'Accuracy@1': 0.42647, 'Accuracy@5': 0.67647, 'Accuracy@10': 0.79118}

In [345]:
EvaluateRetrieval().evaluate_custom(task_qrels, st_rationale_results, metric="acc", k_values=[1,5,10])

[1;35m
[0m
[1;35mAccuracy@1: 0.4294[0m
[1;35mAccuracy@5: 0.6941[0m
[1;35mAccuracy@10: 0.7882[0m


{'Accuracy@1': 0.42941, 'Accuracy@5': 0.69412, 'Accuracy@10': 0.78824}

In [339]:
all_metrics_df[all_metrics_df["retriever"] == "bm25"][["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10")

Unnamed: 0,corpus,retriever,Accuracy@10
2,selected_code,bm25,0.61176
4,generation_context,bm25,0.68824
3,generated_rationale,bm25,0.70588
1,generated_readme,bm25,0.73235
0,readme,bm25,0.85294


In [None]:
Splitting does not make much sense as the most of generated data is under the sentence-transformer context length (384 tokens)

In [131]:
def split_corpus_by_lengths(corpus, chunk_length):
    splitted_corpora = [dict() for _ in range(n_splits)]
    for c_id in corpus.keys():
        text = corpus[c_id]["text"]
        chunk_length =  len(text) // n_splits
        for i in range(0, n_splits):
            splitted_corpora[i] = text[i*chunk_length:(i+1)*chunk_length]
        

In [129]:
corpora["generated_readme"]

{'0': {'text': 'This repository tackles the task of image captioning, which is a machine learning problem that involves generating natural language descriptions for images. The data used in this repository is the MSCOCO dataset, which contains over 300,000 images with corresponding captions. The repository provides code for building and training an image captioning model using TensorFlow and Keras. It includes a configuration file that specifies the hyperparameters for training, such as the number of input shards, the image format, and the vocabulary size. The repository also includes code for building the inference graph, creating the vocabulary, loading the model from checkpoint, and preparing the caption generator. The repository also includes unit tests for the ShowAndTellModel class, which checks the number of parameters in the model, the output shapes, and the accuracy of the model on a test set. Additionally, it defines the Vocabulary class that creates the vocabulary dictionary

In [None]:
class MultiTextEvaluator(BaseModel):
    """
    Evaluate a dataframe that has multiple texts for each query (multiple generation experiments)
    iteration_col says which experiment it was
    """
    iteration_col: str
    text_cols: List[str]
    k_values: List[int] = [1,5,10,25]

    def get_ir_datas(self, df):
        for iter in df[self.iteration_col].unique():
            ir_data = load_ir_data(df[df[self.iteration_col] == iter], self.text_cols)
            yield (iter, ir_data)

    def evaluate(self, df, retriever):
        ir_datas = dict(self.get_ir_datas(df))
        dfs = []
        for iter, ir_data in ir_datas.items():
            per_query_evaluator = PerQueryIREvaluator(k_values=self.k_values)
            df = per_query_evaluator.get_scores(ir_data, retriever)
            df[self.iteration_col] = iter
            dfs.append(df)
        metrics_df = pd.concat(dfs)
        metrics_df["query"] = metrics_df.index
        return metrics_df