In [1]:
import pytrec_eval
import pandas as pd

In [2]:
pd.set_option('max_colwidth', 128)

from pathlib import Path
import json

In [3]:
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from github_search.beir_evaluation import EvaluateRetrievalCustom as EvaluateRetrieval
from beir.retrieval.search.lexical import BM25Search as BM25


from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval.models import SPLADE, SentenceBERT, UniCOIL
from beir.retrieval.search.sparse import SparseSearch

#from github_search.ir.evaluate_bm25 import load_ir_data, load_generation_metrics_df, RetrievalConfig, get_retriever
#from github_search.pipelines.get_zenml_results import ArtifactLoader

  from tqdm.autonotebook import tqdm


In [4]:
## Dependency and librarian signatures

In [5]:
librarian_signatures_df = pd.read_parquet("/home/kuba/Projects/uhackathons/fastrag_util/data/librarian_signatures.parquet")

In [6]:
## load import2vec

In [7]:
import sentence_transformers

In [8]:
from typing import Union
import ast
from pydantic import BaseModel

class CorpusDataLoader(BaseModel):
    repos_df_path: Union[str, Path]
    generated_readmes_df_path: Union[str, Path]
    code_df_path: Union[str, Path]

    @classmethod
    def from_dir(cls, dir):
        dir = Path(dir)
        return CorpusDataLoader(
            repos_df_path=dir / "sampled_repos.jsonl",
            generated_readmes_df_path=dir / "generated_readmes.jsonl",
            code_df_path=dir.parent.parent / "code" / "python_files_with_selected_code.feather"
        )

    def load_repos_df(self):
        assert self.repos_df_path.exists()
        df = pd.read_json(self.repos_df_path, orient="records", lines=True)
        if type(df["tasks"].iloc[0]) is str:
            df["tasks"] = df["tasks"].apply(ast.literal_eval)
        for col in ["repo", "tasks", "readme"]:
            assert col in df.columns
        return df

    def load_generated_readmes_df(self):
        assert self.generated_readmes_df_path.exists()
        if ".json" in str(self.generated_readmes_df_path):
            return self.load_generated_readmes_from_json()
        else:
            return self.load_generated_readmes_from_phoenix(self.generated_readmes_df_path)
            
    def load_generated_readmes_from_json(self):
        df = pd.read_json(self.generated_readmes_df_path, orient="records", lines=True)
        for col in ['rationale', 'answer', 'context_history', 'repo_name']:
            assert col in df.columns
        return df
    
    def load_python_code_df(self):
        assert self.code_df_path.exists()
        df = pd.read_feather(self.code_df_path)
        for col in ['content', 'path', 'repo_name', 'tasks', 'selected_code']:
            assert col in df.columns
        return df

    def load_corpus_dfs(self, selected_repos=None):
        readme_df = self.load_repos_df()
        generated_readme_df = self.load_generated_readmes_df()
        selected_python_code_df = self.load_python_code_df()
        repos = set(readme_df["repo"]).intersection(set(generated_readme_df["repo_name"]))
        if selected_repos is not None:
            repos = repos.intersection(set(selected_repos))
        readme_df = readme_df[readme_df["repo"].isin(repos)].reset_index()
        generated_readme_df = generated_readme_df.set_index("repo_name").loc[readme_df["repo"]].reset_index()
        selected_python_code_df = selected_python_code_df[selected_python_code_df["repo_name"].isin(repos)]
        return readme_df, generated_readme_df, selected_python_code_df

    @classmethod
    def load_generated_readmes_from_phoenix(cls, path):
        phoenix_trace_df = pd.read_parquet(path)
        phoenix_trace_df = phoenix_trace_df[(phoenix_trace_df["status_code"] == "OK") & (phoenix_trace_df["name"] == "Code2Documentation.forward")] 
        trace_generated_readmes_df = pd.json_normalize(phoenix_trace_df[phoenix_trace_df["name"] == "Code2Documentation.forward"]["attributes.output.value"].apply(json.loads))
        generated_readmes_df = pd.concat(
            [
                pd.json_normalize(phoenix_trace_df["attributes.input.value"].apply(json.loads)),
                trace_generated_readmes_df
            ],
            axis=1
        )
        return generated_readmes_df


data_path = Path("../output").expanduser()

small_sample_loader = CorpusDataLoader(
    repos_df_path= data_path / "code2doc/sample2k/sampled_repos.jsonl",
    generated_readmes_df_path=Path("~/Projects").expanduser() / "torch_example/phoenix/sample_2k/trace_dataset-353a22a7-b529-4d9d-a4ec-75f442aa3eb7.parquet",
    code_df_path=data_path / "code" / "python_files_with_selected_code.feather"
)


In [9]:
class ExperimentParams:
    sampled_repos_per_task = 20
    min_repos_per_task = 10

In [10]:
#sampled_repos_df, sampled_generated_readmes_df, sample_python_code_df = small_sample_loader.load_corpus_dfs(librarian_signatures_df["repo"])

In [11]:
def filter_dfs_by_cols_in(dfs, col_values, colnames=["repo", "repo_name"]):
    out_dfs = []
    for df in dfs:
        df_cols = [c for c in colnames if c in df.columns]
        col = df_cols[0]
        filtered_df = df[df[col].isin(col_values)]
        out_dfs.append(filtered_df)
    return out_dfs


def align_dfs(dfs, colname="repo"):
    df0 = dfs[0].reset_index()
    df_index = df0[colname]
    new_dfs = [
        df.set_index(colname).loc[df_index].reset_index()
        for df in dfs[1:]
    ]
    return [df0] + new_dfs

In [14]:
bigger_sample_path = f"../output/code2doc/sample_per_task_5_repos/sampled_repos{ExperimentParams.sampled_repos_per_task}.jsonl"
sample_path = bigger_sample_path#"../output/code2doc/sample_small/sampled_repos_min10.jsonl"
sampled_repos_df = pd.read_json(sample_path, orient="records", lines=True)
sample_python_code_df = pd.read_feather(Path(data_path) / "code" / "python_files_with_selected_code.feather")

In [15]:
sampled_repos_df.shape

(6375, 9)

In [13]:
repos_with_all_data = (
    set(sampled_repos_df["repo"]) &
    set(librarian_signatures_df["repo"]) &
    set(sample_python_code_df["repo_name"])
)

In [14]:
len(repos_with_all_data)

2982

In [15]:
librarian_signatures_df = librarian_signatures_df[librarian_signatures_df["generation"] == 0]

Select only repos with signatures that were in sample

In [16]:
sampled_repos_df, sample_python_code_df, sampled_librarian_signatures_df = filter_dfs_by_cols_in([sampled_repos_df, sample_python_code_df, librarian_signatures_df], repos_with_all_data)
sampled_repos_df, sampled_librarian_signatures_df = align_dfs([sampled_repos_df, sampled_librarian_signatures_df])

## Sample with generated READMEs

In [46]:
model_name = "codellama"
sample_prefix = "sample_per_task_5_repos"

sample_loader = CorpusDataLoader(
    repos_df_path= data_path / f"code2doc/{sample_prefix}/sampled_repos5.jsonl",
    generated_readmes_df_path=data_path / f"code2doc/{sample_prefix}/{model_name}_generated_readmes5.jsonl",
    code_df_path=data_path / "code" / "python_files_with_selected_code.feather"
)

In [47]:
sample_loader.generated_readmes_df_path

PosixPath('../output/code2doc/sample_per_task_5_repos/codellama_generated_readmes5.jsonl')

In [59]:
sampled_repos_df, sampled_generated_readmes_df, sample_python_code_df = sample_loader.load_corpus_dfs(librarian_signatures_df["repo"])

In [60]:
repos_with_all_data = set(sampled_repos_df["repo"]).intersection(librarian_signatures_df["repo"])

In [61]:
sampled_repos_df, sample_python_code_df, sampled_librarian_signatures_df = filter_dfs_by_cols_in([sampled_repos_df, sample_python_code_df, librarian_signatures_df], repos_with_all_data)
sampled_repos_df, sampled_librarian_signatures_df = align_dfs([sampled_repos_df, sampled_librarian_signatures_df[sampled_librarian_signatures_df["generation"] == 0]])

## Example BEIR dataset

In [88]:
import os
import pathlib

dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(pathlib.Path("..").parent.absolute(), "datasets")
beir_data_path = util.download_and_unzip(url, out_dir)

In [89]:
_corpus, _queries, _qrels = GenericDataLoader(beir_data_path).load(split="test")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5183/5183 [00:00<00:00, 37880.68it/s]


In [90]:
print(_corpus['4983'].keys())
_corpus['4983']

dict_keys(['text', 'title'])


{'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 versus 1.1 microm2/ms). Relative anisotropy was higher the closer birth was to term with 

## Data preparation

In [62]:
def get_repos_for_query(query, repos_df):
    return repos_df[repos_df["tasks"].apply(lambda ts: query in ts)]


def get_queries(repos_df, min_query_count):
    all_queries = repos_df["query_tasks"].explode()
    qcounts = all_queries.value_counts()
    return qcounts[qcounts >= min_query_count].index.to_list()

def prepare_query_data(repos_df, min_query_count=5):
    task_queries = {str(i): query for (i, query) in enumerate(get_queries(repos_df, min_query_count=min_query_count))}

    task_qrels = {
        qid: {str(corpus_id): 1 for corpus_id in get_repos_for_query(task_queries[qid], repos_df).index}
        for qid in task_queries.keys()
    }
    return task_queries, task_qrels


def prepare_readme_corpus(repos_df):
    return {str(i): {"text": row["readme"], "title": row["repo"], 'tasks': row['tasks']} for (i, row) in repos_df.iterrows()}


def prepare_generated_readme_corpus(repos_df, generated_readmes_df, columns=["answer"]):
    generated_readmes_df = generated_readmes_df.set_index("repo_name").loc[repos_df["repo"]].reset_index()
    return {str(i): {"text": "\n".join(row[columns]), "title": row["repo_name"]} for (i, row) in generated_readmes_df.iterrows()}

    
def prepare_code_corpus(repos_df, selected_python_code_df):
    per_repo_code_df = selected_python_code_df.groupby("repo_name").apply(lambda df: "\n\n".join(df["selected_code"].fillna("")))
    per_repo_code_df = per_repo_code_df.loc[repos_df["repo"]].reset_index()
    return {str(i): {"text": row[0], "title": row["repo_name"]} for (i, row) in per_repo_code_df.iterrows()}


# THIS IS FOR ONE GENERATION ONLY NOW
def prepare_librarian_corpora(repos_df, sampled_librarian_signatures_df):
    columns = ["dependency_signature", "repository_signature", "generated_tasks"]
    sampled_librarian_signatures_df = sampled_librarian_signatures_df.set_index("repo").loc[repos_df["repo"]].reset_index()
    return {
        column: {str(i): {"text": row[column], "title": row["repo"]} for (i, row) in sampled_librarian_signatures_df[["repo", column]].iterrows()} 
        for column in columns
    }


def prepare_basic_corpora(repos_df, selected_python_code_df):
    readme_corpus = prepare_readme_corpus(repos_df)
    selected_python_code_corpus = prepare_code_corpus(repos_df, selected_python_code_df)
    return {"readme": readme_corpus, "selected_code": selected_python_code_corpus}


def prepare_corpora(repos_df, generated_readmes_df, selected_python_code_df):
    basic_corpora = prepare_basic_corpora(repos_df, selected_python_code_df)
    readme_corpus = basic_corpora["readme"]
    selected_python_code_corpus = basic_corpora["selected_code"]
    generated_readme_corpus = prepare_generated_readme_corpus(repos_df, sampled_generated_readmes_df)
    generated_rationale_corpus = prepare_generated_readme_corpus(repos_df, sampled_generated_readmes_df, columns=["rationale"])
    generated_readme_rationale_corpus = prepare_generated_readme_corpus(repos_df, sampled_generated_readmes_df, columns=["answer", "rationale"])
    generated_readme_context_corpus = prepare_generated_readme_corpus(repos_df, sampled_generated_readmes_df, columns=["context_history"])
    

    assert len(readme_corpus) == len(generated_readme_corpus)
    assert len(selected_python_code_corpus) == len(readme_corpus)
    
    for k in readme_corpus.keys():
        assert readme_corpus[k]['title'] == generated_readme_corpus[k]['title'], str((readme_corpus[k]['title'], generated_readme_corpus[k]['title']))
        assert readme_corpus[k]['title'] == selected_python_code_corpus[k]['title']
    return {
        "readme": readme_corpus,
        "generated_readme": generated_readme_corpus,
        "selected_code": selected_python_code_corpus,
        "generated_rationale": generated_rationale_corpus,
        "generation_context": generated_readme_context_corpus,
    }

In [63]:
task_queries, task_qrels = prepare_query_data(sampled_repos_df, min_query_count=ExperimentParams.min_repos_per_task)

In [64]:
pd.Series(task_qrels).apply(len).describe()

count    831.000000
mean      28.132371
std       43.113013
min       10.000000
25%       11.000000
50%       14.000000
75%       26.000000
max      383.000000
dtype: float64

In [65]:
pd.Series([len(qrl) for qrl in task_qrels.values()]).describe()

count    831.000000
mean      28.132371
std       43.113013
min       10.000000
25%       11.000000
50%       14.000000
75%       26.000000
max      383.000000
dtype: float64

In [66]:
#corpora = prepare_basic_corpora(sampled_repos_df, sample_python_code_df) |  #
corpora =  prepare_corpora(sampled_repos_df, sampled_generated_readmes_df, sample_python_code_df) | prepare_librarian_corpora(sampled_repos_df, sampled_librarian_signatures_df)

In [67]:
corpora.keys()

dict_keys(['readme', 'generated_readme', 'selected_code', 'generated_rationale', 'generation_context', 'dependency_signature', 'repository_signature', 'generated_tasks'])

In [68]:
[len(corpora[cname].keys()) for cname in corpora.keys()]

[7060, 7060, 7060, 7060, 7060, 7060, 7060, 7060]

In [69]:
for cid in corpora["readme"].keys():
    assert corpora["readme"][cid]["title"] == corpora["readme"][cid]["title"], f"no match at {cid}"
    assert corpora["readme"][cid]["title"] == corpora["dependency_signature"][cid]["title"], f"no match at {cid}"

In [70]:
## Checking elasticsearch

In [71]:
import elasticsearch

es_client = elasticsearch.Elasticsearch()
def retrieve_repos_with_es(query, k=50, index="readme", es_client=es_client):
    es_result = es_client.search(index=index, body={"query": {"match": {"txt": query}}}, size=k)
    return [
        hit["_source"]["title"]
        for hit in es_result["hits"]["hits"]
    ]



def get_elasticsearch_results():
    retrieved_repo_tasks = {}

    qcounts = sampled_repos_df["tasks"].explode().value_counts()
    used_queries = [
        query
        for query in sampled_repos_df["tasks"].explode().drop_duplicates()
        if qcounts.loc[query] > 5
    ]
    # [task_queries[qid] for qid in task_queries.keys()]
    
    index="selected_code"
    for query in used_queries:
        retrieved_tasks = sampled_repos_df[sampled_repos_df["repo"].isin(retrieve_repos_with_es(query, index=index))]["tasks"].to_list()
        retrieved_repo_tasks[query] = retrieved_tasks
    
    k = 10
    query_hits = pd.Series({
        query: sum([query in tasks for tasks in retrieved_repo_tasks[query][:k]])
        for query in retrieved_repo_tasks.keys()
    })

def show_elasticsearch_results(qid='10'):
    query = task_queries[qid]
    
    print(query)
    print(query_hits[query], "hits")
    
    for hit in es_client.search(index=index, body={"query": {"match": {"txt": task_queries[qid]}}}, size=k)["hits"]["hits"]:
        print("#" * 100)
        print("#" * 100)
        repo_name = hit["_source"]["title"]
        repo_record = sampled_repos_df[sampled_repos_df["repo"] == repo_name].iloc[0]
        is_hit = query in repo_record["tasks"]
        print(repo_name, "HIT" if is_hit else "NO HIT")
        
        if is_hit:
            print("#" * 100)
            print("#" * 100)
            print(hit['_source']['txt'])

## Evaluating with BEIR

In [72]:
def load_w2v_sentence_transformer(w2v_model_path):
    w2v_layer = sentence_transformers.models.WordEmbeddings.load(w2v_model_path)
    return sentence_transformers.SentenceTransformer(modules=[w2v_layer, sentence_transformers.models.Pooling(200)])

def get_w2v_retriever(w2v_model_path="../models/rnn_abstract_readme_w2v/0_WordEmbeddings"):
    w2v_model = load_w2v_sentence_transformer(w2v_model_path)
    st_model = SentenceBERT("sentence-transformers/all-mpnet-base-v2")
    st_model.q_model = w2v_model
    st_model.doc_model = w2v_model
    return EvaluateRetrieval(DRES(st_model), score_function="cos_sim")

def get_splade_retriever(splade_model_path = "splade/weights/distilsplade_max", batch_size=128):
    splade_model = DRES(SPLADE(splade_model_path), batch_size=128)
    return EvaluateRetrieval(splade_model, score_function="dot")

def get_bm25_retrievers(corpora):
        
    bm25_retrievers = {}
    for corpus_name, corpus in corpora.items():
        model = BM25(index_name=corpus_name)
        retriever = EvaluateRetrieval(model)
        bm25_retrievers[corpus_name] = retriever
    return bm25_retrievers


sentence_transformer_model_names = [
    "sentence-transformers/all-mpnet-base-v2",
    "sentence-transformers/all-MiniLM-L12-v2",
    "flax-sentence-embeddings/st-codesearch-distilroberta-base"
]

def get_sentence_transformer_retriever(model_name="sentence-transformers/all-mpnet-base-v2", batch_size=256):
    model = DRES(SentenceBERT(model_name), batch_size=batch_size)
    return EvaluateRetrieval(model, score_function="cos_sim")

def get_unicoil_retriever(model_name="castorini/unicoil-msmarco-passage"):
    """
    THERE IS A BUG WITH BEIR THAT MAKES THIS UNUSABLE
    """
    model = SparseSearch(UniCOIL(model_path=model_name), batch_size=32)
    return EvaluateRetrieval(model, score_function="dot")

In [74]:
bm25_retrievers = get_bm25_retrievers(corpora)

In [75]:
splade_retriever = get_splade_retriever()
sentence_transformer_retrievers = {
    model_name: get_sentence_transformer_retriever(model_name)
    for model_name in sentence_transformer_model_names
}

In [76]:
w2v_retriever = get_w2v_retriever()

In [77]:
from pydantic import BaseModel
from typing import Dict

class RetrieverInput(BaseModel):
    corpus: Dict[str, dict]
    queries: Dict[str, str]
    qrels: Dict[str, Dict[str, int]]


class RetrievalEvaluationResults(BaseModel):
    retrieval_results: Dict[str, Dict[str, float]]
    metrics: dict
    model_type: str

    @classmethod
    def from_retriever(cls, retriever, retriever_input, metric_names=["accuracy@k", "hits@k", "r_cap@k"]):
        retrieval_results = retriever.retrieve(retriever_input.corpus, retriever_input.queries)
        custom_metrics = retriever.evaluate_custom_multi(retriever_input.qrels, retrieval_results, retriever.k_values, metrics=metric_names)
        other_metrics = retriever.evaluate(retriever_input.qrels, retrieval_results, retriever.k_values, ignore_identical_ids=False)
        metrics = custom_metrics | cls.tuple_to_dict(other_metrics)
        try:
            model_type = str(retriever.retriever.model)
        except:
            model_type = "bm25"
        return RetrievalEvaluationResults(metrics=metrics, model_type=model_type, retrieval_results=retrieval_results)


    @classmethod
    def tuple_to_dict(cls, dicts):
        merged_dict = {}
        for d in dicts:
            merged_dict = d | merged_dict
        return merged_dict




In [78]:
retriever_inputs = {
    corpus_name: RetrieverInput(corpus=corpus, queries=task_queries, qrels=task_qrels)
    for (corpus_name, corpus) in corpora.items()
}

In [79]:
retriever_inputs.keys()

dict_keys(['readme', 'generated_readme', 'selected_code', 'generated_rationale', 'generation_context', 'dependency_signature', 'repository_signature', 'generated_tasks'])

In [80]:
bm25_results = {
    corpus_name: RetrievalEvaluationResults.from_retriever(bm25_retrievers[corpus_name], retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
}

  0%|                                                                                                                                                                                 | 0/7060 [00:00<?, ?docs/s]
que: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.62it/s]
  0%|                                                                                                                                                                                 | 0/7060 [00:00<?, ?docs/s]
que: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  2.62it/s]
  0%|                                                                                                                                                           

In [81]:
splade_results = {
    corpus_name: RetrievalEvaluationResults.from_retriever(splade_retriever, retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
}

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 52.29it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [00:20<00:00, 10.56it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 186.36it/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221/221 [00:12<00:00, 17.74it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [82]:
word2vec_results = {
    corpus_name: RetrievalEvaluationResults.from_retriever(w2v_retriever, retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
}

Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 575.49it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [00:01<00:00, 38.54it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1912.09it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 301.10it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [83]:
sentence_transformer_results = {
    (corpus_name, model_name.split("/")[1]): RetrievalEvaluationResults.from_retriever(sentence_transformer_retrievers[model_name], retriever_inputs[corpus_name])
    for corpus_name in corpora.keys()
    for model_name in sentence_transformer_model_names
}

Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 32.97it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:35<00:00,  1.27s/it]
Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 94.00it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:04<00:00,  6.60it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [84]:
bm25_metrics = [
    {"corpus": corpus_name, "retriever": "bm25", **bm25_results[corpus_name].metrics}
    for corpus_name in corpora.keys()
]

In [85]:
word2vec_metrics = [
    {"corpus": corpus_name, "retriever": "Python code word2vec", **word2vec_results[corpus_name].metrics}
    for corpus_name in corpora.keys()
]

In [86]:
splade_metrics = [
    {"corpus": corpus_name, "retriever": "splade", **splade_results[corpus_name].metrics}
     for corpus_name in corpora.keys()
]
 
sentence_transformer_metrics = [
    {"corpus": corpus_name, "retriever": f"{model_name} (sentence_transformer)", **sentence_transformer_results[(corpus_name, model_name)].metrics}
    for (corpus_name, model_name) in sentence_transformer_results.keys()
]

all_metrics_df = pd.DataFrame.from_records(bm25_metrics + word2vec_metrics + splade_metrics +  sentence_transformer_metrics).sort_values("Hits@10", ascending=False)

In [89]:
f"../output/code2doc/{sample_prefix}/beir_results_{model_name}.csv"

'../output/code2doc/sample_per_task_5_repos/beir_results_codellama.csv'

In [88]:
all_metrics_df.to_csv(f"../output/code2doc/{sample_prefix}/beir_results_{model_name}.csv", index=False)

## Results

By default we will use min_task_count=10 (as we used originally)

We can switch to smaller task counts like 5 to incorporate the fact that we use sample of repos

In [None]:
metric_df_cols = ["corpus", "retriever", "Accuracy@10", "Hits@10", "R_cap@10", "NDCG@10"]

In [None]:
all_metrics_df[metric_df_cols]

In [None]:
all_metrics_df.groupby("corpus").apply(lambda df: df.sort_values("Accuracy@10", ascending=False).iloc[0])[metric_df_cols].sort_values("Accuracy@10", ascending=False)

In [None]:
all_metrics_df.groupby("retriever").apply(lambda df: df.sort_values("Accuracy@10", ascending=False).iloc[0])[metric_df_cols].sort_values("Accuracy@10", ascending=False)

In [None]:
all_metrics_df[all_metrics_df["retriever"] == "bm25"][metric_df_cols]

In [None]:
len(task_queries)

In [None]:
# task count = 5

In [None]:
all_metrics_df[["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10", ascending=False)

In [None]:
# task count = 10

In [None]:
all_metrics_df[["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10", ascending=False)

In [None]:
all_metrics_df.groupby("retriever")["Accuracy@10"].agg("mean").sort_values()

In [None]:
all_metrics_df.groupby("retriever")["Accuracy@10"].agg("mean").sort_values()

In [None]:
all_metrics_df.groupby("retriever")["Accuracy@10"].agg("mean").sort_values()

In [None]:
all_metrics_df.groupby("corpus")["Accuracy@10"].agg("mean").sort_values()

In [None]:
sampled_repos_df["tasks"].explode().value_counts().loc[list(task_queries.values())]

In [None]:
all_metrics_df[["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10", ascending=False)

## Does combining rationale with generated readme help?

It seems that the best sentence transformer retrievers can only get worse when using any other information!

In [None]:
sentence_transformer_results.keys()

In [None]:
st_generated_readme_results= sentence_transformer_results[('generated_readme', 'all-mpnet-base-v2')].retrieval_results
st_rationale_results = sentence_transformer_results[('generated_rationale', 'all-mpnet-base-v2')].retrieval_results
bm25_generated_readme_results = bm25_results["generated_readme"].retrieval_results
st_context_results = sentence_transformer_results[('generation_context', 'all-mpnet-base-v2')].retrieval_results

In [None]:
len(list(bm25_generated_readme_results.keys()))

In [None]:
len(list(st_generated_readme_results.keys()))

In [None]:
def merge_qrels(qrels1, qrels2):
    merged_qrels = {}
    for k in qrels1.keys():
        tmp_rel = dict()
        for rel_k in set(qrels1[k].keys()).union(qrels2[k]):
            tmp_rel[rel_k] = qrels1[k].get(rel_k, 0) +  qrels2[k].get(rel_k, 0)
        merged_qrels[k] = tmp_rel
    return merged_qrels

In [None]:
st_generation_results = merge_qrels(bm25_generated_readme_results, st_generated_readme_results)

In [None]:
st_generation_results['0']

In [None]:
EvaluateRetrieval().evaluate_custom(task_qrels, st_generation_results, metric="acc", k_values=[1,5,10])

In [None]:
EvaluateRetrieval().evaluate_custom(task_qrels, st_generated_readme_results, metric="acc", k_values=[1,5,10])

In [None]:
EvaluateRetrieval().evaluate_custom(task_qrels, st_rationale_results, metric="acc", k_values=[1,5,10])

In [None]:
all_metrics_df[all_metrics_df["retriever"] == "bm25"][["corpus", "retriever", "Accuracy@10"]].sort_values("Accuracy@10")

In [None]:
Splitting does not make much sense as the most of generated data is under the sentence-transformer context length (384 tokens)

In [None]:
def split_corpus_by_lengths(corpus, chunk_length):
    splitted_corpora = [dict() for _ in range(n_splits)]
    for c_id in corpus.keys():
        text = corpus[c_id]["text"]
        chunk_length =  len(text) // n_splits
        for i in range(0, n_splits):
            splitted_corpora[i] = text[i*chunk_length:(i+1)*chunk_length]
        

In [None]:
class MultiTextEvaluator(BaseModel):
    """
    Evaluate a dataframe that has multiple texts for each query (multiple generation experiments)
    iteration_col says which experiment it was
    """
    iteration_col: str
    text_cols: List[str]
    k_values: List[int] = [1,5,10,25]

    def get_ir_datas(self, df):
        for iter in df[self.iteration_col].unique():
            ir_data = load_ir_data(df[df[self.iteration_col] == iter], self.text_cols)
            yield (iter, ir_data)

    def evaluate(self, df, retriever):
        ir_datas = dict(self.get_ir_datas(df))
        dfs = []
        for iter, ir_data in ir_datas.items():
            per_query_evaluator = PerQueryIREvaluator(k_values=self.k_values)
            df = per_query_evaluator.get_scores(ir_data, retriever)
            df[self.iteration_col] = iter
            dfs.append(df)
        metrics_df = pd.concat(dfs)
        metrics_df["query"] = metrics_df.index
        return metrics_df