In [1]:
from coir.data_loader import get_tasks
from coir.evaluation import COIR
from coir.models import YourCustomDEModel
from coir.beir.retrieval.train import TrainRetriever
import pandas as pd
from datasets import load_dataset, Dataset, concatenate_datasets
from collections import defaultdict

import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import CosineSimilarityLoss
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from torch.utils.data import DataLoader
import logging

In [2]:
queries_corpus_dataset = load_dataset(f"CoIR-Retrieval/cosqa-queries-corpus")
qrels_dataset = load_dataset(f"CoIR-Retrieval/cosqa-qrels")
cosqa_granite_updated_path='/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/CS696DS-Oracle-Retrieving-Code-Explanations/Explanation_Generation/Cosqa/postprocessing/output/COSQA_granite_explanations_clean.csv'
df= pd.read_csv(cosqa_granite_updated_path)


In [3]:
corpus_data = queries_corpus_dataset['corpus']
query_data = queries_corpus_dataset['queries']
qrels_data_test = qrels_dataset['test']
qrels_df_test = qrels_data_test.to_pandas()
qrels_data_train = qrels_dataset['train']
qrels_data_valid = qrels_dataset['valid']
qrels_data = concatenate_datasets([qrels_data_train, qrels_data_valid, qrels_data_test])

In [4]:
test_query_ids = qrels_df_test['query_id'].to_list()
len(test_query_ids)

500

In [5]:
corpus_data

Dataset({
    features: ['_id', 'partition', 'text', 'title', 'language', 'meta_information'],
    num_rows: 20604
})

In [6]:
query_data

Dataset({
    features: ['_id', 'partition', 'text', 'title', 'language', 'meta_information'],
    num_rows: 20604
})

In [7]:
qrels_data

Dataset({
    features: ['query_id', 'corpus_id', 'score'],
    num_rows: 20604
})

In [8]:
qrels_df = qrels_data.to_pandas()
# qrels_df = qrels_df[qrels_df["score"] == 1] # keeping only score 1
query_df = query_data.to_pandas()
corpus_df = corpus_data.to_pandas()

In [9]:
query_df = query_df.rename(columns={"_id": "query_id", "partition":"query_partition", "text": "query_text", "title": "query_title",  "language": "query_language", "meta_information": "query_meta_information"})
corpus_df = corpus_df.rename(columns={"_id": "corpus_id", "partition":"corpus_partition", "text": "corpus_text", "title": "corpus_title", "language": "corpus_language", "meta_information": "corpus_meta_information"})
merged = qrels_df.merge(query_df[["query_id", "query_partition", "query_text", "query_title", "query_language", "query_meta_information"]], on="query_id", how="left")

merged = merged.merge(corpus_df[["corpus_id", "corpus_partition","corpus_text", "corpus_title", "corpus_language", "corpus_meta_information"]], on="corpus_id", how="left")
merged = merged.sort_values(by="query_id", ascending=True)

In [10]:
last_corpus_ids = (
    merged
    .drop_duplicates(subset=["corpus_text"], keep="last")
    .set_index("corpus_text")["corpus_id"]
    .to_dict()
)

merged["corpus_id"] = merged["corpus_text"].map(last_corpus_ids)

merged = merged.reset_index(drop=True)

In [11]:
query_df = merged[[
    "query_id", "query_partition", "query_text", "query_title", "query_language", "query_meta_information"
]].rename(columns={
    "query_id": "_id", "query_partition": "partition", "query_text": "text", 
    "query_title": "title", "query_language": "language", "query_meta_information": "meta_information"
})
query_data = Dataset.from_pandas(query_df.reset_index(drop=True))

# Corpus Data
corpus_df = merged[[
    "corpus_id", "corpus_partition", "corpus_text", "corpus_title", "corpus_language", "corpus_meta_information"
]].rename(columns={
    "corpus_id": "_id", "corpus_partition": "partition", "corpus_text": "text", 
    "corpus_title": "title", "corpus_language": "language", "corpus_meta_information": "meta_information"
})
corpus_data = Dataset.from_pandas(corpus_df.reset_index(drop=True))

# Qrels Data
qrels_df = merged[["query_id", "corpus_id", "score"]].drop_duplicates()
qrels_df_test = qrels_df[qrels_df["query_id"].isin(test_query_ids)]
qrels_data = Dataset.from_pandas(qrels_df.reset_index(drop=True))
qrels_data_test = Dataset.from_pandas(qrels_df_test.reset_index(drop=True))

corpus_df = corpus_df.drop_duplicates(subset="_id", keep="first").reset_index(drop=True)


In [12]:
query_data

Dataset({
    features: ['_id', 'partition', 'text', 'title', 'language', 'meta_information'],
    num_rows: 20604
})

In [13]:
print(corpus_df.columns)
print(query_df.columns)
print(qrels_df.columns)

Index(['_id', 'partition', 'text', 'title', 'language', 'meta_information'], dtype='object')
Index(['_id', 'partition', 'text', 'title', 'language', 'meta_information'], dtype='object')
Index(['query_id', 'corpus_id', 'score'], dtype='object')


In [14]:
# assume corpus_df, query_df, qrels_df are already loaded

# 1) Split each DataFrame
train_corpus_df = corpus_df[corpus_df['partition']=='train']
dev_corpus_df   = corpus_df[corpus_df['partition']=='valid']
test_corpus_df  = corpus_df[corpus_df['partition']=='test']

train_query_df = query_df[query_df['partition']=='train']
dev_query_df   = query_df[query_df['partition']=='valid']
test_query_df  = query_df[query_df['partition']=='test']

train_qrels_df = qrels_df[qrels_df['query_id'].isin(train_query_df['_id'])]
dev_qrels_df   = qrels_df[qrels_df['query_id'].isin(dev_query_df['_id'])]
test_qrels_df  = qrels_df[qrels_df['query_id'].isin(test_query_df['_id'])]


# 2) Build dicts for each split

def make_corpus(df):
    return (df
            .set_index('_id')[['title','text']]
            .fillna('')
            .to_dict(orient='index'))

def make_queries(df):
    return df.set_index('_id')['text'].to_dict()

def make_qrels(df):
    return (df
            .groupby('query_id')
            .apply(lambda d: dict(zip(d['corpus_id'], d['score'])))
            .to_dict())

# train split
train_corpus = make_corpus(train_corpus_df)
train_queries = make_queries(train_query_df)
train_qrels = make_qrels(train_qrels_df)

# dev split
dev_corpus = make_corpus(dev_corpus_df)
dev_queries = make_queries(dev_query_df)
dev_qrels = make_qrels(dev_qrels_df)

# test split
test_corpus = make_corpus(test_corpus_df)
test_queries = make_queries(test_query_df)
test_qrels = make_qrels(test_qrels_df)

# Build the full corpus dict once
full_corpus = (
    corpus_df
    .set_index('_id')[['title','text']]
    .fillna('')
    .to_dict(orient='index')
)



In [15]:
#ADD EXPLANATIONS

expl_df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/CS696DS-Oracle-Retrieving-Code-Explanations/Explanation_Generation/Cosqa/postprocessing/output/COSQA_granite_explanations_clean.csv")
expl_df.rename(columns={"query_id": "query-id", "corpus_id": "corpus-id"}, inplace=True)
for _, row in expl_df.iterrows():
    corpus_id = row['corpus-id']
    explanation = row['explanation_granite_1_cleaned']

    if corpus_id in full_corpus and explanation and explanation.strip():
        full_corpus[corpus_id]['text'] = explanation  

    if corpus_id in train_corpus and explanation and explanation.strip():
        train_corpus[corpus_id]['text'] = explanation 
    if corpus_id in dev_corpus and explanation and explanation.strip():
        dev_corpus[corpus_id]['text'] = explanation
    if corpus_id in test_corpus and explanation and explanation.strip():
        test_corpus[corpus_id]['text'] = explanation  

full_corpus = {doc_id: doc for doc_id, doc in full_corpus.items() if 'text' in doc and doc['text'].strip()}
train_corpus = {doc_id: doc for doc_id, doc in train_corpus.items() if 'text' in doc and doc['text'].strip()}
dev_corpus = {doc_id: doc for doc_id, doc in dev_corpus.items() if 'text' in doc and doc['text'].strip()}
test_corpus = {doc_id: doc for doc_id, doc in test_corpus.items() if 'text' in doc and doc['text'].strip()}

In [16]:
# 1) Instantiate model & retriever
model_name = "intfloat/e5-base-v2"
model      = SentenceTransformer(model_name)
tr         = TrainRetriever(model, batch_size=32)

# 2) Prepare train‐loader
train_examples   = tr.load_train(full_corpus, train_queries, train_qrels)
train_dataloader = tr.prepare_train(train_examples, shuffle=True)

# 3) Dev evaluator over full corpus
dev_evaluator = tr.load_ir_evaluator(
    full_corpus,      # ← full retrieval pool
    dev_queries,
    dev_qrels,
    max_corpus_size=None,
    name="dev"
)

# 4) Define objective + fine‐tune
train_objectives = [(train_dataloader, CosineSimilarityLoss(model))]

tr.fit(
    train_objectives=train_objectives,
    evaluator=dev_evaluator,
    epochs=3,
    evaluation_steps=500,
    warmup_steps=200,
    output_path="outputs/encoder_finetuned",
    weight_decay=0.01,
    optimizer_params={'lr': 2e-5},
    save_best_model=True
)

# 5) Final test evaluation also over full corpus
best_model = SentenceTransformer("outputs/encoder_finetuned")


Scoring modified


Adding Input Examples:   0%|          | 0/613 [00:00<?, ?it/s]



Step,Training Loss,Validation Loss,Dev Cosine Accuracy@1,Dev Cosine Accuracy@3,Dev Cosine Accuracy@5,Dev Cosine Accuracy@10,Dev Cosine Precision@1,Dev Cosine Precision@3,Dev Cosine Precision@5,Dev Cosine Precision@10,Dev Cosine Recall@1,Dev Cosine Recall@3,Dev Cosine Recall@5,Dev Cosine Recall@10,Dev Cosine Ndcg@10,Dev Cosine Mrr@10,Dev Cosine Map@100,Dev Dot Accuracy@1,Dev Dot Accuracy@3,Dev Dot Accuracy@5,Dev Dot Accuracy@10,Dev Dot Precision@1,Dev Dot Precision@3,Dev Dot Precision@5,Dev Dot Precision@10,Dev Dot Recall@1,Dev Dot Recall@3,Dev Dot Recall@5,Dev Dot Recall@10,Dev Dot Ndcg@10,Dev Dot Mrr@10,Dev Dot Map@100
500,0.2641,No log,0.01,0.032,0.052,0.084,0.01,0.010667,0.0104,0.0084,0.01,0.032,0.052,0.084,0.041194,0.028168,0.03599,0.01,0.032,0.052,0.084,0.01,0.010667,0.0104,0.0084,0.01,0.032,0.052,0.084,0.041194,0.028168,0.03599
613,0.2641,No log,0.026,0.068,0.084,0.118,0.026,0.022667,0.0168,0.0118,0.026,0.068,0.084,0.118,0.068021,0.052616,0.061871,0.026,0.068,0.084,0.118,0.026,0.022667,0.0168,0.0118,0.026,0.068,0.084,0.118,0.068021,0.052616,0.061871
1000,0.2447,No log,0.106,0.2,0.254,0.33,0.106,0.066667,0.0508,0.033,0.106,0.2,0.254,0.33,0.206437,0.168199,0.17945,0.106,0.2,0.254,0.33,0.106,0.066667,0.0508,0.033,0.106,0.2,0.254,0.33,0.206437,0.168199,0.17945
1226,0.2447,No log,0.088,0.18,0.224,0.316,0.088,0.06,0.0448,0.0316,0.088,0.18,0.224,0.316,0.190364,0.151734,0.161469,0.088,0.18,0.224,0.316,0.088,0.06,0.0448,0.0316,0.088,0.18,0.224,0.316,0.190364,0.151734,0.161469
1500,0.2362,No log,0.128,0.238,0.288,0.36,0.128,0.079333,0.0576,0.036,0.128,0.238,0.288,0.36,0.235365,0.196316,0.207344,0.128,0.238,0.288,0.36,0.128,0.079333,0.0576,0.036,0.128,0.238,0.288,0.36,0.235365,0.196316,0.207344
1839,0.2362,No log,0.124,0.248,0.304,0.378,0.124,0.082667,0.0608,0.0378,0.124,0.248,0.304,0.378,0.244841,0.202894,0.213403,0.124,0.248,0.304,0.378,0.124,0.082667,0.0608,0.0378,0.124,0.248,0.304,0.378,0.244841,0.202894,0.213403


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [19]:
tasks = {}
tasks["cosqa"] = (full_corpus, test_queries, test_qrels)
evaluation = COIR(tasks=tasks,batch_size=256)
dataset_name = "cosqa/trained_encoders"
llm_name= "granite1"
retrieval_name="dres"
model_name="intfloat/e5-base-v2"
loacl_path = "/work/pi_wenlongzhao_umass_edu/27/atifabedeen/pipeline/coir_main/outputs/encoder_finetuned"
model = YourCustomDEModel(model_name=model_name)

results = evaluation.run(model, output_folder=f"results/{dataset_name}/{llm_name}/{retrieval_name}/{model_name}")
print(results)

YourCustomDEModel init → loaded from hub


Encoding batches:   0%|          | 0/2 [00:00<?, ?batch/s]

Encoding batches:   0%|          | 0/25 [00:00<?, ?batch/s]

{'cosqa': {'NDCG': {'NDCG@1': 0.348, 'NDCG@3': 0.46424, 'NDCG@5': 0.50193, 'NDCG@10': 0.53837, 'NDCG@100': 0.58366, 'NDCG@1000': 0.58867}, 'MAP': {'MAP@1': 0.348, 'MAP@3': 0.43533, 'MAP@5': 0.45613, 'MAP@10': 0.4713, 'MAP@100': 0.48119, 'MAP@1000': 0.4814}, 'Recall': {'Recall@1': 0.348, 'Recall@3': 0.548, 'Recall@5': 0.64, 'Recall@10': 0.752, 'Recall@100': 0.958, 'Recall@1000': 0.996}, 'Precision': {'P@1': 0.348, 'P@3': 0.18267, 'P@5': 0.128, 'P@10': 0.0752, 'P@100': 0.00958, 'P@1000': 0.001}}}
