In [1]:
# importing the datset
import pandas as pd
import numpy as np

df=pd.read_csv("Merged_file.csv")
# Prepare inputs
df=df.head( )
queries = df['QueryText'].tolist()
bodies = df['BODY'].tolist()
labels = df['Relevance'].tolist()

In [15]:
from sklearn.metrics import accuracy_score, f1_score
from transformers import pipeline
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch

# --------- 1. Cross-Encoder ---------
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
cross_scores = cross_encoder.predict(list(zip(queries, bodies)))
cross_preds = [1 if score > 0.5 else 0 for score in cross_scores]
cross_acc = accuracy_score(labels, cross_preds)
cross_f1 = f1_score(labels, cross_preds)

# Group the necessary data by 'QueryID'
df["CrossScores"] = cross_scores
grouped = df.groupby("QueryID").agg({
    "DocumentID": list,
    "CrossScores": list,
    "Relevance": list
}).reset_index()

# Construct the data dictionary
data = {
    "query_id": grouped["QueryID"].tolist(),
    "doc_id": grouped["DocumentID"].tolist(),
    "score": grouped["CrossScores"].tolist(),
    "relevance": grouped["Relevance"].tolist()
}

# Optional: Convert to DataFrame if needed
grouped_df = pd.DataFrame(data)

# Display the resulting grouped DataFrame
from IPython.display import display
display(grouped_df)


# Obtaining MAP scores and others
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
from IPython.display import display

# Assuming grouped_df is already defined with columns: query_id, doc_id, score, relevance

map_scores = []
ndcg_scores = []
p_at_5 = []
p_at_10 = []

def dcg_at_k(relevances, k):
    relevances = np.asarray(relevances, dtype=float)[:k]
    if relevances.size:
        return np.sum(relevances / np.log2(np.arange(2, relevances.size + 2)))
    return 0.0

def ndcg_at_k(y_true, y_score, k):
    order = np.argsort(y_score)[::-1]
    y_true_sorted = np.take(y_true, order)
    dcg_max = dcg_at_k(sorted(y_true, reverse=True), k)
    if not dcg_max:
        return 0.0
    return dcg_at_k(y_true_sorted, k) / dcg_max

for i, row in grouped_df.iterrows():
    y_true = row['relevance']
    y_scores = row['score']

    # MAP
    ap = average_precision_score(y_true, y_scores)
    map_scores.append(ap)

    # nDCG@10
    ndcg = ndcg_at_k(y_true, y_scores, k=10)
    ndcg_scores.append(ndcg)

    # Precision@5
    order = np.argsort(y_scores)[::-1]
    top5_relevance = np.take(y_true, order)[:5]
    p5 = np.sum(top5_relevance) / 5.0
    p_at_5.append(p5)

    # Precision@10
    top10_relevance = np.take(y_true, order)[:10]
    p10 = np.sum(top10_relevance) / 10.0
    p_at_10.append(p10)

# Aggregate results per query
results = pd.DataFrame({
    "Query ID": grouped_df['query_id'],
    "MAP": map_scores,
    "nDCG@10": ndcg_scores,
    "P@5": p_at_5,
    "P@10": p_at_10
})

display(results)

# Print average (mean) of each metric across all queries
print("Mean MAP      :", round(np.mean(map_scores), 4))
print("Mean nDCG@10  :", round(np.mean(ndcg_scores), 4))
print("Mean P@5      :", round(np.mean(p_at_5), 4))
print("Mean P@10     :", round(np.mean(p_at_10), 4))


Unnamed: 0,query_id,doc_id,score,relevance
0,1,"[4, 11, 339, 1378, 1861, 2034, 2371, 2379, 262...","[-10.408384323120117, -3.725282669067383, -8.1...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
1,2,"[54, 538, 606, 1704, 2064, 2980, 3336, 3938, 4...","[-5.207036972045898, -5.8463592529296875, -9.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3,"[350, 1160, 1275, 1523, 1660, 1713, 1767, 1789...","[-5.147612571716309, -6.763217449188232, -10.2...","[1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,4,"[143, 715, 2356, 3829, 3941, 5037, 5341, 5764,...","[-4.711159706115723, -8.35203742980957, -3.608...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,"[227, 546, 1087, 1712, 1925, 2108, 2371, 2390,...","[-6.6331634521484375, -3.4871554374694824, -9....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
5,7,"[62, 133, 361, 446, 721, 823, 1065, 1410, 1874...","[-10.048223495483398, -10.793760299682617, -10...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,11,"[321, 801, 1122, 1430, 1712, 1753, 2293, 2484,...","[-4.5121870040893555, -7.635186195373535, -3.3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,12,"[134, 1224, 1868, 2371, 3475, 3511, 3987, 4306...","[-0.05653083324432373, -9.201903343200684, -10...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ..."
8,13,"[372, 1101, 1564, 1787, 2144, 2343, 2512, 3167...","[-5.852274417877197, -6.481305122375488, -2.77...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ..."
9,14,"[77, 363, 446, 813, 1065, 1308, 1704, 2474, 25...","[-10.166321754455566, -10.619680404663086, -11...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


Unnamed: 0,Query ID,MAP,nDCG@10,P@5,P@10
0,1,0.216178,0.44222,0.4,0.4
1,2,0.096531,0.138862,0.2,0.1
2,3,0.145468,0.334051,0.6,0.3
3,4,0.042441,0.0,0.0,0.0
4,5,0.328369,0.466277,0.6,0.4
5,7,0.566189,0.650513,0.6,0.6
6,11,0.093907,0.110046,0.2,0.1
7,12,0.33784,0.478902,0.6,0.4
8,13,0.222395,0.428385,0.4,0.3
9,14,0.077438,0.0,0.0,0.0


Mean MAP      : 0.2461
Mean nDCG@10  : 0.3218
Mean P@5      : 0.35
Mean P@10     : 0.29


In [16]:
# --------- 2. Bi-Encoder ---------
bi_encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
query_embs = bi_encoder.encode(queries, convert_to_tensor=True)
body_embs = bi_encoder.encode(bodies, convert_to_tensor=True)
bi_scores = util.cos_sim(query_embs, body_embs).diagonal().cpu().numpy()
bi_preds = [1 if score > 0.5 else 0 for score in bi_scores]
bi_acc = accuracy_score(labels, bi_preds)
bi_f1 = f1_score(labels, bi_preds)

# Group the necessary data by 'QueryID'
df["BiScores"] = bi_scores
grouped = df.groupby("QueryID").agg({
    "DocumentID": list,
    "BiScores": list,
    "Relevance": list
}).reset_index()

# Construct the data dictionary
data = {
    "query_id": grouped["QueryID"].tolist(),
    "doc_id": grouped["DocumentID"].tolist(),
    "score": grouped["BiScores"].tolist(),
    "relevance": grouped["Relevance"].tolist()
}

# Optional: Convert to DataFrame if needed
grouped_df = pd.DataFrame(data)

# Display the resulting grouped DataFrame
from IPython.display import display
display(grouped_df)


# Obtaining MAP scores and others
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
from IPython.display import display

# Assuming grouped_df is already defined with columns: query_id, doc_id, score, relevance

map_scores = []
ndcg_scores = []
p_at_5 = []
p_at_10 = []

def dcg_at_k(relevances, k):
    relevances = np.asarray(relevances, dtype=float)[:k]
    if relevances.size:
        return np.sum(relevances / np.log2(np.arange(2, relevances.size + 2)))
    return 0.0

def ndcg_at_k(y_true, y_score, k):
    order = np.argsort(y_score)[::-1]
    y_true_sorted = np.take(y_true, order)
    dcg_max = dcg_at_k(sorted(y_true, reverse=True), k)
    if not dcg_max:
        return 0.0
    return dcg_at_k(y_true_sorted, k) / dcg_max

for i, row in grouped_df.iterrows():
    y_true = row['relevance']
    y_scores = row['score']

    # MAP
    ap = average_precision_score(y_true, y_scores)
    map_scores.append(ap)

    # nDCG@10
    ndcg = ndcg_at_k(y_true, y_scores, k=10)
    ndcg_scores.append(ndcg)

    # Precision@5
    order = np.argsort(y_scores)[::-1]
    top5_relevance = np.take(y_true, order)[:5]
    p5 = np.sum(top5_relevance) / 5.0
    p_at_5.append(p5)

    # Precision@10
    top10_relevance = np.take(y_true, order)[:10]
    p10 = np.sum(top10_relevance) / 10.0
    p_at_10.append(p10)

# Aggregate results per query
results = pd.DataFrame({
    "Query ID": grouped_df['query_id'],
    "MAP": map_scores,
    "nDCG@10": ndcg_scores,
    "P@5": p_at_5,
    "P@10": p_at_10
})

display(results)

# Print average (mean) of each metric across all queries
print("Mean MAP      :", round(np.mean(map_scores), 4))
print("Mean nDCG@10  :", round(np.mean(ndcg_scores), 4))
print("Mean P@5      :", round(np.mean(p_at_5), 4))
print("Mean P@10     :", round(np.mean(p_at_10), 4))


Unnamed: 0,query_id,doc_id,score,relevance
0,1,"[4, 11, 339, 1378, 1861, 2034, 2371, 2379, 262...","[0.13010098040103912, 0.386648952960968, 0.426...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
1,2,"[54, 538, 606, 1704, 2064, 2980, 3336, 3938, 4...","[0.636265218257904, 0.5802971124649048, 0.3900...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3,"[350, 1160, 1275, 1523, 1660, 1713, 1767, 1789...","[0.5482257604598999, 0.6178483963012695, 0.477...","[1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,4,"[143, 715, 2356, 3829, 3941, 5037, 5341, 5764,...","[0.4568575620651245, 0.2575801610946655, 0.517...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,"[227, 546, 1087, 1712, 1925, 2108, 2371, 2390,...","[0.4807475805282593, 0.5329251289367676, 0.468...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
5,7,"[62, 133, 361, 446, 721, 823, 1065, 1410, 1874...","[0.1140979528427124, 0.08277460932731628, 0.18...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,11,"[321, 801, 1122, 1430, 1712, 1753, 2293, 2484,...","[0.517747163772583, 0.5531913042068481, 0.4371...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,12,"[134, 1224, 1868, 2371, 3475, 3511, 3987, 4306...","[0.4933459758758545, 0.4012191891670227, 0.327...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ..."
8,13,"[372, 1101, 1564, 1787, 2144, 2343, 2512, 3167...","[0.36009812355041504, 0.3587648868560791, 0.40...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ..."
9,14,"[77, 363, 446, 813, 1065, 1308, 1704, 2474, 25...","[0.07601582258939743, 0.27261608839035034, 0.2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


Unnamed: 0,Query ID,MAP,nDCG@10,P@5,P@10
0,1,0.319322,0.639944,0.8,0.6
1,2,0.110518,0.220092,0.2,0.1
2,3,0.269858,0.5795,0.8,0.5
3,4,0.346618,0.493679,0.6,0.4
4,5,0.425064,0.476071,0.6,0.5
5,7,0.461874,0.533075,0.6,0.5
6,11,0.403354,0.416567,0.2,0.5
7,12,0.087241,0.073364,0.0,0.1
8,13,0.480489,0.790951,1.0,0.7
9,14,0.662338,0.671938,0.8,0.6


Mean MAP      : 0.3514
Mean nDCG@10  : 0.4775
Mean P@5      : 0.54
Mean P@10     : 0.43


In [10]:
from sklearn.metrics import accuracy_score, f1_score
from transformers import pipeline
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch

# --------- 3. T5 Prompt-Based ---------
# Initialize the T5 pipeline
t5_pipe = pipeline("text2text-generation", model="t5-base", tokenizer="t5-base", device=0 if torch.cuda.is_available() else -1)
# Create input prompts
t5_inputs = [f'Is the query "{q}" relevant to the passage: "{b}"? Answer yes or no' for q, b in zip(queries, bodies)]
# Run T5 model to get outputs
t5_outputs = t5_pipe(t5_inputs, max_length=10, truncation=True)
# Score mapping and prediction
score_mapping = {"yes": 1.0, "no": 0.0, "somewhat": 0.5}
t5_preds = []
for out in t5_outputs:
    response = out['generated_text'].strip().lower()
    score = score_mapping.get(response, 0.0)  # default to 0.0 if unexpected output
    prediction = 1 if (score == 1.0 or score > 0.5) else 0
    t5_preds.append(prediction)

# Group the necessary data by 'QueryID'
df["T5Scores"] = score
grouped = df.groupby("QueryID").agg({
    "DocumentID": list,
    "T5Scores": list,
    "Relevance": list
}).reset_index()

# Construct the data dictionary
data = {
    "query_id": grouped["QueryID"].tolist(),
    "doc_id": grouped["DocumentID"].tolist(),
    "score": grouped["T5Scores"].tolist(),
    "relevance": grouped["Relevance"].tolist()
}

# Optional: Convert to DataFrame if needed
grouped_df = pd.DataFrame(data)

# Display the resulting grouped DataFrame
from IPython.display import display
display(grouped_df)


# Obtaining MAP scores and others
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
from IPython.display import display

# Assuming grouped_df is already defined with columns: query_id, doc_id, score, relevance

map_scores = []
ndcg_scores = []
p_at_5 = []
p_at_10 = []

def dcg_at_k(relevances, k):
    relevances = np.asarray(relevances, dtype=float)[:k]
    if relevances.size:
        return np.sum(relevances / np.log2(np.arange(2, relevances.size + 2)))
    return 0.0

def ndcg_at_k(y_true, y_score, k):
    order = np.argsort(y_score)[::-1]
    y_true_sorted = np.take(y_true, order)
    dcg_max = dcg_at_k(sorted(y_true, reverse=True), k)
    if not dcg_max:
        return 0.0
    return dcg_at_k(y_true_sorted, k) / dcg_max

for i, row in grouped_df.iterrows():
    y_true = row['relevance']
    y_scores = row['score']

    # MAP
    ap = average_precision_score(y_true, y_scores)
    map_scores.append(ap)

    # nDCG@10
    ndcg = ndcg_at_k(y_true, y_scores, k=10)
    ndcg_scores.append(ndcg)

    # Precision@5
    order = np.argsort(y_scores)[::-1]
    top5_relevance = np.take(y_true, order)[:5]
    p5 = np.sum(top5_relevance) / 5.0
    p_at_5.append(p5)

    # Precision@10
    top10_relevance = np.take(y_true, order)[:10]
    p10 = np.sum(top10_relevance) / 10.0
    p_at_10.append(p10)

# Aggregate results per query
results = pd.DataFrame({
    "Query ID": grouped_df['query_id'],
    "MAP": map_scores,
    "nDCG@10": ndcg_scores,
    "P@5": p_at_5,
    "P@10": p_at_10
})

display(results)

# Print average (mean) of each metric across all queries
print("Mean MAP      :", round(np.mean(map_scores), 4))
print("Mean nDCG@10  :", round(np.mean(ndcg_scores), 4))
print("Mean P@5      :", round(np.mean(p_at_5), 4))
print("Mean P@10     :", round(np.mean(p_at_10), 4))



Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Both `max_new_tokens` (=256) and `max_length`(=10) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=10) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=10) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=10) seem to have been set. `max_new_tokens`

Unnamed: 0,query_id,doc_id,score,relevance
0,1,"[4, 11, 339, 1378, 1861]","[0.0, 0.0, 0.0, 0.0, 0.0]","[0, 0, 0, 0, 1]"


Unnamed: 0,Query ID,MAP,nDCG@10,P@5,P@10
0,1,0.2,1.0,0.2,0.1


Mean MAP      : 0.2
Mean nDCG@10  : 1.0
Mean P@5      : 0.2
Mean P@10     : 0.1
