## Best Weights

- Goal: Manual Search
    1. Initialize range for alpha (0 to 1)
    2. Combine sparse and dense scores for each combination of alpha (dense) and 1 - alpha (sparse)
    3. Use evaluation metric (MRR) to find best combination of weights

- Evaluation metric: NDCG (Normalized Discounted Cumulative Gain)
    - Prioritizes top-ranked documents by penalizing rankings that place relevant documents lower down in the ranking list

Questions:

- for both splade and bm25, since qrels only provides us with 1 confirmed relevant document, should we use mrr to as an evaluation metric since it only looks at the first relevant document?
    - use mrr

- how to handle weighting when one model ranks a document that the other does not rank at all? 
    - the second model gives it a zero

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import ndcg_score
import os

Load Data

In [62]:
data_dir = "/Users/hannahzhang/Desktop/Github Repos/ERSP-TeamYang/data/"

data = []

for file in os.listdir(data_dir):
    if file.endswith(".tsv") or file.endswith(".trec"):
        data.append(data_dir + file)

# print(data)


Splade dataframe

In [4]:
splade_df = pd.read_csv(data[3], sep="\t", names=['Query ID', 'Q0', 'Document ID', 'Rank', 'Score', 'R0'])
splade_df = splade_df.drop(splade_df.columns[[1,5]], axis=1)
print(splade_df.head)

<bound method NDFrame.head of          Query ID  Document ID  Rank   Score
0         1048585      7187155     0  104472
1         1048585      7187160     1  100811
2         1048585      7187157     2   99206
3         1048585      7187158     3   98698
4         1048585      3100835     4   86255
...           ...          ...   ...     ...
6979995   1048565      4838288   995   66246
6979996   1048565      2133477   996   66245
6979997   1048565      5753707   997   66239
6979998   1048565      1472257   998   66238
6979999   1048565      5637117   999   66238

[6980000 rows x 4 columns]>


In [7]:
dense_df = pd.read_csv(data[1], sep="\t", names=['Query ID', 'Document ID', 'Score'])
print(dense_df.head)

<bound method NDFrame.head of          Query ID  Document ID     Score
0         1048585      7187157  0.866932
1         1048585      7187158  0.863535
2         1048585      7187155  0.861530
3         1048585      7187160  0.858853
4         1048585      7187163  0.840336
...           ...          ...       ...
6979995   1048565      4529995  0.705006
6979996   1048565      8496497  0.704949
6979997   1048565      5713758  0.699297
6979998   1048565      1778769  0.695161
6979999   1048565      5713765  0.689829

[6980000 rows x 3 columns]>


In [8]:
qrels_df = pd.read_csv(data[2], sep="\t", names=['Query ID', '0', 'Document ID', "Relevance"])
qrels_df = qrels_df.drop(columns=['0'])

qrels_df.head

<bound method NDFrame.head of        Query ID  Document ID  Relevance
0       1102432      2026790          1
1       1102431      7066866          1
2       1102431      7066867          1
3       1090282      7066900          1
4         39449      7066905          1
...         ...          ...        ...
59268    150337      8009410          1
59269     22241      8009429          1
59270    129177      8009442          1
59271    190655      3576091          1
59272    371455      8009476          1

[59273 rows x 3 columns]>

In [11]:
dense_query_ids = []
sparse_query_ids = []


for x in dense_df['Query ID'].unique():
    dense_query_ids.append(int(x))

for x in splade_df['Query ID'].unique():
    sparse_query_ids.append(int(x))

query_ids = list(set(dense_query_ids) & set(sparse_query_ids))

print(len(query_ids))


6980


In [13]:
alpha_values = np.arange(0, 1.01, 0.01)
print(alpha_values)

[0.   0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1  0.11 0.12 0.13
 0.14 0.15 0.16 0.17 0.18 0.19 0.2  0.21 0.22 0.23 0.24 0.25 0.26 0.27
 0.28 0.29 0.3  0.31 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39 0.4  0.41
 0.42 0.43 0.44 0.45 0.46 0.47 0.48 0.49 0.5  0.51 0.52 0.53 0.54 0.55
 0.56 0.57 0.58 0.59 0.6  0.61 0.62 0.63 0.64 0.65 0.66 0.67 0.68 0.69
 0.7  0.71 0.72 0.73 0.74 0.75 0.76 0.77 0.78 0.79 0.8  0.81 0.82 0.83
 0.84 0.85 0.86 0.87 0.88 0.89 0.9  0.91 0.92 0.93 0.94 0.95 0.96 0.97
 0.98 0.99 1.  ]


In [16]:
def best_weights(dense_df, sparse_df, query_id, alpha_values, qrels_df):
    best_alpha = 0
    best_mrr = 0

    for alpha in alpha_values:
        # Filter by query ID
        filtered_dense_df = dense_df[dense_df["Query ID"] == query_id].copy()
        filtered_sparse_df = sparse_df[sparse_df["Query ID"] == query_id].copy()

        # Find weighted scores
        filtered_dense_df["Score"] *= (1 - alpha)
        filtered_sparse_df["Score"] *= alpha

        # Merge rankings
        merged = filtered_dense_df.merge(filtered_sparse_df, on="Document ID", how="outer", suffixes=("_dense", "_sparse")).fillna(0)
        merged["Final Score"] = merged["Score_dense"] + merged["Score_sparse"]

        # Rank documents
        ranked_results = merged.sort_values("Final Score", ascending=False)
        ranked_docs = ranked_results["Document ID"].tolist()

        # MRR
        relevant_doc = qrels_df[qrels_df["Query ID"] == query_id]["Document ID"].iloc[0]
        if relevant_doc not in ranked_docs:
            mrr_score = 0
        else:
            rank = ranked_docs.index(relevant_doc) + 1
            mrr_score = 1 / rank

        # Update alpha and MRR
        if mrr_score > best_mrr:
            best_mrr = mrr_score
            best_alpha = alpha

    return best_alpha, best_mrr


In [17]:
best_alphas = []
best_mrrs = []

for query in query_ids:
    best_alpha, best_mrr = best_weights(dense_df, splade_df, query, alpha_values, qrels_df)
    # print(f"Query {query}: Best alpha: {best_alpha}, Best MRR: {best_mrr}")
    best_alphas.append(best_alpha)
    best_mrrs.append(best_mrr)

best_weights = {'Query ID': query_ids, 'Best Alpha': best_alphas, 'MRR': best_mrrs}


In [31]:
unique_best_alphas = list(dict.fromkeys(best_alphas))

In [32]:
for i in unique_best_alphas:
    print(float(i))

0.0
0.01
0.02
0.05
0.04
1.0
0.09
0.03
0.22
0.15
0.1
0.07
0.08
0.06
0.3
0.11
0.13
0.17
0.23
0.47000000000000003
0.14
0.16
0.29
0.12
0.18


In [51]:
import csv

with open("best_weights.csv", "w") as outfile:
	writer = csv.writer(outfile)
	
	# convert dict keys to a list
	key_list = list(best_weights.keys())
	
	writer.writerow(best_weights.keys())
	
	# iterate each column and assign corresponding values to each column
	for i in range(6980):
		writer.writerow([best_weights[x][i] for x in key_list])


In [61]:
best_weights_df = pd.read_csv("best_weights.csv")

best_weights_df.head


<bound method NDFrame.head of       Query ID  Best Alpha   MRR
0            2        0.00  1.00
1      1048585        0.00  0.50
2       458771        0.00  1.00
3       163860        0.01  0.25
4       458774        0.00  0.50
...        ...         ...   ...
6975    884722        0.00  1.00
6976    393203        0.00  0.20
6977    196596        0.01  0.50
6978   1048565        0.01  0.20
6979   1081338        0.00  1.00

[6980 rows x 3 columns]>