In [1]:
import pandas as pd

df = pd.read_parquet("hf://datasets/vaishnavisha/retrieved_results_cosqa_upd_baseline_dres_BAAI_bge-base-en/data/train-00000-of-00001.parquet")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# total q & retrieved docs

total_queries = df['query_id'].nunique()
total_retrieved_docs = len(df)
print(f"Total queries: {total_queries}, Total retrieved docs: {total_retrieved_docs}")


Total queries: 500, Total retrieved docs: 500000


In [3]:
# distribution of relevance labels

relevance_distribution = df['ground_truth_relevance'].value_counts().reset_index()
relevance_distribution.columns = ['ground_truth_relevance', 'count']
print(relevance_distribution)


   ground_truth_relevance   count
0                       0  499510
1                       1     490


In [6]:
# Precision@5: Queries with relevant docs in top 5

ranked = df.copy()
ranked['rank'] = ranked.groupby('query_id')['score'].rank(ascending=False, method='first')
prec_at_5 = ranked[(ranked['rank'] <= 5) & (ranked['ground_truth_relevance'] == 1)]
queries_with_relevant_in_top5 = prec_at_5['query_id'].nunique()
print(f"Queries with relevant doc in top 5: {queries_with_relevant_in_top5} out of {(total_queries)}")


Queries with relevant doc in top 5: 159 out of 500


In [7]:
# score distribution

score_stats = df.groupby('ground_truth_relevance')['score'].agg(['mean', 'min', 'max']).reset_index()
print(score_stats)

   ground_truth_relevance      mean       min       max
0                       0  0.821635  0.768250  0.921884
1                       1  0.860586  0.801716  0.921884


In [8]:
# Duplicated retrieved_code across different queries

duplicates = df.groupby('retrieved_code')['query_id'].nunique().reset_index()
duplicates = duplicates[duplicates['query_id'] > 1].sort_values(by='query_id', ascending=False).head(10)
print(duplicates)

                                         retrieved_code  query_id
1860  \ndef coords_from_query(query):\n\n    try:\n ...       390
3491  \ndef inventory(self, source_id, fetch=False, ...       282
3668  \ndef is_stats_query(query):\n\n    if not que...       274
4685  \ndef querySQL(self, sql, args=()):\n\n    if ...       271
4687  \ndef query_fetch_one(self, query, values):\n\...       267
4605  \ndef print_runs(query):\n\n    if query is No...       258
3146  \ndef get_sql(query):\n\n    sql = str(query.s...       253
1329  \ndef atlasdb_format_query(query, values):\n\n...       248
4442  \ndef parse_query_string(query):\n\n    result...       247
5459  \ndef sqlmany(self, stringname, *args):\n\n   ...       237


In [9]:
# Queries with no relevant docs at all

no_rel_queries = df.groupby('query_id')['ground_truth_relevance'].sum()
no_rel_queries = no_rel_queries[no_rel_queries == 0].index.tolist()
print(f"Queries with no relevant docs: {len(no_rel_queries)}")

Queries with no relevant docs: 10


In [13]:
# Queries always retrieving the same explanation

same_expl = df.groupby('query_id')['retrieved_explanation'].nunique().reset_index()
same_expl.columns = ['query_id', 'distinct_explanations']
same_expl = same_expl.sort_values('distinct_explanations').head(10)
print(same_expl)

    query_id  distinct_explanations
387   q20492                    984
261   q20366                    985
375   q20480                    985
326   q20431                    985
9     q20114                    985
219   q20324                    985
110   q20215                    985
378   q20483                    986
340   q20445                    986
260   q20365                    987


In [14]:
#  Average relevant vs irrelevant scores per query

score_diff = df.groupby('query_id').agg(
    avg_rel_score=('score', lambda x: x[df.loc[x.index, 'ground_truth_relevance'] == 1].mean()),
    avg_irrel_score=('score', lambda x: x[df.loc[x.index, 'ground_truth_relevance'] == 0].mean())
).dropna()
score_diff['score_gap'] = score_diff['avg_rel_score'] - score_diff['avg_irrel_score']
score_diff = score_diff.sort_values('score_gap')
print(score_diff.head(10))

          avg_rel_score  avg_irrel_score  score_gap
query_id                                           
q20148         0.801716         0.811687  -0.009970
q20357         0.813210         0.821074  -0.007864
q20349         0.830722         0.837616  -0.006894
q20368         0.819490         0.826204  -0.006715
q20371         0.818292         0.824556  -0.006264
q20163         0.814860         0.820966  -0.006106
q20374         0.823833         0.829288  -0.005455
q20398         0.824209         0.829141  -0.004932
q20419         0.822557         0.826802  -0.004245
q20393         0.821702         0.825901  -0.004200


In [17]:
# different retrieved_doc_id s with same retrieved explanations

dup_expl = df.groupby('retrieved_explanation')['retrieved_doc_id'].nunique().reset_index()
dup_expl = dup_expl[dup_expl['retrieved_doc_id'] > 1]

shared_expl = df[df['retrieved_explanation'].isin(dup_expl['retrieved_explanation'])]

grouped_shared = shared_expl.groupby('retrieved_explanation')['retrieved_doc_id'].unique().reset_index()
grouped_shared['num_doc_ids'] = grouped_shared['retrieved_doc_id'].apply(len)

print(grouped_shared.sort_values('num_doc_ids', ascending=False).head(10))

                                retrieved_explanation  \
73  The code reads text from a file and returns th...   
75  The code removes the specified key from each d...   
81  The code sets self._should_quit to True and re...   
52  The code fills NaN values with a specified fil...   
33  The code constructs an adjacency matrix for a ...   
71  The code reads credentials from a file, extrac...   
35  The code converts a datetime object to a date ...   
68  The code reads a file from the given filepath ...   
25  The code checks if two Index objects are equal...   
66  The code reads a JSON file and returns its con...   

                            retrieved_doc_id  num_doc_ids  
73  [d18574, d18395, d19486, d17213, d17787]            5  
75          [d16680, d17535, d19453, d19517]            4  
81          [d18806, d19535, d17443, d19482]            4  
52                      [d2748, d312, d6707]            3  
33                     [d7085, d4319, d4354]            3  
71          