In [1]:
import pickle
import math
import pandas as pd
from coir.data_loader import get_tasks
from coir.evaluation import COIR
from coir.models import YourCustomDEModel

In [2]:
tasks = get_tasks(tasks=["cosqa"])
corpus, queries, qrels = tasks['cosqa']

Using the latest cached version of the dataset since CoIR-Retrieval/cosqa-queries-corpus couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/aabedeen_umass_edu/.cache/huggingface/datasets/CoIR-Retrieval___cosqa-queries-corpus/default/0.0.0/d56676dfbe7cd137229c33bd1e7dd96c688d2126 (last modified on Thu Mar 27 20:03:00 2025).
Using the latest cached version of the dataset since CoIR-Retrieval/cosqa-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/aabedeen_umass_edu/.cache/huggingface/datasets/CoIR-Retrieval___cosqa-qrels/default/0.0.0/c70cfe89508993ed4707e31be1f83908f1fd6d38 (last modified on Thu Mar 27 20:03:01 2025).


0it [00:00, ?it/s]

In [3]:
with open("colbert_results.pkl", "rb") as f:
    expl_results = pickle.load(f)
with open("baseline_results.pkl", "rb") as f:
    baseline_results = pickle.load(f)

In [11]:
print(qrels)

{'q20105': {'d20105': 1}, 'q20106': {'d20106': 1}, 'q20107': {'d20107': 1}, 'q20108': {'d20108': 1}, 'q20109': {'d20109': 1}, 'q20110': {'d20110': 1}, 'q20111': {'d20111': 1}, 'q20112': {'d20112': 1}, 'q20113': {'d20113': 1}, 'q20114': {'d20114': 1}, 'q20115': {'d20115': 1}, 'q20116': {'d20116': 1}, 'q20117': {'d20117': 1}, 'q20118': {'d20118': 1}, 'q20119': {'d20119': 1}, 'q20120': {'d20120': 1}, 'q20121': {'d20121': 1}, 'q20122': {'d20122': 1}, 'q20123': {'d20123': 1}, 'q20124': {'d20124': 1}, 'q20125': {'d20125': 1}, 'q20126': {'d20126': 1}, 'q20127': {'d20127': 1}, 'q20128': {'d20128': 1}, 'q20129': {'d20129': 1}, 'q20130': {'d20130': 1}, 'q20131': {'d20131': 1}, 'q20132': {'d20132': 1}, 'q20133': {'d20133': 1}, 'q20134': {'d20134': 1}, 'q20135': {'d20135': 1}, 'q20136': {'d20136': 1}, 'q20137': {'d20137': 1}, 'q20138': {'d20138': 1}, 'q20139': {'d20139': 1}, 'q20140': {'d20140': 1}, 'q20141': {'d20141': 1}, 'q20142': {'d20142': 1}, 'q20143': {'d20143': 1}, 'q20144': {'d20144': 1},

In [4]:
def mrr_at_k(ranked_docs, relevant, k=1000):
    for i, doc in enumerate(ranked_docs[:k], start=1):
        if doc in relevant:
            return 1.0 / i
    return 0.0

def ndcg_at_k(ranked_docs, relevant, k=1000):
    dcg = 0.0
    for i, doc in enumerate(ranked_docs[:k], start=1):
        if doc in relevant:
            dcg += 1.0 / math.log2(i + 1)
    ideal_hits = min(len(relevant), k)
    idcg = sum(1.0 / math.log2(i + 1) for i in range(1, ideal_hits + 1))
    return dcg / idcg if idcg > 0 else 0.0

def compute_per_query_metrics(results_dict, qrels):
    metrics = {}
    for qid, doc_scores in results_dict.items():
        ranked = [doc for doc, _ in sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)]
        relevant = qrels.get(qid, set())
        metrics[qid] = {
            "mrr":  mrr_at_k(ranked, relevant),
            "ndcg": ndcg_at_k(ranked, relevant)
        }
    return pd.DataFrame.from_dict(metrics, orient="index")

# Compute metrics
df_base = compute_per_query_metrics(baseline_results, qrels).rename(columns=lambda c: f"baseline_{c}")
df_expl = compute_per_query_metrics(expl_results,     qrels).rename(columns=lambda c: f"expl_{c}")

# Merge and compute deltas
metrics_df = df_base.join(df_expl)
metrics_df["delta_mrr"]  = metrics_df["expl_mrr"] - metrics_df["baseline_mrr"]
metrics_df["delta_ndcg"] = metrics_df["expl_ndcg"] - metrics_df["baseline_ndcg"]

# Show top‑10 winners and losers by ΔMRR
top_winners = metrics_df.nlargest(10, "delta_mrr")
top_losers  = metrics_df.nsmallest(10, "delta_mrr")

print("Top 10 Queries Where Explanations w/ Colbert Help Most:\n", top_winners)
print("\nTop 10 Queries Where Baseline Beats Explanations w/ Colbert:\n", top_losers)

# (Optional) Save full breakdown for further analysis
metrics_df.to_csv("per_query_comparison.csv")


Top 10 Queries Where Explanations w/ Colbert Help Most:
         baseline_mrr  baseline_ndcg  expl_mrr  expl_ndcg  delta_mrr  \
q20462      0.015152       0.164851  1.000000    1.00000   0.984848   
q20269      0.090909       0.278943  1.000000    1.00000   0.909091   
q20136      0.142857       0.333333  1.000000    1.00000   0.857143   
q20160      0.200000       0.386853  1.000000    1.00000   0.800000   
q20468      0.333333       0.500000  1.000000    1.00000   0.666667   
q20549      0.111111       0.301030  0.500000    0.63093   0.388889   
q20358      0.200000       0.386853  0.500000    0.63093   0.300000   
q20499      0.052632       0.231378  0.333333    0.50000   0.280702   
q20420      0.058824       0.239812  0.333333    0.50000   0.274510   
q20293      0.083333       0.270238  0.333333    0.50000   0.250000   

        delta_ndcg  
q20462    0.835149  
q20269    0.721057  
q20136    0.666667  
q20160    0.613147  
q20468    0.500000  
q20549    0.329900  
q20358    0.24

In [8]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from wordcloud import WordCloud

# ---------------------------
# Directory Setup for Saving Plots
# ---------------------------
base_dir = "plots"
subdirs = {
    "clusters": os.path.join(base_dir, "clusters"),
    "performance": os.path.join(base_dir, "performance"),
    "wordcloud": os.path.join(base_dir, "wordcloud")
}
for subdir in subdirs.values():
    os.makedirs(subdir, exist_ok=True)

# ---------------------------
# Sample Data Setup (Replace with your actual data)
# ---------------------------
# Assume 'metrics_df' is your DataFrame of per-query metrics.
# Assume 'queries' is a dict mapping query_id to query text.
df_metrics = metrics_df.copy()

# If the query IDs are in the index, reset the index to create a 'query_id' column.
if 'query_id' not in df_metrics.columns:
    df_metrics = df_metrics.reset_index().rename(columns={'index': 'query_id'})

# Merge query texts into the DataFrame.
df_metrics["query_text"] = df_metrics["query_id"].apply(lambda q: queries.get(q, ""))

# ---------------------------
# 1. TF-IDF Feature Extraction
# ---------------------------
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
tfidf_matrix = vectorizer.fit_transform(df_metrics["query_text"])

# ---------------------------
# 2. Clustering Queries with K-Means
# ---------------------------
num_clusters = 5  # Adjust as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df_metrics["cluster"] = kmeans.fit_predict(tfidf_matrix)

# ---------------------------
# 3. Visualizing Clusters using PCA
# ---------------------------
pca = PCA(n_components=2, random_state=42)
reduced_features = pca.fit_transform(tfidf_matrix.toarray())
df_metrics["pca1"] = reduced_features[:, 0]
df_metrics["pca2"] = reduced_features[:, 1]

plt.figure(figsize=(10, 8))
scatter = plt.scatter(df_metrics["pca1"], df_metrics["pca2"], 
                      c=df_metrics["cluster"], cmap="viridis", alpha=0.7)
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("Query Clusters (based on TF-IDF)")
plt.legend(*scatter.legend_elements(), title="Cluster")
plt.grid(True)
cluster_plot_path = os.path.join(subdirs["clusters"], "clusters_pca.png")
plt.savefig(cluster_plot_path)
plt.close()
print(f"Cluster PCA plot saved to: {cluster_plot_path}")

# ---------------------------
# 4. Analyzing Cluster Performance
# ---------------------------
# Compute average delta MRR per cluster.
cluster_perf = df_metrics.groupby("cluster")["delta_mrr"].mean().reset_index()
print("Average delta MRR per cluster:")
print(cluster_perf)

plt.figure(figsize=(8, 6))
plt.bar(cluster_perf["cluster"].astype(str), cluster_perf["delta_mrr"])
plt.xlabel("Cluster")
plt.ylabel("Average Delta MRR (expl - baseline)")
plt.title("Average Delta MRR by Query Cluster")
plt.grid(True)
performance_bar_path = os.path.join(subdirs["performance"], "avg_delta_mrr_bar.png")
plt.savefig(performance_bar_path)
plt.close()
print(f"Performance bar plot saved to: {performance_bar_path}")

# ---------------------------
# 5. Generate Word Clouds for Each Cluster
# ---------------------------
for cluster in sorted(df_metrics["cluster"].unique()):
    cluster_text = " ".join(df_metrics[df_metrics["cluster"] == cluster]["query_text"])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(cluster_text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Word Cloud for Cluster {cluster}")
    wordcloud_path = os.path.join(subdirs["wordcloud"], f"wordcloud_cluster_{cluster}.png")
    plt.savefig(wordcloud_path)
    plt.close()
    print(f"Word cloud for cluster {cluster} saved to: {wordcloud_path}")


  super()._check_params_vs_input(X, default_n_init=10)


Cluster PCA plot saved to: plots/clusters/clusters_pca.png
Average delta MRR per cluster:
   cluster  delta_mrr
0        0  -0.155488
1        1  -0.198535
2        2  -0.135041
3        3  -0.236893
4        4  -0.119186
Performance bar plot saved to: plots/performance/avg_delta_mrr_bar.png
Word cloud for cluster 0 saved to: plots/wordcloud/wordcloud_cluster_0.png
Word cloud for cluster 1 saved to: plots/wordcloud/wordcloud_cluster_1.png
Word cloud for cluster 2 saved to: plots/wordcloud/wordcloud_cluster_2.png
Word cloud for cluster 3 saved to: plots/wordcloud/wordcloud_cluster_3.png
Word cloud for cluster 4 saved to: plots/wordcloud/wordcloud_cluster_4.png


In [17]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# ---------------------------
# Hyperparameter: Top-K cutoff
# ---------------------------
top_k = 100  # You can adjust this (e.g., 5, 10, 20)

# ---------------------------
# Directory Setup for Saving Plots and CSVs
# ---------------------------
base_dir = "plots"
subdirs = {
    "clusters": os.path.join(base_dir, "clusters"),
    "performance": os.path.join(base_dir, "performance"),
    "wordcloud": os.path.join(base_dir, "wordcloud"),
    "topk_analysis": os.path.join(base_dir, "topk_analysis")
}
for subdir in subdirs.values():
    os.makedirs(subdir, exist_ok=True)

# ---------------------------
# Helper Function: Get Rank of Correct Document
# ---------------------------
def get_rank(query_id, results, qrels):
    """
    Returns the rank (1-indexed) of the correct document for a given query.
    Assumes that qrels is a nested dict, e.g., {query_id: {correct_doc: 1}}.
    If the correct document is not found in the results, returns None.
    """
    # Extract the correct document ID from the nested qrels structure.
    correct_doc = list(qrels.get(query_id, {}).keys())[0] if query_id in qrels else None
    if correct_doc is None:
        return None
    
    # Sort documents by descending score
    sorted_docs = sorted(results.get(query_id, {}).items(), key=lambda x: x[1], reverse=True)
    for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
        if doc_id == correct_doc:
            return rank
    return None

# ---------------------------
# Create a DataFrame with Top-K Analysis per Query
# ---------------------------
# Assumptions:
# - baseline_results: dict mapping query_id -> {doc_id: score, ...}
# - expl_results: dict mapping query_id -> {doc_id: score, ...}
# - qrels: dict mapping query_id -> {correct_doc: 1}
# - queries: dict mapping query_id -> query text (for further analysis)

data = []
for qid in qrels:
    baseline_rank = get_rank(qid, baseline_results, qrels)
    expl_rank = get_rank(qid, expl_results, qrels)
    
    # Determine if correct doc is in top_k (if rank exists and <= top_k)
    baseline_in_topk = baseline_rank is not None and baseline_rank <= top_k
    expl_in_topk = expl_rank is not None and expl_rank <= top_k
    
    data.append({
        "query_id": qid,
        "baseline_rank": baseline_rank if baseline_rank is not None else float('inf'),
        "expl_rank": expl_rank if expl_rank is not None else float('inf'),
        "baseline_in_topk": baseline_in_topk,
        "expl_in_topk": expl_in_topk
    })

df_topk = pd.DataFrame(data)

# Save the raw data for further inspection
csv_path = os.path.join(subdirs["topk_analysis"], f"top{top_k}_analysis.csv")
df_topk.to_csv(csv_path, index=False)
print(f"Top-{top_k} analysis data saved to: {csv_path}")

# ---------------------------
# Analyze Group Overlap
# ---------------------------
# Define groups based on whether the correct doc is found in top_k:
# "Both": both methods retrieve the correct doc within top_k.
# "Baseline Only": only baseline does.
# "Explanation Only": only explanation does.
# "Neither": neither method does.
def label_group(row):
    if row["baseline_in_topk"] and row["expl_in_topk"]:
        return "Both"
    elif row["baseline_in_topk"] and not row["expl_in_topk"]:
        return "Baseline Only"
    elif not row["baseline_in_topk"] and row["expl_in_topk"]:
        return "Colbert Only"
    else:
        return "Neither"

df_topk["group"] = df_topk.apply(label_group, axis=1)
group_counts = df_topk["group"].value_counts()
print("Query Group Counts based on Top-K presence:")
print(group_counts)

# ---------------------------
# Visualization: Bar Chart for Query Group Counts
# ---------------------------
plt.figure(figsize=(8, 6))
group_counts.plot(kind="bar", color="skyblue")
plt.xlabel("Query Group")
plt.ylabel("Number of Queries")
plt.title(f"Queries Grouped by Presence of Correct Doc in Top-{top_k}")
plt.grid(True, axis='y')
bar_chart_path = os.path.join(subdirs["topk_analysis"], f"query_group_bar_top{top_k}.png")
plt.savefig(bar_chart_path)
plt.close()
print(f"Query group bar chart saved to: {bar_chart_path}")

# ---------------------------
# Optional: Merge Query Text for Further Analysis
# ---------------------------
# Merge query text if available (assuming 'queries' is a dict mapping query_id to text)
df_topk["query_text"] = df_topk["query_id"].apply(lambda q: queries.get(q, ""))

# Print out a few examples from each group for manual inspection
for group in df_topk["group"].unique():
    print(f"\n=== Sample queries from group '{group}' ===")
    print(df_topk[df_topk["group"] == group][["query_id", "baseline_rank", "expl_rank", "query_text"]].head(3))

# Optionally save the samples to a CSV file for further manual analysis.
sample_csv_path = os.path.join(subdirs["topk_analysis"], f"top{top_k}_analysis_samples.csv")
df_topk.to_csv(sample_csv_path, index=False)
print(f"Top-{top_k} analysis samples saved to: {sample_csv_path}")


Top-100 analysis data saved to: plots/topk_analysis/top100_analysis.csv
Query Group Counts based on Top-K presence:
group
Both             263
Baseline Only    179
Neither           52
Colbert Only       6
Name: count, dtype: int64
Query group bar chart saved to: plots/topk_analysis/query_group_bar_top100.png

=== Sample queries from group 'Both' ===
  query_id  baseline_rank  expl_rank                             query_text
0   q20105            1.0        2.0       sort by a token in string python
3   q20108            9.0       41.0  test for iterable is string in python
4   q20109            1.0        6.0     python print results of query loop

=== Sample queries from group 'Baseline Only' ===
  query_id  baseline_rank  expl_rank                             query_text
1   q20106            1.0        inf          python check file is readonly
2   q20107            6.0        inf  declaring empty numpy array in python
7   q20112           24.0        inf            python numpy arr

In [18]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# ---------------------------
# Hyperparameter: Top-K cutoff
# ---------------------------
top_k = 100  # You can adjust this (e.g., 5, 10, 20)

# ---------------------------
# Directory Setup for Saving Plots and CSVs
# ---------------------------
base_dir = "plots"
subdirs = {
    "clusters": os.path.join(base_dir, "clusters"),
    "performance": os.path.join(base_dir, "performance"),
    "wordcloud": os.path.join(base_dir, "wordcloud"),
    "topk_analysis": os.path.join(base_dir, "topk_analysis")
}
for subdir in subdirs.values():
    os.makedirs(subdir, exist_ok=True)

# ---------------------------
# Helper Function: Get Rank of Correct Document
# ---------------------------
def get_rank(query_id, results, qrels):
    """
    Returns the rank (1-indexed) of the correct document for a given query.
    Assumes that qrels is a nested dict, e.g., {query_id: {correct_doc: 1}}.
    If the correct document is not found in the results, returns None.
    """
    # Extract the correct document ID from the nested qrels structure.
    correct_doc = list(qrels.get(query_id, {}).keys())[0] if query_id in qrels else None
    if correct_doc is None:
        return None
    
    # Sort documents by descending score
    sorted_docs = sorted(results.get(query_id, {}).items(), key=lambda x: x[1], reverse=True)
    for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
        if doc_id == correct_doc:
            return rank
    return None

# ---------------------------
# Create a DataFrame with Top-K Analysis per Query
# ---------------------------
# Assumptions:
# - baseline_results: dict mapping query_id -> {doc_id: score, ...}
# - expl_results: dict mapping query_id -> {doc_id: score, ...}
# - qrels: dict mapping query_id -> {correct_doc: 1}
# - queries: dict mapping query_id -> query text

data = []
for qid in qrels:
    baseline_rank = get_rank(qid, baseline_results, qrels)
    expl_rank = get_rank(qid, expl_results, qrels)
    
    # Determine if correct doc is in top_k (if rank exists and <= top_k)
    baseline_in_topk = baseline_rank is not None and baseline_rank <= top_k
    expl_in_topk = expl_rank is not None and expl_rank <= top_k
    
    data.append({
        "query_id": qid,
        "baseline_rank": baseline_rank if baseline_rank is not None else float('inf'),
        "expl_rank": expl_rank if expl_rank is not None else float('inf'),
        "baseline_in_topk": baseline_in_topk,
        "expl_in_topk": expl_in_topk
    })

df_topk = pd.DataFrame(data)

# ---------------------------
# Merge Query Text from the 'queries' Dictionary
# ---------------------------
# Since queries is a simple dict mapping query_id -> query text,
# we convert it into a DataFrame.
df_queries = pd.DataFrame(list(queries.items()), columns=["query_id", "query_text"])
# Merge with df_topk on 'query_id'
df_topk = pd.merge(df_topk, df_queries, on="query_id", how="left")

# Save the raw top-k analysis data for further inspection
csv_path = os.path.join(subdirs["topk_analysis"], f"top{top_k}_analysis.csv")
df_topk.to_csv(csv_path, index=False)
print(f"Top-{top_k} analysis data saved to: {csv_path}")

# ---------------------------
# Analyze Group Overlap
# ---------------------------
# Define groups based on whether the correct doc is found in top_k:
# "Both": both methods retrieve the correct doc within top_k.
# "Baseline Only": only baseline does.
# "Colbert Only": only explanation does.
# "Neither": neither method does.
def label_group(row):
    if row["baseline_in_topk"] and row["expl_in_topk"]:
        return "Both"
    elif row["baseline_in_topk"] and not row["expl_in_topk"]:
        return "Baseline Only"
    elif not row["baseline_in_topk"] and row["expl_in_topk"]:
        return "Colbert Only"
    else:
        return "Neither"

df_topk["group"] = df_topk.apply(label_group, axis=1)
group_counts = df_topk["group"].value_counts()
print("Query Group Counts based on Top-K presence:")
print(group_counts)

# ---------------------------
# Visualization: Bar Chart for Query Group Counts
# ---------------------------
plt.figure(figsize=(8, 6))
group_counts.plot(kind="bar", color="skyblue")
plt.xlabel("Query Group")
plt.ylabel("Number of Queries")
plt.title(f"Queries Grouped by Presence of Correct Doc in Top-{top_k}")
plt.grid(True, axis='y')
bar_chart_path = os.path.join(subdirs["topk_analysis"], f"query_group_bar_top{top_k}.png")
plt.savefig(bar_chart_path)
plt.close()
print(f"Query group bar chart saved to: {bar_chart_path}")

# ---------------------------
# Group-Level Text Analysis: Word Clouds & Top Terms
# ---------------------------
group_analysis_dir = os.path.join(subdirs["topk_analysis"], "group_analysis")
os.makedirs(group_analysis_dir, exist_ok=True)

# Function to generate a word cloud for a given text
def generate_wordcloud(text, title, save_path):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()
    print(f"Word cloud saved to: {save_path}")

# Function to compute and return the top N words by frequency using CountVectorizer
def top_terms(texts, top_n=10):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(texts)
    sum_words = np.array(X.sum(axis=0)).flatten()
    words_freq = [(word, sum_words[idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

group_summary = {}
for group in df_topk["group"].unique():
    group_df = df_topk[df_topk["group"] == group]
    all_text = " ".join(group_df["query_text"].tolist())
    
    # Save word cloud for this group
    wc_path = os.path.join(group_analysis_dir, f"wordcloud_{group.replace(' ', '_')}.png")
    generate_wordcloud(all_text, f"Word Cloud for Group: {group}", wc_path)
    
    # Compute top terms
    top_words = top_terms(group_df["query_text"].tolist(), top_n=10)
    group_summary[group] = top_words
    print(f"\nGroup: {group}")
    print("Top words:")
    for word, freq in top_words:
        print(f"{word}: {freq}")

# Save group summary to CSV
summary_rows = []
for group, words in group_summary.items():
    for word, freq in words:
        summary_rows.append({"group": group, "word": word, "frequency": freq})
summary_df = pd.DataFrame(summary_rows)
summary_csv_path = os.path.join(group_analysis_dir, "group_top_words_summary.csv")
summary_df.to_csv(summary_csv_path, index=False)
print(f"\nGroup top words summary saved to: {summary_csv_path}")


Top-100 analysis data saved to: plots/topk_analysis/top100_analysis.csv
Query Group Counts based on Top-K presence:
group
Both             263
Baseline Only    179
Neither           52
Colbert Only       6
Name: count, dtype: int64
Query group bar chart saved to: plots/topk_analysis/query_group_bar_top100.png
Word cloud saved to: plots/topk_analysis/group_analysis/wordcloud_Both.png

Group: Both
Top words:
python: 261
file: 26
list: 23
string: 19
check: 12
read: 11
remove: 10
line: 10
object: 9
array: 9
Word cloud saved to: plots/topk_analysis/group_analysis/wordcloud_Baseline_Only.png

Group: Baseline Only
Top words:
python: 178
list: 19
check: 18
string: 18
file: 16
object: 14
numpy: 10
array: 10
value: 10
function: 9
Word cloud saved to: plots/topk_analysis/group_analysis/wordcloud_Neither.png

Group: Neither
Top words:
python: 51
list: 7
index: 5
string: 5
array: 4
type: 4
object: 4
element: 3
remove: 3
cast: 2
Word cloud saved to: plots/topk_analysis/group_analysis/wordcloud_Colbe

In [19]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# ---------------------------
# Hyperparameter: Top-K cutoff
# ---------------------------
top_k = 100  # Adjust as needed (e.g., 5, 10, 20)

# ---------------------------
# Directory Setup for Saving Plots and CSVs
# ---------------------------
base_dir = "plots"
subdirs = {
    "clusters": os.path.join(base_dir, "clusters"),
    "performance": os.path.join(base_dir, "performance"),
    "wordcloud": os.path.join(base_dir, "wordcloud"),
    "topk_analysis": os.path.join(base_dir, "topk_analysis")
}
for subdir in subdirs.values():
    os.makedirs(subdir, exist_ok=True)

# ---------------------------
# Helper Function: Get Rank of Correct Document
# ---------------------------
def get_rank(query_id, results, qrels):
    """
    Returns the rank (1-indexed) of the correct document for a given query.
    Assumes that qrels is a nested dict, e.g., {query_id: {correct_doc: 1}}.
    If the correct document is not found in the results, returns None.
    """
    # Extract the correct document ID from the nested qrels structure.
    correct_doc = list(qrels.get(query_id, {}).keys())[0] if query_id in qrels else None
    if correct_doc is None:
        return None
    
    # Sort documents by descending score.
    sorted_docs = sorted(results.get(query_id, {}).items(), key=lambda x: x[1], reverse=True)
    for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
        if doc_id == correct_doc:
            return rank
    return None

# ---------------------------
# Create a DataFrame with Top-K Analysis per Query
# ---------------------------
# Assumptions:
# - baseline_results: dict mapping query_id -> {doc_id: score, ...}
# - expl_results: dict mapping query_id -> {doc_id: score, ...}
# - qrels: dict mapping query_id -> {correct_doc: 1}
# - queries: nested dict mapping query_id -> {'text': query text, ...}
data = []
for qid in qrels:
    baseline_rank = get_rank(qid, baseline_results, qrels)
    expl_rank = get_rank(qid, expl_results, qrels)
    
    # Determine if correct doc is in top_k (if rank exists and <= top_k)
    baseline_in_topk = baseline_rank is not None and baseline_rank <= top_k
    expl_in_topk = expl_rank is not None and expl_rank <= top_k
    
    data.append({
        "query_id": qid,
        "baseline_rank": baseline_rank if baseline_rank is not None else float('inf'),
        "expl_rank": expl_rank if expl_rank is not None else float('inf'),
        "baseline_in_topk": baseline_in_topk,
        "expl_in_topk": expl_in_topk
    })

df_topk = pd.DataFrame(data)

# ---------------------------
# Merge Query Text from the Nested 'queries' Dictionary
# ---------------------------
# Convert the nested queries dict into a DataFrame.
# We assume the nested dict maps query_id -> {'text': query text, ...}
df_queries = pd.DataFrame.from_dict(queries, orient='index').reset_index().rename(columns={'index': 'query_id'})
if "text" in df_queries.columns:
    df_queries = df_queries.rename(columns={"text": "query_text"})
else:
    df_queries["query_text"] = df_queries.iloc[:,1]  # Fallback if needed

# Merge with df_topk on 'query_id'
df_topk = pd.merge(df_topk, df_queries[["query_id", "query_text"]], on="query_id", how="left")

# Save the raw top-k analysis data for further inspection
csv_path = os.path.join(subdirs["topk_analysis"], f"top{top_k}_analysis.csv")
df_topk.to_csv(csv_path, index=False)
print(f"Top-{top_k} analysis data saved to: {csv_path}")

# ---------------------------
# Analyze Group Overlap
# ---------------------------
# Define groups based on whether the correct doc is found in top_k:
# "Both": both methods retrieve the correct doc within top_k.
# "Baseline Only": only baseline does.
# "Colbert Only": only explanation (e.g., ColBERT) does.
# "Neither": neither method does.
def label_group(row):
    if row["baseline_in_topk"] and row["expl_in_topk"]:
        return "Both"
    elif row["baseline_in_topk"] and not row["expl_in_topk"]:
        return "Baseline Only"
    elif not row["baseline_in_topk"] and row["expl_in_topk"]:
        return "Colbert Only"
    else:
        return "Neither"

df_topk["group"] = df_topk.apply(label_group, axis=1)
group_counts = df_topk["group"].value_counts()
print("Query Group Counts based on Top-K presence:")
print(group_counts)

# ---------------------------
# Visualization: Bar Chart for Query Group Counts
# ---------------------------
plt.figure(figsize=(8, 6))
group_counts.plot(kind="bar", color="skyblue")
plt.xlabel("Query Group")
plt.ylabel("Number of Queries")
plt.title(f"Queries Grouped by Presence of Correct Doc in Top-{top_k}")
plt.grid(True, axis='y')
bar_chart_path = os.path.join(subdirs["topk_analysis"], f"query_group_bar_top{top_k}.png")
plt.savefig(bar_chart_path)
plt.close()
print(f"Query group bar chart saved to: {bar_chart_path}")

# ---------------------------
# Extract Top 10 Prominent Keywords for Each Group
# ---------------------------
group_keywords = []
# For each unique group, extract keywords using CountVectorizer.
for group in df_topk["group"].unique():
    group_df = df_topk[df_topk["group"] == group]
    # Combine all query texts in the group into one string.
    text = " ".join(group_df["query_text"].dropna().tolist())
    # Initialize CountVectorizer with English stopwords.
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform([text])
    # Sum frequencies of each term.
    term_freq = np.array(X.sum(axis=0)).flatten()
    # Retrieve term names.
    terms = vectorizer.get_feature_names_out()
    # Create a DataFrame of terms and their frequencies.
    freq_df = pd.DataFrame({"term": terms, "frequency": term_freq})
    # Sort terms by frequency (descending) and select the top 10.
    freq_df = freq_df.sort_values("frequency", ascending=False).head(10)
    freq_df["group"] = group
    group_keywords.append(freq_df)

# Concatenate the results for all groups.
group_keywords_df = pd.concat(group_keywords).reset_index(drop=True)
keywords_csv_path = os.path.join(subdirs["topk_analysis"], "group_top10_keywords.csv")
group_keywords_df.to_csv(keywords_csv_path, index=False)
print(f"Group top 10 keywords saved to: {keywords_csv_path}")


Top-100 analysis data saved to: plots/topk_analysis/top100_analysis.csv
Query Group Counts based on Top-K presence:
group
Both             263
Baseline Only    179
Neither           52
Colbert Only       6
Name: count, dtype: int64
Query group bar chart saved to: plots/topk_analysis/query_group_bar_top100.png
Group top 10 keywords saved to: plots/topk_analysis/group_top10_keywords.csv
