In [2]:
# make cluster mapping
path_to_clusters = "matching/clusters.csv"
id_cluster_map = {}
with open(path_to_clusters, "r") as f:
    for i,line in enumerate(f):
        if i == 0: continue
        webpage_id, cluster = line.split(", ")
        webpage_id = webpage_id.replace(".json", "").strip()
        cluster = cluster.replace(".json", "").strip()
        if webpage_id not in id_cluster_map:
            id_cluster_map[webpage_id] = cluster

In [3]:
def load_rel_scores(path_to_rel_scores):
    # map from queryid to webpage filename
    relevance_scores = {}
    with open(path_to_rel_scores, 'r') as f:
        for line in f:
            split_line = line.split()
            relevance_scores[split_line[0]] = split_line[2]
    return relevance_scores
relevance_scores = load_rel_scores('../data_2017-09/queries/relevance_scores.txt')

In [4]:
# compute recall at 10 by cluster
def recompute_scores(path_to_run, clusters):
    scores = {}
    with open(path_to_run, "r") as f:
        for line in f:
            split_line = line.split()
            query_id = split_line[0]
            doc_id = split_line[2]

            ground_truth_cluster = clusters[relevance_scores[query_id]]

            if query_id not in scores:
                scores[query_id] = []

            if len(scores[query_id]) == 10: continue # stop at recall 10

            if clusters[doc_id] == ground_truth_cluster:
                scores[query_id].append(1)
            else:
                scores[query_id].append(0)

    return scores

In [5]:
def calc_recall_clusters(scores):
    total_score = 0
    for query in scores:
        for i,score in enumerate(scores[query]):
            if score == 1:
                total_score += 1 / (i+1) 
                break
    total_score = total_score / len(scores)
    return total_score

In [5]:
bm25_full_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/bm25_runs/v2_run.val_8_0.99.txt", id_cluster_map)
print(calc_recall_clusters(bm25_full_recompute))
bm25_full_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/bm25_runs/v2_run.test_8_0.99.txt", id_cluster_map)
print(calc_recall_clusters(bm25_full_recompute))

0.29459857275373885
0.2902240415476881


In [6]:
bm25_onlylast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/bm25_runs/v2_run.onlylast.val_4_0.9.txt", id_cluster_map)
print(calc_recall_clusters(bm25_onlylast_recompute))
bm25_onlylast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/bm25_runs/v2_run.onlylast.test_4_0.9.txt", id_cluster_map)
print(calc_recall_clusters(bm25_onlylast_recompute))

0.3488577109347195
0.3437141451177583


In [7]:
bm25_removelast_recompute = recompute_scores("../out/bm25_runs/run.removelast.val_7_0.99.txt", id_cluster_map)
print(calc_recall_clusters(bm25_removelast_recompute))
bm25_removelast_recompute = recompute_scores("../out/bm25_runs/run.removelast.test_7_0.99.txt", id_cluster_map)
print(calc_recall_clusters(bm25_removelast_recompute))

0.13509853200539418
0.1326275443142552


In [8]:
semanticfinetune_full_recompute = recompute_scores("../out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries-2022-06-01_12-04-50/eval/run.val_full.txt", id_cluster_map)
print(calc_recall_clusters(semanticfinetune_full_recompute))
semanticfinetune_full_recompute = recompute_scores("../out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries-2022-06-01_12-04-50/eval/v2_run.test_full.txt", id_cluster_map)
print(calc_recall_clusters(semanticfinetune_full_recompute))

0.33179860904215636
0.3253665866197834


In [9]:
semanticfinetune_onlylast_recompute = recompute_scores("../out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries_onlylast-2022-06-02_21-24-49/eval/run.val_onlylast.txt", id_cluster_map)
print(calc_recall_clusters(semanticfinetune_onlylast_recompute))
semanticfinetune_onlylast_recompute = recompute_scores("../out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries_onlylast-2022-06-02_21-24-49/eval/v2_run.test_onlylast.txt", id_cluster_map)
print(calc_recall_clusters(semanticfinetune_onlylast_recompute))

0.29112258346160885
0.2906473180130481


In [10]:
semanticfinetune_removelast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries_removelast-2022-05-22_18-44-28/eval/run.val_removelast.txt", id_cluster_map)
print(calc_recall_clusters(semanticfinetune_removelast_recompute))
semanticfinetune_removelast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries_removelast-2022-05-22_18-44-28/eval/run.test_removelast.txt", id_cluster_map)
print(calc_recall_clusters(semanticfinetune_removelast_recompute))

0.09988152651571557
0.0993271481762529


In [12]:
bm25_rm3_full_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/bm25rm3_runs/v2_run.val_8_0.99_0.9_1_10.txt", id_cluster_map)
print(calc_recall_clusters(bm25_rm3_full_recompute))
bm25_rm3_full_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/bm25rm3_runs/v2_run.test_8_0.99_0.9_1_10.txt", id_cluster_map)
print(calc_recall_clusters(bm25_rm3_full_recompute))

0.2770709511291541
0.27309466026998597


In [13]:
bm25_rm3_onlylast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/bm25rm3_runs/v2_run.onlylast.val_4_0.9_0.9_1_10.txt", id_cluster_map)
print(calc_recall_clusters(bm25_rm3_onlylast_recompute))
bm25_rm3_onlylast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/bm25rm3_runs/v2_run.onlylast.test_4_0.9_0.9_1_10.txt", id_cluster_map)
print(calc_recall_clusters(bm25_rm3_onlylast_recompute))

0.3154231091093334
0.3127130538722119


In [14]:
bm25_rm3_removelast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/bm25rm3_runs/run.removelast.val_7_0.99_0.9_1_10.txt", id_cluster_map)
print(calc_recall_clusters(bm25_rm3_removelast_recompute))
bm25_rm3_removelast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/bm25rm3_runs/run.removelast.test_7_0.99_0.9_1_10.txt", id_cluster_map)
print(calc_recall_clusters(bm25_rm3_removelast_recompute))

0.12748888578363587
0.12462012338504888


In [15]:
semantic_full_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/semantic_runs/v2_run.val_full_msmarco-distilbert-cos-v5.txt", id_cluster_map)
print(calc_recall_clusters(semantic_full_recompute))
semantic_full_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/semantic_runs/v2_run.test_full_msmarco-distilbert-cos-v5.txt", id_cluster_map)
print(calc_recall_clusters(semantic_full_recompute))

0.2333481780790181
0.23273563606044495


In [16]:
semantic_onlylast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/semantic_runs/v2_run.val_onlylast_msmarco-distilbert-cos-v5.txt", id_cluster_map)
print(calc_recall_clusters(semantic_onlylast_recompute))
semantic_onlylast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/semantic_runs/v2_run.test_onlylast_msmarco-distilbert-cos-v5.txt", id_cluster_map)
print(calc_recall_clusters(semantic_onlylast_recompute))

0.23833332298856394
0.24086232040196318


In [17]:
semantic_removelast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/semantic_runs/run.val_removelast_msmarco-distilbert-cos-v5.txt", id_cluster_map)
print(calc_recall_clusters(semantic_removelast_recompute))
semantic_removelast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/semantic_runs/run.test_removelast_msmarco-distilbert-cos-v5.txt", id_cluster_map)
print(calc_recall_clusters(semantic_removelast_recompute))

0.10204345424301088
0.10264969235557457


In [18]:
interpolated_full_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/interpolated/full.txt", id_cluster_map)
print(calc_recall_clusters(interpolated_full_recompute))
interpolated_full_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/interpolated/test_full.txt", id_cluster_map)
print(calc_recall_clusters(interpolated_full_recompute))

0.36683246747604037
0.35944032759889544


In [19]:
interpolated_onlylast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/interpolated/onlylast.txt", id_cluster_map)
print(calc_recall_clusters(interpolated_onlylast_recompute))
interpolated_onlylast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/interpolated/test_onlylast.txt", id_cluster_map)
print(calc_recall_clusters(interpolated_onlylast_recompute))

0.36562267242663393
0.36868280199482223


In [20]:
interpolated_removelast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/interpolated/removelast.txt", id_cluster_map)
print(calc_recall_clusters(interpolated_removelast_recompute))
interpolated_removelast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/out/interpolated/test_removelast.txt", id_cluster_map)
print(calc_recall_clusters(interpolated_removelast_recompute))

0.14511509591671176
0.1442418550474812


In [6]:
contriver_full_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/semantic_finetune_v2/contriever-main/out/val_full.txt", id_cluster_map)
print(calc_recall_clusters(contriver_full_recompute))
contriver_full_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/semantic_finetune_v2/contriever-main/out/test_full.txt", id_cluster_map)
print(calc_recall_clusters(contriver_full_recompute))

0.18717569144446128
0.18844683544172047


In [7]:
contriver_onlylast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/semantic_finetune_v2/contriever-main/out/val_onlylast.txt", id_cluster_map)
print(calc_recall_clusters(contriver_onlylast_recompute))
contriver_onlylast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/semantic_finetune_v2/contriever-main/out/test_onlylast.txt", id_cluster_map)
print(calc_recall_clusters(contriver_onlylast_recompute))

0.23122118981412593
0.23197919821960802


In [8]:
contriver_removelast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/semantic_finetune_v2/contriever-main/out/val_removelast.txt", id_cluster_map)
print(calc_recall_clusters(contriver_removelast_recompute))
contriver_removelast_recompute = recompute_scores("/home/kjros2/contextualSearch/contextAndConnections/src/semantic_finetune_v2/contriever-main/out/test_removelast.txt", id_cluster_map)
print(calc_recall_clusters(contriver_removelast_recompute))

0.07805783657745335
0.08310110368933904
