In [1]:
import pandas as pd
import pyterrier as pt
import yaml
import os
from src.load_index import load_index, load_topics, load_qrels, tag
from src.extend_runs import extend_run_full
import sqlite3
from repro_eval.Evaluator import RpdEvaluator
import pytrec_eval
import numpy as np

from repro_eval.util import arp, arp_scores

if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

  from .autonotebook import tqdm as notebook_tqdm
PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
with open("data/LongEval/metadata.yml", "r") as yamlfile:
    config = yaml.load(yamlfile, Loader=yaml.FullLoader)

results_path = "data/results/relevance_feedback/"
base_path = "data"
index = load_index("t3")
topics = load_topics("t3", "test")

>>> Loaded index with  2049729 documents.


# Relevance Feedback

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_top_terms(texts, top_n=10, query=None):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    sums = tfidf_matrix.sum(axis=0)
    data = []
    for col, term in enumerate(feature_names):
        data.append((term, sums[0, col]))
    top_terms = sorted(data, key=lambda x: x[1], reverse=True)[:top_n]
    return top_terms

In [4]:
conn = sqlite3.connect("data/database.db")

In [5]:
history = ["t2", "t1", "t0"]
nterms = 20

filename = f"/CIR_BM25_D-t3_T-t3_rrrf{"".join(history)}-k{nterms}"

# get query_map
query = """SELECT topic.queryid as qid_1, T2.queryid as qid_2 from topic
JOIN topic as T2
ON topic.text_fr = T2.text_fr
WHERE T2.sub_collection IN (%s)"""% ",".join("?" * len(history))

query_map = pd.read_sql_query(query, conn, params=history)
# query_map = query_map.drop_duplicates().set_index("qid_1").to_dict()["qid_2"]

new_topics = []
extended_topics = []
for _, topic in topics.iterrows():
    new_topic = {"qid": topic["qid"], "query": topic["query"], "query_original": topic["query"]}
    
    # get queryies for potential extension
    queryies_to_extend = query_map[query_map["qid_1"] == topic["qid"]]["qid_2"].tolist()
    
    if len(queryies_to_extend) == 0:
        # extended_topics.append(new_topic)
        new_topics.append(topic["qid"])
        print("No similar topics found", topic["qid"], topic["query"])
        continue

    # get rel docs
    query = """SELECT url, text_en
    FROM qrel
    JOIN document ON qrel.docid = document.docid
    WHERE queryid IN (%s)
    AND relevance > 0"""% ",".join("?" * len(queryies_to_extend))
    
    rel_docs = pd.read_sql_query(query, conn, params=queryies_to_extend)
    texts = rel_docs.drop_duplicates(subset="url")["text_en"].str.replace("\n", " ").tolist()
    if len(texts) == 0:
        # extended_topics.append(new_topic)
        new_topics.append(topic["qid"])
        print("No relevant docs found", topic["qid"], topic["query"])
        continue
    extension_terms = [item[0] for item in extract_top_terms(texts, top_n=nterms)]
    extension_terms = " ".join(extension_terms)
    new_topic["query"] = new_topic["query"] + " " + extension_terms
    
    extended_topics.append(new_topic)
    

No similar topics found q012318 case over the border
No similar topics found q012396 water atlantic
No similar topics found q0123180 blanquette de veau recipe
No similar topics found q0123240 gift woman
No similar topics found q0123387 government
No similar topics found q0123626 turkic flag
No similar topics found q0123805 veal filet mignon
No similar topics found q0123836 potato patty
No similar topics found q0123854 gateau coing
No similar topics found q0123855 gateau creusois
No similar topics found q0123863 coconut gateau
No similar topics found q0123873 gateau semoule
No similar topics found q0123929 chocolate cake
No similar topics found q01231174 climate and resilience law
No similar topics found q01231206 veil law
No similar topics found q01231281 mackerel with white wine
No similar topics found q01231511 atopic skin
No similar topics found q01231718 recipe for veal
No similar topics found q01231727 apple gateau recipe
No similar topics found q01231739 recipe saut ed potato
No 

In [6]:
new_topics = topics[topics["qid"].isin(new_topics)]

In [7]:
extended_topics = pd.DataFrame(extended_topics)

In [8]:
BM25 = pt.BatchRetrieve(index, wmodel="BM25", verbose=True)
rm3_pipe = BM25 >> pt.rewrite.RM3(index) >> BM25


run_with_feedback = BM25.transform(extended_topics)
run_with_pseudo_feedback = rm3_pipe.transform(new_topics)


BR(BM25): 100%|██████████| 193/193 [03:18<00:00,  1.03s/q]
BR(BM25): 100%|██████████| 406/406 [04:08<00:00,  1.63q/s]
BR(BM25): 100%|██████████| 405/405 [04:38<00:00,  1.45q/s]


In [9]:
merged_run = pd.concat([run_with_feedback, run_with_pseudo_feedback])

In [10]:
pt.io.write_results(merged_run, os.path.join(results_path, filename))

NameError: name 'filename' is not defined

In [None]:
# Relevance Feedback old
base_path = "data"
with open(os.path.join(base_path, config["subcollections"]["t3"]["qrels"]["test"]), "r") as f_qrels:
    qrels = pytrec_eval.parse_qrel(f_qrels)
    
evaluator = pytrec_eval.RelevanceEvaluator(qrels, pytrec_eval.supported_measures)

with open(os.path.join(results_path, filename)) as run_reranked:
    run = pytrec_eval.parse_run(run_reranked)
    scores = evaluator.evaluate(run)
    print( "|", 
          ", ".join(history), "|",
          ", "+str(nterms), "|",
        str(round(arp_scores(scores)["P_10"], 4)), "|",
        str(round(arp_scores(scores)["bpref"], 4)), "|",
        str(round(arp_scores(scores)["ndcg"], 4)), "|"
        )

| t2, t1, t0 | 0.1758 | 0.4819 | 0.3955 |


In [None]:
| t2, t1, t0 | 0.1756 | 0.4698 | 0.3899 |

In [16]:
# MonoT5
with open("CIR_BM25+monoT5_D-t3_T-t3") as run_reranked:
    run = pytrec_eval.parse_run(run_reranked)
    scores = evaluator.evaluate(run)
    print( "|", 
          ", ".join(history), "|",
        str(round(arp_scores(scores)["P_10"], 4)), "|",
        str(round(arp_scores(scores)["bpref"], 4)), "|",
        str(round(arp_scores(scores)["ndcg"], 4)), "|"
        )

| t2, t1, t0 | 0.1776 | 0.4571 | 0.3839 |


In [19]:
# RF new
with open(results_path + f"CIR_BM25_D-t3_T-t3_rf{"".join(history)}") as run_reranked:
    run = pytrec_eval.parse_run(run_reranked)
    scores = evaluator.evaluate(run)
    print( "|", 
          ", ".join(history), "|",
        str(round(arp_scores(scores)["P_10"], 4)), "|",
        str(round(arp_scores(scores)["bpref"], 4)), "|",
        str(round(arp_scores(scores)["ndcg"], 4)), "|"
        )

| t2, t1, t0 | 0.1758 | 0.4819 | 0.3955 |
