In [11]:
import yaml
import os
import pyterrier as pt
import pandas as pd

In [12]:
BASE_PATH = "datasets/LongEval-Web"

with open(BASE_PATH + "/metadata.yml", "r") as yamlfile:
    config = yaml.load(yamlfile, Loader=yaml.FullLoader)

In [13]:
dataset = "longeval-web"
language = "fr"
sub_collection = "2023-08"

In [14]:
index_path = os.path.join(".", BASE_PATH, f"index/{dataset}-{language}-{sub_collection}-pyterrier")
#topics_path = os.path.join(BASE_PATH, "release_2025_p1/French/queries.txt")
topics_path = os.path.join(BASE_PATH, "LongEval Test Collection/queries/2023-08_queries.txt")

In [15]:
#Preprocessing
topics = pd.read_csv("2023-08_categorized_queries.csv")
topics["qid"] = topics["qid"].astype(str)

topics["query"] = topics["query"].str.replace("'", "")
topics["query"] = topics["query"].str.replace("*", "")
topics["query"] = topics["query"].str.replace("/", "")
topics["query"] = topics["query"].str.replace(":", "")
topics["query"] = topics["query"].str.replace("?", "")
topics["query"] = topics["query"].str.replace(")", "")
topics["query"] = topics["query"].str.replace("(", "")
topics["query"] = topics["query"].str.replace("+", "")

spam = ["59769", "6060", "75200", "74351", "67599", "74238", "74207", "75100", "58130", "62893", "75177"]
topics = topics[~topics["qid"].isin(spam)]

#Save cleaned dataframe back to csv (overwrite)
topics.to_csv("2023-08_categorized_queries.csv", index=False)

In [10]:
topics

Unnamed: 0,qid,query,Answer
0,3,1ere guerre mondiale,event
1,8,4 mariages 1 enterrement,event
2,12,abri voiture,time-independent
3,19,activ crea pole emploi,time-independent
4,20,active projet pole emploi,timeliness
...,...,...,...
14529,75390,jean-luc moreau,time-independent
14530,75391,autrans-meaudre,time-independent
14531,75404,lcl - mon espace,time-independent
14532,75405,connexion-emploi,time-independent


In [16]:
import pandas as pd
import os
import pyterrier as pt
from sqlalchemy import create_engine

#Initialize pyterrier
if not pt.started():
    pt.init()

#Database connection
DATABASE = "longeval-web"
USER = "dis18"
HOST = "db"
PORT = "5432"
PASSWORD = "dis182425"

#Create engine for PostgreSQL
engine = create_engine(f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}")

#Build dynamic WHERE clause to include sub_collections <= current
sub_year, sub_month = map(int, sub_collection.split("-"))

where_clause = f"""
WHERE (substring(d.sub_collection from 1 for 4)::int < {sub_year}
       OR (substring(d.sub_collection from 1 for 4)::int = {sub_year}
           AND substring(d.sub_collection from 6 for 2)::int <= {sub_month}))
"""

# --- QUERY UNIQUE DATES FOR DOCUMENTS ---
query = f"""
SELECT d.docid, COUNT(DISTINCT j.date) AS unique_dates_count
FROM "Document" d,
LATERAL jsonb_array_elements_text(d.date::jsonb) AS j(date)
{where_clause}
GROUP BY d.docid;
"""

#Fetch result as DataFrame
df_dates = pd.read_sql(query, engine)
df_dates["docid"] = df_dates["docid"].astype(str)  # Ensure docid is string

#Load queries from processed_queries.csv
queries_path = "2023-08_categorized_queries.csv"

queries = pd.read_csv(queries_path, sep=",")  # Ensure proper separator

#Rename columns for compatibility
queries.rename(columns={"Prompt": "query", "Answer": "category"}, inplace=True)
queries["qid"] = queries["qid"].astype(str)  # Use existing qid values

#Setup Index and BM25
index_path = os.path.join(".", BASE_PATH, f"index/{dataset}-{language}-{sub_collection}-pyterrier")
index = pt.IndexFactory.of(index_path)
BM25 = pt.BatchRetrieve(index, wmodel="BM25", verbose=True)

#BM25 run
#run = BM25.transform(queries.head(100))
run = BM25.transform(queries)

#Boost documents with unique dates <= 3
boosted_docs = set(df_dates[df_dates["unique_dates_count"] <= 3]["docid"])

#Merge query categories into run results
run = run.merge(queries[["qid", "category"]], on="qid", how="left")

#Rename and drop unnecessary columns
run = run.drop(columns=["category_x"]).rename(columns={"category_y": "category"})

  if not pt.started():


01:19:42.245 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 3 GiB of memory would be required.


  BM25 = pt.BatchRetrieve(index, wmodel="BM25", verbose=True)
TerrierRetr(BM25): 100%|██████████| 12938/12938 [21:22<00:00, 10.09q/s]


In [18]:
run

Unnamed: 0,qid,docid,docno,rank,score,query,category
0,3,81943,684186,0,25.872507,1ere guerre mondiale,event
1,3,1773253,2911357,1,25.872507,1ere guerre mondiale,event
2,3,1841598,1686452,2,25.678216,1ere guerre mondiale,event
3,3,338058,430968,3,25.430486,1ere guerre mondiale,event
4,3,1115047,658038,4,25.393302,1ere guerre mondiale,event
...,...,...,...,...,...,...,...
12444280,75425,201725,1844980,995,10.460378,petit chauffe-eau,time-independent
12444281,75425,110879,1851209,996,10.459850,petit chauffe-eau,time-independent
12444282,75425,290936,460101,997,10.456947,petit chauffe-eau,time-independent
12444283,75425,1066583,2068794,998,10.453392,petit chauffe-eau,time-independent


In [17]:
import numpy as np

#Remove 'doc' prefix from boosted_docs docids
boosted_docs = {docid[3:] if docid.startswith("doc") else docid for docid in boosted_docs}

#Define fixed boost values per category
category_boosts = {
    "time-independent": 1.20,
    "explicit-time": 1.15,
    "event": 1.15,
    "timeliness": 1.20
}

#Make a copy to avoid modifying original
run_boosted = run.copy()

run_boosted['docid'] = run_boosted['docid'].astype(str)

def get_boost(row):
    if row['docid'] in boosted_docs:
        # Apply category-specific boost
        return category_boosts.get(row['category'], 1.0)
    else:
        # No boost if docid not in boosted_docs
        return 1.0

run_boosted['boost'] = run_boosted.apply(get_boost, axis=1)

#Update score
run_boosted['score'] = run_boosted['score'] * run_boosted['boost']

#Rerank based on updated scores
run_boosted = run_boosted.sort_values(by=["qid", "score"], ascending=[True, False])
run_boosted["rank"] = run_boosted.groupby("qid").cumcount()

In [19]:
run_boosted

Unnamed: 0,qid,docid,docno,rank,score,query,category,boost
45073,100,928122,2971472,0,31.325900,appli pole emploi,time-independent,1.2
45075,100,1402264,2945156,1,31.325900,appli pole emploi,time-independent,1.2
45072,100,1655099,1694624,2,28.205372,appli pole emploi,time-independent,1.0
45086,100,847023,8661,3,27.678768,appli pole emploi,time-independent,1.2
45087,100,1224416,306,4,27.670013,appli pole emploi,time-independent,1.2
...,...,...,...,...,...,...,...,...
504479,996,553493,2107905,995,8.215766,ensp police,time-independent,1.0
504480,996,212941,2129988,996,8.215341,ensp police,time-independent,1.0
504482,996,7308,2093867,997,8.214912,ensp police,time-independent,1.0
504483,996,197906,2952635,998,8.214633,ensp police,time-independent,1.0


In [20]:
#Add 'Q0' placeholder column
run_boosted['placeholder'] = 'Q0'

#Set run name for identification
run_boosted['name'] = sub_collection

#Select and reorder columns to match TREC format
trec_run = run_boosted[['qid', 'placeholder', 'docno', 'rank', 'score', 'name']]


In [21]:
trec_run

Unnamed: 0,qid,placeholder,docno,rank,score,name
45073,100,Q0,2971472,0,31.325900,2023-08
45075,100,Q0,2945156,1,31.325900,2023-08
45072,100,Q0,1694624,2,28.205372,2023-08
45086,100,Q0,8661,3,27.678768,2023-08
45087,100,Q0,306,4,27.670013,2023-08
...,...,...,...,...,...,...
504479,996,Q0,2107905,995,8.215766,2023-08
504480,996,Q0,2129988,996,8.215341,2023-08
504482,996,Q0,2093867,997,8.214912,2023-08
504483,996,Q0,2952635,998,8.214633,2023-08


In [22]:
#Make copy to avoid warning when casting
trec_run = trec_run.copy()

#Cast columns to sort
trec_run["qid"] = trec_run["qid"].astype(int)
trec_run["rank"] = trec_run["rank"].astype(int)

#Sort values
trec_run_sorted = trec_run.sort_values(by=["qid", "rank"])

#Save to gzip-compressed TREC file
trec_run_sorted.to_csv(
    "run.txt.gz",
    sep=" ",
    index=False,
    header=False,
    compression="gzip"
)

In [23]:
trec_run_sorted

Unnamed: 0,qid,placeholder,docno,rank,score,name
0,3,Q0,684186,0,29.753383,2023-08
6,3,Q0,160081,1,28.964729,2023-08
9,3,Q0,188336,2,28.909960,2023-08
19,3,Q0,1939002,3,28.709618,2023-08
24,3,Q0,2218042,4,28.666875,2023-08
...,...,...,...,...,...,...
12444278,75425,Q0,1858280,995,10.465972,2023-08
12444279,75425,Q0,819125,996,10.461711,2023-08
12444280,75425,Q0,1844980,997,10.460378,2023-08
12444281,75425,Q0,1851209,998,10.459850,2023-08
