# This part of the code extracts only the most relevant queries and their documents (Relevanz = 2) from the original qrel files and stores them in a separate file.

- This part of the code should be run separately for each qrels file.
- After that you should save all the files obtained with this function in the qrels_r2 folder.

In [1]:
# Function to extract only very relevant documents (Relevanz = 2) from original qrels
def save_only_relevance_2(input_file, output_file='qrels_r2_23-02.txt'):
    # Open the input file in read mode and the output file in write mode
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        # Iterate over each line in the input file
        for line in infile:
            try:
                # Strip whitespace and split the line into parts (by default, splits by any whitespace)
                parts = line.strip().split()
                
                # Attempt to get the last element, which should be the relevance label, and convert it to an integer
                last_number = int(parts[-1])
                
                # If the relevance label is exactly 2, write this line to the output file
                if last_number == 2:
                    outfile.write(line)
            
            except ValueError:
                # If the last element cannot be converted to an integer (e.g. it's malformed), skip this line
                continue

# Usage of the function:
save_only_relevance_2('datasets/LongEval-Web/release_2025_p1/French/LongEval Train Collection/qrels/2023-02_fr/qrels_processed.txt')

# This part of the code starts and loads the index

In [2]:
# Start index
import yaml
import pandas as pd
import os

import pyterrier as pt
if not pt.java.started():
    pt.java.init()

# Path to Index
index_path = os.path.abspath("datasets/LongEval-Web/index/longeval-web-fr-2022-12-pyterrier")
index = pt.IndexFactory.of(index_path)

# We need to output the index information because for the idf calculation we need to know the number of documents (line 3 of the index information)
print(index.getCollectionStatistics().toString())

# Access to index components
lex = index.getLexicon()
di = index.getDirectIndex()
doi = index.getDocumentIndex()
inv = index.getInvertedIndex()
meta = index.getMetaIndex()

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


07:07:34.666 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 4.4 GiB of memory would be required.
Number of documents: 2046080
Number of terms: 5355232
Number of postings: 672184530
Number of fields: 0
Number of tokens: 1693990317
Field names: []
Positions:   false



# Part 1: Query & Qrels Preprocessing Pipeline

- This script loads and preprocesses query and qrels data from the LongEval French collection.
- It merges queries with relevant document information and prepares them for retrieval experiments.
- The preprocessing includes cleaning query strings and mapping document IDs to internal index IDs using a metadata
- The Part from 1 to 7 must be run separately for each time period․

In [3]:
import pandas as pd
import re
import numpy as np

# Load queries from a custom .txt file into a DataFrame
def load_queries():
    queries_path = "datasets/LongEval-Web/release_2025_p1/French/LongEval Train Collection/queries/2022-12_queries.txt"
    qs = []
    with open(queries_path, "r") as f:
        for l in f.readlines():
            splits = l.split("\t")
            qs.append({"qid": splits[0], "query": splits[1].strip("\n")})
    return pd.DataFrame(qs)

# Load filtered qrels (only relevance=2), drop the unused column
qrels_path = "qrels_r2/qrels_r2_22-12.txt"
qrels = pd.read_csv(qrels_path, sep=" ", names=["qid", "unused", "docid", "relevance"], dtype={"qid": str})
qrels = qrels.drop(columns=["unused"])

# Load queries
queries = load_queries()

# Merge qrels with corresponding queries by qid
merged_qrels = pd.merge(qrels, queries, on="qid", how="left")

# Clean queries: remove punctuation and special characters
def clean_query(query):
    return re.sub(r"[^\w\s]", "", str(query))  # Removes special characters

merged_qrels["query"] = merged_qrels["query"].apply(clean_query)

# Add document ID prefix expected by the index
merged_qrels["docid_qrels"] = "doc" + merged_qrels["docid"].astype(str)

# Map document IDs to internal index IDs using a metadata object (assumed global)
def get_docid_index(docno):
    try:
        return meta.getDocument("docno", docno)
    except Exception:
        return None  # If lookup fails

merged_qrels["docid_index"] = merged_qrels["docid_qrels"].apply(get_docid_index)

# Reorganize columns for clarity
final_qrels = merged_qrels[["qid", "docid_qrels", "docid_index", "relevance", "query"]]

# Clean: remove rows with missing queries or document index lookup failures
final_qrels = final_qrels.copy()
final_qrels['query'] = final_qrels['query'].replace('nan', np.nan)
final_qrels_cleaned = final_qrels.dropna(subset=["query"])
final_qrels_cleaned = final_qrels_cleaned[final_qrels_cleaned["docid_index"] != -1]

# Reset index before further processing
final_qrels_cleaned = final_qrels_cleaned.reset_index(drop=True)

# Part 2: Term Frequency Calculation for Documents

- This part of the code computes the term frequency for each term in every document listed in the `final_qrels_cleaned` DataFrame.

In [4]:
# Function to compute term frequencies for a given document
def total_terms_doc_berechnen(docid):
    total_terms_doc = 0  # Total number of terms in the document

    # Sum up all term frequencies in the document
    for posting in di.getPostings(doi.getDocumentEntry(docid)):
        total_terms_doc += posting.getFrequency()

    if total_terms_doc == 0:
        return {}  # Return empty dict if no terms found

    # Calculate normalized TF for each term
    term_tf = {}
    for posting in di.getPostings(doi.getDocumentEntry(docid)):
        termid = posting.getId()
        lex_entry = lex.getLexiconEntry(termid)
        term = lex_entry.getKey()
        freq = posting.getFrequency()
        tf = freq / total_terms_doc
        term_tf[term] = tf

    return term_tf

# Get only the column containing document index IDs
docid_column = final_qrels_cleaned[["docid_index"]]

# Dictionary to store TF results for each document
tf_results = {}

# Compute term frequencies for each document
for docid in docid_column["docid_index"]:
    tf_results[docid] = total_terms_doc_berechnen(docid)

# Extracting Unique Terms

- This code computes term frequencies (TF) for all terms in a document collection and extracts a list of unique terms found across all documents.
- Terms are then filtered to remove stopwords, short words, and non-alphabetic tokens.
- This is a preparation step for future document frequency (DF) computation.

In [5]:
import nltk
# nltk.download('stopwords')  # Uncomment if running for the first time
from nltk.corpus import stopwords

# Load stopwords for both French and English
french_stops = stopwords.words('french')
engl_stops = stopwords.words('english')

# Function to extract unique terms from a list of documents (no TF computation)
def tf_werte_unique(dataframe):
    tf_results_unique = {}  # Speichert TF-Werte für jedes Dokument

    # Calculation of TF(t,d) for each document
    for docid in final_qrels_cleaned["docid_index"]:
        total_terms_doc = 0
        term_tf = {}

        # Calculate the number of terms in the document
        for posting in di.getPostings(doi.getDocumentEntry(docid)):
            total_terms_doc += posting.getFrequency()

        if total_terms_doc > 0:
            for posting in di.getPostings(doi.getDocumentEntry(docid)):
                termid = posting.getId()
                lee = lex.getLexiconEntry(termid)
                frequency_term_in_doc = posting.getFrequency()
                tf = frequency_term_in_doc / total_terms_doc
                term_tf[lee.getKey()] = tf  # Saving the TF values for the terms in the document

        tf_results_unique.update(term_tf)

    return tf_results_unique 

# Call to extract all unique terms from the documents in final_qrels_cleaned
tf_results_unique = tf_werte_unique(final_qrels_cleaned)

def extract(tf_results_unique):
    unique_terms = list(tf_results_unique.keys())
    return unique_terms 

unique = extract(tf_results_unique)

# Apply filtering: keep only long, alphabetic, non-stopword terms
unique_filtered = [
    term for term in unique 
    if term.isalpha()
    and len(term) >= 5
    and term.lower() not in french_stops
    and term.lower() not in engl_stops
]

# Part 3: Document Frequency Calculation for Terms

- This function computes the document frequency (DF) for a set of prefiltered unique terms.
- The function uses the inverted index to iterate over the postings list for each term and collects unique document IDs.
- It returns a dictionary mapping terms to their DF values.

In [6]:
def calculate_df_values(unique_terms_filtered):
    df_results = {}

    for term in unique_terms_filtered:
        try:
            # Get lexicon entry for the term
            lex_entry = lex.getLexiconEntry(term)
            if lex_entry is None:
                print(f"No lexicon entry for '{term}'. Skipping...")
                continue

            # Use a set to ensure each document is only counted once
            doc_ids = set()
            for posting in inv.getPostings(lex_entry):
                docno = meta.getItem("docno", posting.getId())
                doc_ids.add(docno)

            df_results[term] = len(doc_ids)

        except Exception as e:
            print(f"Error processing term '{term}': {e}")
            continue

    return df_results

# Execute DF calculation
df_results = calculate_df_values(unique_filtered)

# Part 4: IDF Calculation

- This code computes the inverse document frequency (IDF) for a set of terms, based on the number of documents in which each term appears (DF). 
- The calculation uses a logarithmic formula with base 10 and applies smoothing to avoid division by zero.

In [7]:
import math

# Total number of documents in the collection
N = 2046080

# Compute IDF values for each term
idf_results = {}

for term, df_value in df_results.items():
    # Use smoothing (+1) to avoid division by zero
    idf = math.log(N / (1 + df_value), 10)
    idf_results[term] = idf  # Store the result

# Part 5: TF*IDF Calculation

- This code calculates TF-IDF scores for each term in each document.
- It is computed by multiplying the term frequency (TF) by the inverse document frequency (IDF) of that term.
- The output is a nested dictionary structured as {doc_id: {term: tf-idf}}.

In [8]:
def compute_tf_idf(tf_results, idf_results):
    tf_idf_results = {}

    for docid, terms in tf_results.items():
        tf_idf_results[docid] = {}

        for term, tf in terms.items():
            if term in idf_results:
                idf = idf_results[term]
                tf_idf_results[docid][term] = tf * idf

    return tf_idf_results

# Compute TF-IDF values for all documents
tf_idf_results = compute_tf_idf(tf_results, idf_results)

# Part 6: Determine top 1 term per document

- This function extracts the top-1 most important term per document based on the previously calculated TF-IDF scores.
- For each document, it selects the term with the highest TF-IDF value and stores it as the representative or most significant term.
- The result is a dictionary mapping each document ID to its highest-scoring (term, tf-idf) pair.

In [9]:
def find_top_term_per_document(tf_idf_results):
    top_terms = {}

    for docid, terms in tf_idf_results.items():
        if terms:
            # Select the term with the highest TF-IDF score
            top_term = max(terms.items(), key=lambda x: x[1])
            top_terms[docid] = top_term

    return top_terms

# Get the top-1 term for each document
top_terms_results = find_top_term_per_document(tf_idf_results)

# Part 7: Create .csv file for each period

- This part of the pipeline creates a .csv file that stores the top TF-IDF term for each query ID (qid).
- For each qid, it collects top terms from the relevant documents, selects the highest-scoring term, and combines it with the original query for export.
- The goal is to create an interpretable mapping from query to its most significant expansion term.

In [10]:
# List to hold rows for final CSV output
csv_rows = []

# Group by query ID (qid)
for qid, group in final_qrels_cleaned.groupby('qid'):
    terms_with_scores = []

    # Collect top terms for documents linked to this query
    for _, row in group.iterrows():
        docid = row['docid_index']
        if docid in top_terms_results:
            top_term, tfidf_score = top_terms_results[docid]
            terms_with_scores.append((top_term, tfidf_score))

    # Remove duplicates, keep highest TF-IDF per term
    term_score_dict = {}
    for term, score in terms_with_scores:
        if (term not in term_score_dict) or (score > term_score_dict[term]):
            term_score_dict[term] = score

    # Sort terms by TF-IDF in descending order
    sorted_terms = sorted(term_score_dict.items(), key=lambda x: x[1], reverse=True)

    # Select top-1 term for the query
    if sorted_terms:
        top_term, top_score = sorted_terms[0]
    else:
        top_term, top_score = "", 0.0

    # Extract original query text (if any)
    original_query = group['query'].iloc[0]
    if not isinstance(original_query, str):
        original_query = ""

    # Append row for CSV
    csv_rows.append({
        "qid": qid,
        "query": original_query,
        "z_9_term": top_term,
        "z_9_tf_idf": top_score
    })

# Create DataFrame from collected rows
df_expanded = pd.DataFrame(csv_rows)

# Write to CSV file
df_expanded.to_csv("top_term.csv", index=False, encoding="utf-8")

# Part 8: Merging Top Terms Across Time Periods

- This part of the code merges multiple CSV files, each representing top TF-IDF terms per document for a specific time period.
- The final output is a unified table where each row represents a query, and for each period, the top-scoring term is included.
- This is useful for tracking how term importance evolves over time for recurring queries.
- This part of the code should be run after ․csv files have been created for each time period․

In [None]:
import glob
from functools import reduce
import pandas as pd

# Load all CSV files from the specified folder, sorted by filename
dateien = sorted(glob.glob("top_term/*.csv"))
alle_dfs = []

# Iterate over each CSV file
for idx, datei in enumerate(dateien):
    df = pd.read_csv(datei)

    # Identify the relevant columns for term and tf-idf score
    term_col = next((col for col in df.columns if col.startswith("z_") and col.endswith("_term")), None)
    tfidf_col = next((col for col in df.columns if col.startswith("z_") and col.endswith("_tf_idf")), None)

    # Skip files that do not contain both required columns
    if not all([term_col, tfidf_col]):
        continue

    # Temporarily rename 'qid' to a unique identifier per file
    df = df.rename(columns={"qid": f"qid_{idx}"})
    df = df[[f"qid_{idx}", "query", term_col, tfidf_col]]  # Keep only necessary columns
    alle_dfs.append(df)

# Perform an outer join on the "query" column to merge all periods
merged_df = reduce(lambda left, right: pd.merge(left, right, on="query", how="outer"), alle_dfs)

# Create a unified 'qid' column by taking the first non-null qid across the merged columns
qid_cols = [col for col in merged_df.columns if col.startswith("qid_")]
merged_df["qid"] = merged_df[qid_cols].bfill(axis=1).iloc[:, 0]  # Use backfill to propagate the first available qid

# Convert 'qid' to integer format while preserving NaN values
merged_df["qid"] = merged_df["qid"].astype("Int64")

# Drop the temporary qid_* columns used during merging
merged_df = merged_df.drop(columns=qid_cols)

# Reorder columns: qid, query, then all other columns
cols = ["qid", "query"] + [col for col in merged_df.columns if col not in ["qid", "query"]]
merged_df = merged_df[cols]

# Sort the final DataFrame by qid
merged_df = merged_df.sort_values(by="qid", ascending=True)

# Save the resulting merged DataFrame to a CSV file
merged_df.to_csv("mapping_9z.csv", index=False)

# Part 9: Cleaning the Mapping Table

- The first part eliminates any completely identical rows that may have resulted from merging multiple CSVs.
- Since some queries (`qid`, `query`) may appear multiple times with different TF-IDF scores across time periods, the code calculates the average TF-IDF score per row and keeps only the highest-scoring row per `(qid, query)` pair.
- The cleaned and deduplicated mapping table is saved to new .csv file, sorted by `qid`.
- This step ensures data integrity and prepares the table for further analysis or visualization.

In [None]:
# Load the mapping file
df = pd.read_csv("mapping_9z.csv")

# Remove exact duplicates
df = df.drop_duplicates()

# Identify TF-IDF columns
tfidf_cols = [col for col in df.columns if "tf_idf" in col]

# Compute average TF-IDF score across time periods
df["tfidf_avg"] = df[tfidf_cols].mean(axis=1)

# Keep only the highest scoring row per (qid, query) pair
df = df.sort_values("tfidf_avg", ascending=False).drop_duplicates(subset=["qid", "query"])

# Drop the helper column
df = df.drop(columns="tfidf_avg")

# Sort the final DataFrame by qid
df = df.sort_values(by="qid", ascending=True)

# Save the cleaned and deduplicated file
df.to_csv("mapping_9z_cleaned.csv", index=False)

# Part 10: Query Expansion with Top Terms

- This step enriches each original query with its most relevant terms based on TF-IDF scores across 9 distinct time periods.
- Each query is mapped to its top TF-IDF terms (`z_1` to `z_9`) from 9 temporal snapshots.
- Terms are deduplicated per `qid` by keeping only the instance with the highest TF-IDF score. Then they are sorted and the top 9 are retained.
- For each query, its top terms are appended to the original query text. If no top terms exist, the query remains unchanged.

In [None]:
import pandas as pd

# Load the original queries file (qid and query text)
queries_df = pd.read_csv(
    "datasets/LongEval-Web/LongEval Test Collection/queries/2023-08_queries.txt",
    sep="\t", names=["qid", "query"], dtype={"qid": str}
)

# Load the cleaned TF-IDF file with top terms from 9 time periods
tfidf_df = pd.read_csv("top_term/mapping_9z_cleaned.csv", dtype={"qid": str})

# Dictionary to store the top N unique terms per qid
qid_best_terms = {}

# Iterate over each row in the TF-IDF mapping
for _, row in tfidf_df.iterrows():
    qid = row['qid']
    term_scores = []

    # Collect all (term, score) pairs from z_1 to z_9
    for i in range(1, 10):
        term_col = f"z_{i}_term"
        score_col = f"z_{i}_tf_idf"

        term = row.get(term_col)
        score = row.get(score_col)

        if isinstance(term, str) and pd.notna(score):
            term_scores.append((term, score))

    # Deduplicate terms by keeping only the one with the highest score
    term_dict = {}
    for term, score in term_scores:
        if term not in term_dict or score > term_dict[term]:
            term_dict[term] = score

    # Sort terms by TF-IDF score (descending)
    unique_term_scores = sorted(term_dict.items(), key=lambda x: x[1], reverse=True)

    # Select the top 9 terms
    best_terms = [term for term, _ in unique_term_scores[:9]]

    # Save top terms for this query id
    if best_terms:
        qid_best_terms[qid] = best_terms

# Create new rows with expanded queries
expanded_rows = []

for _, row in queries_df.iterrows():
    qid = row['qid']
    query = row['query']

    # If expansion terms exist, append them to the original query
    if qid in qid_best_terms:
        expanded_query = query + " " + " ".join(qid_best_terms[qid])
    else:
        expanded_query = query  # No expansion available

    expanded_rows.append({"qid": qid, "query": expanded_query})

# Save the expanded queries to a .txt file; d = distinct; 9z = 9 Zeiträume; 9t = 9 Terme
with open("expanded_queries_23-08_d_9z_9t.txt", "w", encoding="utf-8") as f:
    for row in expanded_rows:
        f.write(f"{row['qid']}\t{row['query']}\n")


# Part 11: Retrieval and Evaluation using BM25 

- This final step assesses how effective the query expansion (via temporal top terms) was in improving retrieval performance on a standard test collection.


In [None]:
import yaml
import pyterrier as pt
import os
import pandas as pd

# Initialize the PyTerrier Java virtual machine if it's not already running
if not pt.java.started():
    pt.java.init()

# Define index path and load the index
index_path = os.path.abspath("datasets/LongEval-Web/index/longeval-web-fr-2023-08-pyterrier")
index = pt.IndexFactory.of(index_path)

# Load configuration from YAML
BASE_PATH = "datasets/LongEval-Web"
with open(BASE_PATH + "/metadata.yml", "r") as yamlfile:
    config = yaml.load(yamlfile, Loader=yaml.FullLoader)

# Dataset identifiers
dataset = "longeval-web"
language = "fr"
sub_collection = "2023-08"

# Load expanded queries from the specified file
topics_path = os.path.join("expended_queries/expanded_queries_23-08_d_9z_9t.txt")
topics = pd.read_csv(topics_path, sep="\t", names=["qid", "query"])
topics["qid"] = topics["qid"].astype(str)

# Clean queries by removing problematic characters
topics["query"] = topics["query"].str.replace("'", "")
topics["query"] = topics["query"].str.replace("*", "")
topics["query"] = topics["query"].str.replace("/", "")
topics["query"] = topics["query"].str.replace(":", "")
topics["query"] = topics["query"].str.replace("?", "")
topics["query"] = topics["query"].str.replace(")", "")
topics["query"] = topics["query"].str.replace("(", "")
topics["query"] = topics["query"].str.replace("+", "")

# Remove known spam queries by qid
spam = ["59769", "6060", "75200", "74351", "67599", "74238", "74207", "75100", "58130"]
topics = topics[~topics["qid"].isin(spam)]

# Instantiate BM25 retriever
BM25 = pt.terrier.Retriever(index, wmodel="BM25", verbose=True)

# Apply BM25 to get ranking results for the topics
run = BM25.transform(topics)

# Save the run file in TREC format (.gz compressed)
pt.io.write_results(
    run,
    f"{dataset}-{language}-{sub_collection}-BM25-expanded.gz
)