In [1]:
import yaml
import os
import pyterrier as pt
import pandas as pd
import json
from sqlalchemy import create_engine

In [2]:
BASE_PATH = "/home/jovyan/work/datasets/LongEval-Web"

with open(BASE_PATH + "/metadata.yml", "r") as yamlfile:
    config = yaml.load(yamlfile, Loader=yaml.FullLoader)

In [3]:
!pip install --upgrade typing_extensions
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, util

Collecting sentence_transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting torch>=1.11.0 (from sentence_transformers)
  Using cached torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting scikit-learn (from sentence_transformers)
  Using cached scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting huggingface-hub>=0.20.0 (from sentence_transformers)
  Using cached huggingface_hub-0.31.2-py3-none-any.whl.metadata (13 kB)
Collecting Pillow (from sentence_transformers)
  Using cached pillow-11.2.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (8.9 kB)
Collecting filelock (from huggingface-hub>=0.20.0->sentence_transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pandas as pd
from sqlalchemy import create_engine
from sentence_transformers import SentenceTransformer, util

# DB-Verbindung
DATABASE = "longeval-web"
USER = "dis18"
HOST = "db"
PORT = "5432"
PASSWORD = "dis182425"

engine = create_engine(f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}")

# Lade Sentence-BERT Modell
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Schritt 1: Qrels-Datei laden und Mapping erstellen
column_names = ["QueryID", "Spalte2", "DocID", "Relevanz"]
qrels1 = pd.read_csv(
    "/home/jovyan/work/datasets/LongEval-Web/release_2025_p1/French/LongEval Train Collection/qrels/2023-02_fr/qrels_processed.txt",
    sep=" ", header=None, names=column_names
)

query_ids = list(range(1, 1000000))
query_doc_map = {}

for query_id in query_ids:
    qrels_qid = qrels1[(qrels1["QueryID"] == query_id) & (qrels1["Relevanz"] > 0)]
    formatted_docs = ["doc" + str(doc_id) for doc_id in qrels_qid["DocID"].tolist()]
    if formatted_docs:
        query_doc_map[str(query_id)] = formatted_docs  # Keys als Strings speichern

# Schritt 2: Funktion zum Vergleich per BERT
def compare_doc_texts_bert(docids, threshold=0.9):
    if not docids:
        raise ValueError("Die Liste der docids ist leer.")

    placeholders = ','.join([f"'{docid}'" for docid in docids])
    query = f"""
        SELECT docid, sub_collection, text_fr
        FROM "Document"
        WHERE docid IN ({placeholders})
          AND sub_collection IN ('2023-02', '2023-03')
    """
    df = pd.read_sql(query, con=engine)

    if df.empty:
        raise ValueError("Keine Daten gefunden.")

    df = df.drop_duplicates(subset=["docid", "sub_collection"])

    df_pivot = df.pivot(index="docid", columns="sub_collection", values="text_fr").dropna()

    embeddings_02 = model.encode(df_pivot["2023-02"].tolist(), convert_to_tensor=True)
    embeddings_03 = model.encode(df_pivot["2023-03"].tolist(), convert_to_tensor=True)

    similarities = util.cos_sim(embeddings_02, embeddings_03).diagonal().tolist()

    df_pivot["similarity"] = similarities
    df_pivot["texts_similar"] = df_pivot["similarity"] >= threshold

    matching_docids = df_pivot[df_pivot["texts_similar"]].index.tolist()

    return df_pivot.reset_index(), matching_docids

# Schritt 3: BERT-Vergleich auf alle relevanten DocIDs anwenden
all_docids = set(docid for docs in query_doc_map.values() for docid in docs)
_, matching_docids = compare_doc_texts_bert(all_docids, threshold=0.9)
matching_docids_set = set(matching_docids)

# Schritt 4: Filtere pro Query
filtered_query_doc_map = {
    query_id: [docid for docid in docids if docid in matching_docids_set]
    for query_id, docids in query_doc_map.items()
}

# Ausgabe der gefilterten DocIDs
for qid, docs in filtered_query_doc_map.items():
    print(f"Gefilterte DocIDs für Query {qid}: {docs}")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Gefilterte DocIDs für Query 3: ['doc1646118']
Gefilterte DocIDs für Query 8: ['doc19754']
Gefilterte DocIDs für Query 12: ['doc22877']
Gefilterte DocIDs für Query 18: ['doc19416', 'doc18882']
Gefilterte DocIDs für Query 19: ['doc8323']
Gefilterte DocIDs für Query 20: ['doc1716', 'doc4311']
Gefilterte DocIDs für Query 23: []
Gefilterte DocIDs für Query 24: ['doc8592']
Gefilterte DocIDs für Query 26: []
Gefilterte DocIDs für Query 27: ['doc2588', 'doc23258']
Gefilterte DocIDs für Query 28: ['doc16256', 'doc13703']
Gefilterte DocIDs für Query 29: ['doc26747']
Gefilterte DocIDs für Query 30: ['doc2874670', 'doc1644643', 'doc955', 'doc23020', 'doc2909443', 'doc1706936']
Gefilterte DocIDs für Query 32: ['doc2904587']
Gefilterte DocIDs für Query 33: ['doc1684120']
Gefilterte DocIDs für Query 41: ['doc21116']
Gefilterte DocIDs für Query 42: ['doc21520']
Gefilterte DocIDs für Query 43: ['doc15389', 'doc1698']
Gefilterte DocIDs für Query 44: ['doc23717']
Gefilterte DocIDs für Query 45: ['doc2403

In [5]:
#Run File einlesen
run_file = "/home/jovyan/work/datasets/LongEval-Web/runs/longeval-web-fr-2023-03-BM25.gz"
run = pt.io.read_results(run_file)


print(run.head())

  qid    docno  rank      score       name
0   3  2214755     0  24.226631  pyterrier
1   3   684186     1  23.345397  pyterrier
2   3   637997     2  23.149936  pyterrier
3   3   430968     3  22.982027  pyterrier
4   3   160081     4  22.781866  pyterrier


In [6]:
# Kopiere die Run-Datei und stelle sicher, dass qid und docno Strings sind
reranked_run = run.copy()
reranked_run['qid'] = reranked_run['qid'].astype(str)
reranked_run['docno'] = reranked_run['docno'].astype(str).str.strip().str.lower()

# Normalisiere auch die Keys und Werte im Dictionary
filtered_query_doc_map = {
    str(qid): [doc.strip().lower() for doc in docs]
    for qid, docs in filtered_query_doc_map.items()
}

# Neue Score-Spalte initialisieren
reranked_run['new_score'] = reranked_run['score']

# Gruppiere die Run-Datei einmal nach qid (für schnelleren Zugriff)
qid_groups = dict(tuple(reranked_run.groupby('qid')))

# Boosting pro Query-ID
for i, (query_id, relevant_docs) in enumerate(filtered_query_doc_map.items()):
    if query_id not in qid_groups:
        continue

    qid_df = qid_groups[query_id]
    boost_mask = qid_df['docno'].isin(relevant_docs)
    boost_indices = qid_df[boost_mask].index

    # Boost anwenden
    if not boost_indices.empty:
        reranked_run.loc[boost_indices, 'new_score'] = reranked_run.loc[boost_indices, 'score'] * 2

# Neue Scores übernehmen
reranked_run['score'] = reranked_run['new_score']
reranked_run = reranked_run.drop(columns=['new_score'])

# Sortieren nach Score und neue Ränge vergeben
reranked_run = reranked_run.sort_values(['qid', 'score'], ascending=[True, False])
reranked_run['rank'] = reranked_run.groupby('qid').cumcount() + 1

# Metadaten hinzufügen
reranked_run['iter'] = 0
reranked_run['name'] = "CIR-longeval-web-fr-2023-03-BM25"

# Spalten in gewünschter Reihenfolge
reranked_run = reranked_run[['qid', 'iter', 'docno', 'rank', 'score', 'name']]

# Vorschau der ersten Zeilen
print(reranked_run.head())

         qid  iter    docno  rank      score                              name
315404  1000     0    12254     1  40.173644  CIR-longeval-web-fr-2023-03-BM25
315405  1000     0  1641568     2  40.001489  CIR-longeval-web-fr-2023-03-BM25
315406  1000     0  3376776     3  38.737122  CIR-longeval-web-fr-2023-03-BM25
315407  1000     0  3391193     4  38.190770  CIR-longeval-web-fr-2023-03-BM25
315408  1000     0  3384961     5  38.071227  CIR-longeval-web-fr-2023-03-BM25


In [7]:
# Speichere das Ergebnis im korrekten TREC-Format ohne Header
reranked_run.to_csv("/home/jovyan/work/datasets/LongEval-Web/index/Gruppe_JMFT/2023-03/reranked_run_bert.gz", sep='\t', index=False, header=False, compression='gzip')