In [1]:
import yaml
import os
import pyterrier as pt
import pandas as pd
import json
from sqlalchemy import create_engine

In [2]:
BASE_PATH = "/home/jovyan/work/datasets/LongEval-Web"

with open(BASE_PATH + "/metadata.yml", "r") as yamlfile:
    config = yaml.load(yamlfile, Loader=yaml.FullLoader)

In [3]:
!pip install --upgrade typing_extensions
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, util



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pandas as pd
from sqlalchemy import create_engine
from sentence_transformers import SentenceTransformer, util

# Datenbankverbindung konfigurieren
DATABASE = "longeval-web"
USER = "dis18"
HOST = "db"
PORT = "5432"
PASSWORD = "dis182425"

engine = create_engine(f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}")

# Lade Sentence-BERT Modell
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Schritt 1: Qrels-Datei laden und Mapping erstellen
column_names = ["QueryID", "Spalte2", "DocID", "Relevanz"]
qrels1 = pd.read_csv(
    "/home/jovyan/work/datasets/LongEval-Web/release_2025_p1/French/LongEval Train Collection/qrels/2023-02_fr/qrels_processed.txt",
    sep=" ", header=None, names=column_names
)

query_ids = list(range(1, 1000000))
query_doc_map = {}

for query_id in query_ids:
    qrels_qid = qrels1[(qrels1["QueryID"] == query_id) & (qrels1["Relevanz"] > 0)]
    formatted_docs = ["doc" + str(doc_id) for doc_id in qrels_qid["DocID"].tolist()]
    if formatted_docs:
        query_doc_map[str(query_id)] = formatted_docs  # Keys als Strings speichern

# Funktion: BERT-Ähnlichkeitstextvergleich
def compare_doc_texts_bert(docids):
    if not docids:
        raise ValueError("Die Liste der docids ist leer.")

    query = f"""
        SELECT docid, sub_collection, text_fr
        FROM "Document"
        WHERE docid IN ({','.join([f"'{docid}'" for docid in docids])})
          AND sub_collection IN ('2023-02', '2023-03')
    """
    df = pd.read_sql(query, con=engine)

    if df.duplicated(subset=["docid", "sub_collection"]).any():
        print("Warnung: Doppelte Einträge gefunden. Sie werden entfernt.")
        df = df.drop_duplicates(subset=["docid", "sub_collection"], keep="first")

    if df.empty:
        raise ValueError("Keine Daten für die angegebenen docids gefunden.")

    df_pivot = df.pivot(index="docid", columns="sub_collection", values="text_fr").dropna()

    embeddings_06 = model.encode(df_pivot["2023-02"].tolist(), convert_to_tensor=True)
    embeddings_07 = model.encode(df_pivot["2023-03"].tolist(), convert_to_tensor=True)

    similarities = util.cos_sim(embeddings_06, embeddings_07).diagonal().tolist()

    df_pivot["similarity"] = similarities

    return df_pivot

# Schritt 2: Berechne Ähnlichkeiten für alle relevanten Dokumente
all_docids = set(docid for docs in query_doc_map.values() for docid in docs)
bert_df = compare_doc_texts_bert(all_docids)

# Schritt 3: Buckets mit exklusiven Grenzen bilden
exclusive_buckets = {
    "100%": bert_df[bert_df["similarity"] == 1.0].index.tolist(),
    "95-99%": bert_df[(bert_df["similarity"] >= 0.95) & (bert_df["similarity"] < 1.0)].index.tolist(),
    "90-94%": bert_df[(bert_df["similarity"] >= 0.90) & (bert_df["similarity"] < 0.95)].index.tolist(),
    "85-89%": bert_df[(bert_df["similarity"] >= 0.85) & (bert_df["similarity"] < 0.90)].index.tolist(),
    "80-84%": bert_df[(bert_df["similarity"] >= 0.80) & (bert_df["similarity"] < 0.85)].index.tolist(),
}

# Schritt 4: Erzeuge pro Bucket eine gefilterte Query-Doc-Mapping
filtered_query_doc_maps = {}

for bucket_label, matching_docids in exclusive_buckets.items():
    matching_docids_set = set(matching_docids)
    filtered_query_doc_maps[bucket_label] = {}

    for query_id, docids in query_doc_map.items():
        filtered_query_doc_maps[bucket_label][query_id] = [
            docid for docid in docids if docid in matching_docids_set
        ]

# Schritt 5: Ausgabe zur Kontrolle
for level, result_map in filtered_query_doc_maps.items():
    print(f"\n=== Ergebnisse für Bereich {level} ===")
    for qid, docs in result_map.items():
        print(f"Gefilterte DocIDs für Query {qid}: {docs}")


Warnung: Doppelte Einträge gefunden. Sie werden entfernt.

=== Ergebnisse für Bereich 100% ===
Gefilterte DocIDs für Query 3: []
Gefilterte DocIDs für Query 8: []
Gefilterte DocIDs für Query 12: []
Gefilterte DocIDs für Query 18: []
Gefilterte DocIDs für Query 19: ['doc8323']
Gefilterte DocIDs für Query 20: []
Gefilterte DocIDs für Query 23: []
Gefilterte DocIDs für Query 24: []
Gefilterte DocIDs für Query 26: []
Gefilterte DocIDs für Query 27: []
Gefilterte DocIDs für Query 28: []
Gefilterte DocIDs für Query 29: ['doc26747']
Gefilterte DocIDs für Query 30: ['doc2909443']
Gefilterte DocIDs für Query 32: []
Gefilterte DocIDs für Query 33: []
Gefilterte DocIDs für Query 41: []
Gefilterte DocIDs für Query 42: []
Gefilterte DocIDs für Query 43: []
Gefilterte DocIDs für Query 44: []
Gefilterte DocIDs für Query 45: []
Gefilterte DocIDs für Query 46: []
Gefilterte DocIDs für Query 49: []
Gefilterte DocIDs für Query 50: []
Gefilterte DocIDs für Query 51: []
Gefilterte DocIDs für Query 53: ['do

In [5]:
#Run File einlesen
run_file = "/home/jovyan/work/datasets/LongEval-Web/runs/longeval-web-fr-2023-03-BM25.gz"
run = pt.io.read_results(run_file)


print(run.head())

  qid    docno  rank      score       name
0   3  2214755     0  24.226631  pyterrier
1   3   684186     1  23.345397  pyterrier
2   3   637997     2  23.149936  pyterrier
3   3   430968     3  22.982027  pyterrier
4   3   160081     4  22.781866  pyterrier


In [6]:
# Deine Boost-Werte – frei anpassbar
boost_factors = {
    "100%": 2.0,
    "95-99%": 1.75,
    "90-94%": 1.5,
    "80-89%": 1.25
}

# Kopie und Normalisierung
reranked_run = run.copy()
reranked_run['qid'] = reranked_run['qid'].astype(str)
reranked_run['docno'] = reranked_run['docno'].astype(str).str.strip().str.lower()

# Normalisierte Filter-Maps
normalized_filtered_maps = {
    level: {
        str(qid): [doc.strip().lower() for doc in docs]
        for qid, docs in query_map.items()
    }
    for level, query_map in filtered_query_doc_maps.items()
}

# Neue Score-Spalte
reranked_run['new_score'] = reranked_run['score']

# Gruppiert nach qid
qid_groups = dict(tuple(reranked_run.groupby('qid')))

# Boosting pro Bereich + Query
for level, query_map in normalized_filtered_maps.items():
    boost_value = boost_factors.get(level, 1.0)

    for query_id, relevant_docs in query_map.items():
        if query_id not in qid_groups:
            continue

        qid_df = qid_groups[query_id]
        boost_mask = qid_df['docno'].isin(relevant_docs)
        boost_indices = qid_df[boost_mask].index

        if not boost_indices.empty:
            reranked_run.loc[boost_indices, 'new_score'] = (
                reranked_run.loc[boost_indices, 'score'] * boost_value
            )

# Neue Scores anwenden
reranked_run['score'] = reranked_run['new_score']
reranked_run = reranked_run.drop(columns=['new_score'])

# Sortierung und Ranking
reranked_run = reranked_run.sort_values(['qid', 'score'], ascending=[True, False])
reranked_run['rank'] = reranked_run.groupby('qid').cumcount() + 1

# Metadaten hinzufügen
reranked_run['iter'] = 0
reranked_run['name'] = "CIR-longeval-web-fr-2023-03-BM25"

# Spaltenreihenfolge
reranked_run = reranked_run[['qid', 'iter', 'docno', 'rank', 'score', 'name']]

# Vorschau
print(reranked_run.head())

         qid  iter    docno  rank      score                              name
315404  1000     0    12254     1  40.173644  CIR-longeval-web-fr-2023-03-BM25
315405  1000     0  1641568     2  40.001489  CIR-longeval-web-fr-2023-03-BM25
315406  1000     0  3376776     3  38.737122  CIR-longeval-web-fr-2023-03-BM25
315407  1000     0  3391193     4  38.190770  CIR-longeval-web-fr-2023-03-BM25
315408  1000     0  3384961     5  38.071227  CIR-longeval-web-fr-2023-03-BM25


In [7]:
# Speichere das Ergebnis im korrekten TREC-Format ohne Header
reranked_run.to_csv("/home/jovyan/work/datasets/LongEval-Web/index/Gruppe_JMFT/2023-03/reranked_run_bert_lvl.gz", sep='\t', index=False, header=False, compression='gzip')