In [None]:
%load_ext autoreload
%autoreload 2

from utils.process_data import PubmedQueries

PubmedQueries().main()

# process_TAR()

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd

df = pd.read_json('data/sysrev.jsonl', lines=True)
df['source'] = 'sysrev'
df.rename(columns={'title': 'nl_query', 'query': 'bool_query'}, inplace=True)
df.to_json('data/sysrev_conv.jsonl', orient="records", lines=True)

In [None]:
import json
import pandas as pd
from pathlib import Path

BASE_DIR = Path("data") / "result_n_runs" / "seed_collection_reproduce"

def combine_for_model(out_base: Path):
    combined = pd.DataFrame()
    out_base.mkdir(parents=True, exist_ok=True)
    out = out_base / "combined_outputs"
    for md in BASE_DIR.iterdir():
        if not md.is_dir() or "Wang" in str(md): continue
        print(f"Processing model: {md.name}")

        qs = sorted(d for d in md.rglob("q*") if d.is_dir())
        if not qs: continue

        # Load metric results
        mdf = pd.concat(
            (pd.read_csv(q / "final_trec_result" / "results.rel", sep=r"\s+", header=None,
                names=["id", "accuracy", "f1", "f3", "recall"]).assign(prompt_id=q.name)
                for q in qs if (q / "final_trec_result" / "results.rel").exists() ), ignore_index=True)

        # Load generation outputs
        gen = []
        for q in qs:
            p = q / "generation_output" / "step_0.jsonl"
            if not p.exists(): continue
            for line in p.open():
                obj = json.loads(line)
                obj["prompt_id"] = q.name
                obj["generated_query"] = obj["existing_prompts"]["user"][1]["content"]
                obj["model"] = md.name
                del obj["existing_prompts"]
                gen.append(obj)

        gdf = pd.DataFrame(gen).astype({"id": str})
        mdf["id"] = mdf["id"].astype(str)

        # Merge and accumulate
        df = gdf.merge(mdf, on=["prompt_id", "id"], how="left")
        combined = pd.concat([combined, df], ignore_index=True)

    # Write combined JSONL
    cols = ["id", "prompt_id", "accuracy", "f1", "f3", "recall", "model"]
    cols += [c for c in combined.columns if c not in cols]
    with out.with_suffix(".jsonl").open("w", encoding="utf-8") as f:
        for rec in combined[cols].to_dict(orient="records"):
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")


OUTPUT_DIR = Path("data")
combine_for_model(OUTPUT_DIR)