# BibTeX Triage for ACM Exports

This notebook:
- Counts BibTeX entries per `.bib` file
- Flags entries missing DOIs
- Deduplicates (by DOI if present, else by title+year)
- Filters likely-relevant candidates by keywords
- Exports a reviewable CSV: `bib_candidates.csv`

In [None]:
from pathlib import Path
import re
import csv

# Update these filenames if yours differ
BIB_FILES = [
    "CIKM.bib",
    "KDD.bib",
    "RecSys.bib",
    "TheWebConf.bib",
    "WSDM.bib",
]

# Tune these to your project framing
INCLUDE_KEYWORDS = [
    "popularity",
    "engagement",
    "rating",
    "ratings",
    "vote",
    "votes",
    "attention",
    "prediction",
    "predict",
    "response",
    "collective",
]

# Exclude obvious off-topic domains (sensors, biomedical, etc.)
EXCLUDE_KEYWORDS = [
    "physiological",
    "thermal",
    "ecg",
    "eeg",
    "wearable",
    "biosignal",
    "medical",
    "diagnosis",
    "patient",
]

In [None]:
ENTRY_START_RE = re.compile(r"^\s*@\w+\s*{\s*([^,]+)\s*,\s*$", re.IGNORECASE)
FIELD_RE = re.compile(r"^\s*(\w+)\s*=\s*[{\"](.+?)[}\"]\s*,?\s*$", re.IGNORECASE)

def split_entries(text: str):
    """
    Simple BibTeX splitter: assumes entries start with '@' at the beginning of a line.
    Works well for standard ACM DL exports.
    """
    chunks = re.split(r"\n(?=@)", text.strip(), flags=re.MULTILINE)
    return [c.strip() for c in chunks if c.strip().startswith("@")]

def parse_entry(entry_text: str):
    """
    Parses a BibTeX entry into a dict with common fields.
    This is intentionally lightweight (no external packages).
    """
    lines = entry_text.splitlines()
    m = ENTRY_START_RE.match(lines[0])
    key = m.group(1).strip() if m else ""

    fields = {}
    for line in lines[1:]:
        fm = FIELD_RE.match(line)
        if fm:
            fname = fm.group(1).lower()
            fval = fm.group(2).strip()
            fields[fname] = fval

    title = fields.get("title", "")
    year = fields.get("year", "")
    doi = fields.get("doi", "")
    booktitle = fields.get("booktitle", "")
    journal = fields.get("journal", "")
    venue = journal or booktitle

    return {
        "key": key,
        "title": title,
        "year": year,
        "doi": doi,
        "venue": venue,
        "raw": entry_text,
    }

def has_any_keyword(text: str, keywords):
    t = text.lower()
    return any(k.lower() in t for k in keywords)

In [None]:
missing = [f for f in BIB_FILES if not Path(f).exists()]
if missing:
    print("Missing files:")
    for f in missing:
        print("  -", f)
else:
    print("All bib files found.")

In [None]:
all_rows = []
missing_doi_rows = []
candidates = []

doi_seen = set()
title_year_seen = set()

per_file_counts = {}

for fname in BIB_FILES:
    path = Path(fname)
    text = path.read_text(encoding="utf-8", errors="ignore")
    entries = split_entries(text)
    per_file_counts[fname] = len(entries)

    for e in entries:
        item = parse_entry(e)

        # Dedup logic
        doi_norm = item["doi"].strip().lower()
        title_norm = re.sub(r"\s+", " ", item["title"].strip().lower())
        ty_norm = (title_norm, item["year"].strip())

        is_dup = False
        if doi_norm:
            if doi_norm in doi_seen:
                is_dup = True
            doi_seen.add(doi_norm)
        else:
            if ty_norm in title_year_seen:
                is_dup = True
            title_year_seen.add(ty_norm)

        # Track missing DOI
        if not doi_norm:
            missing_doi_rows.append(item)

        # Candidate filter
        blob = f"{item['title']} {item['venue']}"
        include = has_any_keyword(blob, INCLUDE_KEYWORDS)
        exclude = has_any_keyword(blob, EXCLUDE_KEYWORDS)

        if include and not exclude and not is_dup:
            candidates.append(item)

        all_rows.append(item)

print("Counts per file:")
for k, v in per_file_counts.items():
    print(f"  {k}: {v}")

print(f"\nTotal entries scanned (including duplicates across files): {len(all_rows)}")
print(f"Entries missing DOI: {len(missing_doi_rows)}")
print(f"Candidate count (post-dedup): {len(candidates)}")

In [None]:
out_csv = Path("bib_candidates.csv")

with out_csv.open("w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["year", "title", "venue", "doi", "key"])
    w.writeheader()
    for c in sorted(candidates, key=lambda x: (x["year"], x["title"])):
        w.writerow({k: c.get(k, "") for k in ["year", "title", "venue", "doi", "key"]})

print("Wrote:", out_csv.resolve())

In [None]:
# Show a quick sample in the notebook
for c in candidates[:25]:
    print(f"{c['year']} | {c['title']} | {c['venue']} | {c['doi']}")

In [None]:
out_missing = Path("bib_missing_doi.csv")
with out_missing.open("w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["year", "title", "venue", "key"])
    w.writeheader()
    for m in sorted(missing_doi_rows, key=lambda x: (x["year"], x["title"])):
        w.writerow({k: m.get(k, "") for k in ["year", "title", "venue", "key"]})

print("Wrote:", out_missing.resolve())