In [1]:
import os
import sys
import time
import random
import re
import subprocess
from difflib import SequenceMatcher
from urllib.parse import urljoin
import importlib
import requests
from bs4 import BeautifulSoup
import bibtexparser
import unicodedata

In [2]:
#!/usr/bin/env python3
"""
update_references.py

Read references.bib in the current folder, try to obtain a BibTeX entry
from Google Scholar for each entry (fallback to Crossref if Scholar fails),
and overwrite references.bib keeping the original entry if no match is found.

Prints which conversions worked.

Usage: run this cell / run the script in the same folder as references.bib.
"""

# Ensure required packages are installed
def ensure_packages(mapping):
    missing = []
    for import_name, pip_name in mapping.items():
        try:
            importlib.import_module(import_name)
        except ImportError:
            missing.append(pip_name)
    if missing:
        subprocess.check_call([sys.executable, "-m", "pip", "install", *missing])

ensure_packages({
    "requests": "requests",
    "bs4": "beautifulsoup4",
    "bibtexparser": "bibtexparser"
})


HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/115.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}


def clean_title(t):
    if not t:
        return ""
    t = re.sub(r'[\{\}]', '', t)  # remove bib braces
    t = re.sub(r'\s+', ' ', t).strip()
    try:
        t = unicodedata.normalize('NFKD', t)
    except Exception:
        pass
    t = t.lower()
    t = re.sub(r'[^a-z0-9\s]', ' ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t


def search_google_scholar(title, timeout=15, max_candidates=5):
    """Try to find a BibTeX on Google Scholar for `title`.
    Returns (bibtex_text or None, reason_string).
    """
    url = "https://scholar.google.com/scholar"
    params = {"q": title}
    try:
        resp = requests.get(url, params=params, headers=HEADERS, timeout=timeout)
    except Exception as e:
        return None, f"scholar HTTP error: {e}"
    if resp.status_code != 200:
        return None, f"scholar HTTP status {resp.status_code}"

    soup = BeautifulSoup(resp.text, "html.parser")
    results = soup.find_all("div", class_="gs_ri")
    best = None
    best_score = 0.0
    # Iterate results and look for /scholar.bib href inside each result.
    for res in results[:max_candidates]:
        # Extract title
        title_tag = res.find("h3", class_="gs_rt")
        if title_tag is None:
            title_tag = res.find("h3")
        result_title = title_tag.get_text(" ", strip=True) if title_tag else ""
        score = SequenceMatcher(None, clean_title(result_title), clean_title(title)).ratio()
        # Try to find a BibTeX download link inside the result
        bib_link = None
        # direct link with href containing scholar.bib
        for a in res.find_all("a", href=True):
            if "scholar.bib" in a["href"] or "scholar.bib" in a.get_text().lower():
                bib_link = a
                break
        # some pages include a "BibTeX" anchor text (in a hidden div)
        if not bib_link:
            for a in res.find_all("a"):
                if a.get_text() and "bibtex" in a.get_text().lower():
                    bib_link = a
                    break
        if bib_link and score > best_score:
            best_score = score
            best = (bib_link, result_title, score)

    # fallback: search entire page for any anchor with scholar.bib and find nearest title
    if not best:
        links = soup.find_all("a", href=True)
        candidates = []
        for a in links:
            if "scholar.bib" in a["href"] or ("bibtex" in a.get_text().lower()):
                # try to find closest preceding h3 or div.gs_ri
                parent = a
                candidate_title = None
                for _ in range(6):
                    parent = parent.parent
                    if parent is None:
                        break
                    h = parent.find("h3")
                    if h:
                        candidate_title = h.get_text(" ", strip=True)
                        break
                if not candidate_title:
                    # pick any nearby h3 in soup
                    h = soup.find("h3")
                    candidate_title = h.get_text(" ", strip=True) if h else ""
                score = SequenceMatcher(None, clean_title(candidate_title), clean_title(title)).ratio()
                candidates.append((a, candidate_title, score))
        if candidates:
            candidates.sort(key=lambda x: x[2], reverse=True)
            a, candidate_title, score = candidates[0]
            if score > 0:
                best = (a, candidate_title, score)
                best_score = score

    if best:
        a, matched_title, score = best
        href = a.get("href", "")
        bib_url = urljoin("https://scholar.google.com", href)
        try:
            bib_resp = requests.get(bib_url, headers=HEADERS, timeout=timeout)
            if bib_resp.status_code == 200 and "@" in bib_resp.text:
                return bib_resp.text, f"scholar (score={score:.2f})"
            else:
                return None, f"scholar returned non-bibtex (status {bib_resp.status_code})"
        except Exception as e:
            return None, f"scholar fetch error: {e}"
    return None, "no scholar bibtex link found"


def fetch_bib_crossref(title, timeout=15, rows=5):
    """Fallback: use Crossref to find DOI and request BibTeX via doi.org."""
    api = "https://api.crossref.org/works"
    params = {"query.title": title, "rows": rows}
    try:
        r = requests.get(api, params=params, headers=HEADERS, timeout=timeout)
    except Exception as e:
        return None, f"crossref HTTP error: {e}"
    if r.status_code != 200:
        return None, f"crossref HTTP status {r.status_code}"
    try:
        J = r.json()
    except Exception as e:
        return None, f"crossref JSON parse error: {e}"
    items = J.get("message", {}).get("items", [])
    best_item = None
    best_score = 0.0
    for it in items:
        cand_title = ""
        if "title" in it and isinstance(it["title"], list) and it["title"]:
            cand_title = it["title"][0]
        score = SequenceMatcher(None, clean_title(cand_title), clean_title(title)).ratio()
        if score > best_score:
            best_score = score
            best_item = it
    if best_item and best_score >= 0.55 and "DOI" in best_item:
        doi = best_item["DOI"]
        doi_url = "https://doi.org/" + doi
        headers = HEADERS.copy()
        headers["Accept"] = "application/x-bibtex; charset=utf-8"
        try:
            r2 = requests.get(doi_url, headers=headers, timeout=timeout)
        except Exception as e:
            return None, f"doi.org fetch error: {e}"
        if r2.status_code == 200 and "@" in r2.text:
            return r2.text, f"crossref doi {doi} (score={best_score:.2f})"
        return None, f"doi.org returned non-bibtex (status {r2.status_code})"
    return None, "no crossref match"


def parse_bibtex_string(bibtex_text):
    try:
        db = bibtexparser.loads(bibtex_text)
        if db.entries:
            return db.entries[0]
    except Exception:
        pass
    return None


def main():
    infile = "references.bib"
    if not os.path.exists(infile):
        print("references.bib not found in current folder.")
        return

    with open(infile, "r", encoding="utf-8") as f:
        original_text = f.read()

    try:
        orig_db = bibtexparser.loads(original_text)
        orig_entries = orig_db.entries
    except Exception:
        # fallback: try using bibtexparser directly from file
        parser = bibtexparser.bparser.BibTexParser(common_strings=True)
        with open(infile, "r", encoding="utf-8") as f:
            orig_db = bibtexparser.load(f, parser=parser)
            orig_entries = orig_db.entries

    new_entries = []
    conversions = []  # list of tuples: (orig_key, success_bool, reason)
    for entry in orig_entries:
        key = entry.get("ID") or entry.get("key") or "<no-id>"
        title_field = entry.get("title", "")
        title_clean = title_field.strip()
        # strip surrounding braces often found in bib titles
        title_clean = re.sub(r'^\{(.*)\}$', r'\1', title_clean)
        print(f"Processing: {key}  --  {title_clean}")
        # try Google Scholar
        scholar_bib, scholar_reason = search_google_scholar(title_clean)
        if scholar_bib:
            parsed = parse_bibtex_string(scholar_bib)
            if parsed:
                # preserve original key (ID) to avoid breaking references
                parsed["ID"] = key
                new_entries.append(parsed)
                conversions.append((key, True, scholar_reason))
                print(f"  Replaced by Google Scholar result ({scholar_reason}).")
                # small delay
                time.sleep(random.uniform(1.2, 2.5))
                continue
        # fallback Crossref
        cross_bib, cross_reason = fetch_bib_crossref(title_clean)
        if cross_bib:
            parsed = parse_bibtex_string(cross_bib)
            if parsed:
                parsed["ID"] = key
                new_entries.append(parsed)
                conversions.append((key, True, cross_reason))
                print(f"  Replaced by Crossref result ({cross_reason}).")
                time.sleep(random.uniform(1.2, 2.5))
                continue
        # if both fail, keep original
        new_entries.append(entry)
        conversions.append((key, False, f"no match (scholar: {scholar_reason}; crossref: {cross_reason})"))
        print(f"  Kept original entry (no match).")
        time.sleep(random.uniform(1.2, 2.5))

    # write back to references.bib
    out_db = bibtexparser.bibdatabase.BibDatabase()
    out_db.entries = new_entries
    writer = bibtexparser.bwriter.BibTexWriter()
    # prefer stable formatting
    writer.order_entries_by = None
    writer.indent = "  "
    try:
        with open(infile, "w", encoding="utf-8") as f:
            f.write(writer.write(out_db))
        print("\nWrote updated entries to references.bib")
    except Exception as e:
        print(f"Failed to write {infile}: {e}")
        return

    # Print summary
    print("\nSummary (converted entries):")
    for key, ok, reason in conversions:
        status = "OK" if ok else "KEPT"
        print(f"  {key}: {status} ({reason})")


if __name__ == "__main__":
    main()

Processing: Dav17  --  Probability and Braiding Statistics in Majorana Nanowires
  Replaced by Crossref result (crossref doi 10.1103/physrevb.95.155451 (score=1.00)).
Processing: jas24  --  Non-Abelian statistics and topological quantum information processing in 1D wire networks
  Replaced by Crossref result (crossref doi 10.1038/nphys1915 (score=1.00)).
Processing: kit00  --  Unpaired Majorana fermions in quantum wires
  Replaced by Crossref result (crossref doi 10.1070/1063-7869/44/10s/s29 (score=1.00)).
Processing: cas18  --  Majorana Braiding Dynamics on Nanowires
  Replaced by Crossref result (crossref doi 10.1103/physrevb.91.174305 (score=0.97)).
Processing: cwj20  --  Search for non-Abelian Majorana braiding statistics in superconductors
  Replaced by Crossref result (crossref doi 10.21468/scipostphyslectnotes.15 (score=1.00)).
Processing: steven21  --  Topological Quantum: Lecture Notes
  Replaced by Crossref result (crossref doi 10.1088/2053-2563/aaf3a3 (score=0.66)).
Processi