In [21]:
# ============================================================
# 018_seed_corpus_2023plus_discovery
# ============================================================
#
# Overview
# ----------------
# This notebook performs broad discovery of research papers published from 2023 onward
# related to venture capital, limited partners, government-backed VC, and entrepreneurship policy.
# It is designed as a weekly, mostly automated seed corpus builder for a PhD-style research agent.
#
# The goal is to explore the landscape widely, normalize metadata across multiple sources,
# estimate open-access PDF availability, and produce a prioritized candidate list
# for downstream ingestion, citation analysis, and research question (RQ) refinement.
#
# This notebook intentionally favors high recall over precision at the discovery stage.
# Later notebooks and human-in-the-loop review are responsible for filtering,
# refinement, and deeper analysis.
#
#
# Inputs / Outputs
# ----------------
# Inputs:
# - Configuration of target years, themes, and keyword sets
#   (VC / LP / government VC / entrepreneurship policy)
# - Search sources (e.g., OpenAlex, arXiv, DOI-to-OA resolution, optional PubMed)
#
# Outputs:
# - A normalized candidate paper list with the following fields:
#   title, venue, publication year, authors, abstract, DOI, source URLs,
#   open-access likelihood, PDF fetch hints, and priority scores
# - Review-friendly exports (Top-N CSV / Markdown) explicitly designed
#   for quick human inspection and feedback into RQ and keyword tuning
# - Artifacts saved as CSV / Parquet for reuse in later notebooks
# - Run-level metadata and summary metrics for reproducibility and monitoring
#
#
# Structure
# ----------------
# Cell 00: Notebook purpose, success criteria, and execution assumptions
# Cell 01: Imports and environment / version setup
# Cell 02: Global configuration (years, themes, scoring weights, output paths)
# Cell 03: Search source configuration and enable/disable switches
# Cell 04: Query generation across themes and publication years
# Cell 05: OpenAlex search and pagination
# Cell 06: arXiv search and metadata extraction
# Cell 07: DOI-to-open-access resolution (private addon interface)
# Cell 08: Source-wise execution and raw result collection
# Cell 09: Normalization into a unified paper schema
# Cell 10: Deduplication across sources (DOI / title-based)
# Cell 11: Heuristic estimation of free PDF availability
# Cell 12: Relevance and priority scoring for RQ-driven research
# Cell 13: Candidate list export for downstream ingestion
# Cell 14: Lightweight review view for top-ranked papers (human-in-the-loop checkpoint)
# Cell 15: Run metrics, logs, and metadata persistence
#
#
# Notes
# ----------------
# - This notebook prioritizes recall over precision and is intended to be rerun periodically.
# - Downstream notebooks handle PDF fetching, Drive / Notion ingestion,
#   and citation network expansion.
# - Private addons are abstracted behind interfaces to keep this notebook reproducible and shareable.
# - This notebook does NOT attempt full-text analysis or long-term PDF storage;
#   those concerns are handled explicitly downstream.
# - All outputs are designed to support human-in-the-loop review
#   within a daily ~2 hour research workflow.


In [2]:
# ------------------------------------------------------------
# Cell 01: Imports and environment / version setup
# ------------------------------------------------------------

# Standard library
import os
import sys
import json
import time
import math
import uuid
import logging
from datetime import datetime
from typing import List, Dict, Optional, Tuple

# Third-party libraries
import requests
import pandas as pd

import tqdm as tqdm_module
from tqdm.auto import tqdm

# Retry / robustness utilities
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type
)

# Optional: lightweight schema validation
try:
    from pydantic import BaseModel, Field
    import pydantic
    PYDANTIC_AVAILABLE = True
except ImportError:
    PYDANTIC_AVAILABLE = False

# ------------------------------------------------------------
# Global display and logging configuration
# ------------------------------------------------------------
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 120)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)

logger = logging.getLogger(__name__)

# ------------------------------------------------------------
# Environment sanity checks
# ------------------------------------------------------------
logger.info("Python version: %s", sys.version.replace("\n", " "))
logger.info("Execution time (UTC): %s", datetime.utcnow().isoformat())

# Basic dependency checks (module-level versions)
REQUIRED_LIBRARIES = {
    "requests": requests.__version__,
    "pandas": pd.__version__,
    "tqdm": getattr(tqdm_module, "__version__", "unknown"),
}

for lib, ver in REQUIRED_LIBRARIES.items():
    logger.info("Library loaded: %s==%s", lib, ver)

if PYDANTIC_AVAILABLE:
    logger.info("Library loaded: pydantic==%s", pydantic.__version__)
else:
    logger.warning("pydantic is not available; schema validation will be skipped.")

# ------------------------------------------------------------
# Randomness control (best-effort)
# ------------------------------------------------------------
# Note: API-driven discovery is not fully deterministic, but
# we fix local randomness for scoring and sampling where applicable.
import random
import numpy as np

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

logger.info("Random seed fixed at %d", RANDOM_SEED)

# ------------------------------------------------------------
# Runtime notes
# ------------------------------------------------------------
# - Network calls are rate-limited and retried in later cells.
# - Private addons (e.g., DOI-to-OA resolution) are imported lazily
#   to keep this notebook shareable.
# - All artifacts generated by this notebook are intended to be
#   consumed by downstream ingestion and citation-analysis notebooks.


2026-01-12 08:53:59,526 | INFO | Python version: 3.11.13 (main, Jun  5 2025, 08:14:07) [Clang 14.0.6 ]
2026-01-12 08:53:59,528 | INFO | Execution time (UTC): 2026-01-11T23:53:59.528038
2026-01-12 08:53:59,528 | INFO | Library loaded: requests==2.32.4
2026-01-12 08:53:59,529 | INFO | Library loaded: pandas==2.3.3
2026-01-12 08:53:59,530 | INFO | Library loaded: tqdm==4.67.1
2026-01-12 08:53:59,531 | INFO | Library loaded: pydantic==2.12.5
2026-01-12 08:53:59,534 | INFO | Random seed fixed at 42


In [3]:
# ------------------------------------------------------------
# Cell 02: Global configuration (years, themes, sources, outputs)
# ------------------------------------------------------------
# This cell defines all configurable parameters for discovery:
# - time window (e.g., 2023+)
# - keyword / theme sets
# - enabled sources
# - output artifact paths
# - scoring weights and thresholds
#
# The intent is to keep discovery logic stable while allowing easy tuning
# of scope and relevance over time (weekly operations).

from pathlib import Path

# -----------------------------
# Time window
# -----------------------------
YEAR_FROM = 2023
YEAR_TO = datetime.utcnow().year  # discovery up to current year

# -----------------------------
# Themes / keyword sets
# -----------------------------
# Note: keep these relatively broad for recall; downstream steps can filter harder.
THEMES: Dict[str, List[str]] = {
    "venture_capital": [
        "venture capital", "VC", "venture fund", "venture investing",
        "startup financing", "early-stage financing"
    ],
    "limited_partners": [
        "limited partner", "LP", "institutional investor",
        "fund of funds", "pension fund", "endowment"
    ],
    "government_vc": [
        "government venture capital", "public venture capital",
        "state-backed venture capital", "sovereign wealth fund",
        "innovation agency", "development finance institution"
    ],
    "entrepreneurship_policy": [
        "entrepreneurship policy", "innovation policy",
        "startup policy", "industrial policy",
        "regulation", "tax incentive", "public subsidy"
    ],
}

# Optional: negative keywords to reduce obvious noise (use sparingly)
NEGATIVE_KEYWORDS: List[str] = [
    # e.g., "venture philanthropy"  # only if it consistently pollutes results
]

# -----------------------------
# Source configuration (enable / disable)
# -----------------------------
SOURCES = {
    "openalex": True,
    "arxiv": True,
    "doi_oa_resolve": True,   # private addon interface (implemented elsewhere)
    "pubmed": False,          # enable only if needed for policy/health crossovers
}

# -----------------------------
# Operational limits / rate limits
# -----------------------------
LIMITS = {
    "openalex_per_query": 500,     # cap per query to avoid runaway runs
    "arxiv_per_query": 200,
    "max_total_candidates": 5000,  # global safety cap
    "request_timeout_sec": 30,
}

# -----------------------------
# Outputs (artifacts)
# -----------------------------
ARTIFACT_DIR = Path("./artifacts/018_seed_corpus_2023plus_discovery")
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

OUT_CANDIDATES_CSV = ARTIFACT_DIR / "candidates.csv"
OUT_CANDIDATES_PARQUET = ARTIFACT_DIR / "candidates.parquet"
OUT_MINIMAL_PARQUET = ARTIFACT_DIR / "candidates_minimal.parquet"
OUT_RUN_METADATA_JSON = ARTIFACT_DIR / "run_metadata.json"

# -----------------------------
# Scoring weights (tune over time)
# -----------------------------
SCORING = {
    # RQ relevance signals
    "w_title_match": 3.0,
    "w_abstract_match": 1.5,

    # PDF availability heuristics
    "w_pdf_likelihood": 2.0,

    # Importance signals
    "w_citations": 0.5,      # cited_by_count (when available)
    "w_venue": 0.5,          # venue whitelist boosts (optional)
}

# Priority bucket thresholds (after computing total score)
PRIORITY_THRESHOLDS = {
    "A": 12.0,
    "B": 7.0,
    "C": 0.0,
}

# -----------------------------
# Venue boosts (optional; start minimal and expand)
# -----------------------------
VENUE_BOOST = {
    # Examples (fill in only if you have strong priors):
    # "research policy": 2.0,
    # "journal of business venturing": 2.0,
}

# -----------------------------
# Helpers for run metadata
# -----------------------------
RUN_CONTEXT = {
    "notebook": "018_seed_corpus_2023plus_discovery",
    "run_utc": datetime.utcnow().isoformat(),
    "year_from": YEAR_FROM,
    "year_to": YEAR_TO,
    "sources": SOURCES,
    "limits": LIMITS,
    "scoring": SCORING,
}

logger.info("Config loaded: YEAR_FROM=%s YEAR_TO=%s", YEAR_FROM, YEAR_TO)
logger.info("Sources enabled: %s", {k: v for k, v in SOURCES.items() if v})
logger.info("Artifact dir: %s", str(ARTIFACT_DIR.resolve()))


2026-01-12 08:54:40,287 | INFO | Config loaded: YEAR_FROM=2023 YEAR_TO=2026
2026-01-12 08:54:40,288 | INFO | Sources enabled: {'openalex': True, 'arxiv': True, 'doi_oa_resolve': True}
2026-01-12 08:54:40,289 | INFO | Artifact dir: /Users/yuetoya/Desktop/researchOS100-private/notebooks/artifacts/018_seed_corpus_2023plus_discovery


In [4]:
# ------------------------------------------------------------
# Cell 03: Search source configuration and enable/disable switches
# ------------------------------------------------------------
# This cell defines source-specific settings (endpoints, paging strategy,
# and field selection) and builds a single "active sources" registry.
#
# Design principles:
# - Keep provider-specific logic isolated behind thin adapters
# - Make it easy to turn sources on/off without editing downstream cells
# - Centralize rate-limit and retry defaults per source

from dataclasses import dataclass

# -----------------------------
# Source spec (provider config)
# -----------------------------
@dataclass(frozen=True)
class SourceSpec:
    name: str
    enabled: bool
    base_url: str
    paging: str                 # e.g., "cursor", "page", "start"
    per_page: int
    timeout_sec: int
    max_results_per_query: int
    user_agent: str

DEFAULT_USER_AGENT = (
    "researchOS/seed-corpus-discovery (contact: you@example.com) "
    "requests/{requests_ver}".format(requests_ver=requests.__version__)
)

SOURCE_SPECS: Dict[str, SourceSpec] = {
    "openalex": SourceSpec(
        name="openalex",
        enabled=bool(SOURCES.get("openalex")),
        base_url="https://api.openalex.org/works",
        paging="cursor",
        per_page=200,  # OpenAlex supports up to 200 per page
        timeout_sec=LIMITS["request_timeout_sec"],
        max_results_per_query=LIMITS["openalex_per_query"],
        user_agent=DEFAULT_USER_AGENT,
    ),
    "arxiv": SourceSpec(
        name="arxiv",
        enabled=bool(SOURCES.get("arxiv")),
        base_url="http://export.arxiv.org/api/query",
        paging="start",
        per_page=100,  # typical safe chunk size
        timeout_sec=LIMITS["request_timeout_sec"],
        max_results_per_query=LIMITS["arxiv_per_query"],
        user_agent=DEFAULT_USER_AGENT,
    ),
    "doi_oa_resolve": SourceSpec(
        name="doi_oa_resolve",
        enabled=bool(SOURCES.get("doi_oa_resolve")),
        base_url="(private_addon)",  # interface-only here; implemented elsewhere
        paging="n/a",
        per_page=0,
        timeout_sec=LIMITS["request_timeout_sec"],
        max_results_per_query=0,
        user_agent=DEFAULT_USER_AGENT,
    ),
    "pubmed": SourceSpec(
        name="pubmed",
        enabled=bool(SOURCES.get("pubmed")),
        base_url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/",
        paging="(esearch/retstart)",
        per_page=200,
        timeout_sec=LIMITS["request_timeout_sec"],
        max_results_per_query=500,
        user_agent=DEFAULT_USER_AGENT,
    ),
}

# -----------------------------
# Build active source registry
# -----------------------------
ACTIVE_SOURCES: Dict[str, SourceSpec] = {
    k: v for k, v in SOURCE_SPECS.items() if v.enabled
}

if not ACTIVE_SOURCES:
    raise ValueError("No sources enabled. Please set at least one source True in SOURCES.")

logger.info("Active sources: %s", list(ACTIVE_SOURCES.keys()))

# -----------------------------
# HTTP session defaults
# -----------------------------
# We use a single requests.Session to benefit from connection pooling.
SESSION = requests.Session()
SESSION.headers.update(
    {
        "User-Agent": DEFAULT_USER_AGENT,
        "Accept": "application/json, text/plain, */*",
    }
)

# -----------------------------
# Source-specific notes / switches
# -----------------------------
# OpenAlex:
# - Prefer filter=from_publication_date / to_publication_date
# - Use cursor pagination for large result sets
#
# arXiv:
# - Atom feed; paging via start/max_results
# - Queries are best kept broad; we rely on downstream scoring for relevance
#
# DOI OA resolve (private addon):
# - Interface-only: accepts DOI and returns best OA landing/PDF URL when possible
# - Downstream cells should treat it as an optional enrichment step

# Persist source config into run metadata (for auditing)
RUN_CONTEXT["active_sources"] = list(ACTIVE_SOURCES.keys())
RUN_CONTEXT["source_specs"] = {
    k: {
        "base_url": v.base_url,
        "paging": v.paging,
        "per_page": v.per_page,
        "timeout_sec": v.timeout_sec,
        "max_results_per_query": v.max_results_per_query,
    }
    for k, v in ACTIVE_SOURCES.items()
}


2026-01-12 08:55:53,446 | INFO | Active sources: ['openalex', 'arxiv', 'doi_oa_resolve']


In [5]:
# ------------------------------------------------------------
# Cell 04: Query generation across themes and publication years
# ------------------------------------------------------------
# This cell generates a query plan that can be executed across sources.
# We keep queries intentionally broad (high recall) and rely on downstream
# scoring + deduplication to refine precision.
#
# Outputs:
# - QUERY_PLAN: a list of query jobs with {source, query_string, year_range, theme_tags}
# - Helper functions to build OpenAlex and arXiv query strings consistently

import re
from itertools import product

# -----------------------------
# Text helpers
# -----------------------------
def _normalize_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def _quote_if_needed(term: str) -> str:
    # Quote multi-word phrases for safer search behavior
    t = term.strip()
    if " " in t and not (t.startswith('"') and t.endswith('"')):
        return f'"{t}"'
    return t

# -----------------------------
# Theme -> OR clause
# -----------------------------
def build_or_clause(terms: List[str]) -> str:
    """
    Convert a list of keywords into an OR clause:
    ["venture capital","VC"] -> ("venture capital" OR VC)
    """
    cleaned = [_quote_if_needed(t) for t in terms if t and t.strip()]
    if not cleaned:
        return ""
    if len(cleaned) == 1:
        return cleaned[0]
    return "(" + " OR ".join(cleaned) + ")"

# -----------------------------
# Source-specific query builders
# -----------------------------
def build_openalex_search_query(theme_clause: str, negative_terms: Optional[List[str]] = None) -> str:
    """
    OpenAlex supports free-text search via the `search` parameter.
    We keep a single string that combines positive and (optional) negative terms.
    """
    q = theme_clause
    if negative_terms:
        neg = " ".join([f'-{_quote_if_needed(t)}' for t in negative_terms if t and t.strip()])
        q = f"{q} {neg}".strip()
    return _normalize_spaces(q)

def build_arxiv_search_query(theme_clause: str, negative_terms: Optional[List[str]] = None) -> str:
    """
    arXiv Atom API uses a lucene-like syntax:
    e.g., (ti:"venture capital" OR abs:"venture capital") AND ...
    For recall, we search title OR abstract.
    """
    # For arXiv, we replicate the theme clause into ti:/abs: fields.
    # We keep it simple: apply to both title and abstract by OR-ing two clauses.
    base = theme_clause

    # Expand phrases in a minimal way for ti:/abs: searching.
    # Example: ("venture capital" OR VC) -> (ti:"venture capital" OR ti:VC OR abs:"venture capital" OR abs:VC)
    # This is heuristic but works reasonably for discovery.
    tokens = re.findall(r'"[^"]+"|\w+|\(|\)|OR', base)
    terms = [t for t in tokens if t not in {"(", ")", "OR"}]

    ti_terms = [f'ti:{t}' for t in terms]
    abs_terms = [f'abs:{t}' for t in terms]

    # If original clause had ORs, we keep OR behavior by joining each group with OR.
    ti_clause = "(" + " OR ".join(ti_terms) + ")"
    abs_clause = "(" + " OR ".join(abs_terms) + ")"

    q = f"({ti_clause} OR {abs_clause})"

    # Negative terms (optional; use sparingly)
    if negative_terms:
        neg = " AND ".join([f"NOT all:{_quote_if_needed(t)}" for t in negative_terms if t and t.strip()])
        q = f"{q} AND {neg}"

    return _normalize_spaces(q)

# -----------------------------
# Year buckets (execution plan)
# -----------------------------
# Rationale:
# - We often want per-year runs for better monitoring and easier backfills.
# - If you prefer a single range query, set YEAR_BUCKETS to [(YEAR_FROM, YEAR_TO)].
YEAR_BUCKETS: List[Tuple[int, int]] = [(y, y) for y in range(YEAR_FROM, YEAR_TO + 1)]

# -----------------------------
# Build query plan
# -----------------------------
QUERY_PLAN: List[Dict[str, object]] = []

for (theme_name, terms), (y0, y1) in product(THEMES.items(), YEAR_BUCKETS):
    clause = build_or_clause(terms)
    if not clause:
        continue

    # OpenAlex job
    if "openalex" in ACTIVE_SOURCES:
        QUERY_PLAN.append(
            {
                "source": "openalex",
                "theme": theme_name,
                "year_from": y0,
                "year_to": y1,
                "query": build_openalex_search_query(clause, NEGATIVE_KEYWORDS),
            }
        )

    # arXiv job
    if "arxiv" in ACTIVE_SOURCES:
        QUERY_PLAN.append(
            {
                "source": "arxiv",
                "theme": theme_name,
                "year_from": y0,
                "year_to": y1,
                "query": build_arxiv_search_query(clause, NEGATIVE_KEYWORDS),
            }
        )

# Global safety cap
if len(QUERY_PLAN) > 2000:
    logger.warning("QUERY_PLAN is very large (%d jobs). Consider narrowing keywords or year buckets.", len(QUERY_PLAN))

logger.info("Built QUERY_PLAN with %d jobs across %d themes and %d year buckets.",
            len(QUERY_PLAN), len(THEMES), len(YEAR_BUCKETS))

# Preview a few jobs
pd.DataFrame(QUERY_PLAN).head(10)


2026-01-12 08:58:46,502 | INFO | Built QUERY_PLAN with 32 jobs across 4 themes and 4 year buckets.


Unnamed: 0,source,theme,year_from,year_to,query
0,openalex,venture_capital,2023,2023,"(""venture capital"" OR VC OR ""venture fund"" OR ..."
1,arxiv,venture_capital,2023,2023,"((ti:""venture capital"" OR ti:VC OR ti:""venture..."
2,openalex,venture_capital,2024,2024,"(""venture capital"" OR VC OR ""venture fund"" OR ..."
3,arxiv,venture_capital,2024,2024,"((ti:""venture capital"" OR ti:VC OR ti:""venture..."
4,openalex,venture_capital,2025,2025,"(""venture capital"" OR VC OR ""venture fund"" OR ..."
5,arxiv,venture_capital,2025,2025,"((ti:""venture capital"" OR ti:VC OR ti:""venture..."
6,openalex,venture_capital,2026,2026,"(""venture capital"" OR VC OR ""venture fund"" OR ..."
7,arxiv,venture_capital,2026,2026,"((ti:""venture capital"" OR ti:VC OR ti:""venture..."
8,openalex,limited_partners,2023,2023,"(""limited partner"" OR LP OR ""institutional inv..."
9,arxiv,limited_partners,2023,2023,"((ti:""limited partner"" OR ti:LP OR ti:""institu..."


In [8]:
# ------------------------------------------------------------
# Cell 05: OpenAlex search and pagination
# ------------------------------------------------------------
# This cell implements OpenAlex retrieval with cursor pagination and retries.
# It converts each query job into a normalized "raw OpenAlex" record list,
# then returns a DataFrame with consistent fields for downstream normalization.
#
# References:
# - OpenAlex Works API: https://docs.openalex.org/api-entities/works
#
# Design choices:
# - Use cursor pagination for stability with large result sets
# - Enforce per-query caps (LIMITS["openalex_per_query"])
# - Keep raw payload fragments minimal (avoid bloating artifacts)
# - Normalize essential metadata (title, year, venue, abstract, DOI, OA flags, URLs)
import re
from requests.exceptions import ConnectionError, Timeout
from urllib.parse import urlencode
from requests.exceptions import HTTPError

OPENALEX_FIELDS = ",".join(
    [
        "id",
        "doi",
        "display_name",
        "publication_year",
        "publication_date",
        "type",
        "primary_location",
        "locations",
        "authorships",
        "cited_by_count",
        "open_access",
        "best_oa_location",
        "abstract_inverted_index",
    ]
)


def openalex_inverted_index_to_abstract(inv_idx: Optional[dict]) -> Optional[str]:
    """
    Convert OpenAlex `abstract_inverted_index` to plain abstract text.
    Returns None if missing.
    """
    if not inv_idx:
        return None
    # inv_idx: {word: [pos1,pos2,...], ...}
    positions = []
    for word, pos_list in inv_idx.items():
        for p in pos_list:
            positions.append((p, word))
    if not positions:
        return None
    positions.sort(key=lambda x: x[0])
    return " ".join([w for _, w in positions])

def _openalex_get(url: str, params: dict) -> dict:
    @retry(
        reraise=True,
        stop=stop_after_attempt(5),
        wait=wait_exponential(multiplier=1, min=1, max=30),
        retry=retry_if_exception_type((ConnectionError, Timeout)),
    )

    def _do():
        resp = SESSION.get(url, params=params, timeout=SOURCE_SPECS["openalex"].timeout_sec)
        if resp.status_code == 400:
            logger.error("OpenAlex 400 response text: %s", resp.text[:2000])
            logger.error("OpenAlex 400 params: %s", params)
        resp.raise_for_status()
        return resp.json()
    return _do()

def sanitize_openalex_search(q: str) -> str:
    """
    Best-effort sanitizer to avoid OpenAlex query_string parse errors.
    - remove outer parentheses
    - remove boolean operators
    - keep phrases but drop problematic characters if needed
    """
    s = q.strip()

    # Remove parentheses that often trigger parse issues
    s = s.replace("(", " ").replace(")", " ")

    # Remove boolean operators (OpenAlex will treat space-separated terms as AND)
    s = re.sub(r"\b(AND|OR|NOT)\b", " ", s, flags=re.IGNORECASE)

    # Collapse spaces
    s = re.sub(r"\s+", " ", s).strip()
    return s

def build_openalex_params(search_query: str, year_from: int, year_to: int, per_page: int) -> dict:
    """
    Build OpenAlex query parameters.
    We use `search` + date filters for publication_date for consistent range selection.
    """
    # OpenAlex supports from_publication_date and to_publication_date (YYYY-MM-DD)
    from_date = f"{year_from}-01-01"
    to_date = f"{year_to}-12-31"
    params = {
        "search": search_query,
        "filter": f"from_publication_date:{from_date},to_publication_date:{to_date}",
        "per-page": per_page,
        "cursor": "*",
        "select": OPENALEX_FIELDS,
    }
    return params

def _extract_venue_name(w: dict) -> Optional[str]:
    """
    OpenAlex deprecated host_venue; use primary_location/locations instead.
    Try primary_location.source.display_name, then fall back to best_oa_location.source,
    then any locations[i].source.display_name.
    """
    def source_name(loc: Optional[dict]) -> Optional[str]:
        if not loc:
            return None
        src = loc.get("source") or {}
        return src.get("display_name")

    primary = w.get("primary_location") or {}
    name = source_name(primary)
    if name:
        return name

    best_oa = w.get("best_oa_location") or {}
    name = source_name(best_oa)
    if name:
        return name

    for loc in (w.get("locations") or []):
        name = source_name(loc)
        if name:
            return name

    return None

def parse_openalex_work(w: dict, theme: str) -> Dict[str, object]:
    """
    Extract a compact raw record from an OpenAlex work payload.
    """
    best_oa = w.get("best_oa_location") or {}
    oa = w.get("open_access") or {}

    venue_name = _extract_venue_name(w)

    primary_loc = w.get("primary_location") or {}
    landing_url = None
    if primary_loc.get("landing_page_url"):
        landing_url = primary_loc.get("landing_page_url")
    elif best_oa.get("landing_page_url"):
        landing_url = best_oa.get("landing_page_url")

    pdf_url = best_oa.get("pdf_url")

    abstract = openalex_inverted_index_to_abstract(w.get("abstract_inverted_index"))

    # Basic authors list (compact)
    authors = []
    for a in (w.get("authorships") or [])[:50]:
        author = (a.get("author") or {}).get("display_name")
        if author:
            authors.append(author)

    return {
        "source": "openalex",
        "theme": theme,
        "openalex_id": w.get("id"),
        "doi": (w.get("doi") or "").replace("https://doi.org/", "") if w.get("doi") else None,
        "title": w.get("display_name"),
        "publication_year": w.get("publication_year"),
        "publication_date": w.get("publication_date"),
        "type": w.get("type"),
        "venue": venue_name,
        "authors": authors,
        "cited_by_count": w.get("cited_by_count"),
        # OA signals
        "is_oa": oa.get("is_oa"),
        "oa_status": oa.get("oa_status"),
        "best_oa_landing_url": best_oa.get("landing_page_url"),
        "best_oa_pdf_url": pdf_url,
        # Useful URLs
        "landing_page_url": landing_url,
        "abstract": abstract,
    }

def run_openalex_query_job(job: Dict[str, object]) -> List[Dict[str, object]]:
    spec = SOURCE_SPECS["openalex"]

    original_q = str(job["query"])
    params = build_openalex_params(
        search_query=original_q,
        year_from=int(job["year_from"]),
        year_to=int(job["year_to"]),
        per_page=spec.per_page,
    )

    results: List[Dict[str, object]] = []
    total_fetched = 0
    cursor = "*"

    while True:
        params["cursor"] = cursor

        try:
            payload = _openalex_get(spec.base_url, params=params)

        except HTTPError as e:
            # 400 fallback: sanitize query and retry once per cursor step
            resp = getattr(e, "response", None)
            if resp is not None and resp.status_code == 400:
                safe_q = sanitize_openalex_search(original_q)
                logger.warning("OpenAlex 400. Falling back to sanitized search: %s", safe_q)

                params["search"] = safe_q
                payload = _openalex_get(spec.base_url, params=params)
            else:
                raise

        works = payload.get("results") or []
        if not works:
            break

        for w in works:
            results.append(parse_openalex_work(w, theme=str(job["theme"])))
            total_fetched += 1
            if total_fetched >= spec.max_results_per_query:
                break

        if total_fetched >= spec.max_results_per_query:
            break

        meta = payload.get("meta") or {}
        next_cursor = meta.get("next_cursor")
        if not next_cursor or next_cursor == cursor:
            break

        cursor = next_cursor

    return results
# -----------------------------
# Execute OpenAlex jobs in QUERY_PLAN
# -----------------------------
raw_openalex_records: List[Dict[str, object]] = []

openalex_jobs = [j for j in QUERY_PLAN if j["source"] == "openalex"]

logger.info("OpenAlex jobs: %d", len(openalex_jobs))

for job in tqdm(openalex_jobs, desc="OpenAlex queries"):
    try:
        recs = run_openalex_query_job(job)
        raw_openalex_records.extend(recs)
    except Exception as e:
        logger.exception("OpenAlex job failed: %s", job)
        # Record a minimal failure row so we can audit gaps later
        raw_openalex_records.append(
            {
                "source": "openalex",
                "theme": job.get("theme"),
                "openalex_id": None,
                "doi": None,
                "title": None,
                "publication_year": job.get("year_from"),
                "publication_date": None,
                "type": None,
                "venue": None,
                "authors": [],
                "cited_by_count": None,
                "is_oa": None,
                "oa_status": None,
                "best_oa_landing_url": None,
                "best_oa_pdf_url": None,
                "landing_page_url": None,
                "abstract": None,
                "error": str(e),
                "failed_job": dict(job),
            }
        )

raw_openalex_df = pd.DataFrame(raw_openalex_records)

logger.info("OpenAlex fetched rows: %d", len(raw_openalex_df))

# Quick preview
raw_openalex_df.head(5)


2026-01-12 09:14:14,327 | INFO | OpenAlex jobs: 16


OpenAlex queries:   0%|          | 0/16 [00:00<?, ?it/s]

2026-01-12 09:15:40,907 | INFO | OpenAlex fetched rows: 6848


Unnamed: 0,source,theme,openalex_id,doi,title,publication_year,publication_date,type,venue,authors,cited_by_count,is_oa,oa_status,best_oa_landing_url,best_oa_pdf_url,landing_page_url,abstract
0,openalex,venture_capital,https://openalex.org/W4320913257,10.1016/j.jcorpfin.2023.102361,Has persistence persisted in private equity? E...,2023,2023-02-15,article,Journal of Corporate Finance,"[Robert S. Harris, Tim Jenkinson, Steven N. Ka...",69,True,hybrid,https://doi.org/10.1016/j.jcorpfin.2023.102361,,https://doi.org/10.1016/j.jcorpfin.2023.102361,This paper presents new evidence on performanc...
1,openalex,venture_capital,https://openalex.org/W4384937699,10.1016/j.eneco.2023.106877,The role of venture capital and governments in...,2023,2023-07-19,article,Energy Economics,"[Matthias van den Heuvel, David Popp]",72,False,closed,,,https://doi.org/10.1016/j.eneco.2023.106877,
2,openalex,venture_capital,https://openalex.org/W4386554473,10.1093/rfs/hhad071,Common Venture Capital Investors and Startup G...,2023,2023-09-08,article,Review of Financial Studies,"[Ofer Eldar, Jillian Grennan]",39,True,hybrid,https://doi.org/10.1093/rfs/hhad071,https://academic.oup.com/rfs/advance-article-p...,https://doi.org/10.1093/rfs/hhad071,Abstract We exploit the staggered introduction...
3,openalex,venture_capital,https://openalex.org/W4321190526,10.1016/j.jclepro.2023.136489,Mapping the significance of green venture capi...,2023,2023-02-17,review,Journal of Cleaner Production,"[Karambir Singh Dhayal, Arun Kumar Giri, Luca ...",71,False,closed,,,https://doi.org/10.1016/j.jclepro.2023.136489,
4,openalex,venture_capital,https://openalex.org/W4220654757,10.1016/j.jbankfin.2022.106443,The Reallocation Effects of COVID-19: Evidence...,2023,2023-01-01,article,BOA (University of Milano-Bicocca),"[Andrea Bellucci, Alexander Borisov, Gianluca ...",48,True,green,https://hdl.handle.net/10281/415817,,https://hdl.handle.net/10281/415817,We examine possible reallocation effects gener...


In [9]:
# ------------------------------------------------------------
# Cell 06: arXiv search and metadata extraction
# ------------------------------------------------------------
# This cell retrieves candidate papers from arXiv using the Atom API.
# We execute the arXiv jobs in QUERY_PLAN, paginate via start/max_results,
# and normalize the essential metadata into a "raw arXiv" DataFrame.
#
# Notes:
# - arXiv results are typically preprints; metadata quality varies by category.
# - arXiv does not provide a perfect "publication year" filter in the query language,
#   so we fetch broadly per theme and filter by entry.published year client-side.
# - We keep this step high-recall and rely on downstream scoring/deduplication.

import feedparser
from urllib.parse import quote_plus

ARXIV_NS = {
    "arxiv": "http://arxiv.org/schemas/atom",
}

def _arxiv_get(params: dict) -> feedparser.FeedParserDict:
    """
    Low-level arXiv request with retry handling.
    Returns parsed Atom feed.
    """
    spec = SOURCE_SPECS["arxiv"]

    @retry(
        reraise=True,
        stop=stop_after_attempt(5),
        wait=wait_exponential(multiplier=1, min=1, max=30),
        retry=retry_if_exception_type((requests.exceptions.RequestException,)),
    )
    def _do():
        resp = SESSION.get(spec.base_url, params=params, timeout=spec.timeout_sec)
        resp.raise_for_status()
        return feedparser.parse(resp.text)

    return _do()

def build_arxiv_params(search_query: str, start: int, max_results: int) -> dict:
    """
    Build arXiv query parameters for the Atom API.
    """
    return {
        "search_query": search_query,
        "start": start,
        "max_results": max_results,
        "sortBy": "submittedDate",
        "sortOrder": "descending",
    }

def parse_arxiv_entry(entry: feedparser.FeedParserDict, theme: str) -> Dict[str, object]:
    """
    Extract a compact raw record from an arXiv Atom entry.
    """
    arxiv_id = None
    if "id" in entry and entry.id:
        # Example: http://arxiv.org/abs/2301.01234v2
        arxiv_id = entry.id.split("/abs/")[-1] if "/abs/" in entry.id else entry.id

    title = (entry.get("title") or "").replace("\n", " ").strip()
    abstract = (entry.get("summary") or "").replace("\n", " ").strip()

    published = entry.get("published")  # ISO timestamp string
    updated = entry.get("updated")

    authors = []
    for a in entry.get("authors", [])[:50]:
        name = a.get("name")
        if name:
            authors.append(name)

    # Find PDF link
    pdf_url = None
    landing_url = None
    for link in entry.get("links", []):
        href = link.get("href")
        rel = link.get("rel")
        link_type = link.get("type")
        if rel == "alternate":
            landing_url = href
        if link_type == "application/pdf":
            pdf_url = href

    # arXiv categories
    categories = []
    for tag in entry.get("tags", []):
        term = tag.get("term")
        if term:
            categories.append(term)

    # DOI (sometimes present)
    doi = None
    if "arxiv_doi" in entry:
        doi = entry.arxiv_doi
    else:
        # feedparser sometimes stores arxiv:doi in a dict
        doi = entry.get("doi")

    return {
        "source": "arxiv",
        "theme": theme,
        "arxiv_id": arxiv_id,
        "title": title,
        "abstract": abstract,
        "published": published,
        "updated": updated,
        "authors": authors,
        "categories": categories,
        "doi": doi,
        "landing_page_url": landing_url,
        "pdf_url": pdf_url,
    }

def run_arxiv_query_job(job: Dict[str, object]) -> List[Dict[str, object]]:
    """
    Execute a single arXiv query job with start/max_results pagination.
    Filter client-side by YEAR_FROM/YEAR_TO.
    """
    spec = SOURCE_SPECS["arxiv"]
    q = str(job["query"])

    all_records: List[Dict[str, object]] = []
    fetched = 0
    start = 0

    while True:
        params = build_arxiv_params(
            search_query=q,
            start=start,
            max_results=min(spec.per_page, spec.max_results_per_query - fetched),
        )
        feed = _arxiv_get(params)
        entries = feed.get("entries") or []

        if not entries:
            break

        for entry in entries:
            rec = parse_arxiv_entry(entry, theme=str(job["theme"]))

            # Filter by year (published date)
            pub = rec.get("published") or ""
            year = None
            if len(pub) >= 4 and pub[:4].isdigit():
                year = int(pub[:4])

            if year is not None and (year < YEAR_FROM or year > YEAR_TO):
                continue

            rec["publication_year"] = year
            all_records.append(rec)
            fetched += 1

            if fetched >= spec.max_results_per_query:
                break

        if fetched >= spec.max_results_per_query:
            break

        start += spec.per_page

        # Stop if we've paged too far back in time for this job (heuristic)
        # Since we sort by submittedDate desc, once we see older than YEAR_FROM,
        # continued paging is unlikely to be useful.
        last_pub = (all_records[-1].get("publication_year") if all_records else None)
        if last_pub is not None and last_pub < YEAR_FROM:
            break

    return all_records

# -----------------------------
# Execute arXiv jobs in QUERY_PLAN
# -----------------------------
raw_arxiv_records: List[Dict[str, object]] = []

arxiv_jobs = [j for j in QUERY_PLAN if j["source"] == "arxiv"]
logger.info("arXiv jobs: %d", len(arxiv_jobs))

for job in tqdm(arxiv_jobs, desc="arXiv queries"):
    try:
        recs = run_arxiv_query_job(job)
        raw_arxiv_records.extend(recs)
    except Exception as e:
        logger.exception("arXiv job failed: %s", job)
        raw_arxiv_records.append(
            {
                "source": "arxiv",
                "theme": job.get("theme"),
                "arxiv_id": None,
                "title": None,
                "abstract": None,
                "published": None,
                "updated": None,
                "authors": [],
                "categories": [],
                "doi": None,
                "landing_page_url": None,
                "pdf_url": None,
                "publication_year": job.get("year_from"),
                "error": str(e),
                "failed_job": dict(job),
            }
        )

raw_arxiv_df = pd.DataFrame(raw_arxiv_records)
logger.info("arXiv fetched rows: %d", len(raw_arxiv_df))

raw_arxiv_df.head(5)


2026-01-12 09:16:55,682 | INFO | arXiv jobs: 16


arXiv queries:   0%|          | 0/16 [00:00<?, ?it/s]

2026-01-12 09:17:07,401 | INFO | arXiv fetched rows: 2408


Unnamed: 0,source,theme,arxiv_id,title,abstract,published,updated,authors,categories,doi,landing_page_url,pdf_url,publication_year
0,arxiv,venture_capital,2601.03198v1,Empowering Reliable Visual-Centric Instruction...,Evaluating the instruction-following (IF) capa...,2026-01-06T17:23:33Z,2026-01-06T17:23:33Z,"[Weilei He, Feng Ju, Zhiyuan Fan, Rui Min, Min...",[cs.LG],,https://arxiv.org/abs/2601.03198v1,https://arxiv.org/pdf/2601.03198v1,2026
1,arxiv,venture_capital,2601.03155v1,Vaught's Conjecture and Theories of Partial Or...,A complete theory ${\mathcal T}$ of partial or...,2026-01-06T16:31:13Z,2026-01-06T16:31:13Z,[Miloš S. Kurilić],[math.LO],,https://arxiv.org/abs/2601.03155v1,https://arxiv.org/pdf/2601.03155v1,2026
2,arxiv,venture_capital,2601.03031v1,FlexProofs: A Vector Commitment with Flexible ...,"In this paper, we introduce FlexProofs, a new ...",2026-01-06T14:05:16Z,2026-01-06T14:05:16Z,"[Jing Liu, Liang Feng Zhang]","[cs.CR, cs.LO]",,https://arxiv.org/abs/2601.03031v1,https://arxiv.org/pdf/2601.03031v1,2026
3,arxiv,venture_capital,2601.02721v1,Robust Mesh Saliency GT Acquisition in VR via ...,Reliable 3D mesh saliency ground truth (GT) is...,2026-01-06T05:20:12Z,2026-01-06T05:20:12Z,"[Guoquan Zheng, Jie Hao, Huiyu Duan, Yongming ...","[cs.CV, cs.MM]",,https://arxiv.org/abs/2601.02721v1,https://arxiv.org/pdf/2601.02721v1,2026
4,arxiv,venture_capital,2601.02601v1,State of the Quantum Software Engineering Ecos...,We study the current state of the Quantum Soft...,2026-01-05T23:34:51Z,2026-01-05T23:34:51Z,"[Nazanin Siavash, Armin Moin]",[cs.SE],,https://arxiv.org/abs/2601.02601v1,https://arxiv.org/pdf/2601.02601v1,2026


In [10]:
# ------------------------------------------------------------
# Cell 07: DOI-to-open-access resolution (private addon interface)
# ------------------------------------------------------------
# This cell defines an interface (thin wrapper) for resolving a DOI to
# open-access (OA) landing and PDF URLs. The implementation is intentionally
# kept behind a private addon so this notebook remains shareable and reproducible.
#
# Expected behavior:
# - Input: DOI (string, preferably normalized without "https://doi.org/")
# - Output: a dict with best candidate OA landing URL + PDF URL (if available),
#   plus lightweight evidence fields for debugging/auditing.
#
# Downstream usage:
# - Enrich OpenAlex/arXiv candidates with additional OA/PDF hints
# - Improve "free PDF likelihood" scoring and ingestion success rates

from dataclasses import dataclass

@dataclass
class OAResolveResult:
    doi: str
    is_oa: Optional[bool]
    best_landing_url: Optional[str]
    best_pdf_url: Optional[str]
    evidence: Dict[str, object]

def normalize_doi(doi: Optional[str]) -> Optional[str]:
    if not doi:
        return None
    d = doi.strip()
    d = d.replace("https://doi.org/", "").replace("http://doi.org/", "")
    d = d.replace("doi:", "").strip()
    return d or None

def resolve_doi_to_oa(doi: str) -> OAResolveResult:
    """
    Private addon hook. In a private environment, you should replace the body of this
    function with an import from your internal module (or a service call).
    """
    doi = normalize_doi(doi) or ""
    if not doi:
        return OAResolveResult(
            doi="",
            is_oa=None,
            best_landing_url=None,
            best_pdf_url=None,
            evidence={"error": "empty_doi"},
        )

    # ---- PRIVATE ADDON (placeholder) ----
    # Example expected return shape from private implementation:
    # {
    #   "is_oa": True/False,
    #   "best_landing_url": "https://...",
    #   "best_pdf_url": "https://...pdf",
    #   "evidence": {"provider": "...", "oa_status": "...", ...}
    # }
    #
    # Replace with:
    # from private_addons.oa_resolver import resolve
    # payload = resolve(doi)
    # -------------------------------------

    payload = {
        "is_oa": None,
        "best_landing_url": None,
        "best_pdf_url": None,
        "evidence": {"note": "private addon not installed in this environment"},
    }

    return OAResolveResult(
        doi=doi,
        is_oa=payload.get("is_oa"),
        best_landing_url=payload.get("best_landing_url"),
        best_pdf_url=payload.get("best_pdf_url"),
        evidence=payload.get("evidence") or {},
    )

# ------------------------------------------------------------
# Batch helper (optional): resolve a list of DOIs with caching
# ------------------------------------------------------------
_DOI_OA_CACHE: Dict[str, OAResolveResult] = {}

def resolve_dois_to_oa(dois: List[str], sleep_sec: float = 0.0) -> Dict[str, OAResolveResult]:
    """
    Resolve multiple DOIs with in-memory caching to avoid repeated calls.
    """
    out: Dict[str, OAResolveResult] = {}
    for d in tqdm(dois, desc="DOI->OA resolve"):
        nd = normalize_doi(d)
        if not nd:
            continue
        if nd in _DOI_OA_CACHE:
            out[nd] = _DOI_OA_CACHE[nd]
            continue

        res = resolve_doi_to_oa(nd)
        _DOI_OA_CACHE[nd] = res
        out[nd] = res

        if sleep_sec > 0:
            time.sleep(sleep_sec)

    return out

logger.info("DOI->OA resolver interface is ready (private implementation stubbed).")


2026-01-12 09:18:19,187 | INFO | DOI->OA resolver interface is ready (private implementation stubbed).


In [11]:
# ------------------------------------------------------------
# Cell 08: Source-wise execution and raw result collection
# ------------------------------------------------------------
# This cell orchestrates source-wise execution and collects raw results.
# By this point:
# - OpenAlex retrieval (Cell 05) should have produced `raw_openalex_df`
# - arXiv retrieval (Cell 06) should have produced `raw_arxiv_df`
# - DOI->OA resolver interface (Cell 07) is available for optional enrichment
#
# Goals:
# 1) Validate and snapshot raw source outputs
# 2) Optionally enrich OpenAlex rows with DOI->OA hints (private addon)
# 3) Persist raw artifacts for reproducibility and easier backfills
#
# Outputs:
# - raw_openalex_df (possibly enriched)
# - raw_arxiv_df
# - Raw artifact files saved into ARTIFACT_DIR

from pathlib import Path

# -----------------------------
# Basic validations
# -----------------------------
assert "raw_openalex_df" in globals(), "raw_openalex_df not found. Run Cell 05 first."
assert "raw_arxiv_df" in globals(), "raw_arxiv_df not found. Run Cell 06 first."

logger.info("Raw OpenAlex rows: %d", len(raw_openalex_df))
logger.info("Raw arXiv rows: %d", len(raw_arxiv_df))

# -----------------------------
# Optional DOI->OA enrichment (OpenAlex only)
# -----------------------------
ENRICH_DOI_OA = bool(SOURCES.get("doi_oa_resolve", False))

if ENRICH_DOI_OA:
    # Resolve only unique DOIs that appear in OpenAlex results
    doi_list = (
        raw_openalex_df["doi"]
        .dropna()
        .astype(str)
        .map(normalize_doi)
        .dropna()
        .unique()
        .tolist()
    )

    logger.info("Unique DOIs to resolve (OpenAlex): %d", len(doi_list))

    # NOTE: sleep_sec can be used if your resolver/provider enforces rate limits.
    doi_to_oa = resolve_dois_to_oa(doi_list, sleep_sec=0.0)

    # Convert mapping -> DataFrame for join
    doi_oa_df = pd.DataFrame(
        [
            {
                "doi": k,
                "doi_oa_is_oa": v.is_oa,
                "doi_oa_best_landing_url": v.best_landing_url,
                "doi_oa_best_pdf_url": v.best_pdf_url,
                "doi_oa_evidence": v.evidence,
            }
            for k, v in doi_to_oa.items()
        ]
    )

    # Join back
    raw_openalex_df = raw_openalex_df.copy()
    raw_openalex_df["doi_norm"] = raw_openalex_df["doi"].map(normalize_doi)

    raw_openalex_df = raw_openalex_df.merge(
        doi_oa_df,
        how="left",
        left_on="doi_norm",
        right_on="doi",
        suffixes=("", "_resolver"),
    )

    # Clean up join keys
    raw_openalex_df.drop(columns=["doi_resolver"], inplace=True, errors="ignore")

    logger.info("OpenAlex enriched with DOI->OA hints.")
else:
    logger.info("DOI->OA enrichment disabled (SOURCES['doi_oa_resolve']=False).")

# -----------------------------
# Snapshot raw artifacts
# -----------------------------
RAW_DIR = ARTIFACT_DIR / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

openalex_raw_path = RAW_DIR / "openalex_raw.parquet"
arxiv_raw_path = RAW_DIR / "arxiv_raw.parquet"

raw_openalex_df.to_parquet(openalex_raw_path, index=False)
raw_arxiv_df.to_parquet(arxiv_raw_path, index=False)

logger.info("Saved raw OpenAlex parquet: %s", openalex_raw_path)
logger.info("Saved raw arXiv parquet: %s", arxiv_raw_path)

# Optional: lightweight CSV snapshots for quick inspection
openalex_raw_csv = RAW_DIR / "openalex_raw_sample.csv"
arxiv_raw_csv = RAW_DIR / "arxiv_raw_sample.csv"

raw_openalex_df.head(200).to_csv(openalex_raw_csv, index=False)
raw_arxiv_df.head(200).to_csv(arxiv_raw_csv, index=False)

logger.info("Saved raw samples: %s / %s", openalex_raw_csv, arxiv_raw_csv)

# -----------------------------
# Update run metadata and persist
# -----------------------------
RUN_CONTEXT["raw_openalex_rows"] = int(len(raw_openalex_df))
RUN_CONTEXT["raw_arxiv_rows"] = int(len(raw_arxiv_df))
RUN_CONTEXT["raw_artifacts"] = {
    "openalex_raw_parquet": str(openalex_raw_path),
    "arxiv_raw_parquet": str(arxiv_raw_path),
    "openalex_raw_sample_csv": str(openalex_raw_csv),
    "arxiv_raw_sample_csv": str(arxiv_raw_csv),
}

with open(OUT_RUN_METADATA_JSON, "w") as f:
    json.dump(RUN_CONTEXT, f, indent=2)

logger.info("Updated run metadata: %s", OUT_RUN_METADATA_JSON)

# Quick sanity preview
display(raw_openalex_df.head(3))
display(raw_arxiv_df.head(3))


2026-01-12 09:21:02,165 | INFO | Raw OpenAlex rows: 6848
2026-01-12 09:21:02,166 | INFO | Raw arXiv rows: 2408
2026-01-12 09:21:02,184 | INFO | Unique DOIs to resolve (OpenAlex): 6650


DOI->OA resolve:   0%|          | 0/6650 [00:00<?, ?it/s]

2026-01-12 09:21:02,278 | INFO | OpenAlex enriched with DOI->OA hints.
2026-01-12 09:21:02,629 | INFO | Saved raw OpenAlex parquet: artifacts/018_seed_corpus_2023plus_discovery/raw/openalex_raw.parquet
2026-01-12 09:21:02,630 | INFO | Saved raw arXiv parquet: artifacts/018_seed_corpus_2023plus_discovery/raw/arxiv_raw.parquet
2026-01-12 09:21:02,660 | INFO | Saved raw samples: artifacts/018_seed_corpus_2023plus_discovery/raw/openalex_raw_sample.csv / artifacts/018_seed_corpus_2023plus_discovery/raw/arxiv_raw_sample.csv
2026-01-12 09:21:02,661 | INFO | Updated run metadata: artifacts/018_seed_corpus_2023plus_discovery/run_metadata.json


Unnamed: 0,source,theme,openalex_id,doi,title,publication_year,publication_date,type,venue,authors,cited_by_count,is_oa,oa_status,best_oa_landing_url,best_oa_pdf_url,landing_page_url,abstract,doi_norm,doi_oa_is_oa,doi_oa_best_landing_url,doi_oa_best_pdf_url,doi_oa_evidence
0,openalex,venture_capital,https://openalex.org/W4320913257,10.1016/j.jcorpfin.2023.102361,Has persistence persisted in private equity? E...,2023,2023-02-15,article,Journal of Corporate Finance,"[Robert S. Harris, Tim Jenkinson, Steven N. Ka...",69,True,hybrid,https://doi.org/10.1016/j.jcorpfin.2023.102361,,https://doi.org/10.1016/j.jcorpfin.2023.102361,This paper presents new evidence on performanc...,10.1016/j.jcorpfin.2023.102361,,,,{'note': 'private addon not installed in this ...
1,openalex,venture_capital,https://openalex.org/W4384937699,10.1016/j.eneco.2023.106877,The role of venture capital and governments in...,2023,2023-07-19,article,Energy Economics,"[Matthias van den Heuvel, David Popp]",72,False,closed,,,https://doi.org/10.1016/j.eneco.2023.106877,,10.1016/j.eneco.2023.106877,,,,{'note': 'private addon not installed in this ...
2,openalex,venture_capital,https://openalex.org/W4386554473,10.1093/rfs/hhad071,Common Venture Capital Investors and Startup G...,2023,2023-09-08,article,Review of Financial Studies,"[Ofer Eldar, Jillian Grennan]",39,True,hybrid,https://doi.org/10.1093/rfs/hhad071,https://academic.oup.com/rfs/advance-article-p...,https://doi.org/10.1093/rfs/hhad071,Abstract We exploit the staggered introduction...,10.1093/rfs/hhad071,,,,{'note': 'private addon not installed in this ...


Unnamed: 0,source,theme,arxiv_id,title,abstract,published,updated,authors,categories,doi,landing_page_url,pdf_url,publication_year
0,arxiv,venture_capital,2601.03198v1,Empowering Reliable Visual-Centric Instruction...,Evaluating the instruction-following (IF) capa...,2026-01-06T17:23:33Z,2026-01-06T17:23:33Z,"[Weilei He, Feng Ju, Zhiyuan Fan, Rui Min, Min...",[cs.LG],,https://arxiv.org/abs/2601.03198v1,https://arxiv.org/pdf/2601.03198v1,2026
1,arxiv,venture_capital,2601.03155v1,Vaught's Conjecture and Theories of Partial Or...,A complete theory ${\mathcal T}$ of partial or...,2026-01-06T16:31:13Z,2026-01-06T16:31:13Z,[Miloš S. Kurilić],[math.LO],,https://arxiv.org/abs/2601.03155v1,https://arxiv.org/pdf/2601.03155v1,2026
2,arxiv,venture_capital,2601.03031v1,FlexProofs: A Vector Commitment with Flexible ...,"In this paper, we introduce FlexProofs, a new ...",2026-01-06T14:05:16Z,2026-01-06T14:05:16Z,"[Jing Liu, Liang Feng Zhang]","[cs.CR, cs.LO]",,https://arxiv.org/abs/2601.03031v1,https://arxiv.org/pdf/2601.03031v1,2026


In [13]:
# ------------------------------------------------------------
# Cell 09: Normalization into a unified paper schema
# ------------------------------------------------------------
# This cell converts raw source outputs (OpenAlex + arXiv) into a single
# unified "paper schema" DataFrame for downstream deduplication and scoring.
#
# Goals:
# - Align field names and data types across sources
# - Create stable local identifiers (paper_id) for downstream linking
# - Normalize DOIs and URLs
# - Standardize text fields (title/abstract) for matching and ranking
#
# Output:
# - papers_raw_df: unified schema across sources (still pre-dedup)

import hashlib

# -----------------------------
# Text normalization helpers
# -----------------------------
def normalize_title(title: Optional[str]) -> Optional[str]:
    if not title:
        return None
    t = title.lower().strip()
    t = re.sub(r"\s+", " ", t)
    t = re.sub(r"[^\w\s]", " ", t)  # drop punctuation
    t = re.sub(r"\s+", " ", t).strip()
    return t or None

def normalize_url(url: Optional[object]) -> Optional[str]:
    if url is None:
        return None
    # pandas NaN (float) / NA 
    try:
        if pd.isna(url):
            return None
    except Exception:
        pass
    if not isinstance(url, str):
        url = str(url)

    u = url.strip()
    if not u:
        return None

    # Best-effort cleanup
    u = u.replace("http://", "https://")
    return u


def stable_hash_key(*parts: str) -> str:
    """
    Create a stable short hash for identifiers.
    """
    s = "||".join([p for p in parts if p is not None])
    return hashlib.sha1(s.encode("utf-8")).hexdigest()[:16]

# -----------------------------
# OpenAlex -> unified schema
# -----------------------------
def openalex_to_unified(df: pd.DataFrame) -> pd.DataFrame:
    out = pd.DataFrame()
    out["source"] = "openalex"
    out["source_id"] = df["openalex_id"]
    out["doi"] = df["doi"].map(normalize_doi)
    out["title"] = df["title"].astype("string")
    out["title_norm"] = df["title"].map(normalize_title)
    out["abstract"] = df["abstract"].astype("string")
    out["publication_year"] = pd.to_numeric(df["publication_year"], errors="coerce").astype("Int64")
    out["venue"] = df["venue"].astype("string")
    out["authors"] = df["authors"]
    out["cited_by_count"] = pd.to_numeric(df["cited_by_count"], errors="coerce").astype("Int64")
    out["theme"] = df["theme"].astype("string")

    # URLs
    out["landing_page_url"] = df["landing_page_url"].map(normalize_url)
    out["best_oa_landing_url"] = df["best_oa_landing_url"].map(normalize_url)
    out["best_oa_pdf_url"] = df["best_oa_pdf_url"].map(normalize_url)

    # OA signals
    out["is_oa"] = df["is_oa"]
    out["oa_status"] = df["oa_status"].astype("string")

    # Optional resolver enrichment
    if "doi_oa_best_pdf_url" in df.columns:
        out["resolver_best_pdf_url"] = df["doi_oa_best_pdf_url"].map(normalize_url)
        out["resolver_best_landing_url"] = df["doi_oa_best_landing_url"].map(normalize_url)
        out["resolver_is_oa"] = df["doi_oa_is_oa"]
    else:
        out["resolver_best_pdf_url"] = None
        out["resolver_best_landing_url"] = None
        out["resolver_is_oa"] = None

    return out

# -----------------------------
# arXiv -> unified schema
# -----------------------------
def arxiv_to_unified(df: pd.DataFrame) -> pd.DataFrame:
    out = pd.DataFrame()
    out["source"] = "arxiv"
    out["source_id"] = df["arxiv_id"]
    out["doi"] = df["doi"].map(normalize_doi)
    out["title"] = df["title"].astype("string")
    out["title_norm"] = df["title"].map(normalize_title)
    out["abstract"] = df["abstract"].astype("string")
    out["publication_year"] = pd.to_numeric(df["publication_year"], errors="coerce").astype("Int64")
    out["venue"] = "arXiv"
    out["authors"] = df["authors"]
    out["cited_by_count"] = pd.Series([pd.NA] * len(df), dtype="Int64")
    out["theme"] = df["theme"].astype("string")

    # URLs
    out["landing_page_url"] = df["landing_page_url"].map(normalize_url)
    out["best_oa_landing_url"] = df["landing_page_url"].map(normalize_url)
    out["best_oa_pdf_url"] = df["pdf_url"].map(normalize_url)

    # OA signals
    out["is_oa"] = True
    out["oa_status"] = "arxiv"

    # Resolver fields (not applicable)
    out["resolver_best_pdf_url"] = None
    out["resolver_best_landing_url"] = None
    out["resolver_is_oa"] = None

    return out

# -----------------------------
# Build unified table
# -----------------------------
openalex_unified = openalex_to_unified(raw_openalex_df)
arxiv_unified = arxiv_to_unified(raw_arxiv_df)

papers_raw_df = pd.concat([openalex_unified, arxiv_unified], ignore_index=True)

# Create stable local identifier
# Prefer DOI; else hash (title_norm + year + first author)
def make_paper_id(row: pd.Series) -> str:
    if pd.notna(row.get("doi")) and str(row["doi"]).strip():
        return f"doi::{row['doi']}"
    title_norm = row.get("title_norm") or ""
    year = str(row.get("publication_year") or "")
    first_author = ""
    authors = row.get("authors")
    if isinstance(authors, list) and authors:
        first_author = str(authors[0])
    hk = stable_hash_key(title_norm, year, first_author)
    return f"hash::{hk}"

papers_raw_df["paper_id"] = papers_raw_df.apply(make_paper_id, axis=1)

# Optional: keep "best PDF hint" consolidated for later steps
papers_raw_df["pdf_url_hint"] = (
    papers_raw_df["resolver_best_pdf_url"]
    .fillna(papers_raw_df["best_oa_pdf_url"])
    .fillna(papers_raw_df["best_oa_landing_url"])
    .fillna(papers_raw_df["landing_page_url"])
)

logger.info("Unified papers_raw_df rows: %d", len(papers_raw_df))
logger.info("Unique paper_id count (pre-dedup): %d", papers_raw_df["paper_id"].nunique())

papers_raw_df.head(5)


2026-01-12 09:25:16,011 | INFO | Unified papers_raw_df rows: 9256
2026-01-12 09:25:16,014 | INFO | Unique paper_id count (pre-dedup): 7325


Unnamed: 0,source,source_id,doi,title,title_norm,abstract,publication_year,venue,authors,cited_by_count,theme,landing_page_url,best_oa_landing_url,best_oa_pdf_url,is_oa,oa_status,resolver_best_pdf_url,resolver_best_landing_url,resolver_is_oa,paper_id,pdf_url_hint
0,,https://openalex.org/W4320913257,10.1016/j.jcorpfin.2023.102361,Has persistence persisted in private equity? E...,has persistence persisted in private equity ev...,This paper presents new evidence on performanc...,2023,Journal of Corporate Finance,"[Robert S. Harris, Tim Jenkinson, Steven N. Ka...",69,venture_capital,https://doi.org/10.1016/j.jcorpfin.2023.102361,https://doi.org/10.1016/j.jcorpfin.2023.102361,,True,hybrid,,,,doi::10.1016/j.jcorpfin.2023.102361,https://doi.org/10.1016/j.jcorpfin.2023.102361
1,,https://openalex.org/W4384937699,10.1016/j.eneco.2023.106877,The role of venture capital and governments in...,the role of venture capital and governments in...,,2023,Energy Economics,"[Matthias van den Heuvel, David Popp]",72,venture_capital,https://doi.org/10.1016/j.eneco.2023.106877,,,False,closed,,,,doi::10.1016/j.eneco.2023.106877,https://doi.org/10.1016/j.eneco.2023.106877
2,,https://openalex.org/W4386554473,10.1093/rfs/hhad071,Common Venture Capital Investors and Startup G...,common venture capital investors and startup g...,Abstract We exploit the staggered introduction...,2023,Review of Financial Studies,"[Ofer Eldar, Jillian Grennan]",39,venture_capital,https://doi.org/10.1093/rfs/hhad071,https://doi.org/10.1093/rfs/hhad071,https://academic.oup.com/rfs/advance-article-p...,True,hybrid,,,,doi::10.1093/rfs/hhad071,https://academic.oup.com/rfs/advance-article-p...
3,,https://openalex.org/W4321190526,10.1016/j.jclepro.2023.136489,Mapping the significance of green venture capi...,mapping the significance of green venture capi...,,2023,Journal of Cleaner Production,"[Karambir Singh Dhayal, Arun Kumar Giri, Luca ...",71,venture_capital,https://doi.org/10.1016/j.jclepro.2023.136489,,,False,closed,,,,doi::10.1016/j.jclepro.2023.136489,https://doi.org/10.1016/j.jclepro.2023.136489
4,,https://openalex.org/W4220654757,10.1016/j.jbankfin.2022.106443,The Reallocation Effects of COVID-19: Evidence...,the reallocation effects of covid 19 evidence ...,We examine possible reallocation effects gener...,2023,BOA (University of Milano-Bicocca),"[Andrea Bellucci, Alexander Borisov, Gianluca ...",48,venture_capital,https://hdl.handle.net/10281/415817,https://hdl.handle.net/10281/415817,,True,green,,,,doi::10.1016/j.jbankfin.2022.106443,https://hdl.handle.net/10281/415817


In [14]:
# ------------------------------------------------------------
# Cell 10: Deduplication across sources (DOI / title-based)
# ------------------------------------------------------------
# This cell deduplicates the unified papers table across sources.
#
# Why dedup?
# - OpenAlex and arXiv often overlap (preprint vs published version)
# - Within OpenAlex, multiple queries/themes can retrieve the same work
# - arXiv queries can also overlap due to broad keyword searches
#
# Strategy (practical, deterministic):
# 1) DOI-based dedup: if DOI exists, treat it as the primary key and keep 1 "best" row
# 2) Title-based dedup: for rows without DOI, use normalized title (+year) as a fallback key
#
# "Best row" heuristic:
# - Prefer OpenAlex over arXiv when DOI is present (richer metadata, citations)
# - Prefer rows with a PDF URL hint
# - Prefer higher cited_by_count (when available)
# - Prefer longer abstract (proxy for completeness)

from typing import Any

# -----------------------------
# Helpers: quality score for picking representative rows
# -----------------------------
def _has_pdf_hint(u: Any) -> int:
    return int(u is not None and isinstance(u, str) and len(u.strip()) > 0)

def _abstract_len(a: Any) -> int:
    if a is None or (isinstance(a, float) and pd.isna(a)):
        return 0
    if not isinstance(a, str):
        a = str(a)
    return len(a.strip())

def _cited_count(x: Any) -> int:
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return 0
    try:
        return int(x)
    except Exception:
        return 0

def row_quality_score(row: pd.Series) -> Tuple[int, int, int, int]:
    """
    Higher is better. We return a tuple so sorting is deterministic.
    Priority order:
    1) Source preference (OpenAlex > arXiv)
    2) Has PDF hint
    3) Cited-by count
    4) Abstract length
    """
    source_pref = 1 if row.get("source") == "openalex" else 0
    pdf_pref = _has_pdf_hint(row.get("pdf_url_hint"))
    cited = _cited_count(row.get("cited_by_count"))
    ablen = _abstract_len(row.get("abstract"))
    return (source_pref, pdf_pref, cited, ablen)

# -----------------------------
# Step 1: DOI-based dedup
# -----------------------------
df = papers_raw_df.copy()

# Normalize DOI again defensively
df["doi_norm"] = df["doi"].map(normalize_doi)

# Prepare a stable key for DOI rows
df["dedup_key_doi"] = df["doi_norm"].apply(lambda d: f"doi::{d}" if d else None)

# Compute quality score components for sorting
df["_source_pref"] = (df["source"] == "openalex").astype(int)
df["_pdf_pref"] = df["pdf_url_hint"].apply(_has_pdf_hint)
df["_cited"] = df["cited_by_count"].apply(_cited_count)
df["_ablen"] = df["abstract"].apply(_abstract_len)

# Sort so the "best" row comes first within each group
df_sorted = df.sort_values(
    by=["_source_pref", "_pdf_pref", "_cited", "_ablen"],
    ascending=[False, False, False, False],
).reset_index(drop=True)

# Split DOI vs non-DOI
df_with_doi = df_sorted[df_sorted["dedup_key_doi"].notna()].copy()
df_no_doi = df_sorted[df_sorted["dedup_key_doi"].isna()].copy()

doi_best = (
    df_with_doi
    .drop_duplicates(subset=["dedup_key_doi"], keep="first")
    .copy()
)

logger.info("DOI rows: %d -> %d (kept best per DOI)", len(df_with_doi), len(doi_best))

# -----------------------------
# Step 2: Title-based dedup for non-DOI rows
# -----------------------------
# Use title_norm + year bucket to reduce false merges across unrelated papers with same title.
df_no_doi["year_bucket"] = df_no_doi["publication_year"].astype("Int64")

# If year is missing, we still dedup by title_norm only (more aggressive)
def _title_key(row: pd.Series) -> str:
    t = row.get("title_norm") or ""
    y = row.get("year_bucket")
    if pd.isna(y):
        return f"title::{t}"
    return f"title::{t}::{int(y)}"

df_no_doi["dedup_key_title"] = df_no_doi.apply(_title_key, axis=1)

title_best = (
    df_no_doi
    .drop_duplicates(subset=["dedup_key_title"], keep="first")
    .copy()
)

logger.info("Non-DOI rows: %d -> %d (kept best per title key)", len(df_no_doi), len(title_best))

# -----------------------------
# Combine results
# -----------------------------
papers_dedup_df = pd.concat([doi_best, title_best], ignore_index=True)

# Clean up helper columns
papers_dedup_df.drop(
    columns=[
        "_source_pref", "_pdf_pref", "_cited", "_ablen",
        "doi_norm", "dedup_key_doi", "year_bucket", "dedup_key_title"
    ],
    inplace=True,
    errors="ignore",
)

logger.info("papers_dedup_df rows: %d", len(papers_dedup_df))
logger.info("paper_id unique after dedup: %d", papers_dedup_df["paper_id"].nunique())

# Quick sanity checks
dupe_doi_count = (papers_raw_df["doi"].map(normalize_doi).dropna().nunique()
                  - papers_dedup_df["doi"].map(normalize_doi).dropna().nunique())
logger.info("Removed duplicate DOI groups (approx): %d", dupe_doi_count)

papers_dedup_df.head(10)


2026-01-12 09:28:06,877 | INFO | DOI rows: 6852 -> 6672 (kept best per DOI)
2026-01-12 09:28:06,905 | INFO | Non-DOI rows: 2404 -> 648 (kept best per title key)
2026-01-12 09:28:06,916 | INFO | papers_dedup_df rows: 7320
2026-01-12 09:28:06,919 | INFO | paper_id unique after dedup: 7320
2026-01-12 09:28:06,930 | INFO | Removed duplicate DOI groups (approx): 0


Unnamed: 0,source,source_id,doi,title,title_norm,abstract,publication_year,venue,authors,cited_by_count,theme,landing_page_url,best_oa_landing_url,best_oa_pdf_url,is_oa,oa_status,resolver_best_pdf_url,resolver_best_landing_url,resolver_is_oa,paper_id,pdf_url_hint
0,,https://openalex.org/W4229933927,10.1073/pnas,Proceedings of the National Academy of Sciences,proceedings of the national academy of sciences,Colon carcinoma is one of the leading causes o...,2024,Warwick Research Archive Portal (University of...,"[Oliver J. Miller, Abdeslam El Harrak, Thomas ...",4072,entrepreneurship_policy,https://doi.org/10.1073/pnas,https://wrap.warwick.ac.uk/id/eprint/120002/8/...,https://wrap.warwick.ac.uk/120002/7/supplement...,True,green,,,,doi::10.1073/pnas,https://wrap.warwick.ac.uk/120002/7/supplement...
1,,https://openalex.org/W4394894573,10.1016/s0140-6736(24)00757-8,"Global incidence, prevalence, years lived with...",global incidence prevalence years lived with d...,,2024,The Lancet,"[Alize J Ferrari, Damian Santomauro, Amirali A...",3368,government_vc,https://doi.org/10.1016/s0140-6736(24)00757-8,https://doi.org/10.1016/s0140-6736(24)00757-8,https://www.thelancet.com/article/S01406736240...,True,hybrid,,,,doi::10.1016/s0140-6736(24)00757-8,https://www.thelancet.com/article/S01406736240...
2,,https://openalex.org/W4313488537,10.1038/s41580-022-00566-8,"Long non-coding RNAs: definitions, functions, ...",long non coding rnas definitions functions cha...,,2023,Nature Reviews Molecular Cell Biology,"[John S. Mattick, Paulo Amaral, Piero Carninci...",1901,entrepreneurship_policy,https://doi.org/10.1038/s41580-022-00566-8,https://doi.org/10.1038/s41580-022-00566-8,https://www.nature.com/articles/s41580-022-005...,True,bronze,,,,doi::10.1038/s41580-022-00566-8,https://www.nature.com/articles/s41580-022-005...
3,,https://openalex.org/W4368358469,10.1093/nar/gkad344,antiSMASH 7.0: new and improved predictions fo...,antismash 7 0 new and improved predictions for...,Abstract Microorganisms produce small bioactiv...,2023,Nucleic Acids Research,"[Kai Blin, Simon J. Shaw, Hannah E. Augustijn,...",1888,entrepreneurship_policy,https://doi.org/10.1093/nar/gkad344,https://doi.org/10.1093/nar/gkad344,https://academic.oup.com/nar/advance-article-p...,True,gold,,,,doi::10.1093/nar/gkad344,https://academic.oup.com/nar/advance-article-p...
4,,https://openalex.org/W4377206081,10.1038/s41392-023-01452-1,Macrophages in immunoregulation and therapeutics,macrophages in immunoregulation and therapeutics,,2023,Signal Transduction and Targeted Therapy,"[Shanze Chen, Abdullah F. U. H. Saeed, Quan Li...",1549,entrepreneurship_policy,https://doi.org/10.1038/s41392-023-01452-1,https://doi.org/10.1038/s41392-023-01452-1,https://www.nature.com/articles/s41392-023-014...,True,gold,,,,doi::10.1038/s41392-023-01452-1,https://www.nature.com/articles/s41392-023-014...
5,,https://openalex.org/W4392343318,10.1038/s41392-024-01757-9,NF-κB in biology and targeted therapy: new ins...,nf κb in biology and targeted therapy new insi...,,2024,Signal Transduction and Targeted Therapy,"[Qing Guo, Yizi Jin, Xinyu Chen, Xiaomin Ye, X...",1233,entrepreneurship_policy,https://doi.org/10.1038/s41392-024-01757-9,https://doi.org/10.1038/s41392-024-01757-9,https://www.nature.com/articles/s41392-024-017...,True,gold,,,,doi::10.1038/s41392-024-01757-9,https://www.nature.com/articles/s41392-024-017...
6,,https://openalex.org/W4400907212,10.4337/9781802208818.00008,The Fourth Industrial Revolution: what it mean...,the fourth industrial revolution what it means...,We stand on the brink of a technological revol...,2024,Edward Elgar Publishing eBooks,[Klaus Schwab],1229,entrepreneurship_policy,https://doi.org/10.4337/9781802208818.00008,,,False,closed,,,,doi::10.4337/9781802208818.00008,https://doi.org/10.4337/9781802208818.00008
7,,https://openalex.org/W4378217890,10.1038/s41392-023-01481-w,"The blood–brain barrier: Structure, regulation...",the blood brain barrier structure regulation a...,,2023,Signal Transduction and Targeted Therapy,"[Di Wu, Qi Chen, Xiaojie Chen, Feng Han, Zhong...",1194,entrepreneurship_policy,https://doi.org/10.1038/s41392-023-01481-w,https://doi.org/10.1038/s41392-023-01481-w,https://www.nature.com/articles/s41392-023-014...,True,gold,,,,doi::10.1038/s41392-023-01481-w,https://www.nature.com/articles/s41392-023-014...
8,,https://openalex.org/W4386151915,10.1093/eurheartj/ehad192,2023 ESC Guidelines for the management of card...,2023 esc guidelines for the management of card...,Guidelines evaluate and summarize available ev...,2023,European Heart Journal,"[Nikolaus Marx, Massimo Federici, Katharina Sc...",1165,entrepreneurship_policy,https://doi.org/10.1093/eurheartj/ehad192,https://doi.org/10.1093/eurheartj/ehad192,https://academic.oup.com/eurheartj/advance-art...,True,hybrid,,,,doi::10.1093/eurheartj/ehad192,https://academic.oup.com/eurheartj/advance-art...
9,,https://openalex.org/W4391878282,10.1038/s41580-024-00703-5,The cell biology of ferroptosis,the cell biology of ferroptosis,,2024,Nature Reviews Molecular Cell Biology,"[Scott J. Dixon, James A. Olzmann]",812,entrepreneurship_policy,https://doi.org/10.1038/s41580-024-00703-5,,,False,closed,,,,doi::10.1038/s41580-024-00703-5,https://doi.org/10.1038/s41580-024-00703-5


In [15]:
# ------------------------------------------------------------
# Cell 11: Heuristic estimation of free PDF availability
# ------------------------------------------------------------
# This cell assigns a best-effort "free PDF likelihood" for each deduplicated paper.
#
# Why heuristics?
# - OpenAlex OA flags are helpful but not sufficient for reliable automated download
# - Some hybrid/bronze articles have a landing page but no direct PDF URL
# - arXiv always provides a PDF URL, but non-arXiv sources vary widely
#
# Output columns (added to `papers_dedup_df`):
# - pdf_candidate_url: best URL to try first for downloading a PDF
# - free_pdf_score: numeric score (higher => more likely downloadable without paywall)
# - free_pdf_label: categorical label ("HIGH" / "MEDIUM" / "LOW")
# - free_pdf_reason: short, human-readable explanation for audit/debugging

from urllib.parse import urlparse

def _is_probably_pdf_url(u: Optional[str]) -> bool:
    if not u or not isinstance(u, str):
        return False
    u = u.lower().strip()
    return (u.endswith(".pdf") or "pdf" in u)

def _domain(u: Optional[str]) -> Optional[str]:
    if not u or not isinstance(u, str):
        return None
    try:
        return urlparse(u).netloc.lower()
    except Exception:
        return None

def choose_pdf_candidate_url(row: pd.Series) -> Optional[str]:
    """
    Choose the best first-try URL for PDF retrieval.
    Priority:
    1) resolver_best_pdf_url (private addon)
    2) best_oa_pdf_url (OpenAlex best OA)
    3) arXiv pdf_url_hint (already a PDF)
    4) best_oa_landing_url (sometimes landing is already a PDF)
    5) landing_page_url (fallback)
    """
    for col in [
        "resolver_best_pdf_url",
        "best_oa_pdf_url",
        "pdf_url_hint",
        "best_oa_landing_url",
        "landing_page_url",
    ]:
        u = row.get(col)
        if isinstance(u, str) and u.strip():
            return u.strip()
    return None

def estimate_free_pdf_score(row: pd.Series) -> Tuple[float, str]:
    """
    Heuristic scoring: returns (score, reason).
    """
    score = 0.0
    reasons = []

    src = row.get("source")
    oa_status = (row.get("oa_status") or "").lower()
    is_oa = row.get("is_oa")

    # Source baseline
    if src == "arxiv":
        score += 10.0
        reasons.append("arXiv source (PDF expected)")
    elif src == "openalex":
        score += 1.0
        reasons.append("OpenAlex source")

    # Direct PDF URL hints
    best_pdf = row.get("best_oa_pdf_url")
    resolver_pdf = row.get("resolver_best_pdf_url")
    hint = row.get("pdf_url_hint")

    if isinstance(resolver_pdf, str) and resolver_pdf.strip():
        score += 6.0
        reasons.append("Resolver returned direct PDF")
    if isinstance(best_pdf, str) and best_pdf.strip():
        score += 5.0
        reasons.append("OpenAlex best_oa_pdf_url present")
    if isinstance(hint, str) and hint.strip() and _is_probably_pdf_url(hint):
        score += 2.0
        reasons.append("PDF-like URL hint")

    # OA flags
    if is_oa is True:
        score += 2.0
        reasons.append("is_oa=True")
    if oa_status in {"gold", "green"}:
        score += 2.0
        reasons.append(f"oa_status={oa_status}")
    elif oa_status in {"hybrid", "bronze"}:
        score += 1.0
        reasons.append(f"oa_status={oa_status}")
    elif oa_status == "closed":
        score -= 2.0
        reasons.append("oa_status=closed")

    # Domain heuristics (optional, lightweight)
    cand = choose_pdf_candidate_url(row)
    dom = _domain(cand)
    if dom:
        if "arxiv.org" in dom:
            score += 2.0
            reasons.append("arxiv.org domain")
        if "doi.org" in dom:
            score -= 0.5
            reasons.append("doi.org is landing, not PDF")

    # Clamp to a sensible range
    score = max(-5.0, min(15.0, score))
    return score, "; ".join(reasons[:6])

def label_from_score(score: float) -> str:
    if score >= 9:
        return "HIGH"
    if score >= 5:
        return "MEDIUM"
    return "LOW"

papers_dedup_df = papers_dedup_df.copy()

papers_dedup_df["pdf_candidate_url"] = papers_dedup_df.apply(choose_pdf_candidate_url, axis=1)
scores_and_reasons = papers_dedup_df.apply(estimate_free_pdf_score, axis=1)

papers_dedup_df["free_pdf_score"] = [x[0] for x in scores_and_reasons]
papers_dedup_df["free_pdf_reason"] = [x[1] for x in scores_and_reasons]
papers_dedup_df["free_pdf_label"] = papers_dedup_df["free_pdf_score"].map(label_from_score)

logger.info("Free PDF label distribution:\n%s", papers_dedup_df["free_pdf_label"].value_counts())

papers_dedup_df[[
    "source", "doi", "title", "publication_year",
    "oa_status", "is_oa",
    "pdf_candidate_url", "free_pdf_score", "free_pdf_label", "free_pdf_reason"
]].head(10)


2026-01-12 09:30:29,714 | INFO | Free PDF label distribution:
free_pdf_label
LOW       3806
HIGH      2958
MEDIUM     556
Name: count, dtype: int64


Unnamed: 0,source,doi,title,publication_year,oa_status,is_oa,pdf_candidate_url,free_pdf_score,free_pdf_label,free_pdf_reason
0,,10.1073/pnas,Proceedings of the National Academy of Sciences,2024,green,True,https://wrap.warwick.ac.uk/120002/7/supplement...,11.0,HIGH,OpenAlex best_oa_pdf_url present; PDF-like URL...
1,,10.1016/s0140-6736(24)00757-8,"Global incidence, prevalence, years lived with...",2024,hybrid,True,https://www.thelancet.com/article/S01406736240...,10.0,HIGH,OpenAlex best_oa_pdf_url present; PDF-like URL...
2,,10.1038/s41580-022-00566-8,"Long non-coding RNAs: definitions, functions, ...",2023,bronze,True,https://www.nature.com/articles/s41580-022-005...,10.0,HIGH,OpenAlex best_oa_pdf_url present; PDF-like URL...
3,,10.1093/nar/gkad344,antiSMASH 7.0: new and improved predictions fo...,2023,gold,True,https://academic.oup.com/nar/advance-article-p...,11.0,HIGH,OpenAlex best_oa_pdf_url present; PDF-like URL...
4,,10.1038/s41392-023-01452-1,Macrophages in immunoregulation and therapeutics,2023,gold,True,https://www.nature.com/articles/s41392-023-014...,11.0,HIGH,OpenAlex best_oa_pdf_url present; PDF-like URL...
5,,10.1038/s41392-024-01757-9,NF-κB in biology and targeted therapy: new ins...,2024,gold,True,https://www.nature.com/articles/s41392-024-017...,11.0,HIGH,OpenAlex best_oa_pdf_url present; PDF-like URL...
6,,10.4337/9781802208818.00008,The Fourth Industrial Revolution: what it mean...,2024,closed,False,https://doi.org/10.4337/9781802208818.00008,-2.5,LOW,"oa_status=closed; doi.org is landing, not PDF"
7,,10.1038/s41392-023-01481-w,"The blood–brain barrier: Structure, regulation...",2023,gold,True,https://www.nature.com/articles/s41392-023-014...,11.0,HIGH,OpenAlex best_oa_pdf_url present; PDF-like URL...
8,,10.1093/eurheartj/ehad192,2023 ESC Guidelines for the management of card...,2023,hybrid,True,https://academic.oup.com/eurheartj/advance-art...,10.0,HIGH,OpenAlex best_oa_pdf_url present; PDF-like URL...
9,,10.1038/s41580-024-00703-5,The cell biology of ferroptosis,2024,closed,False,https://doi.org/10.1038/s41580-024-00703-5,-2.5,LOW,"oa_status=closed; doi.org is landing, not PDF"


In [16]:
# ------------------------------------------------------------
# Cell 12: Relevance and priority scoring for RQ-driven research
# ------------------------------------------------------------
# This cell scores each (deduplicated) paper for:
# 1) RQ relevance: how strongly the paper matches our research questions (RQ)
# 2) Priority: which papers we should read/ingest first, balancing relevance,
#    impact, recency, and "free PDF likelihood".
#
# Philosophy:
# - Keep this model simple, transparent, and editable in a notebook.
# - Prefer high recall earlier; use scoring to rank, not to filter too hard.
# - Persist intermediate signals so we can debug why a paper ranked high/low.
#
# Outputs (added to `papers_dedup_df`):
# - rq_relevance_score (0..100)
# - rq_relevance_label ("HIGH" / "MEDIUM" / "LOW")
# - priority_score (0..100)
# - priority_tier ("P0" / "P1" / "P2" / "P3")
# - score_breakdown (dict for audit/debugging)

import math

# -----------------------------
# Config: RQ keyword sets
# -----------------------------
# You should edit/extend these lists as your RQ evolves.
# Keep terms broad (recall) and add more precise terms as you converge.
RQ_KEYWORDS: Dict[str, List[str]] = {
    "venture_capital": [
        "venture capital", "vc", "venture fund", "early-stage", "seed", "series a",
        "syndication", "term sheet", "valuation", "exit", "ipo", "unicorn",
    ],
    "limited_partners": [
        "limited partner", "lp", "institutional investor", "endowment", "pension fund",
        "fund of funds", "private equity", "allocation", "commitment", "manager selection",
    ],
    "government_vc": [
        "government venture capital", "public venture capital", "state-backed",
        "sovereign wealth fund", "innovation agency", "development finance institution",
        "industrial policy", "public finance", "policy instrument",
    ],
    "entrepreneurship_policy": [
        "entrepreneurship policy", "startup policy", "innovation policy",
        "business dynamism", "firm entry", "regulation", "tax credit",
        "accelerator", "incubator", "grant", "subsidy",
    ],
}

# Negative terms help down-rank clearly off-topic results (optional)
NEGATIVE_KEYWORDS_RQ: List[str] = [
    "colon carcinoma", "macrophage", "ferroptosis", "cancer", "protein", "genome",
    "immunology", "cardiology", "neural network", "diffusion model",
]

# Weights (tune later)
W_RELEVANCE_TITLE = 0.55
W_RELEVANCE_ABSTRACT = 0.45

W_PRIORITY_RELEVANCE = 0.55
W_PRIORITY_IMPACT = 0.20
W_PRIORITY_RECENCY = 0.15
W_PRIORITY_FREEPDF = 0.10

# -----------------------------
# Helpers: keyword matching
# -----------------------------
def _safe_text(x: Any) -> str:
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return ""
    return str(x).lower()

def count_keyword_hits(text: str, keywords: List[str]) -> int:
    """
    Count distinct keyword hits (simple substring match).
    """
    hits = 0
    for kw in keywords:
        if kw.lower() in text:
            hits += 1
    return hits

def relevance_for_theme(row: pd.Series, theme: str) -> Tuple[float, Dict[str, Any]]:
    """
    Compute relevance score for a given theme.
    Returns (score_0_to_100, breakdown).
    """
    title = _safe_text(row.get("title"))
    abstract = _safe_text(row.get("abstract"))

    pos_kws = RQ_KEYWORDS.get(theme, [])
    neg_kws = NEGATIVE_KEYWORDS_RQ

    title_hits = count_keyword_hits(title, pos_kws)
    abs_hits = count_keyword_hits(abstract, pos_kws)

    neg_title_hits = count_keyword_hits(title, neg_kws)
    neg_abs_hits = count_keyword_hits(abstract, neg_kws)

    # Convert hits -> bounded scores (log-ish saturation)
    title_score = 1.0 - math.exp(-0.7 * title_hits)   # 0..~1
    abs_score = 1.0 - math.exp(-0.35 * abs_hits)      # 0..~1

    base = (W_RELEVANCE_TITLE * title_score) + (W_RELEVANCE_ABSTRACT * abs_score)

    # Negative penalty (stronger on title)
    penalty = min(0.6, 0.15 * neg_title_hits + 0.08 * neg_abs_hits)

    score = max(0.0, base - penalty) * 100.0

    breakdown = {
        "theme": theme,
        "title_hits": title_hits,
        "abstract_hits": abs_hits,
        "neg_title_hits": neg_title_hits,
        "neg_abstract_hits": neg_abs_hits,
        "title_score": round(title_score, 4),
        "abstract_score": round(abs_score, 4),
        "penalty": round(penalty, 4),
        "relevance": round(score, 2),
    }
    return score, breakdown

def choose_best_theme_and_relevance(row: pd.Series) -> Tuple[str, float, Dict[str, Any]]:
    """
    Some rows came from multiple themes/queries.
    We compute relevance for each theme and keep the best.
    """
    themes = []
    v = row.get("theme")

    if isinstance(v, str) and v.strip():
        themes = [t.strip() for t in v.split(",") if t.strip()]
    else:
        themes = list(RQ_KEYWORDS.keys())

    best_theme = None
    best_score = -1.0
    best_breakdown = {}

    for t in themes:
        score, bd = relevance_for_theme(row, t)
        if score > best_score:
            best_score = score
            best_theme = t
            best_breakdown = bd

    return best_theme or "unknown", float(best_score), best_breakdown

# -----------------------------
# Priority components
# -----------------------------
def impact_score(row: pd.Series) -> float:
    """
    Map cited_by_count -> 0..100 with saturation.
    """
    c = row.get("cited_by_count")
    if c is None or (isinstance(c, float) and pd.isna(c)):
        return 0.0
    try:
        c = max(0, int(c))
    except Exception:
        return 0.0
    # Saturating transform: 0->0, 10->~50, 50->~80, 200->~95
    return (1.0 - math.exp(-c / 20.0)) * 100.0

def recency_score(row: pd.Series) -> float:
    """
    Newer papers are prioritized (YEAR_TO is typically current year).
    """
    y = row.get("publication_year")
    if y is None or (isinstance(y, float) and pd.isna(y)):
        return 0.0
    try:
        y = int(y)
    except Exception:
        return 0.0

    # Linear decay over 6 years window
    age = max(0, YEAR_TO - y)
    return max(0.0, 100.0 * (1.0 - min(1.0, age / 6.0)))

def freepdf_score(row: pd.Series) -> float:
    """
    Map free_pdf_score (Cell 11) to 0..100.
    """
    s = row.get("free_pdf_score")
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return 0.0
    try:
        s = float(s)
    except Exception:
        return 0.0
    # free_pdf_score is roughly [-5, 15]
    s = max(-5.0, min(15.0, s))
    return (s + 5.0) / 20.0 * 100.0

def label_relevance(score: float) -> str:
    if score >= 70:
        return "HIGH"
    if score >= 35:
        return "MEDIUM"
    return "LOW"

def tier_priority(score: float) -> str:
    if score >= 80:
        return "P0"
    if score >= 65:
        return "P1"
    if score >= 45:
        return "P2"
    return "P3"

# -----------------------------
# Run scoring
# -----------------------------
papers_scored_df = papers_dedup_df.copy()

best_theme, best_rel, breakdowns = [], [], []
for _, row in papers_scored_df.iterrows():
    t, s, bd = choose_best_theme_and_relevance(row)
    best_theme.append(t)
    best_rel.append(s)
    breakdowns.append(bd)

papers_scored_df["rq_best_theme"] = best_theme
papers_scored_df["rq_relevance_score"] = best_rel
papers_scored_df["rq_relevance_label"] = papers_scored_df["rq_relevance_score"].map(label_relevance)

papers_scored_df["impact_component"] = papers_scored_df.apply(impact_score, axis=1)
papers_scored_df["recency_component"] = papers_scored_df.apply(recency_score, axis=1)
papers_scored_df["freepdf_component"] = papers_scored_df.apply(freepdf_score, axis=1)

papers_scored_df["priority_score"] = (
    W_PRIORITY_RELEVANCE * papers_scored_df["rq_relevance_score"]
    + W_PRIORITY_IMPACT * papers_scored_df["impact_component"]
    + W_PRIORITY_RECENCY * papers_scored_df["recency_component"]
    + W_PRIORITY_FREEPDF * papers_scored_df["freepdf_component"]
)

papers_scored_df["priority_tier"] = papers_scored_df["priority_score"].map(tier_priority)
papers_scored_df["score_breakdown"] = breakdowns

logger.info("Priority tier distribution:\n%s", papers_scored_df["priority_tier"].value_counts())
logger.info("Relevance label distribution:\n%s", papers_scored_df["rq_relevance_label"].value_counts())

# Show top-ranked candidates for reading/ingestion
cols = [
    "priority_tier", "priority_score",
    "rq_best_theme", "rq_relevance_score", "rq_relevance_label",
    "free_pdf_label", "free_pdf_score",
    "publication_year", "cited_by_count",
    "title", "venue", "doi", "pdf_candidate_url",
]

papers_scored_df.sort_values("priority_score", ascending=False)[cols].head(30)


2026-01-12 09:32:15,853 | INFO | Priority tier distribution:
priority_tier
P3    6083
P2    1231
P1       6
Name: count, dtype: int64
2026-01-12 09:32:15,856 | INFO | Relevance label distribution:
rq_relevance_label
LOW       5409
MEDIUM    1867
HIGH        44
Name: count, dtype: int64


Unnamed: 0,priority_tier,priority_score,rq_best_theme,rq_relevance_score,rq_relevance_label,free_pdf_label,free_pdf_score,publication_year,cited_by_count,title,venue,doi,pdf_candidate_url
2824,P1,70.401026,venture_capital,85.445069,HIGH,HIGH,10.0,2024,7.0,Venture capital exit after venture IPO,Strategic Entrepreneurship Journal,10.1002/sej.1515,https://onlinelibrary.wiley.com/doi/pdfdirect/...
1783,P1,70.175828,venture_capital,85.558583,HIGH,LOW,2.5,2023,18.0,"Scalability, venture capital availability, and...",Journal of Business Venturing,10.1016/j.jbusvent.2023.106345,https://doi.org/10.1016/j.jbusvent.2023.106345
3537,P1,68.120173,venture_capital,85.445069,HIGH,HIGH,10.0,2024,4.0,Post‐IPO lead venture capital firm involvement...,British Journal of Management,10.1111/1467-8551.12803,https://onlinelibrary.wiley.com/doi/pdfdirect/...
3298,P1,67.116403,venture_capital,82.168033,HIGH,HIGH,10.0,2024,5.0,"VC ownership post‐IPO: When, why, and how do V...",The Journal of Financial Research,10.1111/jfir.12412,https://onlinelibrary.wiley.com/doi/pdfdirect/...
7110,P1,66.692418,venture_capital,82.168033,HIGH,HIGH,13.0,2025,0.0,Can Large Language Models Improve Venture Capi...,arXiv (Cornell University),,https://arxiv.org/pdf/2601.00810
1071,P1,65.163665,entrepreneurship_policy,56.94061,MEDIUM,HIGH,10.0,2024,34.0,The role of policy and regulation in promoting...,World Journal of Advanced Research and Reviews,10.30574/wjarr.2024.22.1.1047,https://wjarr.com/sites/default/files/WJARR-20...
2987,P2,64.063118,venture_capital,70.689968,HIGH,HIGH,10.0,2025,6.0,The Inflation Reduction Act’s Impact Upon Earl...,Therapeutic Innovation & Regulatory Science,10.1007/s43441-025-00773-3,https://link.springer.com/content/pdf/10.1007/...
1538,P2,64.019968,venture_capital,64.867981,MEDIUM,HIGH,10.0,2023,22.0,IPOs and SPACs: Recent Developments,Annual Review of Financial Economics,10.1146/annurev-financial-111021-100657,https://www.annualreviews.org/doi/pdf/10.1146/...
476,P2,63.520343,entrepreneurship_policy,50.34147,MEDIUM,HIGH,13.0,2023,68.0,Frontier AI Regulation: Managing Emerging Risk...,arXiv (Cornell University),10.48550/arxiv.2307.03718,https://arxiv.org/pdf/2307.03718
1535,P2,63.472061,venture_capital,70.689968,HIGH,LOW,2.5,2023,22.0,"A definition, review, and extension of global ...",Journal of Business Research,10.1016/j.jbusres.2022.113605,https://doi.org/10.1016/j.jbusres.2022.113605


In [17]:
# ------------------------------------------------------------
# Cell 13: Candidate list export for downstream ingestion
# ------------------------------------------------------------
# This cell produces the main artifact of this notebook:
# a ranked candidate list of papers (post-dedup) for downstream PDF ingestion.
#
# Outputs:
# - candidates_full.parquet  : full table with scoring + debugging columns
# - candidates_full.csv      : full table (CSV)
# - candidates_topN.csv      : top N (compact) for quick review
# - candidates_topN.md       : top N as a lightweight markdown table (optional)
#
# Notes:
# - Keep artifacts deterministic (sorted, stable columns)
# - Include enough columns to debug ranking decisions
# - Keep a compact view for daily manual review

from pathlib import Path

EXPORT_DIR = ARTIFACT_DIR / "candidates"
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

# -----------------------------
# Select and order columns
# -----------------------------
CORE_COLS = [
    "priority_tier",
    "priority_score",
    "rq_best_theme",
    "rq_relevance_score",
    "rq_relevance_label",
    "free_pdf_label",
    "free_pdf_score",
    "publication_year",
    "cited_by_count",
    "source",
    "source_id",
    "paper_id",
    "doi",
    "title",
    "venue",
    "authors",
    "pdf_candidate_url",
    "landing_page_url",
    "best_oa_landing_url",
    "best_oa_pdf_url",
]

DEBUG_COLS = [
    "oa_status",
    "is_oa",
    "pdf_url_hint",
    "free_pdf_reason",
    "score_breakdown",
]

# Ensure optional columns exist (so export doesn't fail)
for c in DEBUG_COLS:
    if c not in papers_scored_df.columns:
        papers_scored_df[c] = None

export_cols = CORE_COLS + DEBUG_COLS

# -----------------------------
# Sort and prepare export frame
# -----------------------------
candidates_df = papers_scored_df.copy()

# Stable sort:
# 1) priority_score desc
# 2) rq_relevance_score desc
# 3) cited_by_count desc
# 4) publication_year desc
candidates_df["_cited_sort"] = candidates_df["cited_by_count"].fillna(0).astype(int)
candidates_df["_year_sort"] = candidates_df["publication_year"].fillna(0).astype(int)

candidates_df = candidates_df.sort_values(
    by=["priority_score", "rq_relevance_score", "_cited_sort", "_year_sort"],
    ascending=[False, False, False, False],
).reset_index(drop=True)

# -----------------------------
# Export artifacts
# -----------------------------
out_parquet = EXPORT_DIR / "candidates_full.parquet"
out_csv = EXPORT_DIR / "candidates_full.csv"

candidates_df[export_cols].to_parquet(out_parquet, index=False)
candidates_df[export_cols].to_csv(out_csv, index=False)

logger.info("Saved candidates parquet: %s", out_parquet)
logger.info("Saved candidates CSV: %s", out_csv)

# -----------------------------
# Compact "top N" exports for quick review
# -----------------------------
TOP_N = 200

TOP_COLS = [
    "priority_tier",
    "priority_score",
    "rq_best_theme",
    "rq_relevance_label",
    "free_pdf_label",
    "publication_year",
    "cited_by_count",
    "title",
    "venue",
    "doi",
    "pdf_candidate_url",
]

top_df = candidates_df[TOP_COLS].head(TOP_N).copy()

out_top_csv = EXPORT_DIR / f"candidates_top{TOP_N}.csv"
top_df.to_csv(out_top_csv, index=False)
logger.info("Saved top-%d CSV: %s", TOP_N, out_top_csv)

# Optional: markdown table for Notion/manual copy-paste
out_top_md = EXPORT_DIR / f"candidates_top{TOP_N}.md"
try:
    top_df.to_markdown(out_top_md, index=False)
    logger.info("Saved top-%d Markdown: %s", TOP_N, out_top_md)
except Exception as e:
    logger.warning("Could not write markdown table (%s). Skipping.", e)

# -----------------------------
# Update run metadata
# -----------------------------
RUN_CONTEXT["exports"] = RUN_CONTEXT.get("exports", {})
RUN_CONTEXT["exports"].update(
    {
        "candidates_full_parquet": str(out_parquet),
        "candidates_full_csv": str(out_csv),
        "candidates_top_csv": str(out_top_csv),
        "candidates_top_md": str(out_top_md),
        "top_n": TOP_N,
    }
)

with open(OUT_RUN_METADATA_JSON, "w") as f:
    json.dump(RUN_CONTEXT, f, indent=2)

logger.info("Updated run metadata with exports: %s", OUT_RUN_METADATA_JSON)

# Quick preview
display(top_df.head(20))


2026-01-12 09:33:46,938 | INFO | Saved candidates parquet: artifacts/018_seed_corpus_2023plus_discovery/candidates/candidates_full.parquet
2026-01-12 09:33:46,938 | INFO | Saved candidates CSV: artifacts/018_seed_corpus_2023plus_discovery/candidates/candidates_full.csv
2026-01-12 09:33:46,945 | INFO | Saved top-200 CSV: artifacts/018_seed_corpus_2023plus_discovery/candidates/candidates_top200.csv
2026-01-12 09:33:47,022 | INFO | Saved top-200 Markdown: artifacts/018_seed_corpus_2023plus_discovery/candidates/candidates_top200.md
2026-01-12 09:33:47,024 | INFO | Updated run metadata with exports: artifacts/018_seed_corpus_2023plus_discovery/run_metadata.json


Unnamed: 0,priority_tier,priority_score,rq_best_theme,rq_relevance_label,free_pdf_label,publication_year,cited_by_count,title,venue,doi,pdf_candidate_url
0,P1,70.401026,venture_capital,HIGH,HIGH,2024,7.0,Venture capital exit after venture IPO,Strategic Entrepreneurship Journal,10.1002/sej.1515,https://onlinelibrary.wiley.com/doi/pdfdirect/...
1,P1,70.175828,venture_capital,HIGH,LOW,2023,18.0,"Scalability, venture capital availability, and...",Journal of Business Venturing,10.1016/j.jbusvent.2023.106345,https://doi.org/10.1016/j.jbusvent.2023.106345
2,P1,68.120173,venture_capital,HIGH,HIGH,2024,4.0,Post‐IPO lead venture capital firm involvement...,British Journal of Management,10.1111/1467-8551.12803,https://onlinelibrary.wiley.com/doi/pdfdirect/...
3,P1,67.116403,venture_capital,HIGH,HIGH,2024,5.0,"VC ownership post‐IPO: When, why, and how do V...",The Journal of Financial Research,10.1111/jfir.12412,https://onlinelibrary.wiley.com/doi/pdfdirect/...
4,P1,66.692418,venture_capital,HIGH,HIGH,2025,0.0,Can Large Language Models Improve Venture Capi...,arXiv (Cornell University),,https://arxiv.org/pdf/2601.00810
5,P1,65.163665,entrepreneurship_policy,MEDIUM,HIGH,2024,34.0,The role of policy and regulation in promoting...,World Journal of Advanced Research and Reviews,10.30574/wjarr.2024.22.1.1047,https://wjarr.com/sites/default/files/WJARR-20...
6,P2,64.063118,venture_capital,HIGH,HIGH,2025,6.0,The Inflation Reduction Act’s Impact Upon Earl...,Therapeutic Innovation & Regulatory Science,10.1007/s43441-025-00773-3,https://link.springer.com/content/pdf/10.1007/...
7,P2,64.019968,venture_capital,MEDIUM,HIGH,2023,22.0,IPOs and SPACs: Recent Developments,Annual Review of Financial Economics,10.1146/annurev-financial-111021-100657,https://www.annualreviews.org/doi/pdf/10.1146/...
8,P2,63.520343,entrepreneurship_policy,MEDIUM,HIGH,2023,68.0,Frontier AI Regulation: Managing Emerging Risk...,arXiv (Cornell University),10.48550/arxiv.2307.03718,https://arxiv.org/pdf/2307.03718
9,P2,63.472061,venture_capital,HIGH,LOW,2023,22.0,"A definition, review, and extension of global ...",Journal of Business Research,10.1016/j.jbusres.2022.113605,https://doi.org/10.1016/j.jbusres.2022.113605


In [20]:
# ------------------------------------------------------------
# Cell 14: Lightweight review view for top-ranked papers
# ------------------------------------------------------------
# This cell renders a human-friendly review table for quick daily inspection.
# The goal is to make a 5–10 minute "sanity pass" easy:
# - Are top-ranked items actually on-topic?
# - Do we have a plausible PDF candidate URL?
# - Are there obvious false positives we should down-rank (update RQ keywords)?
#
# Output:
# - review_df: compact DataFrame for on-screen review
# - (optional) a CSV export that can be opened quickly

from textwrap import shorten
from datetime import datetime

REVIEW_TOP_N = 100

def _short(x: Any, n: int = 140) -> str:
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return ""
    s = str(x).replace("\n", " ").strip()
    return shorten(s, width=n, placeholder="...")

def _fmt_authors(auths: Any, max_n: int = 3) -> str:
    if not isinstance(auths, list) or not auths:
        return ""
    head = auths[:max_n]
    suffix = " et al." if len(auths) > max_n else ""
    return ", ".join(head) + suffix

def _clickable(u: Any) -> str:
    # In many notebook UIs, raw URLs are automatically clickable.
    if u is None or (isinstance(u, float) and pd.isna(u)):
        return ""
    s = str(u).strip()
    return s

review_cols = [
    "priority_tier",
    "priority_score",
    "rq_best_theme",
    "rq_relevance_score",
    "rq_relevance_label",
    "free_pdf_label",
    "free_pdf_score",
    "publication_year",
    "cited_by_count",
    "title",
    "venue",
    "authors",
    "doi",
    "pdf_candidate_url",
    "landing_page_url",
]

review_df = (
    candidates_df
    .head(REVIEW_TOP_N)
    [review_cols]
    .copy()
)

# Format for readability
review_df["priority_score"] = review_df["priority_score"].round(2)
review_df["rq_relevance_score"] = review_df["rq_relevance_score"].round(1)
review_df["free_pdf_score"] = review_df["free_pdf_score"].round(1)

review_df["authors"] = review_df["authors"].apply(_fmt_authors)
review_df["title"] = review_df["title"].apply(lambda x: _short(x, 120))
review_df["venue"] = review_df["venue"].apply(lambda x: _short(x, 60))
review_df["pdf_candidate_url"] = review_df["pdf_candidate_url"].apply(_clickable)
review_df["landing_page_url"] = review_df["landing_page_url"].apply(_clickable)

logger.info("Review view ready: top %d rows.", REVIEW_TOP_N)

display(review_df)

# Optional quick export for sharing/review outside the notebook
run_ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")  # UTC timestamp

out_review_csv = EXPORT_DIR / f"review_top{REVIEW_TOP_N}_{run_ts}.csv"
review_df.to_csv(out_review_csv, index=False)

logger.info("Saved review CSV: %s", out_review_csv)


2026-01-12 14:02:31,456 | INFO | Review view ready: top 100 rows.


Unnamed: 0,priority_tier,priority_score,rq_best_theme,rq_relevance_score,rq_relevance_label,free_pdf_label,free_pdf_score,publication_year,cited_by_count,title,venue,authors,doi,pdf_candidate_url,landing_page_url
0,P1,70.40,venture_capital,85.4,HIGH,HIGH,10.0,2024,7,Venture capital exit after venture IPO,Strategic Entrepreneurship Journal,"Yong Li, Tailan Chi, Sai Lan et al.",10.1002/sej.1515,https://onlinelibrary.wiley.com/doi/pdfdirect/...,https://doi.org/10.1002/sej.1515
1,P1,70.18,venture_capital,85.6,HIGH,LOW,2.5,2023,18,"Scalability, venture capital availability, and...",Journal of Business Venturing,"Deepak Somaya, Jingya You",10.1016/j.jbusvent.2023.106345,https://doi.org/10.1016/j.jbusvent.2023.106345,https://doi.org/10.1016/j.jbusvent.2023.106345
2,P1,68.12,venture_capital,85.4,HIGH,HIGH,10.0,2024,4,Post‐IPO lead venture capital firm involvement...,British Journal of Management,"Anup Basnet, Thomas Walker",10.1111/1467-8551.12803,https://onlinelibrary.wiley.com/doi/pdfdirect/...,https://doi.org/10.1111/1467-8551.12803
3,P1,67.12,venture_capital,82.2,HIGH,HIGH,10.0,2024,5,"VC ownership post‐IPO: When, why, and how do V...",The Journal of Financial Research,"Anup Basnet, Kuntara Pukthuanthong, Harry J. T...",10.1111/jfir.12412,https://onlinelibrary.wiley.com/doi/pdfdirect/...,https://doi.org/10.1111/jfir.12412
4,P1,66.69,venture_capital,82.2,HIGH,HIGH,13.0,2025,0,Can Large Language Models Improve Venture Capi...,arXiv (Cornell University),Mohammadhossien Rashidi,,https://arxiv.org/pdf/2601.00810,https://arxiv.org/abs/2601.00810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,P2,57.54,limited_partners,41.0,MEDIUM,HIGH,10.0,2023,557,Climate Risk Disclosure and Institutional Inve...,Review of Financial Studies,"Emirhan Ilhan, Philipp Krueger, Zacharias Saut...",10.1093/rfs/hhad002,https://academic.oup.com/rfs/article-pdf/36/7/...,https://doi.org/10.1093/rfs/hhad002
96,P2,57.54,entrepreneurship_policy,41.0,MEDIUM,HIGH,10.0,2023,397,Better regulation for the green transition,Public governance policy papers,Yola Thürer,10.1787/c91a04bc-en,https://www.oecd-ilibrary.org/deliver/c91a04bc...,https://doi.org/10.1787/c91a04bc-en
97,P2,57.54,entrepreneurship_policy,41.0,MEDIUM,HIGH,10.0,2023,236,TGF-β Regulation of T Cells,Annual Review of Immunology,Wanjun Chen,10.1146/annurev-immunol-101921-045939,https://www.annualreviews.org/doi/pdf/10.1146/...,https://doi.org/10.1146/annurev-immunol-101921...
98,P2,57.54,entrepreneurship_policy,41.0,MEDIUM,HIGH,10.0,2023,189,Atomic‐Level Regulation of Cobalt Single‐Atom ...,Angewandte Chemie International Edition,"Yuanjun Chen, Bing Jiang, Haigang Hao et al.",10.1002/anie.202301879,https://onlinelibrary.wiley.com/doi/pdfdirect/...,https://doi.org/10.1002/anie.202301879


2026-01-12 14:02:31,539 | INFO | Saved review CSV: artifacts/018_seed_corpus_2023plus_discovery/candidates/review_top100_20260112_050231.csv


In [19]:
# ------------------------------------------------------------
# Cell 15: Run metrics, logs, and metadata persistence
# ------------------------------------------------------------
# This cell finalizes the notebook run by:
# - computing summary metrics (counts, label distributions, top themes)
# - persisting a single JSON "run summary" for easy tracking/debugging
# - writing a compact human-readable README-like text (optional)
#
# Outputs:
# - artifacts/.../run_summary.json
# - artifacts/.../run_summary.md (optional)

from datetime import datetime
from collections import defaultdict

RUN_SUMMARY_PATH = ARTIFACT_DIR / "run_summary.json"
RUN_SUMMARY_MD_PATH = ARTIFACT_DIR / "run_summary.md"

# -----------------------------
# Metrics helpers
# -----------------------------
def _vc(series: pd.Series) -> Dict[str, int]:
    """
    Value counts as a plain dict (JSON-friendly).
    """
    return series.value_counts(dropna=False).head(50).to_dict()

def _int(x: Any, default: int = 0) -> int:
    try:
        if x is None or (isinstance(x, float) and pd.isna(x)):
            return default
        return int(x)
    except Exception:
        return default

# -----------------------------
# Compute metrics
# -----------------------------
metrics = {
    "timestamp_utc": datetime.utcnow().isoformat(),
    "year_from": YEAR_FROM,
    "year_to": YEAR_TO,
    "sources_enabled": SOURCES,
    "artifact_dir": str(ARTIFACT_DIR),
    # Raw counts
    "raw_openalex_rows": _int(RUN_CONTEXT.get("raw_openalex_rows")),
    "raw_arxiv_rows": _int(RUN_CONTEXT.get("raw_arxiv_rows")),
    # Unified counts
    "unified_rows": _int(len(papers_raw_df)),
    "unified_unique_paper_id": _int(papers_raw_df["paper_id"].nunique()),
    # Dedup counts
    "dedup_rows": _int(len(papers_dedup_df)),
    "dedup_unique_paper_id": _int(papers_dedup_df["paper_id"].nunique()),
    # Scoring distributions
    "free_pdf_label_dist": _vc(papers_dedup_df["free_pdf_label"]) if "free_pdf_label" in papers_dedup_df.columns else {},
    "priority_tier_dist": _vc(papers_scored_df["priority_tier"]) if "priority_tier" in papers_scored_df.columns else {},
    "rq_relevance_label_dist": _vc(papers_scored_df["rq_relevance_label"]) if "rq_relevance_label" in papers_scored_df.columns else {},
    "rq_best_theme_dist": _vc(papers_scored_df["rq_best_theme"]) if "rq_best_theme" in papers_scored_df.columns else {},
    # Export paths (already written in Cell 13)
    "exports": RUN_CONTEXT.get("exports", {}),
}

# Useful sanity values
metrics["top_priority_examples"] = (
    papers_scored_df.sort_values("priority_score", ascending=False)[
        ["priority_tier", "priority_score", "rq_best_theme", "publication_year", "title", "doi", "pdf_candidate_url"]
    ]
    .head(10)
    .to_dict(orient="records")
)

# -----------------------------
# Persist summary JSON
# -----------------------------
with open(RUN_SUMMARY_PATH, "w") as f:
    json.dump(metrics, f, indent=2)

logger.info("Saved run summary JSON: %s", RUN_SUMMARY_PATH)

# -----------------------------
# Optional: Markdown summary for quick reading
# -----------------------------
try:
    lines = []
    lines.append(f"# Run Summary: 018_seed_corpus_2023plus_discovery")
    lines.append("")
    lines.append(f"- Timestamp (UTC): {metrics['timestamp_utc']}")
    lines.append(f"- Year range: {YEAR_FROM}–{YEAR_TO}")
    lines.append(f"- Artifact dir: `{metrics['artifact_dir']}`")
    lines.append("")
    lines.append("## Counts")
    lines.append(f"- Raw OpenAlex rows: {metrics['raw_openalex_rows']}")
    lines.append(f"- Raw arXiv rows: {metrics['raw_arxiv_rows']}")
    lines.append(f"- Unified rows: {metrics['unified_rows']} (unique paper_id: {metrics['unified_unique_paper_id']})")
    lines.append(f"- Dedup rows: {metrics['dedup_rows']} (unique paper_id: {metrics['dedup_unique_paper_id']})")
    lines.append("")
    lines.append("## Distributions")
    if metrics["free_pdf_label_dist"]:
        lines.append(f"- free_pdf_label: {metrics['free_pdf_label_dist']}")
    if metrics["rq_relevance_label_dist"]:
        lines.append(f"- rq_relevance_label: {metrics['rq_relevance_label_dist']}")
    if metrics["priority_tier_dist"]:
        lines.append(f"- priority_tier: {metrics['priority_tier_dist']}")
    if metrics["rq_best_theme_dist"]:
        lines.append(f"- rq_best_theme: {metrics['rq_best_theme_dist']}")
    lines.append("")
    lines.append("## Top examples (by priority)")
    for r in metrics["top_priority_examples"]:
        title = r.get("title", "")
        doi = r.get("doi", "")
        score = r.get("priority_score", "")
        tier = r.get("priority_tier", "")
        lines.append(f"- **{tier} {score:.2f}** | {title} | {doi}")
    lines.append("")

    RUN_SUMMARY_MD_PATH.write_text("\n".join(lines), encoding="utf-8")
    logger.info("Saved run summary Markdown: %s", RUN_SUMMARY_MD_PATH)
except Exception as e:
    logger.warning("Failed to write run summary Markdown (%s). Skipping.", e)

# -----------------------------
# Also update the existing run_metadata.json (small, non-breaking)
# -----------------------------
RUN_CONTEXT["run_summary_json"] = str(RUN_SUMMARY_PATH)
RUN_CONTEXT["run_summary_md"] = str(RUN_SUMMARY_MD_PATH)

with open(OUT_RUN_METADATA_JSON, "w") as f:
    json.dump(RUN_CONTEXT, f, indent=2)

logger.info("Updated run metadata with run summary paths: %s", OUT_RUN_METADATA_JSON)

# Preview key metrics
display(pd.DataFrame([{
    "raw_openalex_rows": metrics["raw_openalex_rows"],
    "raw_arxiv_rows": metrics["raw_arxiv_rows"],
    "unified_rows": metrics["unified_rows"],
    "dedup_rows": metrics["dedup_rows"],
    "high_relevance": metrics["rq_relevance_label_dist"].get("HIGH", 0),
    "p0_count": metrics["priority_tier_dist"].get("P0", 0),
    "p1_count": metrics["priority_tier_dist"].get("P1", 0),
    "top_n_export": metrics.get("exports", {}).get("top_n", None),
}]))


2026-01-12 09:37:35,077 | INFO | Saved run summary JSON: artifacts/018_seed_corpus_2023plus_discovery/run_summary.json
2026-01-12 09:37:35,093 | INFO | Saved run summary Markdown: artifacts/018_seed_corpus_2023plus_discovery/run_summary.md
2026-01-12 09:37:35,097 | INFO | Updated run metadata with run summary paths: artifacts/018_seed_corpus_2023plus_discovery/run_metadata.json


Unnamed: 0,raw_openalex_rows,raw_arxiv_rows,unified_rows,dedup_rows,high_relevance,p0_count,p1_count,top_n_export
0,6848,2408,9256,7320,44,0,6,200
