In [39]:
# ============================================================
# 007_Parametric_MOIC_Analysis_VC_Portfolios
# ============================================================
#
# Overview
# --------
# This notebook estimates a VC portfolio‚Äôs *approximate* MOIC by combining:
# (1) explicit, round-by-round investment assumptions (check size, implied multiple,
#     ownership/dilution proxies), and
# (2) lightweight, web-derived evidence of actual investments and exits.
#
# The workflow is intentionally pragmatic. It generates candidate startup lists
# using Google Custom Search Engine (CSE), validates investment and exit signals
# through high-relevance web sources (official press releases, reputable coverage),
# and applies conservative heuristics to translate partial evidence into
# deal-level outcomes.
#
# Exit verification follows a two-stage approach:
# (a) a fast, snippet-based classification using search result titles/snippets, and
# (b) optional deeper verification (e.g., page scraping) only when higher confidence
#     is required.
#
# This notebook is designed for scenario building and portfolio intelligence,
# not for audited performance measurement.
#
#
# Inputs / Outputs
# ----------------
# Inputs:
# - Target VC name(s) and target year range
# - Round-level assumptions (e.g., check size in USD, expected return multiple per round)
# - Scenario-level knobs (e.g., holding multiple proxies, fallback logic)
# - CSE configuration (API key / CX) and query templates
# - Scraping constraints (domains allowlist/denylist, rate limits, robots checks)
#
# Outputs:
# - Candidate startup list for a given VC & year (CSE-derived)
# - Filtered investment evidence set (investment confirmation + source URLs)
# - Exit evidence set (IPO / M&A signals with supporting sources)
# - Tagged dataset (round, year, exit type, confidence, notes)
# - Deal-level MOIC approximation
# - Portfolio-level summary statistics and scenario comparisons
#
#
# Structure
# ---------
# Cell 0 : Purpose, scope, and non-goals (approximation + evidence-based tagging)
# Cell 1 : Imports, configuration, and credentials (CSE / scraping settings)
# Cell 2 : Define round-level assumptions (check size, target multiples, heuristics)
# Cell 3 : Build candidate startup universe via CSE (VC √ó year ‚Üí startup candidates)
# Cell 4 : Rank/clean candidates (dedupe, normalize names, relevance scoring)
# Cell 5 : Robots.txt quick check + source selection (PR/news/company pages)
# Cell 6 : Investment verification (VC √ó startup √ó year) and evidence capture
# Cell 7 : Construct the investment table (one row per inferred investment event)
# Cell 8 : Exit discovery via CSE (IPO / M&A queries per startup)
# Cell 9 : Exit tagging (fast snippet-based classification; optional deeper verification)
# Cell 10: MOIC approximation logic (round assumptions √ó exit outcome)
# Cell 11: Portfolio aggregation (summary stats, distributions, scenario comparisons)
# Cell 12: Export artifacts (CSV/JSON) for downstream analysis/visualization
#
#
# Notes
# -----
# - Evidence is probabilistic: each inferred ‚Äúinvestment‚Äù and ‚Äúexit‚Äù carries a
#   confidence score and source URLs for traceability.
# - robots.txt checks are a lightweight technical safeguard, not a legal determination.
# - This pipeline works best with reputable sources (official PR, major media,
#   regulatory filings) and an explicit allowlist of domains.
# - The resulting MOIC is an approximation intended for comparative and exploratory
#   analysis rather than precise fund performance attribution.


In [1]:
# ============================================================
# Cell 1 : Imports, configuration, and credentials (CSE / scraping settings)
# ============================================================

# ----------------------------
# 1-0. Imports
# ----------------------------
import os
import re
import time
import json
import random
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

import requests
from dotenv import load_dotenv

# ----------------------------
# 1-1. Load credentials (env.txt)
# ----------------------------
# Load env.txt explicitly (recommended for local + GitHub Actions parity)
load_dotenv("env.txt")

# --- OpenAI (optional; used for summarization / tagging if enabled) ---
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if OPENAI_API_KEY:
    print("‚úÖ OPENAI_API_KEY loaded successfully")
else:
    print("‚ÑπÔ∏è OPENAI_API_KEY not found. OpenAI-based summarization/tagging will be disabled.")

# Only initialize the client if the key exists (prevents hard failure for data-only runs)
openai_client = None
if OPENAI_API_KEY:
    try:
        from openai import OpenAI
        openai_client = OpenAI(api_key=OPENAI_API_KEY)
    except Exception as e:
        print(f"‚ö†Ô∏è OpenAI client could not be initialized: {e}")
        openai_client = None

# --- Google Custom Search Engine (required for this notebook) ---
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_CX = os.getenv("GOOGLE_CSE_CX")

if GOOGLE_API_KEY is None or GOOGLE_CSE_CX is None:
    raise ValueError("GOOGLE_API_KEY or GOOGLE_CSE_CX is missing in env.txt")
else:
    print("‚úÖ Google CSE credentials loaded successfully")

# ----------------------------
# 1-2. Global configuration
# ----------------------------
@dataclass
class CSEConfig:
    api_key: str
    cx: str
    endpoint: str = "https://www.googleapis.com/customsearch/v1"
    num: int = 10                 # results per request (max 10)
    lang: str = "lang_en"         # can be changed to "lang_ja" for Japanese sources
    safe: str = "off"             # "off" / "active"


@dataclass
class ScrapeConfig:
    user_agent: str = (
        "Mozilla/5.0 (compatible; researchOS/1.0; +https://example.com/bot)"
    )
    timeout_sec: int = 20
    max_bytes: int = 2_000_000     # hard cap on response size (2MB)
    min_delay_sec: float = 1.0     # base delay between requests
    max_delay_sec: float = 2.0     # jitter upper bound
    respect_robots: bool = True
    allowed_domains: Optional[List[str]] = None   # set to list for strict allowlist
    blocked_domains: Optional[List[str]] = None   # set to list for explicit denylist


cse_cfg = CSEConfig(api_key=GOOGLE_API_KEY, cx=GOOGLE_CSE_CX)
scrape_cfg = ScrapeConfig(
    allowed_domains=None,  # e.g., ["prnewswire.com", "businesswire.com", "reuters.com"]
    blocked_domains=[
        "facebook.com",
        "instagram.com",
        "linkedin.com",
        "twitter.com",
        "x.com",
        "tiktok.com",
    ],
)

print("‚úÖ Configuration objects initialized (cse_cfg, scrape_cfg)")

# ----------------------------
# 1-3. HTTP session (shared)
# ----------------------------
session = requests.Session()
session.headers.update({"User-Agent": scrape_cfg.user_agent})

# ----------------------------
# 1-4. Helper utilities (sleep jitter / URL normalization)
# ----------------------------
def sleep_with_jitter(min_s: float, max_s: float) -> None:
    """Sleep a random amount of time to reduce burstiness."""
    time.sleep(random.uniform(min_s, max_s))


def normalize_url(url: str) -> str:
    """Basic URL normalization for dedupe."""
    url = url.strip()
    url = re.sub(r"#.*$", "", url)  # remove fragments
    url = re.sub(r"\?$", "", url)
    return url


print("‚úÖ Session and helper utilities ready")


‚úÖ OPENAI_API_KEY loaded successfully
‚úÖ Google CSE credentials loaded successfully
‚úÖ Configuration objects initialized (cse_cfg, scrape_cfg)
‚úÖ Session and helper utilities ready


In [17]:
# ============================================================
# Cell 2 : Define round-level assumptions (check size, target multiples, heuristics)
# ============================================================

# ----------------------------
# 2-0. Purpose
# ----------------------------
# We model a VC portfolio with explicit, round-level assumptions.
# These assumptions are used later to compute an *approximate* deal-level MOIC
# once an exit signal (IPO / M&A / None) is detected and tagged.
#
# Notes:
# - This is scenario modeling, not audited performance measurement.
# - Keep the assumptions explicit and easy to edit.
# - Use the "scenario_name" to version and compare runs.

from dataclasses import dataclass
from typing import Dict, Optional

# ----------------------------
# 2-1. Scenario metadata
# ----------------------------
scenario_name = "baseline_v1"
target_currency = "USD"

# ----------------------------
# 2-2. Round-level assumption schema
# ----------------------------
@dataclass
class RoundAssumption:
    round_name: str
    check_size_usd: float                  # assumed investment amount per deal
    target_multiple_base: float            # base case exit multiple for this round (if successful)
    target_multiple_bull: float            # bull case multiple (for upside outcomes)
    loss_probability: float                # probability of total loss (0.0 - 1.0)
    partial_loss_multiple: float = 0.0     # multiple applied for "partial loss" (0 => wipeout)
    dilution_factor: float = 1.0           # heuristic factor to reflect dilution (<= 1.0)
    notes: str = ""

# ----------------------------
# 2-3. Define baseline assumptions (edit here)
# ----------------------------
# You can tune these based on your mental model:
# - Earlier rounds: smaller check sizes, higher dispersion / higher upside.
# - Later rounds: larger checks, lower upside, lower wipeout probability.
#
# dilution_factor is a simple proxy; if you later model ownership explicitly,
# you can replace this with a computed value.

ROUND_ASSUMPTIONS: Dict[str, RoundAssumption] = {
    "Pre-Seed": RoundAssumption(
        round_name="Pre-Seed",
        check_size_usd=250_000,
        target_multiple_base=8.0,
        target_multiple_bull=25.0,
        loss_probability=0.65,
        partial_loss_multiple=0.0,
        dilution_factor=0.85,
        notes="Highest uncertainty; heavy skew."
    ),
    "Seed": RoundAssumption(
        round_name="Seed",
        check_size_usd=500_000,
        target_multiple_base=6.0,
        target_multiple_bull=18.0,
        loss_probability=0.55,
        partial_loss_multiple=0.2,
        dilution_factor=0.88,
        notes="Still high dispersion; some partial returns."
    ),
    "Series A": RoundAssumption(
        round_name="Series A",
        check_size_usd=1_500_000,
        target_multiple_base=4.0,
        target_multiple_bull=10.0,
        loss_probability=0.40,
        partial_loss_multiple=0.5,
        dilution_factor=0.92,
        notes="Lower wipeout; moderate upside."
    ),
    "Series B+": RoundAssumption(
        round_name="Series B+",
        check_size_usd=3_000_000,
        target_multiple_base=2.5,
        target_multiple_bull=6.0,
        loss_probability=0.25,
        partial_loss_multiple=0.8,
        dilution_factor=0.95,
        notes="Later-stage; lower variance."
    ),
}

# ----------------------------
# Follow-on adjustment factors
# ----------------------------
# These factors down-adjust expected multiples and/or effective check size
# for follow-on investments, reflecting lower ownership expansion
# and lower convexity compared to initial checks.

FOLLOW_ON_ADJUSTMENT = {
    "Pre-Seed": {
        "multiple_factor": 0.75,
        "check_size_factor": 1.0,   # same check, just lower convexity
    },
    "Seed": {
        "multiple_factor": 0.70,
        "check_size_factor": 1.0,
    },
    "Series A": {
        "multiple_factor": 0.60,
        "check_size_factor": 1.0,
    },
    "Series B+": {
        "multiple_factor": 0.50,
        "check_size_factor": 1.0,
    },
}

DEFAULT_FOLLOW_ON_ADJUSTMENT = {
    "multiple_factor": 0.65,
    "check_size_factor": 1.0,
}
DEFAULT_ROUND = "Seed"  # used when round is unknown

# ----------------------------
# 2-4. Heuristics (mapping + fallbacks)
# ----------------------------
# If the scraped evidence does not explicitly name the round,
# we try to infer it from keywords. Keep it conservative.
ROUND_KEYWORD_MAP = {
    "Pre-Seed": ["pre-seed", "preseed"],
    "Seed": ["seed", "seed round"],
    "Series A": ["series a", "series-a"],
    "Series B+": ["series b", "series c", "series d", "growth", "late-stage", "series b+", "series e"],
}

def infer_round_from_text(text: str, default_round: str = DEFAULT_ROUND) -> str:
    """
    Infer a financing round label from raw text (title/snippet/body).
    Returns DEFAULT_ROUND if no strong signal is found.
    """
    if not text:
        return default_round
    t = text.lower()
    for round_name, keywords in ROUND_KEYWORD_MAP.items():
        for kw in keywords:
            if kw in t:
                return round_name
    return default_round

# ----------------------------
# 2-5. Helper: compute an approximate deal MOIC from a tagged outcome
# ----------------------------
def estimate_deal_moic(
    round_name: str,
    outcome: str,
    bull: bool = False,
) -> float:
    """
    Estimate deal-level MOIC using round assumptions and an outcome tag.

    outcome:
      - "EXIT"   : successful liquidity event (IPO/M&A confirmed)
      - "PARTIAL": partial return / soft landing
      - "LOSS"   : wipeout / shutdown
      - "UNKNOWN": insufficient info; treat as expected value proxy

    bull:
      - if True, use bull multiple for "EXIT"
    """
    ra = ROUND_ASSUMPTIONS.get(round_name, ROUND_ASSUMPTIONS[DEFAULT_ROUND])

    if outcome == "EXIT":
        multiple = ra.target_multiple_bull if bull else ra.target_multiple_base
        return multiple * ra.dilution_factor

    if outcome == "PARTIAL":
        return ra.partial_loss_multiple * ra.dilution_factor

    if outcome == "LOSS":
        return 0.0

    # UNKNOWN: expected-value proxy using loss_probability and base multiple
    exp_multiple = (1.0 - ra.loss_probability) * ra.target_multiple_base
    return exp_multiple * ra.dilution_factor

# ----------------------------
# 2-6. Quick sanity print
# ----------------------------
print(f"‚úÖ Scenario: {scenario_name} ({target_currency})")
for k, v in ROUND_ASSUMPTIONS.items():
    print(f" - {k}: check=${v.check_size_usd:,.0f}, base={v.target_multiple_base}x, "
          f"bull={v.target_multiple_bull}x, loss_p={v.loss_probability:.2f}, "
          f"dilution={v.dilution_factor:.2f}")
print(f"‚úÖ Default round fallback: {DEFAULT_ROUND}")


‚úÖ Scenario: baseline_v1 (USD)
 - Pre-Seed: check=$250,000, base=8.0x, bull=25.0x, loss_p=0.65, dilution=0.85
 - Seed: check=$500,000, base=6.0x, bull=18.0x, loss_p=0.55, dilution=0.88
 - Series A: check=$1,500,000, base=4.0x, bull=10.0x, loss_p=0.40, dilution=0.92
 - Series B+: check=$3,000,000, base=2.5x, bull=6.0x, loss_p=0.25, dilution=0.95
‚úÖ Default round fallback: Seed


In [9]:
# ============================================================
# Cell 3 : Build candidate startup universe via CSE (VC √ó year ‚Üí startup candidates)
#          (Interactive widgets + progress logging)
# ============================================================

# ----------------------------
# 3-0. Imports (cell-local)
# ----------------------------
import pandas as pd
import ipywidgets as widgets
from ipywidgets import Output
from IPython.display import display, clear_output

# ----------------------------
# 3-1. Widgets: select VC and target year
# ----------------------------
# Edit VC_OPTIONS as you expand coverage.
VC_OPTIONS = [
    "B Capital",
    "Sequoia Capital",
    "Andreessen Horowitz",
    "Accel",
    "Lightspeed Venture Partners",
]

YEAR_OPTIONS = list(range(2015, 2027))  # adjust as needed

vc_dropdown = widgets.Dropdown(
    options=VC_OPTIONS,
    value=VC_OPTIONS[0],
    description="VC:",
    layout=widgets.Layout(width="420px"),
)

year_dropdown = widgets.Dropdown(
    options=YEAR_OPTIONS,
    value=2023,
    description="Year:",
    layout=widgets.Layout(width="220px"),
)

max_pages_slider = widgets.IntSlider(
    value=3,
    min=1,
    max=10,
    step=1,
    description="Pages:",
    tooltip="Each page is up to 10 results; higher pages = more results + more API usage.",
    layout=widgets.Layout(width="520px"),
)

run_button = widgets.Button(
    description="Run CSE Search",
    button_style="primary",
    icon="search",
)

progress_out = Output()

display(widgets.HBox([vc_dropdown, year_dropdown]))
display(max_pages_slider)
display(run_button)
display(progress_out)

# ----------------------------
# 3-2. CSE query templates
# ----------------------------
# Keep these broad; we will clean / rank / verify downstream.
CSE_QUERY_TEMPLATES = [
    '"{VC}" {YEAR} invests in startup',
    '"{VC}" {YEAR} backed by',
    '"{VC}" {YEAR} seed round',
    '"{VC}" {YEAR} series a',
    '"{VC}" portfolio',
    '"{VC}" investment announcement {YEAR}',
]

# Optional domain targeting using CSE operators (site:).
# Keep empty to search broadly.
DOMAIN_HINTS = [
    # 'site:prnewswire.com',
    # 'site:businesswire.com',
    # 'site:techcrunch.com',
]

# ----------------------------
# 3-3. CSE fetch helpers
# ----------------------------
def cse_search(query: str, start_index: int = 1) -> dict:
    """
    Execute a single Google CSE request.
    start_index: 1, 11, 21, ... (Google CSE pagination uses 1-based indexing)
    """
    params = {
        "key": cse_cfg.api_key,
        "cx": cse_cfg.cx,
        "q": query,
        "num": cse_cfg.num,      # max 10
        "start": start_index,
        "safe": cse_cfg.safe,
    }
    r = session.get(cse_cfg.endpoint, params=params, timeout=scrape_cfg.timeout_sec)
    r.raise_for_status()
    return r.json()


def extract_rows_from_items(items: list) -> list:
    """Convert raw CSE items into normalized rows."""
    rows = []
    for it in items or []:
        rows.append({
            "title": it.get("title"),
            "snippet": it.get("snippet"),
            "link": normalize_url(it.get("link", "")),
            "displayLink": it.get("displayLink"),
        })
    return rows

# ----------------------------
# 3-4. Startup name heuristic (naive; refined later)
# ----------------------------
COMMON_TITLE_SPLITS = [" - ", " | ", " ‚Äî ", " ‚Äì "]

def guess_startup_name_from_title(title: str):
    """
    Naive heuristic to extract a likely entity name from a CSE title.
    (We will later improve this using dedupe + evidence-based verification.)
    """
    if not title:
        return None
    t = title.strip()

    for sep in COMMON_TITLE_SPLITS:
        if sep in t:
            t = t.split(sep)[0].strip()
            break

    t = re.sub(r"^(press release:|pr:|news:)\s*", "", t, flags=re.IGNORECASE)

    if len(t) < 2 or len(t) > 80:
        return None
    return t

# ----------------------------
# 3-5. Orchestrator: VC √ó year ‚Üí candidate universe
# ----------------------------
def build_candidate_universe(
    vc_name: str,
    year: int,
    max_pages: int = 3,
    progress_cb=None,
) -> pd.DataFrame:
    """
    Runs multiple query templates and collects a raw candidate universe
    for (vc_name, year). Returns a deduped DataFrame keyed by URL.

    progress_cb: callable(str) -> None
      Used to stream progress logs into the widget output.
    """
    all_rows = []

    # Determine total steps for friendly progress messages
    queries_to_run = []
    for tmpl in CSE_QUERY_TEMPLATES:
        base_q = tmpl.format(VC=vc_name, YEAR=year)
        if DOMAIN_HINTS:
            queries_to_run.extend([f"{dh} {base_q}" for dh in DOMAIN_HINTS])
        else:
            queries_to_run.append(base_q)

    total_steps = len(queries_to_run) * max_pages
    step = 0

    for q in queries_to_run:
        for page in range(max_pages):
            step += 1
            start_index = 1 + page * cse_cfg.num  # 1, 11, 21, ...

            if progress_cb:
                progress_cb(f"[{step}/{total_steps}] Query='{q}' | page={page+1}/{max_pages}")

            try:
                data = cse_search(q, start_index=start_index)
                items = data.get("items", [])
                all_rows.extend(extract_rows_from_items(items))

                if progress_cb:
                    progress_cb(f"    ‚Ü≥ items fetched: {len(items)} (start={start_index})")

                sleep_with_jitter(scrape_cfg.min_delay_sec, scrape_cfg.max_delay_sec)

            except Exception as e:
                if progress_cb:
                    progress_cb(f"‚ö†Ô∏è CSE request failed (start={start_index}): {e}")
                continue

    df = pd.DataFrame(all_rows)
    if df.empty:
        return df

    # Add metadata
    df["vc"] = vc_name
    df["year"] = year
    df["startup_guess"] = df["title"].apply(guess_startup_name_from_title)

    # Deduplicate by URL
    df = df.dropna(subset=["link"]).drop_duplicates(subset=["link"]).reset_index(drop=True)
    return df

# ----------------------------
# 3-6. Button handler: confirmation + progress streaming
# ----------------------------
candidate_df = pd.DataFrame()

def on_click_run(_):
    global candidate_df

    vc_name = vc_dropdown.value
    year = int(year_dropdown.value)
    max_pages = int(max_pages_slider.value)

    # Reset output area + show confirmation
    with progress_out:
        clear_output(wait=True)
        print("=" * 70)
        print("üîç Starting investigation with the following parameters:")
        print(f"  ‚Ä¢ VC        : {vc_name}")
        print(f"  ‚Ä¢ Year      : {year}")
        print(f"  ‚Ä¢ Max pages : {max_pages} (‚âà {max_pages * 10} results per query template)")
        print("=" * 70)

    # Stream progress messages into the same output widget
    def progress_cb(msg: str):
        with progress_out:
            print(msg)

    # Run
    candidate_df = build_candidate_universe(
        vc_name=vc_name,
        year=year,
        max_pages=max_pages,
        progress_cb=progress_cb,
    )

    # Final summary + preview
    with progress_out:
        print("-" * 70)
        print(f"‚úÖ Completed. Raw candidate URLs collected: {len(candidate_df):,}")
        if not candidate_df.empty:
            display(candidate_df.head(20))

run_button.on_click(on_click_run)


HBox(children=(Dropdown(description='VC:', layout=Layout(width='420px'), options=('B Capital', 'Sequoia Capita‚Ä¶

IntSlider(value=3, description='Pages:', layout=Layout(width='520px'), max=10, min=1, tooltip='Each page is up‚Ä¶

Button(button_style='primary', description='Run CSE Search', icon='search', style=ButtonStyle())

Output()

In [10]:
# ============================================================
# Cell 4 : Rank/clean candidates (dedupe, normalize names, relevance scoring)
# ============================================================

# This cell takes the raw CSE results (candidate_df) and produces a cleaner,
# more "startup-like" candidate list by:
# - normalizing company names
# - removing obvious non-startup / low-quality results
# - deduplicating by (normalized_name) and (domain)
# - scoring relevance to the selected (VC, year)
#
# Output:
#   ranked_candidates_df  (one row per candidate startup, with top evidence URLs)

import math
from urllib.parse import urlparse

# ----------------------------
# 4-0. Guardrails
# ----------------------------
if "candidate_df" not in globals() or candidate_df is None or candidate_df.empty:
    raise ValueError("candidate_df is empty. Run Cell 3 first to collect CSE results.")

# ----------------------------
# 4-1. Normalization helpers
# ----------------------------
STOPWORDS_COMPANY = {
    "inc", "inc.", "llc", "l.l.c", "ltd", "ltd.", "limited",
    "corp", "corp.", "corporation", "co", "co.", "company",
    "plc", "gmbh", "s.a.", "sa", "ag", "bv", "kk", "k.k.",
}

GENERIC_BAD_NAMES = {
    "home", "about", "careers", "blog", "news", "press", "press release",
    "portfolio", "investments", "companies", "announcements",
}

def normalize_startup_name(name: str) -> str:
    """
    Normalize a startup/company name for dedupe purposes.
    This is a heuristic; do not treat it as canonical identity resolution.
    """
    if not name:
        return ""

    s = name.strip().lower()

    # Remove punctuation except spaces
    s = re.sub(r"[^a-z0-9\s]", " ", s)

    # Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()

    # Remove common company suffixes at the end
    parts = s.split()
    while parts and parts[-1] in STOPWORDS_COMPANY:
        parts = parts[:-1]

    s = " ".join(parts).strip()
    return s

def get_domain(url: str) -> str:
    try:
        return urlparse(url).netloc.lower().replace("www.", "")
    except Exception:
        return ""

# ----------------------------
# 4-2. Relevance scoring
# ----------------------------
# Simple additive scoring based on text signals:
# - mentions VC name
# - mentions the target year
# - includes funding keywords
# - penalize obvious noise (jobs/careers, directory pages, social links)
FUNDING_KEYWORDS = [
    "raises", "raised", "funding", "financing", "investment", "invests", "backed by",
    "seed", "series a", "series b", "series c", "round", "led by", "co-led", "participated",
    "announces", "announcement",
]

NOISE_KEYWORDS = [
    "careers", "jobs", "hiring", "apply", "glassdoor",
    "wikipedia", "crunchbase", "linkedin", "facebook", "twitter", "x.com",
    "directory", "list of", "top", "rankings",
]

def compute_relevance_score(title: str, snippet: str, vc_name: str, year: int) -> float:
    text = f"{title or ''} {snippet or ''}".lower()
    score = 0.0

    # VC mention
    if vc_name.lower() in text:
        score += 3.0

    # Year mention
    if str(year) in text:
        score += 1.5

    # Funding signals
    for kw in FUNDING_KEYWORDS:
        if kw in text:
            score += 0.6

    # Noise penalties
    for kw in NOISE_KEYWORDS:
        if kw in text:
            score -= 1.0

    return score

# ----------------------------
# 4-3. Build a clean candidate table
# ----------------------------
vc_selected = candidate_df["vc"].iloc[0]
year_selected = int(candidate_df["year"].iloc[0])

work_df = candidate_df.copy()
work_df["domain"] = work_df["link"].apply(get_domain)
work_df["startup_guess"] = work_df["startup_guess"].fillna("")
work_df["startup_norm"] = work_df["startup_guess"].apply(normalize_startup_name)

# Drop rows with no inferred name
work_df = work_df[work_df["startup_norm"] != ""].copy()

# Drop obvious generic names
work_df = work_df[~work_df["startup_norm"].isin(GENERIC_BAD_NAMES)].copy()

# Drop blocked domains (from scrape_cfg) if present
if getattr(scrape_cfg, "blocked_domains", None):
    blocked = set([d.lower().replace("www.", "") for d in (scrape_cfg.blocked_domains or [])])
    work_df = work_df[~work_df["domain"].isin(blocked)].copy()

# Score relevance
work_df["relevance_score"] = work_df.apply(
    lambda r: compute_relevance_score(r.get("title"), r.get("snippet"), vc_selected, year_selected),
    axis=1
)

# ----------------------------
# 4-4. Deduplicate + keep best evidence URL per candidate
# ----------------------------
# Strategy:
# 1) For each normalized startup name, keep the highest scoring row as "best evidence"
# 2) Also keep a few additional evidence URLs per name (top_k)
TOP_EVIDENCE_PER_NAME = 3

work_df = work_df.sort_values(["startup_norm", "relevance_score"], ascending=[True, False])

grouped = []
for name, g in work_df.groupby("startup_norm", sort=False):
    g2 = g.sort_values("relevance_score", ascending=False).head(TOP_EVIDENCE_PER_NAME).copy()
    best = g2.iloc[0].to_dict()

    best["evidence_urls"] = list(g2["link"].values)
    best["evidence_titles"] = list(g2["title"].fillna("").values)
    best["evidence_snippets"] = list(g2["snippet"].fillna("").values)
    best["evidence_domains"] = list(g2["domain"].fillna("").values)
    best["evidence_count"] = len(g2)

    grouped.append(best)

ranked_candidates_df = pd.DataFrame(grouped)

# Final ranking across candidates
ranked_candidates_df = ranked_candidates_df.sort_values(
    "relevance_score", ascending=False
).reset_index(drop=True)

# ----------------------------
# 4-5. Output preview
# ----------------------------
print(f"‚úÖ Ranked candidate startups for VC='{vc_selected}', Year={year_selected}")
print(f" - Raw CSE rows          : {len(candidate_df):,}")
print(f" - Unique candidates     : {len(ranked_candidates_df):,}")
print(f" - Evidence per candidate: top {TOP_EVIDENCE_PER_NAME}")

display_cols = [
    "startup_guess", "startup_norm", "relevance_score",
    "domain", "link", "title"
]
display(ranked_candidates_df[display_cols].head(30))


‚úÖ Ranked candidate startups for VC='Sequoia Capital', Year=2015
 - Raw CSE rows          : 136
 - Unique candidates     : 123
 - Evidence per candidate: top 3


Unnamed: 0,startup_guess,startup_norm,relevance_score,domain,link,title
0,Sensors Data Completes Series C Financing of U...,sensors data completes series c financing of u...,8.1,warburgpincus.com,https://warburgpincus.com/wp-content/uploads/2...,Sensors Data Completes Series C Financing of U...
1,Clutter Raises $20M Series B Funding from Sequ...,clutter raises 20m series b funding from sequo...,7.5,prnewswire.com,https://www.prnewswire.com/news-releases/clutt...,Clutter Raises $20M Series B Funding from Sequ...
2,How Much Did Lemonade Raise? Funding & Key Inv...,how much did lemonade raise funding key investors,7.5,texau.com,https://www.texau.com/profiles/lemonade,How Much Did Lemonade Raise? Funding & Key Inv...
3,Canva Pitch Deck to Raise Seed Round Capital I...,canva pitch deck to raise seed round capital i...,7.5,alexanderjarvis.com,https://www.alexanderjarvis.com/canva-pitch-de...,Canva Pitch Deck to Raise Seed Round Capital I...
4,Sequoia Invests $13 Million In A Seed Round Fo...,sequoia invests 13 million in a seed round for...,6.9,techcrunch.com,https://techcrunch.com/2015/12/08/sequoia-inve...,Sequoia Invests $13 Million In A Seed Round Fo...
5,Ascend pitch deck to raise $15m series-a round,ascend pitch deck to raise 15m series a round,6.9,alexanderjarvis.com,https://www.alexanderjarvis.com/ascend-pitch-d...,Ascend pitch deck to raise $15m series-a round
6,GitHub Raises $250M Series B Round Led By Sequ...,github raises 250m series b round led by sequo...,6.9,techcrunch.com,https://techcrunch.com/2015/07/29/github-raise...,GitHub Raises $250M Series B Round Led By Sequ...
7,EdiGene Raises Approximately USD 67 Million in...,edigene raises approximately usd 67 million in...,6.9,firstwordpharma.com,https://firstwordpharma.com/story/5126421,EdiGene Raises Approximately USD 67 Million in...
8,What is Brief History of Lemonade Company?,what is brief history of lemonade,6.9,matrixbcg.com,https://matrixbcg.com/blogs/brief-history/lemo...,What is Brief History of Lemonade Company? ‚Äì M...
9,How Lemonade's founders raised a massive seed ...,how lemonade s founders raised a massive seed ...,6.9,businessinsider.com,https://www.businessinsider.com/lemonade-danie...,How Lemonade's founders raised a massive seed ...


In [11]:
# ============================================================
# Cell 5 : Robots.txt quick check + source selection (PR/news/company pages)
#          (OpenAI-assisted classification + relevance scoring)
# ============================================================

from urllib.parse import urlparse, urljoin
import urllib.robotparser as robotparser

# ----------------------------
# 5-0. Guardrails
# ----------------------------
if "ranked_candidates_df" not in globals() or ranked_candidates_df is None or ranked_candidates_df.empty:
    raise ValueError("ranked_candidates_df is empty. Run Cell 4 first.")

if openai_client is None:
    raise ValueError("openai_client is not initialized. Set OPENAI_API_KEY in env.txt and rerun Cell 1.")

# ----------------------------
# 5-1. Robots.txt checker (domain-level cache)
# ----------------------------
_robots_cache = {}

def robots_allowed(url: str, user_agent: str):
    """
    Lightweight robots check using urllib.robotparser.
    Returns:
      True  -> allowed
      False -> disallowed
      None  -> could not be determined (robots unavailable / error)
    """
    try:
        parsed = urlparse(url)
        if not parsed.scheme or not parsed.netloc:
            return None

        base = f"{parsed.scheme}://{parsed.netloc}"
        robots_url = urljoin(base, "/robots.txt")

        if robots_url not in _robots_cache:
            rp = robotparser.RobotFileParser()
            rp.set_url(robots_url)
            rp.read()
            _robots_cache[robots_url] = rp

        rp = _robots_cache[robots_url]
        return rp.can_fetch(user_agent, url)
    except Exception:
        return None

# ----------------------------
# 5-2. Expand evidence URLs into a flat table
# ----------------------------
rows = []
for _, r in ranked_candidates_df.iterrows():
    startup = r.get("startup_guess") or r.get("startup_norm")
    startup_norm = r.get("startup_norm")
    vc = r.get("vc")
    year = int(r.get("year"))

    evidence_urls = r.get("evidence_urls") or []
    for u in evidence_urls:
        u = normalize_url(u)
        if not u:
            continue
        rows.append({
            "vc": vc,
            "year": year,
            "startup": startup,
            "startup_norm": startup_norm,
            "url": u,
            "domain": get_domain(u),
            "title": None,   # (optional) fill later if you want
            "snippet": None, # (optional) fill later if you want
            "seed_relevance_score": float(r.get("relevance_score", 0.0)),
        })

sources_df = pd.DataFrame(rows)
if sources_df.empty:
    raise ValueError("No evidence URLs found to check. (sources_df is empty)")

sources_df = sources_df.drop_duplicates(subset=["startup_norm", "url"]).reset_index(drop=True)

# ----------------------------
# 5-3. Apply allow/block filters (optional)
# ----------------------------
if scrape_cfg.allowed_domains:
    allow = set([d.lower().replace("www.", "") for d in scrape_cfg.allowed_domains])
    sources_df = sources_df[sources_df["domain"].isin(allow)].copy()

if scrape_cfg.blocked_domains:
    block = set([d.lower().replace("www.", "") for d in scrape_cfg.blocked_domains])
    sources_df = sources_df[~sources_df["domain"].isin(block)].copy()

sources_df = sources_df.reset_index(drop=True)

# ----------------------------
# 5-4. Robots.txt quick check (optional but recommended)
# ----------------------------
if scrape_cfg.respect_robots:
    print("ü§ñ Running robots.txt checks (domain-level cached)...")
    ua = scrape_cfg.user_agent
    sources_df["robots_allowed"] = sources_df["url"].apply(lambda u: robots_allowed(u, ua))
else:
    print("‚ÑπÔ∏è robots.txt checks are disabled (scrape_cfg.respect_robots=False).")
    sources_df["robots_allowed"] = None

sources_df["robots_blocked"] = sources_df["robots_allowed"].apply(lambda x: True if x is False else False)

# ----------------------------
# 5-5. OpenAI-assisted URL classification
# ----------------------------
# We ask the model to judge:
# - source_type: PR / NEWS / COMPANY / OTHER / SKIP
# - relevance_to_investment: 0..100 (how likely this URL is useful to confirm VC investment in the target year)
# - should_scrape: boolean (whether we should try to fetch the page later)
# - rationale: short reason

import json

OPENAI_MODEL_FOR_JUDGING = "gpt-4.1-mini"  # change if you prefer
MAX_URLS_TO_JUDGE = 250                    # cost guard; adjust as needed

def judge_url_with_openai(vc: str, startup: str, year: int, url: str, domain: str):
    system = (
        "You are a strict web-source triage assistant for VC investment verification.\n"
        "Your job is to classify a URL and estimate how useful it is for confirming whether "
        "a specific VC invested in a specific startup in a specific year.\n"
        "Return ONLY valid JSON with the required keys."
    )

    user = {
        "task": "classify_url_for_vc_investment_verification",
        "vc": vc,
        "startup": startup,
        "year": year,
        "url": url,
        "domain": domain,
        "rules": {
            "source_type_options": ["PR", "NEWS", "COMPANY", "OTHER", "SKIP"],
            "prefer": [
                "official press releases about funding rounds",
                "reputable news articles about the round",
                "company newsroom/blog post about funding"
            ],
            "deprioritize_or_skip": [
                "social media profiles",
                "directory pages (e.g., Crunchbase, Wikipedia, generic lists)",
                "jobs/careers pages",
                "login-walled pages if likely un-scrapable"
            ],
            "output_json_schema": {
                "source_type": "one of PR/NEWS/COMPANY/OTHER/SKIP",
                "relevance_to_investment": "integer 0-100",
                "should_scrape": "boolean",
                "rationale": "short string"
            }
        }
    }

    resp = openai_client.chat.completions.create(
        model=OPENAI_MODEL_FOR_JUDGING,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": json.dumps(user, ensure_ascii=False)},
        ],
        temperature=0,
    )

    text = resp.choices[0].message.content.strip()

    # Defensive parse
    try:
        obj = json.loads(text)
    except Exception:
        # fallback when model returns non-JSON (should be rare)
        obj = {
            "source_type": "OTHER",
            "relevance_to_investment": 0,
            "should_scrape": False,
            "rationale": "Failed to parse JSON output."
        }

    # Normalize
    if obj.get("source_type") not in {"PR", "NEWS", "COMPANY", "OTHER", "SKIP"}:
        obj["source_type"] = "OTHER"
    try:
        obj["relevance_to_investment"] = int(obj.get("relevance_to_investment", 0))
    except Exception:
        obj["relevance_to_investment"] = 0
    obj["relevance_to_investment"] = max(0, min(100, obj["relevance_to_investment"]))
    obj["should_scrape"] = bool(obj.get("should_scrape", False))
    obj["rationale"] = str(obj.get("rationale", ""))[:300]

    return obj

# Apply with a cost guard
judge_n = min(len(sources_df), MAX_URLS_TO_JUDGE)
print(f"üß† Judging URLs with OpenAI: {judge_n}/{len(sources_df)} (cap={MAX_URLS_TO_JUDGE})")

judged = []
for i, row in sources_df.head(judge_n).iterrows():
    if row["robots_blocked"]:
        judged.append({
            "source_type_llm": "SKIP",
            "relevance_llm": 0,
            "should_scrape_llm": False,
            "rationale_llm": "Blocked by robots.txt"
        })
        continue

    j = judge_url_with_openai(
        vc=row["vc"],
        startup=row["startup"],
        year=int(row["year"]),
        url=row["url"],
        domain=row["domain"],
    )
    judged.append({
        "source_type_llm": j["source_type"],
        "relevance_llm": j["relevance_to_investment"],
        "should_scrape_llm": j["should_scrape"],
        "rationale_llm": j["rationale"],
    })

judged_df = pd.DataFrame(judged)
sources_df = pd.concat([sources_df.head(judge_n).reset_index(drop=True), judged_df], axis=1)

# If there are more than cap, keep the rest as unjudged (optional)
if len(sources_df) < len(rows):
    # Keep only judged portion for downstream steps (recommended to avoid mixed states)
    pass

# ----------------------------
# 5-6. Source selection policy (prefer high LLM relevance + PR/NEWS)
# ----------------------------
MAX_SOURCES_PER_STARTUP = 5
SOURCE_TYPE_PRIORITY = {"PR": 0, "NEWS": 1, "COMPANY": 2, "OTHER": 3, "SKIP": 9}

sources_df["source_priority"] = sources_df["source_type_llm"].map(SOURCE_TYPE_PRIORITY).fillna(9).astype(int)

# Selection score: prioritize LLM relevance, then seed relevance, then PR/NEWS
sources_df["selection_score"] = (
    sources_df["relevance_llm"].fillna(0) * 10.0
    + sources_df["seed_relevance_score"].fillna(0)
    - sources_df["source_priority"] * 2.0
)

# Filter: should scrape + not SKIP
filtered = sources_df[
    (sources_df["should_scrape_llm"] == True) &
    (sources_df["source_type_llm"] != "SKIP")
].copy()

filtered = filtered.sort_values(
    ["startup_norm", "selection_score"],
    ascending=[True, False]
)

selected_rows = []
for name, g in filtered.groupby("startup_norm", sort=False):
    selected_rows.append(g.head(MAX_SOURCES_PER_STARTUP))

selected_sources_df = pd.concat(selected_rows, ignore_index=True) if selected_rows else filtered

# ----------------------------
# 5-7. Output
# ----------------------------
print("‚úÖ OpenAI-assisted source selection completed.")
print(f" - Candidate URLs judged: {len(sources_df):,}")
print(f" - Selected sources      : {len(selected_sources_df):,}")
print(f" - Robots respected      : {scrape_cfg.respect_robots}")

display_cols = [
    "startup", "startup_norm", "domain",
    "source_type_llm", "relevance_llm", "should_scrape_llm",
    "robots_allowed", "url", "rationale_llm"
]
display(selected_sources_df[display_cols].head(60))


ü§ñ Running robots.txt checks (domain-level cached)...
üß† Judging URLs with OpenAI: 125/125 (cap=250)
‚úÖ OpenAI-assisted source selection completed.
 - Candidate URLs judged: 125
 - Selected sources      : 52
 - Robots respected      : True


Unnamed: 0,startup,startup_norm,domain,source_type_llm,relevance_llm,should_scrape_llm,robots_allowed,url,rationale_llm
0,"10 Years, 60 Startups, $105 Million",10 years 60 startups 105 million,today.duke.edu,NEWS,70,True,True,https://today.duke.edu/2025/03/10-years-60-sta...,Reputable university news site covering fundin...
1,2015: A Transformative Year for LendInvest,2015 a transformative year for lendinvest,lendinvest.com,COMPANY,60,True,True,https://www.lendinvest.com/blog/2015-a-transfo...,Company blog post likely discusses key events ...
2,$950M Sequoia Capital Fund Aims To Back Next A...,950m sequoia capital fund aims to back next am...,finance.yahoo.com,NEWS,40,True,True,https://finance.yahoo.com/news/950m-sequoia-ca...,Article discusses a Sequoia Capital fund but d...
3,A history lesson with Tom McMurray ¬∑ Collabora...,a history lesson with tom mcmurray collaborati...,collabfund.com,COMPANY,30,True,True,https://collabfund.com/blog/a-history-lesson-w...,Company blog post about a former Sequoia Capit...
4,A review of the Indian private equity sector a...,a review of the indian private equity sector a...,assets.kpmg.com,OTHER,20,True,True,https://assets.kpmg.com/content/dam/kpmg/in/pd...,The PDF is a sector review report from KPMG co...
5,Ascend pitch deck to raise $15m series-a round,ascend pitch deck to raise 15m series a round,alexanderjarvis.com,OTHER,40,True,True,https://www.alexanderjarvis.com/ascend-pitch-d...,The URL appears to be a blog post about the As...
6,Atlas,atlas,sequoiacap.com,PR,85,True,True,https://www.sequoiacap.com/wp-content/uploads/...,Official Sequoia Capital PDF likely detailing ...
7,"Autodesk Acquires Google, Sequoia-Backed PlanGrid",autodesk acquires google sequoia backed plangrid,businessinsider.com,NEWS,40,True,True,https://www.businessinsider.com/autodesk-acqui...,Reputable news article about acquisition menti...
8,Bubble talk,bubble talk,blog.samaltman.com,COMPANY,40,True,True,https://blog.samaltman.com/bubble-talk,Company blog post may mention funding but is n...
9,Carbon3D Closes $100 Million Series C Investme...,carbon3d closes 100 million series c investmen...,carbon3d.com,PR,80,True,True,https://www.carbon3d.com/news/press-releases/c...,Official press release on company website abou...


In [18]:
# ============================================================
# Cell 6 : Investment verification (VC √ó startup √ó year) and evidence capture
#          (Fetch selected sources + OpenAI-assisted verification)
# ============================================================

# This cell verifies whether:
#   "VC invested in Startup around the target year"
# using selected URLs from Cell 5, and captures structured evidence:
# - startup_name_on_page
# - announcement_date / announcement_year
# - main_investors
# - investment_type (initial / follow_on / unknown)
#
# Output:
#   investment_evidence_df : row-level evidence per URL
#   confirmed_df           : high-confidence confirmed rows
#   investment_events_df   : aggregated events (one row per VC√óstartup_norm√ótarget_year)

import re
import html
import json
import pandas as pd
from bs4 import BeautifulSoup  # pip install beautifulsoup4 if missing

# ----------------------------
# 6-0. Guardrails
# ----------------------------
if "selected_sources_df" not in globals() or selected_sources_df is None or selected_sources_df.empty:
    raise ValueError("selected_sources_df is empty. Run Cell 5 first.")

if openai_client is None:
    raise ValueError("openai_client is not initialized. Set OPENAI_API_KEY in env.txt and rerun Cell 1.")

# ----------------------------
# 6-1. Fetch + text extraction
# ----------------------------
def fetch_url_text(url: str) -> dict:
    """
    Fetch a URL and return a dict with status + extracted text (truncated).
    Applies size limits and basic content-type checks.
    """
    try:
        r = session.get(url, timeout=scrape_cfg.timeout_sec, stream=True)
        content_type = (r.headers.get("Content-Type") or "").lower()

        if ("text/html" not in content_type) and ("text/plain" not in content_type):
            return {
                "ok": False,
                "url": url,
                "status": r.status_code,
                "error": f"Unsupported content-type: {content_type}",
            }

        raw = b""
        for chunk in r.iter_content(chunk_size=64_000):
            raw += chunk
            if len(raw) > scrape_cfg.max_bytes:
                return {
                    "ok": False,
                    "url": url,
                    "status": r.status_code,
                    "error": "Response exceeded max_bytes cap",
                }

        text = raw.decode(r.encoding or "utf-8", errors="ignore")

        if "text/html" in content_type:
            soup = BeautifulSoup(text, "html.parser")
            for tag in soup(["script", "style", "noscript"]):
                tag.decompose()
            visible_text = soup.get_text(separator="\n")
        else:
            visible_text = text

        visible_text = html.unescape(visible_text)
        visible_text = re.sub(r"\n{3,}", "\n\n", visible_text)
        visible_text = re.sub(r"[ \t]{2,}", " ", visible_text).strip()

        MAX_CHARS = 14_000  # keep token costs bounded
        if len(visible_text) > MAX_CHARS:
            visible_text = visible_text[:MAX_CHARS]

        return {"ok": True, "url": url, "status": r.status_code, "text": visible_text}

    except Exception as e:
        return {"ok": False, "url": url, "status": None, "error": str(e)}

# ----------------------------
# 6-2. OpenAI verification prompt
# ----------------------------
OPENAI_MODEL_FOR_VERIFICATION = "gpt-4.1-mini"  # adjust if desired

def verify_investment_with_openai(
    vc: str,
    startup_guess: str,
    year: int,
    url: str,
    page_text: str
) -> dict:
    """
    Determine whether the content supports:
      'VC invested in Startup in (or around) target Year'
    and extract announcement date + investors if available.
    Returns structured JSON.
    """
    system = (
        "You are a strict fact-checking analyst for venture capital investment verification.\n"
        "Given a URL and extracted page text, determine whether it supports the claim:\n"
        "  'VC invested in Startup in the target year'.\n"
        "Be conservative: if evidence is weak or ambiguous, mark as not confirmed.\n"
        "Return ONLY valid JSON."
    )

    user = {
        "task": "verify_vc_investment_and_extract_evidence",
        "claim": {
            "vc": vc,
            "startup_guess": startup_guess,
            "target_year": year
        },
        "url": url,
        "page_text_excerpt": page_text,
        "output_json_schema": {
            "investment_confirmed": "boolean",
            "confidence": "number 0-1",
            "startup_name_on_page": "string or null",
            "announcement_date": "string or null (ISO preferred: YYYY-MM-DD; else raw date text)",
            "announcement_year": "integer or null",
            "round": "string or null (e.g., Pre-Seed/Seed/Series A/Series B+)",
            "amount": "string or null (raw text if found)",
            "main_investors": "array of strings (lead + notable participants, if disclosed)",
            "investment_type": "string: one of ['initial', 'follow_on', 'unknown']",
            "vc_role": "string or null (e.g., 'lead', 'co-lead', 'participant', 'unspecified')",
            "evidence_quotes": "array of short quotes (<= 20 words each)",
            "rationale": "short string"
        },
        "rules": [
            "Confirm only if the text explicitly indicates VC participation (e.g., 'led by', 'participated', 'backed by')",
            "If the page clearly names the startup, set startup_name_on_page",
            "Extract the announcement date if the page provides it (press release date or stated date)",
            "announcement_year should be derived from the announcement_date if possible",
            "If investors are listed, extract a concise list of main_investors (avoid very long lists)",
            "Keep quotes short and directly supporting the VC participation and/or the round/date",
            "If this appears to be the VC's first investment in the startup, set investment_type='initial'",
            "If the text suggests a later participation (e.g., 'also participated', 'follow-on round', 'existing investor'), set investment_type='follow_on'",
            "If unclear, set investment_type='unknown'"
        ]
    }

    resp = openai_client.chat.completions.create(
        model=OPENAI_MODEL_FOR_VERIFICATION,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": json.dumps(user, ensure_ascii=False)},
        ],
        temperature=0,
    )

    text = resp.choices[0].message.content.strip()

    # Defensive parse
    try:
        obj = json.loads(text)
    except Exception:
        obj = {
            "investment_confirmed": False,
            "confidence": 0.0,
            "startup_name_on_page": None,
            "announcement_date": None,
            "announcement_year": None,
            "round": None,
            "amount": None,
            "main_investors": [],
            "investment_type": "unknown",
            "vc_role": None,
            "evidence_quotes": [],
            "rationale": "Failed to parse JSON output."
        }

    # Normalize booleans / numerics
    obj["investment_confirmed"] = bool(obj.get("investment_confirmed", False))

    try:
        obj["confidence"] = float(obj.get("confidence", 0.0))
    except Exception:
        obj["confidence"] = 0.0
    obj["confidence"] = max(0.0, min(1.0, obj["confidence"]))

    # Normalize strings
    obj["startup_name_on_page"] = obj.get("startup_name_on_page") or None
    obj["announcement_date"] = obj.get("announcement_date") or None

    # Normalize year
    ann_year = obj.get("announcement_year", None)
    if ann_year is not None:
        try:
            obj["announcement_year"] = int(ann_year)
        except Exception:
            obj["announcement_year"] = None
    else:
        obj["announcement_year"] = None

    # If announcement_year missing but date present, try deriving from date text
    if obj["announcement_year"] is None and obj["announcement_date"]:
        m = re.search(r"(\d{4})", obj["announcement_date"])
        if m:
            obj["announcement_year"] = int(m.group(1))

    obj["round"] = obj.get("round") or None
    obj["amount"] = obj.get("amount") or None
    obj["vc_role"] = obj.get("vc_role") or None

    # Normalize investment_type
    inv_type = obj.get("investment_type", "unknown")
    if inv_type not in {"initial", "follow_on", "unknown"}:
        inv_type = "unknown"
    obj["investment_type"] = inv_type

    # Normalize arrays
    obj["main_investors"] = obj.get("main_investors") or []
    if not isinstance(obj["main_investors"], list):
        obj["main_investors"] = []

    obj["evidence_quotes"] = obj.get("evidence_quotes") or []
    if not isinstance(obj["evidence_quotes"], list):
        obj["evidence_quotes"] = []

    obj["rationale"] = str(obj.get("rationale", ""))[:500]
    return obj

# ----------------------------
# 6-3. Run verification over selected sources
# ----------------------------
MAX_PAGES_TO_FETCH = 80          # cost guard
MIN_CONFIDENCE_TO_KEEP = 0.55    # tune

to_process = selected_sources_df.copy()

# Respect robots where available
if "robots_blocked" in to_process.columns:
    to_process = to_process[to_process["robots_blocked"] == False].copy()

to_process = to_process.head(MAX_PAGES_TO_FETCH).reset_index(drop=True)

print(f"üìÑ Pages to fetch+verify: {len(to_process):,} (cap={MAX_PAGES_TO_FETCH})")

verification_rows = []

for i, row in to_process.iterrows():
    vc = row.get("vc")
    year = int(row.get("year"))
    startup_guess = row.get("startup")
    startup_norm = row.get("startup_norm")
    url = row.get("url")

    print(f"[{i+1}/{len(to_process)}] Fetching: {startup_guess} | {url}")

    fetched = fetch_url_text(url)
    if not fetched["ok"]:
        verification_rows.append({
            "vc": vc,
            "target_year": year,
            "startup_guess": startup_guess,
            "startup_norm": startup_norm,
            "url": url,
            "fetch_ok": False,
            "fetch_error": fetched.get("error"),
            "investment_confirmed": False,
            "confidence": 0.0,
            "startup_name_on_page": None,
            "announcement_date": None,
            "announcement_year": None,
            "round": None,
            "amount": None,
            "main_investors": [],
            "investment_type": "unknown",
            "vc_role": None,
            "evidence_quotes": [],
            "rationale": "Fetch failed"
        })
        continue

    verdict = verify_investment_with_openai(
        vc=vc,
        startup_guess=startup_guess,
        year=year,
        url=url,
        page_text=fetched["text"],
    )

    verification_rows.append({
        "vc": vc,
        "target_year": year,
        "startup_guess": startup_guess,
        "startup_norm": startup_norm,
        "url": url,
        "fetch_ok": True,
        "fetch_error": None,
        **verdict,
    })

    sleep_with_jitter(scrape_cfg.min_delay_sec, scrape_cfg.max_delay_sec)

investment_evidence_df = pd.DataFrame(verification_rows)

# ----------------------------
# 6-4. Keep high-confidence confirmed evidence
# ----------------------------
confirmed_df = investment_evidence_df[
    (investment_evidence_df["investment_confirmed"] == True) &
    (investment_evidence_df["confidence"] >= MIN_CONFIDENCE_TO_KEEP)
].copy()

print("‚úÖ Verification completed.")
print(f" - Evidence rows evaluated                 : {len(investment_evidence_df):,}")
print(f" - Confirmed (investment_confirmed=True)   : {(investment_evidence_df['investment_confirmed'] == True).sum():,}")
print(f" - Confirmed (conf‚â•{MIN_CONFIDENCE_TO_KEEP})         : {len(confirmed_df):,}")

display_cols = [
    "vc", "startup_name_on_page", "startup_guess",
    "investment_type", "round", "amount",
    "announcement_date", "announcement_year",
    "vc_role", "confidence", "url"
]
display(confirmed_df[display_cols].sort_values("confidence", ascending=False).head(50))

# ----------------------------
# 6-5. Aggregate to one investment event per (vc, startup_norm, target_year)
# ----------------------------
# First pass: take the highest-confidence evidence per tuple.
if confirmed_df.empty:
    investment_events_df = pd.DataFrame()
else:
    agg_rows = []
    for (vc, startup_norm, target_year), g in confirmed_df.groupby(["vc", "startup_norm", "target_year"]):
        g = g.sort_values("confidence", ascending=False)
        top = g.iloc[0]

        agg_rows.append({
            "vc": vc,
            "target_year": int(target_year),

            # Keep both: guessed name + extracted "name on page"
            "startup_norm": startup_norm,
            "startup_guess": top.get("startup_guess"),
            "startup_name_on_page": top.get("startup_name_on_page"),

            # Key extracted fields
            "announcement_date": top.get("announcement_date"),
            "announcement_year": top.get("announcement_year"),
            "round": top.get("round"),
            "amount": top.get("amount"),
            "investment_type": top.get("investment_type", "unknown"),
            "vc_role": top.get("vc_role"),
            "main_investors": top.get("main_investors"),

            # Evidence + traceability
            "confidence": float(top.get("confidence", 0.0)),
            "evidence_url": top.get("url"),
            "evidence_quotes": top.get("evidence_quotes"),
            "notes": top.get("rationale"),
        })

    investment_events_df = pd.DataFrame(agg_rows).reset_index(drop=True)

print(f"üìå Investment events (aggregated): {len(investment_events_df):,}")
display(investment_events_df.head(30))


üìÑ Pages to fetch+verify: 52 (cap=80)
[1/52] Fetching: 10 Years, 60 Startups, $105 Million | https://today.duke.edu/2025/03/10-years-60-startups-105-million-and-counting-duke-capital-partners-leads-way-alumni
[2/52] Fetching: 2015: A Transformative Year for LendInvest | https://www.lendinvest.com/blog/2015-a-transformative-year-for-lendinvest/
[3/52] Fetching: $950M Sequoia Capital Fund Aims To Back Next Amazon Of AI Era ... | https://finance.yahoo.com/news/950m-sequoia-capital-fund-aims-143114265.html
[4/52] Fetching: A history lesson with Tom McMurray ¬∑ Collaborative Fund | https://collabfund.com/blog/a-history-lesson-with-collaborative-fund-investor-and-former-sequoia-capital-general-partner-tom-mcmurray/
[5/52] Fetching: A review of the Indian private equity sector and developments in 2016 | https://assets.kpmg.com/content/dam/kpmg/in/pdf/2017/02/Private-Equity-Report-review-2016.pdf
[6/52] Fetching: Ascend pitch deck to raise $15m series-a round | https://www.alexanderjarvis.co

Unnamed: 0,vc,startup_name_on_page,startup_guess,investment_type,round,amount,announcement_date,announcement_year,vc_role,confidence,url
18,Sequoia Capital,Percolate,How Much Did Percolate Raise? Funding & Key In...,follow_on,Series C,$40M,2015-05-15,2015.0,unspecified,1.0,https://www.texau.com/profiles/percolate
9,Sequoia Capital,Carbon3D,Carbon3D Closes $100 Million Series C Investme...,follow_on,Series C,$100 million,2015-08-20,2015.0,participant,1.0,https://www.carbon3d.com/news/press-releases/c...
48,Sequoia Capital,SpaceX,Top 5 Venture Capital Firms Backing SpaceX,initial,Series F,$1B,,2015.0,participant,1.0,https://spacexstock.com/venture-capital-firms-...
15,Sequoia Capital,GitHub,GitHub Raises $250M Series B Round Led By Sequ...,follow_on,Series B,$250 million,2015-07-29,2015.0,lead,1.0,https://techcrunch.com/2015/07/29/github-raise...
23,Sequoia Capital,MedGenome Labs Pvt Ltd,MedGenome Labs raises $20M in Series B funding...,initial,Series B,$20 million,2015-07-23,2015.0,lead,1.0,https://www.vccircle.com/medgenome-labs-raises...
24,Sequoia Capital,Noom,Noom zooms to $540m series F -,follow_on,Series F,$540m,2021-05-26,2021.0,participant,0.95,https://globalventuring.com/blog/2021/05/26/no...
28,Sequoia Capital,Dia&Co,Plus-Size Fashion Is a $100 Billion Opportunit...,follow_on,Series C,$40 million,2018-11-15,2018.0,participant,0.95,https://fortune.com/2018/11/15/dia-co-series-c...
27,Sequoia Capital,Remix,Partnering with Remix: The Opportunity in Vert...,initial,Series A,,2017-05-09,2017.0,lead,0.95,https://sequoiacap.com/article/remix-and-the-o...
7,Sequoia Capital,PlanGrid,"Autodesk Acquires Google, Sequoia-Backed PlanGrid",initial,Series B,$69 million,2018-11-20,2018.0,unspecified,0.95,https://www.businessinsider.com/autodesk-acqui...
17,Sequoia Capital,Lemonade,How Much Did Lemonade Raise? Funding & Key Inv...,initial,Seed,$13M,2015-12,2015.0,lead,0.95,https://www.texau.com/profiles/lemonade


üìå Investment events (aggregated): 22


Unnamed: 0,vc,target_year,startup_norm,startup_guess,startup_name_on_page,announcement_date,announcement_year,round,amount,investment_type,vc_role,main_investors,confidence,evidence_url,evidence_quotes,notes
0,Sequoia Capital,2015,ascend pitch deck to raise 15m series a round,Ascend pitch deck to raise $15m series-a round,Ascend,2015-12-15,2015.0,Seed,$4M,initial,participant,"[Lightspeed Venture Partners, Sequoia Capital]",0.9,https://www.alexanderjarvis.com/ascend-pitch-d...,[After the company raised a seed round in 2015...,The text explicitly states Sequoia Capital par...
1,Sequoia Capital,2015,autodesk acquires google sequoia backed plangrid,"Autodesk Acquires Google, Sequoia-Backed PlanGrid",PlanGrid,2018-11-20,2018.0,Series B,$69 million,initial,unspecified,"[GV (formerly Google Ventures), Sequoia Capita...",0.95,https://www.businessinsider.com/autodesk-acqui...,[PlanGrid was last valued at $419 million in a...,The text explicitly states Sequoia Capital bac...
2,Sequoia Capital,2015,carbon3d closes 100 million series c investmen...,Carbon3D Closes $100 Million Series C Investme...,Carbon3D,2015-08-20,2015.0,Series C,$100 million,follow_on,participant,"[Google Ventures, Sequoia Capital, Silver Lake...",1.0,https://www.carbon3d.com/news/press-releases/c...,[Carbon3D Closes $100 Million Series C Investm...,The text explicitly states Sequoia Capital par...
3,Sequoia Capital,2015,carbon3d introduces clip breakthrough technolo...,"Carbon3D introduces CLIP, breakthrough technol...",Carbon3D,2015-03-16,2015.0,Series A and Series B,$41 million,initial,lead,"[Sequoia Capital, Silver Lake Kraftwerk, North...",0.95,https://www.carbon3d.com/news/press-releases/c...,[partnered with Sequoia Capital to lead the co...,The text explicitly states Sequoia Capital led...
4,Sequoia Capital,2015,chillr secures 6m in series a from sequoia cap...,Chillr secures $6M in Series A from Sequoia Ca...,Chillr,2015-10-05,2015.0,Series A,$6 million,initial,unspecified,[Sequoia Capital],0.95,https://www.medianama.com/2015/10/223-chillr-r...,[Chillr has raised $6 million in a Series A ro...,The text explicitly states that Chillr raised ...
5,Sequoia Capital,2015,cohesity emerges from stealth mode with 70 mil...,Cohesity Emerges from Stealth Mode with $70 Mi...,Cohesity,2015-06-17,2015.0,Series B,$70 million,follow_on,participant,"[Sequoia Capital, Google Ventures, ARTIS Ventu...",0.95,https://www.cohesity.com/newsroom/press/cohesi...,"[Strong Backing From Sequoia Capital, Google V...",The text explicitly states Sequoia Capital led...
6,Sequoia Capital,2015,github raises 250m series b round led by sequo...,GitHub Raises $250M Series B Round Led By Sequ...,GitHub,2015-07-29,2015.0,Series B,$250 million,follow_on,lead,"[Sequoia Capital, Andreessen Horowitz, Thrive ...",1.0,https://techcrunch.com/2015/07/29/github-raise...,[GitHub ... announced that it has raised a $25...,The text explicitly states Sequoia Capital led...
7,Sequoia Capital,2015,how lemonade s founders raised a massive seed ...,How Lemonade's founders raised a massive seed ...,Lemonade,2015-12,2015.0,Seed,$13 million,initial,lead,"[Sequoia Capital, Aleph]",0.95,https://www.businessinsider.com/lemonade-danie...,[Lemonade landed in the public eye in December...,The text explicitly states Sequoia Capital led...
8,Sequoia Capital,2015,how much did lemonade raise funding key investors,How Much Did Lemonade Raise? Funding & Key Inv...,Lemonade,2015-12,2015.0,Seed,$13M,initial,lead,"[Aleph, Sequoia Capital Israel]",0.95,https://www.texau.com/profiles/lemonade,[a $13M seed round in 2015 led by Aleph and Se...,The text explicitly states that Sequoia Capita...
9,Sequoia Capital,2015,how much did percolate raise funding key inves...,How Much Did Percolate Raise? Funding & Key In...,Percolate,2015-05-15,2015.0,Series C,$40M,follow_on,unspecified,"[Lightspeed Venture Partners, First Round Capi...",1.0,https://www.texau.com/profiles/percolate,"[a $40 million Series C in 2015, backed by Lig...",The text explicitly states Sequoia Capital par...


In [19]:
# ============================================================
# Cell 7 : Construct the investment table (one row per inferred investment event)
#          + Apply strict filters:
#            - Drop rows where startup_name_on_page is missing/blank
#            - Drop rows where announcement_year is missing
#            - Drop rows where announcement_year != target_year
# ============================================================

# This cell converts the aggregated investment_events_df (from Cell 6)
# into a clean, analysis-ready investment table.
#
# Output:
#   investments_df  (one row per inferred investment event)

import pandas as pd

# ----------------------------
# 7-0. Guardrails
# ----------------------------
if "investment_events_df" not in globals() or investment_events_df is None or investment_events_df.empty:
    raise ValueError("investment_events_df is empty. Run Cell 6 first.")

inv = investment_events_df.copy()

# Ensure required columns exist (helps debugging if upstream schema changes)
required_cols = [
    "vc", "target_year", "startup_name_on_page", "startup_norm",
    "announcement_year", "announcement_date",
    "investment_type", "round", "amount",
    "vc_role", "main_investors",
    "confidence", "evidence_url", "evidence_quotes", "notes",
]
missing = [c for c in required_cols if c not in inv.columns]
if missing:
    raise ValueError(f"investment_events_df is missing required columns: {missing}")

# ----------------------------
# 7-1. Standardize / coerce types
# ----------------------------
inv["target_year"] = pd.to_numeric(inv["target_year"], errors="coerce")
inv["announcement_year"] = pd.to_numeric(inv["announcement_year"], errors="coerce")

# Normalize startup_name_on_page for filtering
inv["startup_name_on_page"] = inv["startup_name_on_page"].astype("string")
inv["startup_name_on_page_stripped"] = inv["startup_name_on_page"].fillna("").str.strip()

# Normalize investment_type (defensive)
inv["investment_type"] = inv["investment_type"].fillna("unknown")
inv.loc[~inv["investment_type"].isin(["initial", "follow_on", "unknown"]), "investment_type"] = "unknown"

# ----------------------------
# 7-2. Strict filters (requested)
# ----------------------------
before_n = len(inv)

# 1) Drop missing/blank startup_name_on_page
inv = inv[inv["startup_name_on_page_stripped"] != ""].copy()

# 2) Drop where announcement_year is missing or does not match target_year
inv = inv[inv["announcement_year"].notna() & inv["target_year"].notna()].copy()
inv = inv[inv["announcement_year"].astype(int) == inv["target_year"].astype(int)].copy()

after_n = len(inv)
dropped_n = before_n - after_n

print("‚úÖ Filters applied.")
print(f" - Rows before: {before_n:,}")
print(f" - Rows after : {after_n:,}")
print(f" - Dropped    : {dropped_n:,}")

# ----------------------------
# 7-3. Select + rename columns for the canonical investment table
# ----------------------------
investments_df = inv[[
    "vc",
    "target_year",
    "startup_name_on_page",
    "startup_norm",
    "investment_type",
    "round",
    "amount",
    "announcement_date",
    "announcement_year",
    "vc_role",
    "main_investors",
    "confidence",
    "evidence_url",
    "evidence_quotes",
    "notes",
]].copy()

# Canonical column names
investments_df = investments_df.rename(columns={
    "target_year": "year",
    "startup_name_on_page": "startup",
})

# ----------------------------
# 7-4. Final cleanup
# ----------------------------
# Deduplicate (keep the highest-confidence row per VC√óStartup√óYear)
investments_df = investments_df.sort_values("confidence", ascending=False)
investments_df = investments_df.drop_duplicates(subset=["vc", "startup", "year"], keep="first")

# Sort for readability
investments_df = investments_df.sort_values(["vc", "year", "startup"]).reset_index(drop=True)

# Drop helper column if present
if "startup_name_on_page_stripped" in investments_df.columns:
    investments_df = investments_df.drop(columns=["startup_name_on_page_stripped"], errors="ignore")

print(f"üìå Final investments table rows: {len(investments_df):,}")
display(investments_df.head(50))

# Optional: quick summary
summary_df = investments_df.groupby(["vc", "year"]).size().reset_index(name="n_investments")
display(summary_df.sort_values(["vc", "year"], ascending=[True, False]).head(30))


‚úÖ Filters applied.
 - Rows before: 22
 - Rows after : 16
 - Dropped    : 6
üìå Final investments table rows: 13


Unnamed: 0,vc,year,startup,startup_norm,investment_type,round,amount,announcement_date,announcement_year,vc_role,main_investors,confidence,evidence_url,evidence_quotes,notes
0,Sequoia Capital,2015,Ascend,ascend pitch deck to raise 15m series a round,initial,Seed,$4M,2015-12-15,2015.0,participant,"[Lightspeed Venture Partners, Sequoia Capital]",0.9,https://www.alexanderjarvis.com/ascend-pitch-d...,[After the company raised a seed round in 2015...,The text explicitly states Sequoia Capital par...
1,Sequoia Capital,2015,Carbon3D,carbon3d closes 100 million series c investmen...,follow_on,Series C,$100 million,2015-08-20,2015.0,participant,"[Google Ventures, Sequoia Capital, Silver Lake...",1.0,https://www.carbon3d.com/news/press-releases/c...,[Carbon3D Closes $100 Million Series C Investm...,The text explicitly states Sequoia Capital par...
2,Sequoia Capital,2015,Chillr,chillr secures 6m in series a from sequoia cap...,initial,Series A,$6 million,2015-10-05,2015.0,unspecified,[Sequoia Capital],0.95,https://www.medianama.com/2015/10/223-chillr-r...,[Chillr has raised $6 million in a Series A ro...,The text explicitly states that Chillr raised ...
3,Sequoia Capital,2015,Cohesity,cohesity emerges from stealth mode with 70 mil...,follow_on,Series B,$70 million,2015-06-17,2015.0,participant,"[Sequoia Capital, Google Ventures, ARTIS Ventu...",0.95,https://www.cohesity.com/newsroom/press/cohesi...,"[Strong Backing From Sequoia Capital, Google V...",The text explicitly states Sequoia Capital led...
4,Sequoia Capital,2015,Coupang,softbank to invest 1 billion in coupang korea ...,unknown,unknown,$1 billion,2015-06-03,2015.0,participant,"[SoftBank, Sequoia Capital Global Equities, Se...",0.95,https://group.softbank/en/news/press/20150603_0,"[SoftBank will invest $1 billion in Coupang, C...",The page explicitly states that Sequoia Capita...
5,Sequoia Capital,2015,GitHub,github raises 250m series b round led by sequo...,follow_on,Series B,$250 million,2015-07-29,2015.0,lead,"[Sequoia Capital, Andreessen Horowitz, Thrive ...",1.0,https://techcrunch.com/2015/07/29/github-raise...,[GitHub ... announced that it has raised a $25...,The text explicitly states Sequoia Capital led...
6,Sequoia Capital,2015,Gong,the full list of 127 unicorn startups backed b...,unknown,,$583M,,2015.0,unspecified,[Sequoia Capital],0.9,https://www.failory.com/startups/sequoia-capit...,"[Gong AI 2015 $583M $7.25B, Top Investors ... ...",The page explicitly lists Gong as a unicorn st...
7,Sequoia Capital,2015,Lemonade,how lemonade s founders raised a massive seed ...,initial,Seed,$13 million,2015-12,2015.0,lead,"[Sequoia Capital, Aleph]",0.95,https://www.businessinsider.com/lemonade-danie...,[Lemonade landed in the public eye in December...,The text explicitly states Sequoia Capital led...
8,Sequoia Capital,2015,MedGenome Labs Pvt Ltd,medgenome labs raises 20m in series b funding ...,initial,Series B,$20 million,2015-07-23,2015.0,lead,[Sequoia Capital],1.0,https://www.vccircle.com/medgenome-labs-raises...,[MedGenome Labs raises $20 million in its Seri...,The text explicitly states that Sequoia Capita...
9,Sequoia Capital,2015,Percolate,how much did percolate raise funding key inves...,follow_on,Series C,$40M,2015-05-15,2015.0,unspecified,"[Lightspeed Venture Partners, First Round Capi...",1.0,https://www.texau.com/profiles/percolate,"[a $40 million Series C in 2015, backed by Lig...",The text explicitly states Sequoia Capital par...


Unnamed: 0,vc,year,n_investments
0,Sequoia Capital,2015,13


In [20]:
# ============================================================
# Cell 8 : Exit discovery via CSE (IPO / M&A queries per startup)
# ============================================================

# This cell discovers potential exit events (IPO / M&A) for each startup in investments_df
# using Google Custom Search (CSE).
#
# Workflow:
# 1) Take the cleaned investment table (investments_df) from Cell 7
# 2) For each startup, run a small set of exit-focused CSE queries:
#    - "<startup> IPO"
#    - "<startup> acquired by"
#    - "<startup> acquisition"
#    - "<startup> merger"
#    - "<startup> went public"
# 3) Collect URL evidence (title/snippet/link/domain) into exit_candidates_df
#
# Output:
#   exit_candidates_df  (one row per startup √ó exit-evidence URL)

import pandas as pd
import ipywidgets as widgets
from ipywidgets import Output
from IPython.display import display, clear_output

# ----------------------------
# 8-0. Guardrails
# ----------------------------
if "investments_df" not in globals() or investments_df is None or investments_df.empty:
    raise ValueError("investments_df is empty. Run Cell 7 first.")

# ----------------------------
# 8-1. Widgets (controls)
# ----------------------------
max_startups_slider = widgets.IntSlider(
    value=min(50, len(investments_df)),
    min=1,
    max=max(1, min(300, len(investments_df))),
    step=1,
    description="Startups:",
    tooltip="How many startups to search exits for (cost guard).",
    layout=widgets.Layout(width="520px"),
)

max_pages_slider_exit = widgets.IntSlider(
    value=2,
    min=1,
    max=5,
    step=1,
    description="Pages:",
    tooltip="Each page is up to 10 results.",
    layout=widgets.Layout(width="520px"),
)

run_exit_button = widgets.Button(
    description="Run Exit Discovery",
    button_style="primary",
    icon="search",
)

exit_progress_out = Output()

display(max_startups_slider)
display(max_pages_slider_exit)
display(run_exit_button)
display(exit_progress_out)

# ----------------------------
# 8-2. Exit query templates
# ----------------------------
EXIT_QUERY_TEMPLATES = [
    '"{SU}" IPO',
    '"{SU}" went public',
    '"{SU}" filed for IPO',
    '"{SU}" acquired by',
    '"{SU}" acquisition',
    '"{SU}" acquired',
    '"{SU}" merger',
    '"{SU}" bought by',
]

# Optional domain hints. Keep empty for broad search, or bias to major sources:
EXIT_DOMAIN_HINTS = [
    # 'site:sec.gov',             # US filings (IPO-related)
    # 'site:reuters.com',
    # 'site:bloomberg.com',
    # 'site:prnewswire.com',
    # 'site:businesswire.com',
]

# ----------------------------
# 8-3. Run CSE for exits
# ----------------------------
def build_exit_candidates(
    startups: list,
    max_pages: int = 2,
    progress_cb=None,
) -> pd.DataFrame:
    all_rows = []
    total_steps = len(startups) * len(EXIT_QUERY_TEMPLATES) * max_pages
    step = 0

    for su in startups:
        su_clean = str(su).strip()
        if not su_clean:
            continue

        for tmpl in EXIT_QUERY_TEMPLATES:
            base_q = tmpl.format(SU=su_clean)

            queries = [base_q] if not EXIT_DOMAIN_HINTS else [f"{dh} {base_q}" for dh in EXIT_DOMAIN_HINTS]

            for q in queries:
                for page in range(max_pages):
                    step += 1
                    start_index = 1 + page * cse_cfg.num

                    if progress_cb:
                        progress_cb(f"[{step}/{total_steps}] Startup='{su_clean}' | query='{q}' | page={page+1}/{max_pages}")

                    try:
                        data = cse_search(q, start_index=start_index)
                        items = data.get("items", [])
                        rows = extract_rows_from_items(items)

                        for r in rows:
                            r["startup"] = su_clean
                            r["query"] = q
                        all_rows.extend(rows)

                        if progress_cb:
                            progress_cb(f"    ‚Ü≥ items fetched: {len(items)}")

                        sleep_with_jitter(scrape_cfg.min_delay_sec, scrape_cfg.max_delay_sec)

                    except Exception as e:
                        if progress_cb:
                            progress_cb(f"‚ö†Ô∏è CSE request failed: {e}")
                        continue

    df = pd.DataFrame(all_rows)
    if df.empty:
        return df

    df["domain"] = df["link"].apply(get_domain)
    df = df.dropna(subset=["link"]).drop_duplicates(subset=["startup", "link"]).reset_index(drop=True)
    return df

# ----------------------------
# 8-4. Button handler
# ----------------------------
exit_candidates_df = pd.DataFrame()

def on_click_run_exit(_):
    global exit_candidates_df

    # Determine which startups to process
    startups = investments_df["startup"].dropna().astype(str).str.strip().unique().tolist()
    startups = [s for s in startups if s]
    startups = startups[: int(max_startups_slider.value)]

    max_pages = int(max_pages_slider_exit.value)

    with exit_progress_out:
        clear_output(wait=True)
        print("=" * 70)
        print("üîç Starting exit discovery with the following parameters:")
        print(f"  ‚Ä¢ Startups  : {len(startups):,}")
        print(f"  ‚Ä¢ Pages     : {max_pages} (‚âà {max_pages * 10} results per query template)")
        print(f"  ‚Ä¢ Templates : {len(EXIT_QUERY_TEMPLATES)}")
        print("=" * 70)

    def progress_cb(msg: str):
        with exit_progress_out:
            print(msg)

    exit_candidates_df = build_exit_candidates(
        startups=startups,
        max_pages=max_pages,
        progress_cb=progress_cb,
    )

    with exit_progress_out:
        print("-" * 70)
        print(f"‚úÖ Exit discovery completed. Evidence URLs collected: {len(exit_candidates_df):,}")
        if not exit_candidates_df.empty:
            display(exit_candidates_df.head(30))

run_exit_button.on_click(on_click_run_exit)


IntSlider(value=13, description='Startups:', layout=Layout(width='520px'), max=13, min=1, tooltip='How many st‚Ä¶

IntSlider(value=2, description='Pages:', layout=Layout(width='520px'), max=5, min=1, tooltip='Each page is up ‚Ä¶

Button(button_style='primary', description='Run Exit Discovery', icon='search', style=ButtonStyle())

Output()

In [32]:
# ============================================================
# Cell 9 (FAST) : Exit tagging using CSE title/snippet only
#                (Aligned to exit_sources_df schema: uses `link`)
# ============================================================

import json
import pandas as pd

# ----------------------------
# 9F-0. Guardrails
# ----------------------------
if openai_client is None:
    raise ValueError("openai_client is not initialized. Set OPENAI_API_KEY in env.txt and rerun Cell 1.")

if "exit_sources_df" not in globals() or exit_sources_df is None or exit_sources_df.empty:
    raise ValueError("exit_sources_df is empty. Run Cell 8 first (Exit discovery via CSE).")

required_cols = ["title", "snippet", "link", "startup"]
missing = [c for c in required_cols if c not in exit_sources_df.columns]
if missing:
    raise ValueError(f"exit_sources_df is missing required columns: {missing}")

OPENAI_MODEL_FOR_EXIT_TAGGING_FAST = "gpt-4.1-mini"
MAX_URLS_PER_STARTUP_FAST = 5     # cost control
MAX_STARTUPS_FAST = 200           # cost control

# ----------------------------
# 9F-1. OpenAI tagger (snippet-based)
# ----------------------------
EXIT_SCHEMA_FAST = {
    "exit_type": "one of ['IPO','M&A','NONE','UNKNOWN']",
    "exit_date": "ISO 'YYYY-MM-DD' if explicitly present in snippet/title, else null",
    "exit_year": "integer year if explicitly present, else null",
    "acquirer": "string if M&A is clearly indicated, else null",
    "ticker": "string if IPO is clearly indicated, else null",
    "confidence": "number 0.0-1.0",
    "rationale": "short string (<= 160 chars)",
}

def tag_exit_from_snippet_with_openai(startup: str, url: str, title: str, snippet: str) -> dict:
    system = (
        "You are a strict exit-verification assistant.\n"
        "Classify whether a company had an exit event (IPO or M&A) using ONLY the provided URL, title, and snippet.\n"
        "Be conservative: if evidence is weak or ambiguous, choose UNKNOWN or NONE and lower confidence.\n"
        "Return ONLY valid JSON."
    )

    user_payload = {
        "task": "exit_tagging_from_search_snippet",
        "startup": startup,
        "url": url,
        "title": title or "",
        "snippet": snippet or "",
        "required_output_schema": EXIT_SCHEMA_FAST,
        "instructions": [
            "Use IPO when title/snippet clearly indicates IPO, listing, went public, ticker, or exchange.",
            "Use M&A when it clearly indicates acquired by / acquisition / merger and mentions an acquirer.",
            "Use NONE only when title/snippet clearly indicates no exit (rare). Otherwise use UNKNOWN.",
            "Extract exit_date or exit_year ONLY if explicitly stated; do not guess.",
            "Confidence should reflect how explicit the title/snippet is."
        ],
    }

    resp = openai_client.chat.completions.create(
        model=OPENAI_MODEL_FOR_EXIT_TAGGING_FAST,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": json.dumps(user_payload, ensure_ascii=False)},
        ],
        temperature=0,
    )

    raw = (resp.choices[0].message.content or "").strip()
    try:
        obj = json.loads(raw)
    except Exception:
        obj = {
            "exit_type": "UNKNOWN",
            "exit_date": None,
            "exit_year": None,
            "acquirer": None,
            "ticker": None,
            "confidence": 0.0,
            "rationale": "Failed to parse JSON output.",
        }

    # Normalize defensively
    if obj.get("exit_type") not in {"IPO", "M&A", "NONE", "UNKNOWN"}:
        obj["exit_type"] = "UNKNOWN"

    try:
        obj["confidence"] = float(obj.get("confidence", 0.0))
    except Exception:
        obj["confidence"] = 0.0
    obj["confidence"] = max(0.0, min(1.0, obj["confidence"]))

    # Infer exit_year from exit_date if parseable
    if obj.get("exit_year") is None and obj.get("exit_date"):
        try:
            obj["exit_year"] = int(str(obj["exit_date"])[:4])
        except Exception:
            obj["exit_year"] = None

    # Trim strings
    for k in ["acquirer", "ticker", "rationale"]:
        if obj.get(k) is not None:
            obj[k] = str(obj[k])[:300]
    if obj.get("rationale") is not None:
        obj["rationale"] = str(obj["rationale"])[:160]

    return obj

# ----------------------------
# 9F-2. Prepare input table (use `link` as URL)
# ----------------------------
work = exit_sources_df.copy()
work["link"] = work["link"].astype(str).apply(normalize_url)
work = work.dropna(subset=["link"]).drop_duplicates(subset=["startup", "link"]).reset_index(drop=True)

# Optional: ensure `domain` exists (you already have it, but safe fallback)
if "domain" not in work.columns:
    work["domain"] = work["link"].apply(get_domain)

# Cost control: limit startups
startup_list = list(work["startup"].dropna().unique())[:MAX_STARTUPS_FAST]
work = work[work["startup"].isin(startup_list)].copy()

print(f"‚ö° Fast exit tagging (snippet-only): {len(startup_list):,} startups, {len(work):,} rows")

# ----------------------------
# 9F-3. Run tagging per startup (top-N rows per startup)
# ----------------------------
EXIT_TYPE_PRIORITY = {"IPO": 0, "M&A": 1, "NONE": 2, "UNKNOWN": 3}

rows = []
for startup, g in work.groupby("startup", sort=False):
    g = g.head(MAX_URLS_PER_STARTUP_FAST).copy()

    for _, r in g.iterrows():
        tag = tag_exit_from_snippet_with_openai(
            startup=startup,
            url=r["link"],
            title=r.get("title", ""),
            snippet=r.get("snippet", ""),
        )

        rows.append({
            "startup": startup,
            "query": r.get("query", ""),
            "domain": r.get("domain", ""),
            "link": r["link"],
            "title": r.get("title", ""),
            "snippet": r.get("snippet", ""),
            "exit_type": tag["exit_type"],
            "exit_date": tag["exit_date"],
            "exit_year": tag["exit_year"],
            "acquirer": tag["acquirer"],
            "ticker": tag["ticker"],
            "confidence": tag["confidence"],
            "rationale": tag["rationale"],
        })

exit_evals_fast_df = pd.DataFrame(rows)

# ----------------------------
# 9F-4. Pick best tag per startup
# ----------------------------
exit_evals_fast_df["exit_priority"] = exit_evals_fast_df["exit_type"].map(EXIT_TYPE_PRIORITY).fillna(9).astype(int)

best_rows = []
for startup, g in exit_evals_fast_df.groupby("startup", sort=False):
    # best: IPO/M&A first, then highest confidence
    best = g.sort_values(["exit_priority", "confidence"], ascending=[True, False]).iloc[0].to_dict()
    best["rows_considered"] = len(g)
    best_rows.append(best)

exit_tags_fast_df = pd.DataFrame(best_rows)

# ----------------------------
# 9F-5. Output
# ----------------------------
print("‚úÖ Fast exit tagging completed (snippet-only).")
print(exit_tags_fast_df["exit_type"].value_counts(dropna=False))

display_cols = [
    "startup", "exit_type", "exit_date", "exit_year",
    "acquirer", "ticker", "confidence", "link", "rationale"
]
display(exit_tags_fast_df[display_cols].head(50))


‚ö° Fast exit tagging (snippet-only): 13 startups, 1,226 rows
‚úÖ Fast exit tagging completed (snippet-only).
exit_type
IPO        6
UNKNOWN    3
NONE       3
M&A        1
Name: count, dtype: int64


Unnamed: 0,startup,exit_type,exit_date,exit_year,acquirer,ticker,confidence,link,rationale
0,Ascend,IPO,,2021.0,,,0.9,https://www.forbes.com/sites/javierhasse/2021/...,Title and snippet clearly indicate Ascend Well...
1,Carbon3D,UNKNOWN,,,,Carbon Stock,0.4,https://notice.co/c/carbon3d,"Title mentions 'How to Buy, Valuation, Stock P..."
2,Chillr,M&A,,,Truecaller,,0.9,https://www.fintechfutures.com/paytech/truecal...,"Title clearly states 'Truecaller buys Chillr',..."
3,Cohesity,IPO,,2025.0,,,0.8,https://finance.yahoo.com/news/cohesity-preps-...,Title states 'Cohesity Preps For IPO' and snip...
4,Coupang,IPO,2021-03-11,2021.0,,CPNG,1.0,https://www.cnbc.com/2021/03/11/coupang-ipo-cp...,Title and snippet explicitly state Coupang IPO...
5,GitHub,NONE,,,,,1.0,https://github.com/lmsdss/IPO,Title and snippet refer to an IPO project name...
6,Gong,NONE,,,,,0.9,https://www.reddit.com/r/sales/comments/1ab9e2...,Snippet discusses funding rounds and valuation...
7,Lemonade,IPO,2020-07-02,2020.0,,LMND,1.0,https://www.lemonade.com/blog/lemonade-turns-ten/,Snippet explicitly states Lemonade IPO'd on Ju...
8,MedGenome Labs Pvt Ltd,IPO,,,,,0.7,https://www.grantthornton.in/globalassets/1.-m...,Snippet includes 'IPO' and MedGenome Labs Pvt ...
9,Percolate,UNKNOWN,,,,,0.4,https://equityzen.com/company/percolate/,Mentions pre-IPO stock availability but no exp...


In [36]:
# ============================================================
# Cell 10 (REPLACEMENT) : MOIC approximation logic (round assumptions √ó exit outcome)
#   - Aligned to exit_tags_fast_df schema you showed:
#       startup, exit_type, exit_date, exit_year, acquirer, ticker, confidence, link, rationale
#   - Fixes common issues:
#       (A) Join mismatch ‚Üí adds diagnostics + fallback join
#       (B) Round labels like "Series B", "Series C", "unknown" ‚Üí mapped into your RoundAssumption buckets
#       (C) IPO that is clearly "future / planned" (e.g., SpaceX 2026) ‚Üí treated as NO_EXIT_YET
# ============================================================

import numpy as np
import pandas as pd
from datetime import date

# ----------------------------
# 10-0. Guardrails
# ----------------------------
if "investments_df" not in globals() or investments_df is None or investments_df.empty:
    raise ValueError("investments_df is empty. Create it in Cell 6/7 before running Cell 10.")

if "exit_tags_fast_df" not in globals() or exit_tags_fast_df is None or exit_tags_fast_df.empty:
    raise ValueError("exit_tags_fast_df is empty. Run Cell 9 (FAST snippet-based) before running Cell 10.")

# ----------------------------
# 10-1. Config knobs
# ----------------------------
HOLDING_MULTIPLE_PROXY = 1.30           # used for NO_EXIT_YET / UNKNOWN fallback
HOLDING_DILUTION_APPLIES = True
UNKNOWN_FALLBACK_MODE = "HOLDING"       # "HOLDING" or "EXPECTED_VALUE"
MIN_EXIT_CONFIDENCE = 0.60              # if exit_confidence below this, treat as UNKNOWN (optional)
TODAY = date.today()                    # notebook runtime date

# ----------------------------
# 10-2. Round normalization (critical)
# ----------------------------
# Your RoundAssumption keys (Cell 2) are likely:
#   "Pre-Seed", "Seed", "Series A", "Series B+"
# but investments_df may contain: "Series B", "Series C", "Series F", "unknown", etc.
def canonicalize_round(raw_round: str, default_round: str = DEFAULT_ROUND) -> str:
    if raw_round is None or (isinstance(raw_round, float) and np.isnan(raw_round)):
        return default_round
    s = str(raw_round).strip().lower()

    if s in ("", "unknown", "unk", "na", "n/a", "none", "null"):
        return default_round

    # common variants
    if "pre" in s and "seed" in s:
        return "Pre-Seed"
    if "seed" in s and "series" not in s:
        return "Seed"
    if "series a" in s or s == "a":
        return "Series A"
    if "series" in s:
        # any Series B/C/D/E/F/G... => Series B+
        if "series a" in s:
            return "Series A"
        return "Series B+"
    if "growth" in s or "late" in s:
        return "Series B+"

    return default_round

def dilution_factor_for_round(round_name: str) -> float:
    ra = ROUND_ASSUMPTIONS.get(round_name, ROUND_ASSUMPTIONS[DEFAULT_ROUND])
    return float(getattr(ra, "dilution_factor", 1.0))

def assumed_check_size_usd(round_name: str) -> float:
    ra = ROUND_ASSUMPTIONS.get(round_name, ROUND_ASSUMPTIONS[DEFAULT_ROUND])
    return float(ra.check_size_usd)

# ----------------------------
# 10-3. Prepare investments table
# ----------------------------
work_inv = investments_df.copy()

if "startup" not in work_inv.columns:
    raise ValueError("investments_df must include a 'startup' column.")

if "round" not in work_inv.columns:
    work_inv["round"] = DEFAULT_ROUND

# keep original round for inspection, plus canonical round for assumptions lookup
work_inv["round_raw"] = work_inv["round"]
work_inv["round_canon"] = work_inv["round_raw"].apply(lambda x: canonicalize_round(x, DEFAULT_ROUND))

# normalize startup keys
if "startup_norm" not in work_inv.columns:
    work_inv["startup_norm"] = work_inv["startup"].fillna("").apply(normalize_startup_name)
else:
    work_inv["startup_norm"] = work_inv["startup_norm"].fillna("").astype(str)

# year best-effort
if "year" in work_inv.columns:
    work_inv["year"] = pd.to_numeric(work_inv["year"], errors="coerce").astype("Int64")

# investment confidence best-effort
if "investment_confidence" in work_inv.columns:
    work_inv["investment_confidence"] = pd.to_numeric(work_inv["investment_confidence"], errors="coerce").fillna(0.0)
else:
    work_inv["investment_confidence"] = 0.5

# ----------------------------
# 10-4. Prepare exit tags table (aligned to your schema)
# ----------------------------
work_exit = exit_tags_fast_df.copy()

# required columns
for col in ["startup", "exit_type", "confidence", "link"]:
    if col not in work_exit.columns:
        raise ValueError(f"exit_tags_fast_df is missing required column: '{col}'")

# normalize startup keys
if "startup_norm" not in work_exit.columns:
    work_exit["startup_norm"] = work_exit["startup"].fillna("").apply(normalize_startup_name)
else:
    work_exit["startup_norm"] = work_exit["startup_norm"].fillna("").astype(str)

work_exit["exit_confidence"] = pd.to_numeric(work_exit["confidence"], errors="coerce").fillna(0.0)
work_exit = work_exit.rename(columns={"link": "exit_evidence_url"})

# optional: parse exit_date safely
if "exit_date" in work_exit.columns:
    # keep original, plus parsed date
    work_exit["exit_date_parsed"] = pd.to_datetime(work_exit["exit_date"], errors="coerce").dt.date
else:
    work_exit["exit_date"] = None
    work_exit["exit_date_parsed"] = pd.NaT

# ensure exit_year numeric (best effort)
if "exit_year" in work_exit.columns:
    work_exit["exit_year"] = pd.to_numeric(work_exit["exit_year"], errors="coerce")
else:
    work_exit["exit_year"] = np.nan

# dedupe: keep highest confidence per startup_norm
work_exit = (
    work_exit.sort_values(["startup_norm", "exit_confidence"], ascending=[True, False])
    .drop_duplicates(subset=["startup_norm"])
    .reset_index(drop=True)
)

exit_keep = [
    "startup", "startup_norm",
    "exit_type", "exit_date", "exit_date_parsed", "exit_year",
    "acquirer", "ticker",
    "exit_confidence", "exit_evidence_url", "rationale"
]
exit_keep = [c for c in exit_keep if c in work_exit.columns]
work_exit = work_exit[exit_keep].copy()

# ----------------------------
# 10-5. Join investments √ó exit tags (robust + diagnostics)
# ----------------------------
deal_moic_df = work_inv.merge(work_exit, on="startup_norm", how="left", suffixes=("", "_exit"))

# fallback join on raw startup (case-insensitive) for rows not matched by startup_norm
missing_exit = deal_moic_df["exit_type"].isna()
if missing_exit.any():
    exit_by_startup = work_exit.copy()
    exit_by_startup["startup_lc"] = exit_by_startup["startup"].fillna("").str.lower().str.strip()

    tmp = deal_moic_df.loc[missing_exit, ["startup"]].copy()
    tmp["startup_lc"] = tmp["startup"].fillna("").str.lower().str.strip()

    tmp = tmp.merge(exit_by_startup.drop(columns=["startup_norm"], errors="ignore"), on="startup_lc", how="left")

    for col in ["exit_type", "exit_date", "exit_date_parsed", "exit_year", "acquirer", "ticker",
                "exit_confidence", "exit_evidence_url", "rationale"]:
        if col in tmp.columns:
            deal_moic_df.loc[missing_exit, col] = tmp[col].values

# fill exit fields
deal_moic_df["exit_type"] = deal_moic_df["exit_type"].fillna("UNKNOWN")
deal_moic_df["exit_confidence"] = pd.to_numeric(deal_moic_df["exit_confidence"], errors="coerce").fillna(0.0)

# Diagnostics: merge hit rate
hit_rate = (deal_moic_df["exit_type"] != "UNKNOWN").mean()
print(f"üîé Exit tag merge hit-rate (non-UNKNOWN after join): {hit_rate:.2%}")
if hit_rate < 0.5:
    print("‚ö†Ô∏è Low hit-rate. Likely cause: startup name mismatch between investments_df and exit_tags_fast_df.")
    print("   Consider enforcing the same startup naming upstream, or use a fuzzy match step.")

# ----------------------------
# 10-6. Fix: treat future IPO as NO_EXIT_YET
# ----------------------------
def is_future_exit(exit_type: str, exit_date_parsed, exit_year) -> bool:
    et = str(exit_type).strip().upper()
    if et != "IPO":
        return False

    # --- date-based check (safe) ---
    if exit_date_parsed is not None and not pd.isna(exit_date_parsed):
        try:
            return exit_date_parsed > TODAY
        except Exception:
            pass

    # --- year-based check (fallback) ---
    try:
        if exit_year is not None and not pd.isna(exit_year):
            return int(exit_year) > TODAY.year
    except Exception:
        pass

    return False

deal_moic_df["is_future_ipo_signal"] = deal_moic_df.apply(
    lambda r: is_future_exit(r.get("exit_type"), r.get("exit_date_parsed"), r.get("exit_year")),
    axis=1
)

# Optional: treat very low-confidence EXIT tags as UNKNOWN
def normalize_exit_type(exit_type: str, conf: float, future_ipo: bool) -> str:
    et = str(exit_type).strip().upper()
    if future_ipo and et == "IPO":
        return "UNKNOWN"
    if et in ("IPO", "M&A") and conf < MIN_EXIT_CONFIDENCE:
        return "UNKNOWN"
    if et == "M&A":
        return "M&A"
    if et == "IPO":
        return "IPO"
    if et == "NONE":
        return "NONE"
    return "UNKNOWN"

deal_moic_df["exit_type_norm"] = deal_moic_df.apply(
    lambda r: normalize_exit_type(r["exit_type"], float(r["exit_confidence"]), bool(r["is_future_ipo_signal"])),
    axis=1
)

# ----------------------------
# 10-7. Outcome buckets (avoid "everything becomes UNKNOWN")
# ----------------------------
# EXIT        : IPO / M&A confirmed
# NO_EXIT_YET : explicit NONE, or Unknown fallback
# UNKNOWN     : unknown when you want to keep it separate (we still map to holding/EV downstream)
def exit_type_to_outcome(exit_type_norm: str) -> str:
    et = str(exit_type_norm).strip().upper()
    if et in ("IPO", "M&A"):
        return "EXIT"
    if et == "NONE":
        return "NO_EXIT_YET"
    return "UNKNOWN"

deal_moic_df["outcome_bucket"] = deal_moic_df["exit_type_norm"].apply(exit_type_to_outcome)

# ----------------------------
# 10-8. Revised multiple logic
# ----------------------------
def estimate_multiple_revised(round_canon: str, outcome: str) -> float:
    r = str(round_canon)
    o = str(outcome)

    if o == "EXIT":
        return float(estimate_deal_moic(round_name=r, outcome="EXIT", bull=False))

    if o == "NO_EXIT_YET":
        mult = float(HOLDING_MULTIPLE_PROXY)
        if HOLDING_DILUTION_APPLIES:
            mult *= dilution_factor_for_round(r)
        return mult

    # UNKNOWN fallback
    if UNKNOWN_FALLBACK_MODE.upper() == "EXPECTED_VALUE":
        return float(estimate_deal_moic(round_name=r, outcome="UNKNOWN", bull=False))
    else:
        mult = float(HOLDING_MULTIPLE_PROXY)
        if HOLDING_DILUTION_APPLIES:
            mult *= dilution_factor_for_round(r)
        return mult

deal_moic_df["estimated_multiple"] = deal_moic_df.apply(
    lambda r: estimate_multiple_revised(round_canon=r["round_canon"], outcome=r["outcome_bucket"]),
    axis=1
)

deal_moic_df["assumed_check_usd"] = deal_moic_df["round_canon"].apply(assumed_check_size_usd)
deal_moic_df["assumed_value_usd"] = deal_moic_df["assumed_check_usd"] * deal_moic_df["estimated_multiple"]

deal_moic_df["deal_moic"] = np.where(
    deal_moic_df["assumed_check_usd"] > 0,
    deal_moic_df["assumed_value_usd"] / deal_moic_df["assumed_check_usd"],
    np.nan
)

# ----------------------------
# 10-9. Confidence-weighted MOIC (ranking heuristic)
# ----------------------------
deal_moic_df["investment_confidence"] = pd.to_numeric(deal_moic_df["investment_confidence"], errors="coerce").fillna(0.0)
deal_moic_df["confidence_weight"] = (
    deal_moic_df["investment_confidence"].clip(0, 1) * deal_moic_df["exit_confidence"].clip(0, 1)
)
deal_moic_df["weighted_deal_moic"] = deal_moic_df["deal_moic"] * deal_moic_df["confidence_weight"]

# ----------------------------
# 10-10. Output + key diagnostics
# ----------------------------
print("\n‚úÖ Deal-level MOIC approximation computed (replacement).")
print(f" - Deals: {len(deal_moic_df):,}")

print("\nExit type (raw from tags):")
print(deal_moic_df["exit_type"].value_counts(dropna=False))

print("\nExit type (normalized, future IPO downgraded, low-confidence EXIT -> UNKNOWN):")
print(deal_moic_df["exit_type_norm"].value_counts(dropna=False))

print("\nOutcome bucket:")
print(deal_moic_df["outcome_bucket"].value_counts(dropna=False))

print("\nRound canonicalization (so checks/multiples vary by bucket):")
print(deal_moic_df["round_canon"].value_counts(dropna=False))

# Show rows that still have no evidence URL after join (likely name mismatch)
no_evidence = deal_moic_df["exit_evidence_url"].isna() if "exit_evidence_url" in deal_moic_df.columns else pd.Series(False, index=deal_moic_df.index)
if no_evidence.any():
    print("\n‚ö†Ô∏è Rows with missing exit_evidence_url after join (possible name mismatch):")
    display(deal_moic_df.loc[no_evidence, ["startup", "startup_norm", "round_raw", "round_canon"]].head(20))

display_cols = [
    "vc", "year", "startup",
    "round_raw", "round_canon", "assumed_check_usd",
    "exit_type", "exit_type_norm", "exit_date", "exit_year", "exit_confidence",
    "outcome_bucket",
    "estimated_multiple", "deal_moic",
    "confidence_weight", "weighted_deal_moic",
    "exit_evidence_url",
]
display_cols = [c for c in display_cols if c in deal_moic_df.columns]
display(deal_moic_df[display_cols].head(50))


üîé Exit tag merge hit-rate (non-UNKNOWN after join): 76.92%

‚úÖ Deal-level MOIC approximation computed (replacement).
 - Deals: 13

Exit type (raw from tags):
exit_type
IPO        6
UNKNOWN    3
NONE       3
M&A        1
Name: count, dtype: int64

Exit type (normalized, future IPO downgraded, low-confidence EXIT -> UNKNOWN):
exit_type_norm
IPO        6
UNKNOWN    3
NONE       3
M&A        1
Name: count, dtype: int64

Outcome bucket:
outcome_bucket
EXIT           7
UNKNOWN        3
NO_EXIT_YET    3
Name: count, dtype: int64

Round canonicalization (so checks/multiples vary by bucket):
round_canon
Seed         6
Series B+    6
Series A     1
Name: count, dtype: int64


Unnamed: 0,vc,year,startup,round_raw,round_canon,assumed_check_usd,exit_type,exit_type_norm,exit_date,exit_year,exit_confidence,outcome_bucket,estimated_multiple,deal_moic,confidence_weight,weighted_deal_moic,exit_evidence_url
0,Sequoia Capital,2015,Ascend,Seed,Seed,500000.0,IPO,IPO,,2021.0,0.9,EXIT,5.28,5.28,0.45,2.376,https://www.forbes.com/sites/javierhasse/2021/...
1,Sequoia Capital,2015,Carbon3D,Series C,Series B+,3000000.0,UNKNOWN,UNKNOWN,,,0.4,UNKNOWN,1.235,1.235,0.2,0.247,https://notice.co/c/carbon3d
2,Sequoia Capital,2015,Chillr,Series A,Series A,1500000.0,M&A,M&A,,,0.9,EXIT,3.68,3.68,0.45,1.656,https://www.fintechfutures.com/paytech/truecal...
3,Sequoia Capital,2015,Cohesity,Series B,Series B+,3000000.0,IPO,IPO,,2025.0,0.8,EXIT,2.375,2.375,0.4,0.95,https://finance.yahoo.com/news/cohesity-preps-...
4,Sequoia Capital,2015,Coupang,unknown,Seed,500000.0,IPO,IPO,2021-03-11,2021.0,1.0,EXIT,5.28,5.28,0.5,2.64,https://www.cnbc.com/2021/03/11/coupang-ipo-cp...
5,Sequoia Capital,2015,GitHub,Series B,Series B+,3000000.0,NONE,NONE,,,1.0,NO_EXIT_YET,1.235,1.235,0.5,0.6175,https://github.com/lmsdss/IPO
6,Sequoia Capital,2015,Gong,,Seed,500000.0,NONE,NONE,,,0.9,NO_EXIT_YET,1.144,1.144,0.45,0.5148,https://www.reddit.com/r/sales/comments/1ab9e2...
7,Sequoia Capital,2015,Lemonade,Seed,Seed,500000.0,IPO,IPO,2020-07-02,2020.0,1.0,EXIT,5.28,5.28,0.5,2.64,https://www.lemonade.com/blog/lemonade-turns-ten/
8,Sequoia Capital,2015,MedGenome Labs Pvt Ltd,Series B,Series B+,3000000.0,IPO,IPO,,,0.7,EXIT,2.375,2.375,0.35,0.83125,https://www.grantthornton.in/globalassets/1.-m...
9,Sequoia Capital,2015,Percolate,Series C,Series B+,3000000.0,UNKNOWN,UNKNOWN,,,0.4,UNKNOWN,1.235,1.235,0.2,0.247,https://equityzen.com/company/percolate/


In [37]:
# ============================================================
# Cell 11 : Portfolio aggregation
#           (summary stats, distributions, scenario comparisons)
# ============================================================

# This cell aggregates deal-level outputs from Cell 10 (deal_moic_df) into
# portfolio-level metrics, including:
# - total invested (assumed)
# - total value (assumed)
# - portfolio MOIC
# - confidence-weighted portfolio MOIC (ranking heuristic)
# - summary stats and distributions by round / exit_type
#
# It also supports lightweight "scenario comparisons" by allowing you to
# run Cell 10 under different knobs (e.g., HOLDING_MULTIPLE_PROXY) and then
# store scenario outputs in a dict for side-by-side comparison.

import numpy as np
import pandas as pd

# ----------------------------
# 11-0. Guardrails
# ----------------------------
if "deal_moic_df" not in globals() or deal_moic_df is None or deal_moic_df.empty:
    raise ValueError("deal_moic_df is empty. Run Cell 10 before running Cell 11.")

# ----------------------------
# 11-1. Core portfolio aggregation
# ----------------------------
agg_keys = ["vc"]
if "year" in deal_moic_df.columns:
    agg_keys.append("year")

work = deal_moic_df.copy()

# Ensure numeric fields
for c in ["assumed_check_usd", "assumed_value_usd", "deal_moic", "weighted_deal_moic", "confidence_weight"]:
    if c in work.columns:
        work[c] = pd.to_numeric(work[c], errors="coerce")

work["assumed_check_usd"] = work["assumed_check_usd"].fillna(0.0)
work["assumed_value_usd"] = work["assumed_value_usd"].fillna(0.0)
work["confidence_weight"] = work["confidence_weight"].fillna(0.0)

# Portfolio totals
portfolio_totals = (
    work.groupby(agg_keys, dropna=False)
        .agg(
            deals=("startup", "count"),
            invested_usd=("assumed_check_usd", "sum"),
            value_usd=("assumed_value_usd", "sum"),
            avg_deal_moic=("deal_moic", "mean"),
            median_deal_moic=("deal_moic", "median"),
        )
        .reset_index()
)

portfolio_totals["portfolio_moic"] = np.where(
    portfolio_totals["invested_usd"] > 0,
    portfolio_totals["value_usd"] / portfolio_totals["invested_usd"],
    np.nan
)

# Confidence-weighted portfolio MOIC (heuristic)
# Weighted value = sum(value * weight) / sum(invested * weight)
work["weighted_value_usd"] = work["assumed_value_usd"] * work["confidence_weight"]
work["weighted_invested_usd"] = work["assumed_check_usd"] * work["confidence_weight"]

portfolio_weighted = (
    work.groupby(agg_keys, dropna=False)
        .agg(
            weighted_value_usd=("weighted_value_usd", "sum"),
            weighted_invested_usd=("weighted_invested_usd", "sum"),
            avg_confidence=("confidence_weight", "mean"),
        )
        .reset_index()
)

portfolio_summary = portfolio_totals.merge(portfolio_weighted, on=agg_keys, how="left")
portfolio_summary["portfolio_moic_weighted"] = np.where(
    portfolio_summary["weighted_invested_usd"] > 0,
    portfolio_summary["weighted_value_usd"] / portfolio_summary["weighted_invested_usd"],
    np.nan
)

# ----------------------------
# 11-2. Breakdown tables (exit_type / outcome / round)
# ----------------------------
def pct(x):
    s = x.sum()
    return (x / s) if s else x * 0

# Exit type distribution by portfolio
exit_dist = None
if "exit_type_norm" in work.columns:
    exit_dist = (
        work.groupby(agg_keys + ["exit_type_norm"], dropna=False)
            .size()
            .rename("count")
            .reset_index()
    )
elif "exit_type" in work.columns:
    exit_dist = (
        work.groupby(agg_keys + ["exit_type"], dropna=False)
            .size()
            .rename("count")
            .reset_index()
    )

# Round-level stats
round_stats = None
round_col = None
if "round_canon" in work.columns:
    round_col = "round_canon"
elif "round" in work.columns:
    round_col = "round"

if round_col:
    round_stats = (
        work.groupby(agg_keys + [round_col], dropna=False)
            .agg(
                deals=("startup", "count"),
                invested_usd=("assumed_check_usd", "sum"),
                value_usd=("assumed_value_usd", "sum"),
                portfolio_moic=("assumed_value_usd", lambda s: s.sum()),
            )
            .reset_index()
    )
    # compute moic for each group
    round_stats["portfolio_moic"] = np.where(
        round_stats["invested_usd"] > 0,
        round_stats["value_usd"] / round_stats["invested_usd"],
        np.nan
    )

# ----------------------------
# 11-3. Distribution summary (percentiles)
# ----------------------------
percentiles = [0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95]

def percentile_series(x: pd.Series) -> pd.Series:
    x = pd.to_numeric(x, errors="coerce").dropna()
    if x.empty:
        return pd.Series({f"p{int(p*100)}": np.nan for p in percentiles})
    return pd.Series({f"p{int(p*100)}": float(x.quantile(p)) for p in percentiles})

deal_dist = (
    work.groupby(agg_keys, dropna=False)["deal_moic"]
        .apply(percentile_series)
        .reset_index()
)

# ----------------------------
# 11-4. Scenario comparisons (optional)
# ----------------------------
# If you run Cell 10 multiple times with different assumptions, store results like:
#   scenario_results["baseline_v1"] = portfolio_summary
#   scenario_results["holding_1_1x"] = portfolio_summary
#
# This helper will assemble a comparison table.
if "scenario_results" not in globals():
    scenario_results = {}

def register_scenario_result(name: str, portfolio_summary_df: pd.DataFrame):
    """Store a scenario summary for later comparison."""
    scenario_results[name] = portfolio_summary_df.copy()
    print(f"‚úÖ Registered scenario: {name}")

def build_scenario_comparison_table() -> pd.DataFrame:
    """Concatenate registered scenarios for side-by-side comparison."""
    if not scenario_results:
        print("‚ÑπÔ∏è No scenarios registered yet.")
        return pd.DataFrame()

    frames = []
    for name, df in scenario_results.items():
        tmp = df.copy()
        tmp["scenario"] = name
        frames.append(tmp)

    out = pd.concat(frames, ignore_index=True)
    # reorder columns
    front = ["scenario"] + agg_keys
    rest = [c for c in out.columns if c not in front]
    out = out[front + rest]
    return out

# ----------------------------
# 11-5. Output
# ----------------------------
print("‚úÖ Portfolio aggregation completed.")

print("\n--- Portfolio Summary (MOIC) ---")
display_cols = [
    *agg_keys,
    "deals", "invested_usd", "value_usd",
    "portfolio_moic", "portfolio_moic_weighted",
    "avg_deal_moic", "median_deal_moic",
]
display_cols = [c for c in display_cols if c in portfolio_summary.columns]
display(portfolio_summary[display_cols].sort_values(agg_keys).head(50))

print("\n--- Deal MOIC Distribution (percentiles) ---")
display(deal_dist.sort_values(agg_keys).head(50))

if exit_dist is not None:
    print("\n--- Exit Type Distribution ---")
    display(exit_dist.sort_values(agg_keys + [exit_dist.columns[-2]]).head(100))

if round_stats is not None:
    print("\n--- Round-Level Portfolio Stats ---")
    display(round_stats.sort_values(agg_keys + [round_col]).head(100))

# Optional: register this run under scenario_name if available
if "scenario_name" in globals() and isinstance(scenario_name, str) and scenario_name:
    register_scenario_result(scenario_name, portfolio_summary)

print("\n‚ÑπÔ∏è To compare scenarios, run:")
print("   comparison_df = build_scenario_comparison_table()")
print("   display(comparison_df.sort_values(['scenario'] + agg_keys).head(50))")


‚úÖ Portfolio aggregation completed.

--- Portfolio Summary (MOIC) ---


Unnamed: 0,vc,year,deals,invested_usd,value_usd,portfolio_moic,portfolio_moic_weighted,avg_deal_moic,median_deal_moic
0,Sequoia Capital,2015,13,22500000.0,47646000.0,2.1176,2.285362,2.598615,2.375



--- Deal MOIC Distribution (percentiles) ---


Unnamed: 0,vc,year,level_2,deal_moic
0,Sequoia Capital,2015,p5,1.144
1,Sequoia Capital,2015,p10,1.144
2,Sequoia Capital,2015,p25,1.235
3,Sequoia Capital,2015,p50,2.375
4,Sequoia Capital,2015,p75,3.68
5,Sequoia Capital,2015,p90,5.28
6,Sequoia Capital,2015,p95,5.28



--- Exit Type Distribution ---


Unnamed: 0,vc,year,exit_type_norm,count
0,Sequoia Capital,2015,IPO,6
1,Sequoia Capital,2015,M&A,1
2,Sequoia Capital,2015,NONE,3
3,Sequoia Capital,2015,UNKNOWN,3



--- Round-Level Portfolio Stats ---


Unnamed: 0,vc,year,round_canon,deals,invested_usd,value_usd,portfolio_moic
0,Sequoia Capital,2015,Seed,6,3000000.0,9636000.0,3.212
1,Sequoia Capital,2015,Series A,1,1500000.0,5520000.0,3.68
2,Sequoia Capital,2015,Series B+,6,18000000.0,32490000.0,1.805


‚úÖ Registered scenario: baseline_v1

‚ÑπÔ∏è To compare scenarios, run:
   comparison_df = build_scenario_comparison_table()
   display(comparison_df.sort_values(['scenario'] + agg_keys).head(50))


In [38]:
# ============================================================
# Cell 12 : Export artifacts (CSV/JSON) for downstream analysis/visualization
# ============================================================

# This cell exports the key intermediate and final artifacts produced by the notebook.
# Recommended outputs:
# - candidate_df              (raw CSE hits)
# - ranked_candidates_df      (deduped + scored startups with evidence URLs)
# - selected_sources_df       (robots-aware / LLM-triaged sources for verification)
# - investments_df            (inferred investments table)
# - exit_sources_df           (CSE exit discovery results)
# - exit_tags_fast_df         (snippet-based exit tags)
# - deal_moic_df              (deal-level MOIC table)
# - portfolio_summary         (portfolio-level summary metrics)
#
# Exports:
# - CSV for tables
# - JSON for list-like / nested columns (or when preserving structure matters)

import os
import json
from datetime import datetime
import pandas as pd

# ----------------------------
# 12-0. Export configuration
# ----------------------------
EXPORT_DIR = "artifacts"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Use scenario_name if available to keep runs organized
scenario_label = ""
if "scenario_name" in globals() and isinstance(scenario_name, str) and scenario_name:
    scenario_label = scenario_name.replace(" ", "_")

run_id = f"{timestamp}" + (f"__{scenario_label}" if scenario_label else "")
out_dir = os.path.join(EXPORT_DIR, run_id)

os.makedirs(out_dir, exist_ok=True)
print(f"‚úÖ Export directory: {out_dir}")

# ----------------------------
# 12-1. Helpers
# ----------------------------
def safe_filename(name: str) -> str:
    name = name.strip().replace(" ", "_")
    name = "".join(ch for ch in name if ch.isalnum() or ch in ("_", "-", "."))
    return name

def export_df_csv(df: pd.DataFrame, name: str) -> str:
    path = os.path.join(out_dir, safe_filename(name) + ".csv")
    df.to_csv(path, index=False)
    print(f"üìÑ CSV saved: {path}  ({len(df):,} rows)")
    return path

def export_df_json(df: pd.DataFrame, name: str) -> str:
    """
    Export a dataframe as JSON records, preserving nested/list fields where possible.
    """
    path = os.path.join(out_dir, safe_filename(name) + ".json")
    df.to_json(path, orient="records", force_ascii=False, indent=2)
    print(f"üßæ JSON saved: {path}  ({len(df):,} records)")
    return path

def export_metadata(meta: dict, name: str = "run_metadata") -> str:
    path = os.path.join(out_dir, safe_filename(name) + ".json")
    with open(path, "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)
    print(f"üßæ Metadata saved: {path}")
    return path

def df_exists(name: str) -> bool:
    return name in globals() and globals()[name] is not None and hasattr(globals()[name], "empty") and not globals()[name].empty

# ----------------------------
# 12-2. Build run metadata
# ----------------------------
meta = {
    "run_id": run_id,
    "created_at": timestamp,
    "scenario_name": scenario_label or None,
    "notes": "Exports from 007_Parametric_MOIC_Analysis_VC_Portfolios notebook.",
}

# Capture key knobs if present
for k in ["HOLDING_MULTIPLE_PROXY", "UNKNOWN_FALLBACK_MODE", "MIN_EXIT_CONFIDENCE"]:
    if k in globals():
        try:
            meta[k] = globals()[k]
        except Exception:
            pass

# Capture VC/year if present from widgets
if "vc_dropdown" in globals():
    meta["selected_vc"] = getattr(vc_dropdown, "value", None)
if "year_dropdown" in globals():
    meta["selected_year"] = getattr(year_dropdown, "value", None)

export_metadata(meta)

# ----------------------------
# 12-3. Export available artifacts
# ----------------------------
exports = {}

# Raw candidates
if df_exists("candidate_df"):
    exports["candidate_df_csv"] = export_df_csv(candidate_df, "candidate_df")
    exports["candidate_df_json"] = export_df_json(candidate_df, "candidate_df")

# Ranked candidates
if df_exists("ranked_candidates_df"):
    exports["ranked_candidates_df_csv"] = export_df_csv(ranked_candidates_df, "ranked_candidates_df")
    exports["ranked_candidates_df_json"] = export_df_json(ranked_candidates_df, "ranked_candidates_df")

# Sources for verification
if df_exists("selected_sources_df"):
    exports["selected_sources_df_csv"] = export_df_csv(selected_sources_df, "selected_sources_df")
    exports["selected_sources_df_json"] = export_df_json(selected_sources_df, "selected_sources_df")

# Investments table
if df_exists("investments_df"):
    exports["investments_df_csv"] = export_df_csv(investments_df, "investments_df")
    exports["investments_df_json"] = export_df_json(investments_df, "investments_df")

# Exit discovery results
if df_exists("exit_sources_df"):
    exports["exit_sources_df_csv"] = export_df_csv(exit_sources_df, "exit_sources_df")
    exports["exit_sources_df_json"] = export_df_json(exit_sources_df, "exit_sources_df")

# Exit tags (fast)
if df_exists("exit_tags_fast_df"):
    exports["exit_tags_fast_df_csv"] = export_df_csv(exit_tags_fast_df, "exit_tags_fast_df")
    exports["exit_tags_fast_df_json"] = export_df_json(exit_tags_fast_df, "exit_tags_fast_df")

# Deal-level MOIC
if df_exists("deal_moic_df"):
    exports["deal_moic_df_csv"] = export_df_csv(deal_moic_df, "deal_moic_df")
    exports["deal_moic_df_json"] = export_df_json(deal_moic_df, "deal_moic_df")

# Portfolio summary
if df_exists("portfolio_summary"):
    exports["portfolio_summary_csv"] = export_df_csv(portfolio_summary, "portfolio_summary")
    exports["portfolio_summary_json"] = export_df_json(portfolio_summary, "portfolio_summary")

# Scenario comparison table (optional)
if "scenario_results" in globals() and isinstance(scenario_results, dict) and len(scenario_results) > 0:
    try:
        comparison_df = build_scenario_comparison_table()
        if comparison_df is not None and not comparison_df.empty:
            exports["scenario_comparison_csv"] = export_df_csv(comparison_df, "scenario_comparison")
            exports["scenario_comparison_json"] = export_df_json(comparison_df, "scenario_comparison")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not export scenario comparison: {e}")

# ----------------------------
# 12-4. Save export index
# ----------------------------
export_index_path = os.path.join(out_dir, "export_index.json")
with open(export_index_path, "w", encoding="utf-8") as f:
    json.dump(exports, f, ensure_ascii=False, indent=2)

print(f"\n‚úÖ Export index saved: {export_index_path}")
print("Done.")


‚úÖ Export directory: artifacts/20260106_093006__baseline_v1
üßæ Metadata saved: artifacts/20260106_093006__baseline_v1/run_metadata.json
üìÑ CSV saved: artifacts/20260106_093006__baseline_v1/candidate_df.csv  (136 rows)
üßæ JSON saved: artifacts/20260106_093006__baseline_v1/candidate_df.json  (136 records)
üìÑ CSV saved: artifacts/20260106_093006__baseline_v1/ranked_candidates_df.csv  (123 rows)
üßæ JSON saved: artifacts/20260106_093006__baseline_v1/ranked_candidates_df.json  (123 records)
üìÑ CSV saved: artifacts/20260106_093006__baseline_v1/selected_sources_df.csv  (52 rows)
üßæ JSON saved: artifacts/20260106_093006__baseline_v1/selected_sources_df.json  (52 records)
üìÑ CSV saved: artifacts/20260106_093006__baseline_v1/investments_df.csv  (13 rows)
üßæ JSON saved: artifacts/20260106_093006__baseline_v1/investments_df.json  (13 records)
üìÑ CSV saved: artifacts/20260106_093006__baseline_v1/exit_sources_df.csv  (1,226 rows)
üßæ JSON saved: artifacts/20260106_093006__baseli