In [12]:
# ============================================================
# 010 Startup Initial Meeting Deep Dive
# ============================================================
#
# Overview
# ----------------
# This notebook implements a structured deep-dive workflow following an
# initial meeting with a startup.
#
# Starting from entity identification and factual grounding, it progressively
# organizes public information into explicit structures, layers hypotheses
# where information is incomplete, and culminates in concrete meeting questions
# and actionable outputs.
#
# The objective is to convert a first, often qualitative interaction into a
# traceable analytical artifact that clearly separates facts, hypotheses, and
# actions, enabling more effective follow-up meetings and internal
# decision-making.
#
# Wherever possible, factual statements are grounded in publicly available
# sources and preserved with evidence links to maintain transparency and
# traceability.
#
#
# Inputs / Outputs
# ----------------
# Inputs:
# - Startup identifiers and public references (website, articles, announcements)
# - Initial meeting participant(s), roles, and meeting context
# - Your organization’s perspective (investment, partnership, research, etc.)
#
# Outputs:
# - Fact-grounded startup and people profiles
# - Explicit hypotheses on business, market, and competitive positioning
# - Structured meeting questions, NG questions, and watchouts
# - One-page, meeting-ready summary in Markdown
# - Reusable artifacts (JSON / CSV / Markdown) for comparison and downstream workflows
#
#
# Structure
# ----------------
# 0. Notebook Metadata
#    - Run ID, timestamps, and global configuration
#
# 1. Input Widget
#    - Startup name, meeting participants, meeting context, and your org context
#
# 2. Startup Identification
#    - Entity resolution and canonical identification using official sources and CSE
#
# 3. Company Basics
#    - Core facts on business, product, customers, market, and funding signals
#
# 4. Key People Extraction
#    - Identification of founders, executives, and key team members
#
# 5. Meeting Person Deep Dive
#    - Background, prior roles, public statements, and inferred focus areas
#
# 6. Business & Product Understanding
#    - Product scope, technical approach, value proposition, and differentiation
#
# 7. Customer & Market Structure
#    - Target customers, use cases, market size signals, and adoption patterns
#
# 8. Competitive Landscape
#    - Direct and indirect competitors, substitutes, and positioning signals
#
# 9. Funding & Cap Table Signals
#    - Fundraising history, investors, and capital structure indicators
#
# 10. Recent Changes Timeline
#     - Key developments, announcements, and shifts over recent months
#
# 11. Integrated Insights            (LLM synthesis)
#     - Cross-sectional synthesis of facts, hypotheses, risks, and contradictions
#
# 12. Meeting Context Framing        (LLM synthesis)
#     - Meeting objectives, narrative framing, agenda, and role allocation
#
# 13. Top 5 Meeting Questions        (LLM synthesis)
#     - Priority questions linked to hypotheses, risks, and evidence
#
# 14. NG Questions & Watchouts       (LLM synthesis)
#     - Questions to avoid and signals to watch for during the meeting
#
# 15. One-Page Summary Generation    (LLM synthesis)
#     - Scannable, evidence-linked Markdown summary for internal sharing
#
# 16. Export & Save Artifacts
#     - Run-level export and manifest for reuse and downstream workflows
#
#
# Notes
# ----------------
# - Facts, hypotheses, and actions are intentionally separated.
# - Hypotheses should be revisited and updated as new information emerges.
# - Missing or uncertain information should be made explicit (e.g., UNKNOWN).
# - This notebook is designed for iterative reuse across multiple startups
#   and for comparison across runs within researchOS.


In [13]:
# ============================================================
# 1. Input Widget
# ============================================================
# Purpose:
# - Capture the minimum required context for the deep dive.
# - Keep inputs structured so downstream cells can be deterministic.
# - Separate "startup context", "meeting participant", "meeting context",
#   and "your org context" to avoid mixing assumptions later.

from datetime import datetime, timezone
import json
from pathlib import Path

import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output

# ---- UI Components ----
startup_name = widgets.Text(
    value="",
    description="Startup:",
    placeholder="e.g., Acme AI",
    layout=widgets.Layout(width="600px"),
)

startup_website = widgets.Text(
    value="",
    description="Website:",
    placeholder="e.g., https://acme.ai",
    layout=widgets.Layout(width="600px"),
)

startup_country = widgets.Text(
    value="",
    description="Country:",
    placeholder="e.g., Japan",
    layout=widgets.Layout(width="300px"),
)

startup_sector = widgets.Text(
    value="",
    description="Sector:",
    placeholder="e.g., FinTech, AI Infra, Climate",
    layout=widgets.Layout(width="600px"),
)

meeting_person_name = widgets.Text(
    value="",
    description="Person:",
    placeholder="e.g., Jane Doe",
    layout=widgets.Layout(width="600px"),
)

meeting_person_title = widgets.Text(
    value="",
    description="Title:",
    placeholder="e.g., CEO / Head of Sales / BizDev",
    layout=widgets.Layout(width="600px"),
)

meeting_person_email = widgets.Text(
    value="",
    description="Email:",
    placeholder="(optional)",
    layout=widgets.Layout(width="600px"),
)

meeting_date = widgets.DatePicker(
    description="Date:",
)

meeting_type = widgets.Dropdown(
    options=["Intro", "Follow-up", "Partnering", "Investment", "Hiring", "Other"],
    value="Intro",
    description="Type:",
)

meeting_goal = widgets.Textarea(
    value="",
    description="Goal:",
    placeholder="What do you want to achieve from this meeting / deep dive?",
    layout=widgets.Layout(width="800px", height="80px"),
)

meeting_context = widgets.Textarea(
    value="",
    description="Context:",
    placeholder="Any useful context from the initial meeting (notes, key claims, open questions).",
    layout=widgets.Layout(width="800px", height="120px"),
)

your_org_name = widgets.Text(
    value="",
    description="Your org:",
    placeholder="e.g., XYZ Ventures",
    layout=widgets.Layout(width="600px"),
)

your_org_role = widgets.Dropdown(
    options=["VC", "CVC", "PE", "Corporate", "Accelerator", "Research", "Other"],
    value="VC",
    description="Role:",
)

your_org_thesis = widgets.Textarea(
    value="",
    description="Thesis:",
    placeholder="Your org’s perspective (investment thesis, partnership angle, strategic intent).",
    layout=widgets.Layout(width="800px", height="100px"),
)

confidentiality = widgets.Dropdown(
    options=["Public-only", "Public + meeting notes", "Includes sensitive info"],
    value="Public + meeting notes",
    description="Data:",
)

analyst_name = widgets.Text(
    value="",
    description="Analyst:",
    placeholder="(optional) Your name",
    layout=widgets.Layout(width="400px"),
)

run_id = widgets.Text(
    value=datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"),
    description="Run ID:",
    layout=widgets.Layout(width="300px"),
)

save_button = widgets.Button(
    description="Save Inputs",
    button_style="primary",
    tooltip="Validate and save inputs to /artifacts",
)

status = widgets.Output()

# ---- Layout ----
display(Markdown("## 1. Input Widget"))
display(Markdown("Fill in the fields below. Click **Save Inputs** to persist them for downstream cells."))

section_startup = widgets.VBox([
    widgets.HTML("<b>Startup Context</b>"),
    startup_name, startup_website,
    widgets.HBox([startup_country, widgets.Label("")]),
    startup_sector,
])

section_person = widgets.VBox([
    widgets.HTML("<b>Meeting Participant</b>"),
    meeting_person_name, meeting_person_title, meeting_person_email,
])

section_meeting = widgets.VBox([
    widgets.HTML("<b>Meeting Context</b>"),
    widgets.HBox([meeting_date, meeting_type, run_id]),
    meeting_goal,
    meeting_context,
])

section_org = widgets.VBox([
    widgets.HTML("<b>Your Organization Context</b>"),
    your_org_name, your_org_role, analyst_name,
    your_org_thesis,
    confidentiality,
])

ui = widgets.VBox([
    section_startup,
    widgets.HTML("<hr>"),
    section_person,
    widgets.HTML("<hr>"),
    section_meeting,
    widgets.HTML("<hr>"),
    section_org,
    widgets.HTML("<br>"),
    save_button,
    status,
])

display(ui)

# ---- Helpers ----
def _validate_inputs():
    errors = []
    if not startup_name.value.strip():
        errors.append("Startup name is required.")
    if not meeting_person_name.value.strip():
        errors.append("Meeting participant name is required.")
    if not meeting_goal.value.strip():
        errors.append("Meeting goal is required.")
    if not your_org_role.value.strip():
        errors.append("Your org role is required.")
    return errors

def _build_inputs_dict():
    return {
        "meta": {
            "run_id": run_id.value.strip(),
            "created_at_utc": datetime.now(timezone.utc).isoformat(),
            "analyst_name": analyst_name.value.strip() or None,
            "data_classification": confidentiality.value,
        },
        "startup": {
            "name": startup_name.value.strip(),
            "website": startup_website.value.strip() or None,
            "country": startup_country.value.strip() or None,
            "sector": startup_sector.value.strip() or None,
        },
        "meeting_person": {
            "name": meeting_person_name.value.strip(),
            "title": meeting_person_title.value.strip() or None,
            "email": meeting_person_email.value.strip() or None,
        },
        "meeting": {
            "date": str(meeting_date.value) if meeting_date.value else None,
            "type": meeting_type.value,
            "goal": meeting_goal.value.strip(),
            "context_notes": meeting_context.value.strip() or None,
        },
        "your_org": {
            "name": your_org_name.value.strip() or None,
            "role": your_org_role.value,
            "thesis": your_org_thesis.value.strip() or None,
        },
    }

def _save_inputs(payload: dict, out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / f"inputs_{payload['meta']['run_id']}.json"
    out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
    return out_path

# ---- Callbacks ----
def on_save_clicked(_):
    with status:
        clear_output()
        errors = _validate_inputs()
        if errors:
            display(Markdown("### ❌ Validation errors"))
            for e in errors:
                display(Markdown(f"- {e}"))
            return

        payload = _build_inputs_dict()
        out_path = _save_inputs(payload, Path("artifacts") / "meeting_deep_dive")
        display(Markdown("### ✅ Saved"))
        display(Markdown(f"- Run ID: `{payload['meta']['run_id']}`"))
        display(Markdown(f"- Output: `{out_path.as_posix()}`"))

save_button.on_click(on_save_clicked)

# ---- Final note ----
# Downstream cells should only depend on the persisted JSON (not widget state),
# to keep the workflow reproducible.


## 1. Input Widget

Fill in the fields below. Click **Save Inputs** to persist them for downstream cells.

VBox(children=(VBox(children=(HTML(value='<b>Startup Context</b>'), Text(value='', description='Startup:', lay…

In [7]:
# ============================================================
# 2. Startup Identification (Entity Resolution)
# ============================================================
# Goal:
# - Confirm that the "startup" we analyze is the correct real-world entity.
# - Use the provided URL (if any) as the primary anchor.
# - Use Google CSE as a secondary check to reduce mis-identification risk.
# - Produce a canonical entity record: official website + key reference URLs.
#
# Output (saved):
# - artifacts/meeting_deep_dive/entity_<run_id>.json
#
# Notes:
# - This cell does NOT attempt deep research; it only resolves identity.
# - Downstream cells should rely on the resolved entity record, not raw inputs.

import os
import re
import json
import time
import hashlib
import requests
from urllib.parse import urlparse, urlunparse
from pathlib import Path
from datetime import datetime, timezone
from dotenv import load_dotenv

# ------------------------------------------------------------
# Global configuration (as provided)
# ------------------------------------------------------------
# Load env.txt explicitly (recommended for local + GitHub Actions parity)
load_dotenv("env.txt")

# --- OpenAI API (required) ---
import openai

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if OPENAI_API_KEY is None:
    raise EnvironmentError("OPENAI_API_KEY is not set in the environment variables.")
openai.api_key = OPENAI_API_KEY

# --- Google Custom Search Engine (required for this notebook) ---
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_CX = os.getenv("GOOGLE_CSE_CX")

if GOOGLE_API_KEY is None or GOOGLE_CSE_CX is None:
    raise EnvironmentError(
        "GOOGLE_API_KEY and GOOGLE_CSE_CX must be set in the environment variables."
    )

# ------------------------------------------------------------
# Load inputs (from cell #1 artifacts)
# ------------------------------------------------------------
ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# If you already have `inputs` dict in memory, this will use it.
# Otherwise, load the latest inputs_*.json from artifacts.
def load_latest_inputs(art_dir: Path) -> dict:
    candidates = sorted(art_dir.glob("inputs_*.json"), key=lambda p: p.stat().st_mtime, reverse=True)
    if not candidates:
        raise FileNotFoundError("No inputs_*.json found. Please run cell #1 and save inputs first.")
    return json.loads(candidates[0].read_text(encoding="utf-8"))

try:
    inputs  # noqa: F401
except NameError:
    inputs = load_latest_inputs(ART_DIR)

run_id = inputs["meta"]["run_id"]
startup_name = (inputs.get("startup", {}).get("name") or "").strip()
provided_website = (inputs.get("startup", {}).get("website") or "").strip()

if not startup_name:
    raise ValueError("Startup name is missing in inputs. Please fill it in the Input Widget.")


# ------------------------------------------------------------
# Utility helpers
# ------------------------------------------------------------
def normalize_url(url: str) -> str:
    """
    Normalize URL to reduce trivial mismatches.
    - Force scheme to https when missing
    - Remove URL fragments
    - Remove trailing slash (except root)
    - Lowercase hostname
    """
    if not url:
        return ""
    url = url.strip()
    if not re.match(r"^https?://", url, flags=re.IGNORECASE):
        url = "https://" + url

    p = urlparse(url)
    netloc = p.netloc.lower()
    path = p.path or "/"
    # strip fragment + query for identity checks (keep query only for some sites; but default strip)
    norm = p._replace(scheme="https", netloc=netloc, query="", fragment="")

    # tidy path
    if path != "/" and path.endswith("/"):
        path = path[:-1]
    norm = norm._replace(path=path)

    return urlunparse(norm)

def domain_of(url: str) -> str:
    if not url:
        return ""
    p = urlparse(normalize_url(url))
    host = p.netloc
    # drop common prefixes
    host = re.sub(r"^(www\.)", "", host)
    return host

def soft_domain_match(a: str, b: str) -> bool:
    """
    A conservative match:
    - exact domain match OR one is a subdomain of the other
    """
    if not a or not b:
        return False
    a = a.lower().strip()
    b = b.lower().strip()
    return (a == b) or a.endswith("." + b) or b.endswith("." + a)

def stable_hash(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:12]


# ------------------------------------------------------------
# Google CSE
# ------------------------------------------------------------
def google_cse_search(query: str, num: int = 5) -> list[dict]:
    """
    Returns a list of items with keys:
      - title, link, snippet, displayLink
    """
    url = "https://www.googleapis.com/customsearch/v1"
    params = {
        "key": GOOGLE_API_KEY,
        "cx": GOOGLE_CSE_CX,
        "q": query,
        "num": min(max(num, 1), 10),
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    return data.get("items", []) or []

def cse_candidate_urls(startup_name: str, provided_website: str) -> dict:
    """
    Run a few searches to collect candidate official URLs and reference pages.
    """
    queries = []
    if provided_website:
        # confirm the provided domain is actually associated with the startup name
        queries.append(f'"{startup_name}" {domain_of(provided_website)}')
    queries.extend([
        f'"{startup_name}" official website',
        f'{startup_name} company',
        f'{startup_name} Crunchbase',
        f'{startup_name} LinkedIn',
    ])

    results = []
    for q in queries:
        try:
            items = google_cse_search(q, num=5)
            for it in items:
                results.append({
                    "query": q,
                    "title": it.get("title"),
                    "link": it.get("link"),
                    "snippet": it.get("snippet"),
                    "displayLink": it.get("displayLink"),
                })
            time.sleep(0.2)  # gentle pacing
        except Exception as e:
            results.append({"query": q, "error": str(e)})

    # Extract and group links
    links = [r["link"] for r in results if isinstance(r, dict) and r.get("link")]
    # keep unique order
    seen = set()
    uniq_links = []
    for l in links:
        nl = normalize_url(l)
        if nl and nl not in seen:
            seen.add(nl)
            uniq_links.append(nl)

    return {"queries": queries, "raw_results": results, "unique_links": uniq_links}


# ------------------------------------------------------------
# Candidate selection heuristics (deterministic, lightweight)
# ------------------------------------------------------------
KNOWN_REFERENCE_DOMAINS = {
    "crunchbase.com": "crunchbase",
    "linkedin.com": "linkedin",
    "pitchbook.com": "pitchbook",
    "bloomberg.com": "bloomberg",
    "prtimes.jp": "prtimes",
    "note.com": "note",
    "medium.com": "medium",
    "x.com": "x",
    "twitter.com": "x",
    "facebook.com": "facebook",
    "github.com": "github",
}

def classify_link(url: str) -> str:
    d = domain_of(url)
    for dom, label in KNOWN_REFERENCE_DOMAINS.items():
        if soft_domain_match(d, dom) or d.endswith("." + dom):
            return label
    return "other"

def score_official_site_candidate(url: str, startup_name: str, provided_website: str) -> float:
    """
    Heuristic scoring for "official website" candidates.
    We prefer:
    - match with provided domain (if provided)
    - non-reference domains (not Crunchbase/LinkedIn/etc.)
    - shorter URLs (homepages)
    """
    url_n = normalize_url(url)
    d = domain_of(url_n)

    score = 0.0
    # Provided URL anchor
    if provided_website:
        if soft_domain_match(d, domain_of(provided_website)):
            score += 5.0

    # Penalize known reference aggregators for "official website" selection
    ref = classify_link(url_n)
    if ref != "other":
        score -= 3.0

    # Prefer homepage-ish paths
    p = urlparse(url_n).path or "/"
    if p in ["/", ""]:
        score += 1.5
    elif len(p) <= 12:
        score += 0.5
    else:
        score -= 0.3

    # Light keyword bias (domain contains a token from the name)
    tokens = re.findall(r"[a-z0-9]+", startup_name.lower())
    if tokens:
        for t in tokens[:2]:
            if t and t in d:
                score += 0.8
                break

    return score


def pick_top_official_site(links: list[str], startup_name: str, provided_website: str) -> dict:
    scored = []
    for u in links:
        scored.append({
            "url": u,
            "domain": domain_of(u),
            "class": classify_link(u),
            "score": score_official_site_candidate(u, startup_name, provided_website),
        })
    scored.sort(key=lambda x: x["score"], reverse=True)

    top = scored[0] if scored else None
    top_official = top["url"] if top and top["score"] >= 0.5 else (normalize_url(provided_website) if provided_website else None)

    return {
        "top_official_site": top_official,
        "ranked_candidates": scored[:15],
    }


# ------------------------------------------------------------
# Optional: Use OpenAI to finalize entity resolution (conservative)
# ------------------------------------------------------------
def openai_finalize_entity(startup_name: str, provided_website: str, ranked_candidates: list[dict], raw_results: list[dict]) -> dict:
    """
    Use OpenAI to:
    - choose canonical official website (or keep provided)
    - extract key reference URLs (Crunchbase / LinkedIn / X / etc.)
    - output a structured entity record with confidence + rationale

    This is intentionally conservative: it should prefer 'provided_website'
    when it matches the candidates.
    """
    # Keep prompt compact; include only top candidates and a small sample of results
    top_candidates = ranked_candidates[:8]
    sample_results = [r for r in raw_results if r.get("link")][:10]

    system = (
        "You are an entity-resolution assistant. "
        "Given a startup name, an optionally provided website, and Google CSE results, "
        "select the most likely canonical official website and key reference pages. "
        "Be conservative: if the provided website domain appears consistent, keep it. "
        "Return JSON only, no extra text."
    )

    user = {
        "startup_name": startup_name,
        "provided_website": provided_website or None,
        "top_candidates": top_candidates,
        "sample_results": sample_results,
        "required_output_schema": {
            "canonical_name": "string",
            "official_website": "string|null",
            "official_website_confidence": "0..1",
            "official_website_rationale": "string",
            "reference_urls": {
                "crunchbase": "string|null",
                "linkedin": "string|null",
                "x": "string|null",
                "github": "string|null",
                "other": "array of {label, url}"
            },
            "flags": {
                "possible_name_collision": "boolean",
                "needs_manual_review": "boolean"
            }
        }
    }

    # Compatible with older openai.ChatCompletion style
    resp = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": json.dumps(user, ensure_ascii=False)},
        ],
        temperature=0.0,
    )
    content = resp["choices"][0]["message"]["content"]
    return json.loads(content)


# ------------------------------------------------------------
# Run: collect candidates and resolve
# ------------------------------------------------------------
provided_website_n = normalize_url(provided_website) if provided_website else ""
cse_pack = cse_candidate_urls(startup_name=startup_name, provided_website=provided_website_n)

picked = pick_top_official_site(
    links=cse_pack["unique_links"],
    startup_name=startup_name,
    provided_website=provided_website_n
)

# Try OpenAI finalization; if it fails, fall back to heuristic result
try:
    entity = openai_finalize_entity(
        startup_name=startup_name,
        provided_website=provided_website_n,
        ranked_candidates=picked["ranked_candidates"],
        raw_results=cse_pack["raw_results"],
    )
    entity["resolution_method"] = "openai+heuristics"
except Exception as e:
    entity = {
        "canonical_name": startup_name,
        "official_website": picked["top_official_site"],
        "official_website_confidence": 0.55 if picked["top_official_site"] else 0.2,
        "official_website_rationale": f"Fallback heuristic selection (OpenAI step failed): {str(e)}",
        "reference_urls": {
            "crunchbase": None,
            "linkedin": None,
            "x": None,
            "github": None,
            "other": []
        },
        "flags": {
            "possible_name_collision": True,
            "needs_manual_review": True
        },
        "resolution_method": "heuristics_only",
    }

# Attach evidence (lightweight)
entity["evidence"] = {
    "provided_website": provided_website_n or None,
    "cse_queries": cse_pack["queries"],
    "top_ranked_candidates": picked["ranked_candidates"][:10],
    "result_sample_hash": stable_hash(json.dumps(cse_pack["raw_results"][:20], ensure_ascii=False)),
}

# Persist
out_path = ART_DIR / f"entity_{run_id}.json"
out_path.write_text(json.dumps(entity, ensure_ascii=False, indent=2), encoding="utf-8")

print("✅ Startup Identification complete")
print(f"- Startup: {startup_name}")
print(f"- Official website: {entity.get('official_website')}")
print(f"- Method: {entity.get('resolution_method')}")
print(f"- Saved: {out_path.as_posix()}")


✅ Startup Identification complete
- Startup: Sakana AI
- Official website: https://sakana.ai/series-a
- Method: heuristics_only
- Saved: artifacts/meeting_deep_dive/entity_20260107_225628.json


In [6]:
# ============================================================
# 3. Company Basics (Expanded version)
# ============================================================
# What this cell does (expanded, still lightweight & compliant):
# 1) Official-site first: collect richer facts from key pages + internal links + sitemap hints (robots-respecting).
# 2) CSE enrichment: collect more third-party references (robots-respecting per-domain).
# 3) Store raw page text as JSONL for traceability/reproducibility.
# 4) Use OpenAI (SDK v1+) to:
#    - generate a 1-row "company basics" table
#    - generate a "claims" long-form table with evidence_url + confidence
#
# Outputs (saved under artifacts/meeting_deep_dive/):
# - raw_pages_<run_id>.jsonl
# - company_basics_<run_id>.csv
# - company_claims_<run_id>.csv
# - company_extraction_<run_id>.json     (full model output)
#
# Requirements:
# - pip install beautifulsoup4 lxml pandas python-dotenv requests openai

import os
import re
import json
import time
import hashlib
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
from urllib.parse import urlparse, urljoin
from datetime import datetime, timezone
from urllib.robotparser import RobotFileParser
from dotenv import load_dotenv
from openai import OpenAI

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
# Load env.txt explicitly (recommended for local + GitHub Actions parity)
load_dotenv("env.txt")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if OPENAI_API_KEY is None:
    raise EnvironmentError("OPENAI_API_KEY is not set in the environment variables.")

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_CX = os.getenv("GOOGLE_CSE_CX")
if GOOGLE_API_KEY is None or GOOGLE_CSE_CX is None:
    raise EnvironmentError("GOOGLE_API_KEY and GOOGLE_CSE_CX must be set in the environment variables.")

client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Load resolved entity (from cell #2)
# ------------------------------------------------------------
def load_latest_entity(art_dir: Path) -> tuple[dict, Path]:
    files = sorted(art_dir.glob("entity_*.json"), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError("No entity_*.json found. Please run cell #2 first.")
    p = files[0]
    return json.loads(p.read_text(encoding="utf-8")), p

entity, entity_path = load_latest_entity(ART_DIR)

startup_name = (entity.get("canonical_name") or "").strip() or "Unknown Startup"
official_website = (entity.get("official_website") or "").strip()
if not official_website:
    raise ValueError("official_website is missing in entity. Please resolve identity in cell #2.")

# Try to align run_id with inputs if present; else derive from time
try:
    run_id = inputs["meta"]["run_id"]  # from cell #1
except Exception:
    run_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

# ------------------------------------------------------------
# Knobs (expanded but still safe)
# ------------------------------------------------------------
USER_AGENT = "researchOSv2-bot/0.2 (+contact: internal-research)"
TIMEOUT = 25
SLEEP_SEC = 0.25

MAX_OFFICIAL_PAGES = 30         # expanded from ~12
MAX_SITEMAP_URLS = 250          # cap sitemap candidates
MAX_CSE_ARTICLES = 25           # expanded from ~12
MAX_TEXT_CHARS_STORE = 60000    # expanded raw text cap
MAX_TEXT_CHARS_PROMPT = 9000    # per-page cap in LLM prompt
MAX_PAGES_TO_LLM = 18           # keep prompt bounded; prioritize official pages

EXPLICIT_PATHS = [
    "/about", "/company", "/mission", "/vision", "/team",
    "/product", "/products", "/service", "/services", "/solutions",
    "/customers", "/case", "/cases", "/case-studies", "/usecase", "/use-cases",
    "/pricing",
    "/news", "/press", "/blog", "/media",
    "/careers", "/jobs", "/recruit",
    "/investor", "/investors", "/funding", "/ir"
]

OFFICIAL_KEYWORDS = [
    "about", "company", "mission", "vision", "team",
    "product", "products", "service", "services", "solutions",
    "customer", "customers", "case", "cases", "case-study", "usecase", "use-case",
    "pricing", "security", "privacy", "compliance",
    "news", "press", "blog", "media",
    "careers", "jobs", "recruit",
    "investor", "investors", "funding", "ir"
]

# ------------------------------------------------------------
# URL / domain helpers
# ------------------------------------------------------------
def normalize_url(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return ""
    if not re.match(r"^https?://", url, flags=re.I):
        url = "https://" + url
    p = urlparse(url)
    # normalize host and remove fragment
    p = p._replace(netloc=p.netloc.lower(), fragment="")
    return p.geturl()

def get_domain(url: str) -> str:
    u = normalize_url(url)
    return re.sub(r"^www\.", "", urlparse(u).netloc)

def same_domain(a: str, b: str) -> bool:
    da, db = get_domain(a), get_domain(b)
    return da == db or da.endswith("." + db) or db.endswith("." + da)

def is_asset_url(url: str) -> bool:
    path = (urlparse(url).path or "").lower()
    return bool(re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|zip|mp4|mov)$", path))

def stable_hash(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:12]

# ------------------------------------------------------------
# Robots helpers (conservative by design)
# ------------------------------------------------------------
def build_robot_parser(base_url: str) -> RobotFileParser | None:
    base = normalize_url(base_url)
    rp = RobotFileParser()
    robots_url = urljoin(base, "/robots.txt")
    rp.set_url(robots_url)
    try:
        rp.read()
        return rp
    except Exception:
        # If robots can't be fetched, be conservative later
        return None

def robots_allows(rp: RobotFileParser | None, url: str, user_agent: str = USER_AGENT) -> bool:
    if rp is None:
        # Conservative fallback: allow only homepage & obvious public info paths
        p = (urlparse(url).path or "/").lower()
        return p in ["/", ""] or any(k in p for k in [
            "about", "company", "product", "service", "solutions",
            "customers", "case", "pricing", "news", "press", "blog",
            "careers", "jobs", "recruit", "investor", "funding", "ir"
        ])
    try:
        return rp.can_fetch(user_agent, url)
    except Exception:
        return False

# ------------------------------------------------------------
# Fetch + extraction helpers
# ------------------------------------------------------------
def fetch_html(url: str) -> tuple[int, str]:
    headers = {"User-Agent": USER_AGENT}
    r = requests.get(url, headers=headers, timeout=TIMEOUT, allow_redirects=True)
    return r.status_code, (r.text if r.ok else "")

def html_to_text(html: str) -> str:
    """
    Higher-density extraction:
    - Remove scripts/styles
    - Prefer <main> or <article> when present to reduce nav/footer noise
    """
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript", "svg"]):
        tag.decompose()

    node = soup.find("main") or soup.find("article") or (soup.body if soup.body else soup)
    text = node.get_text("\n", strip=True)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def extract_links(base_url: str, html: str) -> list[str]:
    soup = BeautifulSoup(html, "lxml")
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if href.startswith("#") or href.lower().startswith("mailto:") or href.lower().startswith("tel:"):
            continue
        abs_url = normalize_url(urljoin(base_url, href))
        if abs_url.startswith(("http://", "https://")):
            links.append(abs_url)
    # de-dup preserve order
    seen, out = set(), []
    for l in links:
        if l not in seen:
            seen.add(l)
            out.append(l)
    return out

def score_official_link(url: str) -> float:
    """
    Prefer likely "company basics" pages; de-prioritize assets/long tracking URLs.
    """
    u = normalize_url(url)
    p = (urlparse(u).path or "/").lower()

    score = 0.0
    if p in ["/", ""]:
        score += 2.0

    for k in OFFICIAL_KEYWORDS:
        if k in p:
            score += 1.0

    if is_asset_url(u):
        score -= 4.0

    # penalize extremely deep paths
    if p.count("/") >= 4:
        score -= 0.5

    # mild penalty for long query strings
    if len(urlparse(u).query) > 30:
        score -= 0.4

    return score

# ------------------------------------------------------------
# Sitemap discovery (lightweight)
# ------------------------------------------------------------
def discover_from_sitemap(site_url: str, rp: RobotFileParser | None, limit: int = MAX_SITEMAP_URLS) -> list[str]:
    import xml.etree.ElementTree as ET

    root = normalize_url(site_url)
    sm_url = urljoin(root, "/sitemap.xml")
    if not robots_allows(rp, sm_url):
        return []

    try:
        r = requests.get(sm_url, headers={"User-Agent": USER_AGENT}, timeout=TIMEOUT)
        if not r.ok:
            return []
        tree = ET.fromstring(r.text)
    except Exception:
        return []

    ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
    urls = []

    # sitemap index?
    sitemap_locs = [loc.text for loc in tree.findall(".//sm:sitemap/sm:loc", ns) if loc.text]
    if sitemap_locs:
        for loc in sitemap_locs[:5]:
            try:
                if not robots_allows(rp, loc):
                    continue
                r2 = requests.get(loc, headers={"User-Agent": USER_AGENT}, timeout=TIMEOUT)
                if not r2.ok:
                    continue
                t2 = ET.fromstring(r2.text)
                locs2 = [x.text for x in t2.findall(".//sm:url/sm:loc", ns) if x.text]
                urls.extend(locs2)
                if len(urls) >= limit:
                    break
            except Exception:
                continue
    else:
        locs = [loc.text for loc in tree.findall(".//sm:url/sm:loc", ns) if loc.text]
        urls.extend(locs)

    base_dom = get_domain(root)
    out, seen = [], set()
    for u in urls:
        nu = normalize_url(u)
        if not nu:
            continue
        if not same_domain(nu, root):
            continue
        if nu not in seen:
            seen.add(nu)
            out.append(nu)
        if len(out) >= limit:
            break
    return out

# ------------------------------------------------------------
# Google CSE enrichment
# ------------------------------------------------------------
def google_cse_search(query: str, num: int = 5) -> list[dict]:
    url = "https://www.googleapis.com/customsearch/v1"
    params = {"key": GOOGLE_API_KEY, "cx": GOOGLE_CSE_CX, "q": query, "num": min(max(num, 1), 10)}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    items = data.get("items", []) or []
    out = []
    for it in items:
        out.append({
            "title": it.get("title"),
            "link": normalize_url(it.get("link") or ""),
            "snippet": it.get("snippet"),
            "displayLink": it.get("displayLink"),
        })
    return out

# ------------------------------------------------------------
# Build official targets (homepage + explicit paths + internal links + sitemap)
# ------------------------------------------------------------
official_home = normalize_url(official_website)
official_domain = get_domain(official_home)

rp_official = build_robot_parser(official_home)
print(f"Startup: {startup_name}")
print(f"Official: {official_home}")
print(f"Entity file: {entity_path.name}")

# Fetch homepage
home_status, home_html = fetch_html(official_home)
time.sleep(SLEEP_SEC)
if home_status >= 400 or not home_html:
    raise RuntimeError(f"Failed to fetch official homepage: HTTP {home_status}")

home_links = extract_links(official_home, home_html)
internal_links = [l for l in home_links if same_domain(l, official_home) and not is_asset_url(l)]

explicit_candidates = []
for p in EXPLICIT_PATHS:
    u = normalize_url(urljoin(official_home, p))
    if not is_asset_url(u) and robots_allows(rp_official, u):
        explicit_candidates.append(u)

sitemap_links = discover_from_sitemap(official_home, rp_official, limit=MAX_SITEMAP_URLS)

# Merge candidates
official_candidates = list(dict.fromkeys([official_home] + explicit_candidates + internal_links + sitemap_links))

# Score + pick
scored = [{"url": u, "score": score_official_link(u)} for u in official_candidates]
scored.sort(key=lambda x: x["score"], reverse=True)

official_targets = []
for it in scored:
    if len(official_targets) >= MAX_OFFICIAL_PAGES:
        break
    u = it["url"]
    if it["score"] < 0.2:
        continue
    if robots_allows(rp_official, u):
        official_targets.append(u)

official_targets = list(dict.fromkeys(official_targets))
print(f"Official targets (robots-allowed, capped): {len(official_targets)}")

# ------------------------------------------------------------
# Build CSE targets (expanded query set, bilingual-friendly)
# ------------------------------------------------------------
queries = [
    f'"{startup_name}" {official_domain}',
    f'"{startup_name}" funding OR raised OR Series OR seed OR 資金調達',
    f'"{startup_name}" customer OR case study OR 導入事例 OR 事例',
    f'"{startup_name}" partnership OR 提携',
    f'"{startup_name}" pricing OR 料金',
    f'"{startup_name}" competitor OR alternative OR 競合',
    f'"{startup_name}" market OR TAM OR 市場',
    f'"{startup_name}" ARR OR revenue OR 売上',
    f'"{startup_name}" security OR SOC2 OR ISO27001 OR セキュリティ',
    f'"{startup_name}" hiring OR careers OR 採用 OR 募集',
]

cse_items = []
for q in queries:
    try:
        cse_items.extend(google_cse_search(q, num=5))
        time.sleep(0.2)
    except Exception as e:
        cse_items.append({"title": None, "link": None, "snippet": f"CSE error: {e}", "displayLink": None})

# Dedup + cap
seen = set()
cse_links = []
for it in cse_items:
    link = it.get("link")
    if link and link not in seen and link.startswith(("http://", "https://")):
        seen.add(link)
        cse_links.append(link)
cse_links = cse_links[:MAX_CSE_ARTICLES]
print(f"CSE targets (capped): {len(cse_links)}")

# ------------------------------------------------------------
# Robots cache for third-party domains
# ------------------------------------------------------------
_robot_cache: dict[str, RobotFileParser | None] = {}

def get_rp_for_url(url: str) -> RobotFileParser | None:
    dom = get_domain(url)
    if dom in _robot_cache:
        return _robot_cache[dom]
    rp = build_robot_parser(url)
    _robot_cache[dom] = rp
    return rp

# ------------------------------------------------------------
# Fetch + store raw pages (JSONL)
# ------------------------------------------------------------
RAW_PATH = ART_DIR / f"raw_pages_{run_id}.jsonl"

def append_jsonl(path: Path, obj: dict):
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

RAW_PATH.write_text("", encoding="utf-8")

def collect_pages(urls: list[str], source_type: str):
    for u in urls:
        u = normalize_url(u)
        if not u or is_asset_url(u):
            continue

        rp = rp_official if source_type == "official" else get_rp_for_url(u)
        allowed = robots_allows(rp, u)

        if not allowed:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": False,
                "text": "",
                "notes": "Skipped due to robots.txt (or conservative fallback).",
            })
            continue

        try:
            code, html = fetch_html(u)
            time.sleep(SLEEP_SEC)
            text = html_to_text(html) if html else ""
            text = text[:MAX_TEXT_CHARS_STORE]

            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": code,
                "robots_allowed": True,
                "text": text,
                "text_char_len": len(text),
            })
        except Exception as e:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": True,
                "text": "",
                "notes": f"Fetch error: {str(e)}",
            })

collect_pages(official_targets, "official")
collect_pages(cse_links, "cse")

print(f"✅ Raw pages saved: {RAW_PATH.as_posix()}")

# ------------------------------------------------------------
# Load raw pages + prepare shortlist for LLM (prioritize official)
# ------------------------------------------------------------
def load_raw_pages(path: Path) -> list[dict]:
    rows = []
    for line in path.read_text(encoding="utf-8").splitlines():
        if line.strip():
            rows.append(json.loads(line))
    return rows

raw_pages = load_raw_pages(RAW_PATH)

usable_pages = [
    r for r in raw_pages
    if r.get("robots_allowed")
    and isinstance(r.get("http_status"), int) and r["http_status"] < 400
    and (r.get("text") or "").strip()
]

official_usable = [r for r in usable_pages if r.get("source_type") == "official"]
cse_usable = [r for r in usable_pages if r.get("source_type") == "cse"]

# Shortlist: official first, then CSE
shortlist = (official_usable[:12] + cse_usable[:10])[:MAX_PAGES_TO_LLM]
print(f"Usable pages: {len(usable_pages)} | Shortlist to LLM: {len(shortlist)}")

# ------------------------------------------------------------
# OpenAI extraction (SDK v1+): summary row + evidence-linked claims
# ------------------------------------------------------------
def openai_extract_company_basics_and_claims(
    startup_name: str,
    official_website: str,
    pages: list[dict]
) -> dict:
    """
    Return JSON only. Includes:
    - company_summary (single object)
    - claims (list; each claim has evidence_url and confidence)
    """
    system = (
        "You are a research analyst assistant. "
        "Given raw text from multiple webpages about a startup, extract factual company basics. "
        "Prefer official sources when conflicts exist. "
        "Separate facts from hypotheses. "
        "For claims, ALWAYS attach an evidence_url from the provided pages; "
        "if you cannot support it, omit it. Return JSON only."
    )

    schema = {
        "company_summary": {
            "name": "string",
            "official_website": "string|null",
            "one_liner": "string|null",
            "business_description": "string|null",
            "products_services": "array of strings",
            "customer_segments": "array of strings",
            "customers_named": "array of strings",
            "use_cases": "array of strings",
            "industries_hiring_for": "array of strings",
            "target_market": "string|null",
            "geography_focus": "string|null",
            "competitors": "array of strings",
            "funding_signals": {
                "funding_stage_hint": "string|null",
                "investors_mentioned": "array of strings",
                "funding_amounts_mentioned": "array of strings",
                "funding_dates_mentioned": "array of strings"
            }
        },
        "claims": [
            {
                "category": "product|customer|competitor|market|funding|hiring|other",
                "claim_type": "fact|hypothesis",
                "claim": "string",
                "evidence_url": "string",
                "source_type": "official|cse",
                "confidence": "0..1"
            }
        ],
        "notes": {
            "conflicts_or_ambiguities": "array of strings",
            "data_gaps": "array of strings"
        },
        "confidence": {
            "overall": "0..1",
            "rationale": "string"
        }
    }

    bundle = {
        "startup_name": startup_name,
        "official_website": official_website,
        "pages": [
            {
                "url": p["url"],
                "source_type": p["source_type"],
                "domain": p.get("domain"),
                "text": (p.get("text") or "")[:MAX_TEXT_CHARS_PROMPT],
            } for p in pages
        ],
        "schema": schema
    }

    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
        ],
        temperature=0.0,
    )

    content = resp.choices[0].message.content
    return json.loads(content)

extracted = openai_extract_company_basics_and_claims(startup_name, official_home, shortlist)

# Save full extraction JSON
EXTRACT_PATH = ART_DIR / f"company_extraction_{run_id}.json"
EXTRACT_PATH.write_text(json.dumps(extracted, ensure_ascii=False, indent=2), encoding="utf-8")

# ------------------------------------------------------------
# Build DataFrames
# ------------------------------------------------------------
summary = extracted.get("company_summary", {}) or {}
funding = summary.get("funding_signals", {}) if isinstance(summary.get("funding_signals"), dict) else {}

df_company = pd.DataFrame([{
    "run_id": run_id,
    "company_name": summary.get("name") or startup_name,
    "official_website": summary.get("official_website") or official_home,
    "one_liner": summary.get("one_liner"),
    "business_description": summary.get("business_description"),
    "products_services": summary.get("products_services", []),
    "customer_segments": summary.get("customer_segments", []),
    "customers_named": summary.get("customers_named", []),
    "use_cases": summary.get("use_cases", []),
    "industries_hiring_for": summary.get("industries_hiring_for", []),
    "target_market": summary.get("target_market"),
    "geography_focus": summary.get("geography_focus"),
    "competitors": summary.get("competitors", []),
    "funding_stage_hint": funding.get("funding_stage_hint"),
    "investors_mentioned": funding.get("investors_mentioned", []),
    "funding_amounts_mentioned": funding.get("funding_amounts_mentioned", []),
    "funding_dates_mentioned": funding.get("funding_dates_mentioned", []),
    "confidence_overall": (extracted.get("confidence", {}) or {}).get("overall"),
    "confidence_rationale": (extracted.get("confidence", {}) or {}).get("rationale"),
    "entity_source": entity_path.name,
}])

claims = extracted.get("claims", []) or []
df_claims = pd.DataFrame(claims)
if not df_claims.empty:
    df_claims.insert(0, "run_id", run_id)
    df_claims.insert(1, "company_name", df_company.loc[0, "company_name"])

# ------------------------------------------------------------
# Save artifacts
# ------------------------------------------------------------
COMPANY_CSV = ART_DIR / f"company_basics_{run_id}.csv"
CLAIMS_CSV = ART_DIR / f"company_claims_{run_id}.csv"

df_company.to_csv(COMPANY_CSV, index=False)
df_claims.to_csv(CLAIMS_CSV, index=False)

print("✅ Company basics (expanded) complete")
print(f"- Raw pages: {RAW_PATH.as_posix()}")
print(f"- Extraction JSON: {EXTRACT_PATH.as_posix()}")
print(f"- Company DF: {COMPANY_CSV.as_posix()}")
print(f"- Claims DF: {CLAIMS_CSV.as_posix()}")

# Show previews
display(df_company)
display(df_claims.head(20) if not df_claims.empty else df_claims)


Startup: Sakana AI
Official: https://sakana.ai/series-a
Entity file: entity_20260107_225628.json
Official targets (robots-allowed, capped): 30
CSE targets (capped): 25
✅ Raw pages saved: artifacts/meeting_deep_dive/raw_pages_20260107_225628.jsonl
Usable pages: 22 | Shortlist to LLM: 14
✅ Company basics (expanded) complete
- Raw pages: artifacts/meeting_deep_dive/raw_pages_20260107_225628.jsonl
- Extraction JSON: artifacts/meeting_deep_dive/company_extraction_20260107_225628.json
- Company DF: artifacts/meeting_deep_dive/company_basics_20260107_225628.csv
- Claims DF: artifacts/meeting_deep_dive/company_claims_20260107_225628.csv


Unnamed: 0,run_id,company_name,official_website,one_liner,business_description,products_services,customer_segments,customers_named,use_cases,industries_hiring_for,target_market,geography_focus,competitors,funding_stage_hint,investors_mentioned,funding_amounts_mentioned,funding_dates_mentioned,confidence_overall,confidence_rationale,entity_source
0,20260107_225628,Sakana AI,https://sakana.ai/series-a,"A new AI R&D company based in Tokyo, Japan, fo...",Sakana AI aims to develop transformative AI th...,"[AI Scientist, ShinkaEvolve, Evolutionary Mode...","[Enterprises, Finance, Manufacturing, Government]","[Mitsubishi UFJ Financial Group, Daiwa Securit...","[Automated scientific discovery, AI model opti...","[AI Research, Software Engineering, Cybersecur...",Japan,"Tokyo, Japan","[OpenAI, Google, Anthropic]",Series B,"[New Enterprise Associates, Khosla Ventures, L...","[$30M, $200M, $135M]","[January 16, 2024, September 04, 2024, Novembe...",1.0,The information is sourced from official compa...,entity_20260107_225628.json


Unnamed: 0,run_id,company_name,category,claim_type,claim,evidence_url,source_type,confidence
0,20260107_225628,Sakana AI,funding,fact,Sakana AI raised approximately $200M in its Se...,https://sakana.ai/series-a/,official,1.0
1,20260107_225628,Sakana AI,funding,fact,Sakana AI raised $135M in its Series B funding...,https://siliconangle.com/2025/11/17/sakana-ai-...,cse,1.0
2,20260107_225628,Sakana AI,product,fact,"Sakana AI developed the AI Scientist, a system...",https://sakana.ai/ai-scientist/,cse,1.0
3,20260107_225628,Sakana AI,product,fact,Sakana AI's ShinkaEvolve framework optimizes c...,https://sakana.ai/icfp-2025/,cse,1.0
4,20260107_225628,Sakana AI,market,fact,Sakana AI focuses on developing AI models tail...,https://siliconangle.com/2025/11/17/sakana-ai-...,cse,1.0


In [9]:
# ============================================================
# 4. Key People Extraction (expanded, evidence-linked)
# ============================================================
# Goal:
# - Identify key people at the startup:
#   (a) the meeting participant (from inputs), and
#   (b) other major members (founders, executives, leadership, board, etc.)
# - Use the same pattern as Company Basics:
#   official pages first + CSE enrichment, lightweight robots check,
#   store raw pages, then use OpenAI to extract + normalize.
#
# Outputs (saved under artifacts/meeting_deep_dive/):
# - people_raw_pages_<run_id>.jsonl
# - people_directory_<run_id>.csv        (people table)
# - people_extraction_<run_id>.json      (full model output)
#
# Notes:
# - This cell is conservative: it only includes people supported by evidence URLs.
# - It also tries to link the meeting participant to a matching person profile (if found).

import os
import re
import json
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
from urllib.parse import urlparse, urljoin
from datetime import datetime, timezone
from urllib.robotparser import RobotFileParser
from dotenv import load_dotenv
from openai import OpenAI

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
load_dotenv("env.txt")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set.")

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_CX = os.getenv("GOOGLE_CSE_CX")
if not GOOGLE_API_KEY or not GOOGLE_CSE_CX:
    raise EnvironmentError("GOOGLE_API_KEY and GOOGLE_CSE_CX must be set.")

client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Load inputs + entity (from prior cells)
# ------------------------------------------------------------
def load_latest_json(pattern: str) -> dict:
    files = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError(f"No {pattern} found. Please run the previous cells.")
    return json.loads(files[0].read_text(encoding="utf-8"))

try:
    run_id = inputs["meta"]["run_id"]
    meeting_person_name = (inputs.get("meeting_person", {}).get("name") or "").strip()
    meeting_person_title = (inputs.get("meeting_person", {}).get("title") or "").strip()
except Exception:
    # fallback to latest inputs file
    latest_inputs = load_latest_json("inputs_*.json")
    run_id = latest_inputs.get("meta", {}).get("run_id") or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    meeting_person_name = (latest_inputs.get("meeting_person", {}).get("name") or "").strip()
    meeting_person_title = (latest_inputs.get("meeting_person", {}).get("title") or "").strip()

entity = load_latest_json("entity_*.json")
startup_name = (entity.get("canonical_name") or "Unknown Startup").strip()
official_website = (entity.get("official_website") or "").strip()
if not official_website:
    raise ValueError("official_website is missing in entity. Run cell #2.")

# ------------------------------------------------------------
# Knobs
# ------------------------------------------------------------
USER_AGENT = "researchOSv2-bot/0.2 (+contact: internal-research)"
TIMEOUT = 25
SLEEP_SEC = 0.25

MAX_OFFICIAL_PAGES = 25
MAX_CSE_PAGES = 20
MAX_TEXT_CHARS_STORE = 60000
MAX_TEXT_CHARS_PROMPT = 9000
MAX_PAGES_TO_LLM = 18

PEOPLE_KEY_PATHS = [
    "/team", "/about", "/company", "/leadership", "/management",
    "/members", "/people", "/board", "/governance",
    "/careers", "/jobs", "/recruit",
    "/news", "/press", "/blog"
]

# ------------------------------------------------------------
# URL / domain / robots helpers
# ------------------------------------------------------------
def normalize_url(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return ""
    if not re.match(r"^https?://", url, flags=re.I):
        url = "https://" + url
    p = urlparse(url)
    return p._replace(netloc=p.netloc.lower(), fragment="").geturl()

def get_domain(url: str) -> str:
    return re.sub(r"^www\.", "", urlparse(normalize_url(url)).netloc)

def same_domain(a: str, b: str) -> bool:
    da, db = get_domain(a), get_domain(b)
    return da == db or da.endswith("." + db) or db.endswith("." + da)

def is_asset_url(url: str) -> bool:
    path = (urlparse(url).path or "").lower()
    return bool(re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|zip|mp4|mov)$", path))

def build_robot_parser(base_url: str) -> RobotFileParser | None:
    rp = RobotFileParser()
    rp.set_url(urljoin(normalize_url(base_url), "/robots.txt"))
    try:
        rp.read()
        return rp
    except Exception:
        return None

def robots_allows(rp: RobotFileParser | None, url: str) -> bool:
    if rp is None:
        # conservative fallback
        p = (urlparse(url).path or "/").lower()
        return p in ["/", ""] or any(k in p for k in ["team", "about", "company", "leadership", "people", "board", "news", "press", "blog"])
    try:
        return rp.can_fetch(USER_AGENT, url)
    except Exception:
        return False

# ------------------------------------------------------------
# HTML extraction helpers
# ------------------------------------------------------------
def fetch_html(url: str) -> tuple[int, str]:
    r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=TIMEOUT, allow_redirects=True)
    return r.status_code, (r.text if r.ok else "")

def html_to_text(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript", "svg"]):
        tag.decompose()
    node = soup.find("main") or soup.find("article") or (soup.body if soup.body else soup)
    text = node.get_text("\n", strip=True)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def extract_links(base_url: str, html: str) -> list[str]:
    soup = BeautifulSoup(html, "lxml")
    out = []
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if href.startswith("#") or href.lower().startswith("mailto:") or href.lower().startswith("tel:"):
            continue
        u = normalize_url(urljoin(base_url, href))
        if u.startswith(("http://", "https://")):
            out.append(u)
    # unique preserve order
    seen, uniq = set(), []
    for u in out:
        if u not in seen:
            seen.add(u)
            uniq.append(u)
    return uniq

def score_people_link(url: str) -> float:
    p = (urlparse(url).path or "/").lower()
    score = 0.0
    if p in ["/team", "/people", "/leadership", "/management", "/board"]:
        score += 3.0
    for k in ["team", "people", "leadership", "management", "board", "company", "about"]:
        if k in p:
            score += 1.0
    if is_asset_url(url):
        score -= 4.0
    if p.count("/") >= 4:
        score -= 0.4
    return score

# ------------------------------------------------------------
# Google CSE
# ------------------------------------------------------------
def google_cse_search(query: str, num: int = 5) -> list[dict]:
    url = "https://www.googleapis.com/customsearch/v1"
    params = {"key": GOOGLE_API_KEY, "cx": GOOGLE_CSE_CX, "q": query, "num": min(max(num, 1), 10)}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    items = data.get("items", []) or []
    return [{
        "title": it.get("title"),
        "link": normalize_url(it.get("link") or ""),
        "snippet": it.get("snippet"),
        "displayLink": it.get("displayLink"),
    } for it in items]

# ------------------------------------------------------------
# Build targets (official + CSE)
# ------------------------------------------------------------
official_home = normalize_url(official_website)
rp_official = build_robot_parser(official_home)

# homepage links
status, html = fetch_html(official_home)
time.sleep(SLEEP_SEC)
if status >= 400 or not html:
    raise RuntimeError(f"Failed to fetch official homepage: HTTP {status}")

home_links = extract_links(official_home, html)
internal_links = [l for l in home_links if same_domain(l, official_home) and not is_asset_url(l)]

# explicit paths
explicit = []
for p in PEOPLE_KEY_PATHS:
    u = normalize_url(urljoin(official_home, p))
    if robots_allows(rp_official, u) and not is_asset_url(u):
        explicit.append(u)

official_candidates = list(dict.fromkeys([official_home] + explicit + internal_links))
scored = [{"url": u, "score": score_people_link(u)} for u in official_candidates]
scored.sort(key=lambda x: x["score"], reverse=True)

official_targets = []
for it in scored:
    if len(official_targets) >= MAX_OFFICIAL_PAGES:
        break
    if it["score"] < 0.2:
        continue
    u = it["url"]
    if robots_allows(rp_official, u):
        official_targets.append(u)

official_targets = list(dict.fromkeys(official_targets))
print(f"Official people targets: {len(official_targets)}")

# CSE queries (people-focused)
official_domain = get_domain(official_home)
queries = [
    f'"{startup_name}" founder',
    f'"{startup_name}" co-founder',
    f'"{startup_name}" CEO',
    f'"{startup_name}" leadership team',
    f'"{startup_name}" management team',
    f'"{startup_name}" board member',
    f'"{startup_name}" {meeting_person_name}' if meeting_person_name else None,
    f'"{startup_name}" 役員',
    f'"{startup_name}" 創業者',
    f'"{startup_name}" 経営陣',
    f'"{startup_name}" {official_domain}',
]
queries = [q for q in queries if q]

cse_items = []
for q in queries:
    try:
        cse_items.extend(google_cse_search(q, num=5))
        time.sleep(0.2)
    except Exception:
        pass

# Dedup and cap
seen = set()
cse_links = []
for it in cse_items:
    link = it.get("link")
    if link and link not in seen and link.startswith(("http://", "https://")):
        seen.add(link)
        cse_links.append(link)
cse_links = cse_links[:MAX_CSE_PAGES]
print(f"CSE people targets: {len(cse_links)}")

# Robots cache for third-party domains
_robot_cache: dict[str, RobotFileParser | None] = {}

def get_rp_for_url(url: str) -> RobotFileParser | None:
    dom = get_domain(url)
    if dom in _robot_cache:
        return _robot_cache[dom]
    rp = build_robot_parser(url)
    _robot_cache[dom] = rp
    return rp

# ------------------------------------------------------------
# Fetch + store raw pages (JSONL)
# ------------------------------------------------------------
RAW_PATH = ART_DIR / f"people_raw_pages_{run_id}.jsonl"
RAW_PATH.write_text("", encoding="utf-8")

def append_jsonl(path: Path, obj: dict):
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def collect_pages(urls: list[str], source_type: str):
    for u in urls:
        u = normalize_url(u)
        if not u or is_asset_url(u):
            continue

        rp = rp_official if source_type == "official" else get_rp_for_url(u)
        allowed = robots_allows(rp, u)

        if not allowed:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": False,
                "text": "",
                "notes": "Skipped due to robots.txt (or conservative fallback).",
            })
            continue

        try:
            code, html = fetch_html(u)
            time.sleep(SLEEP_SEC)
            text = html_to_text(html) if html else ""
            text = text[:MAX_TEXT_CHARS_STORE]

            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": code,
                "robots_allowed": True,
                "text": text,
                "text_char_len": len(text),
            })
        except Exception as e:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": True,
                "text": "",
                "notes": f"Fetch error: {str(e)}",
            })

collect_pages(official_targets, "official")
collect_pages(cse_links, "cse")

print(f"✅ People raw pages saved: {RAW_PATH.as_posix()}")

# ------------------------------------------------------------
# Load + shortlist to LLM (official first)
# ------------------------------------------------------------
def load_raw_pages(path: Path) -> list[dict]:
    rows = []
    for line in path.read_text(encoding="utf-8").splitlines():
        if line.strip():
            rows.append(json.loads(line))
    return rows

raw_pages = load_raw_pages(RAW_PATH)

usable_pages = [
    r for r in raw_pages
    if r.get("robots_allowed")
    and isinstance(r.get("http_status"), int) and r["http_status"] < 400
    and (r.get("text") or "").strip()
]

official_usable = [r for r in usable_pages if r.get("source_type") == "official"]
cse_usable = [r for r in usable_pages if r.get("source_type") == "cse"]

shortlist = (official_usable[:12] + cse_usable[:10])[:MAX_PAGES_TO_LLM]
print(f"Usable pages: {len(usable_pages)} | Shortlist to LLM: {len(shortlist)}")

# ------------------------------------------------------------
# OpenAI extraction: people directory + meeting-person match
# ------------------------------------------------------------
# ------------------------------------------------------------
# OpenAI extraction (robust JSON handling)
# ------------------------------------------------------------
import json
import re

def _extract_json_object(text: str) -> str:
    """
    Best-effort extraction of a JSON object from arbitrary text.
    Handles:
    - ```json ... ```
    - leading/trailing commentary
    - extra whitespace
    """
    if not text:
        return ""
    t = text.strip()

    # Remove fenced code blocks (```json ... ```)
    fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", t, flags=re.DOTALL | re.IGNORECASE)
    if fence:
        return fence.group(1).strip()

    # Otherwise: find the first "{" and last "}" and slice
    start = t.find("{")
    end = t.rfind("}")
    if start != -1 and end != -1 and end > start:
        return t[start:end+1].strip()

    return ""


def openai_extract_people_directory(
    startup_name: str,
    official_website: str,
    meeting_person_name: str,
    meeting_person_title: str,
    pages: list[dict]
) -> dict:
    """
    Robust extractor:
    - First try "strict JSON mode" via response_format (if supported).
    - If that still fails, attempt to extract JSON from the text.
    - If still failing, raise a clear error and print the raw output.
    """
    system = (
        "You are a research analyst assistant focused on extracting people/leadership information. "
        "From the provided webpage text, identify key people at the startup "
        "(founders, executives, leadership, board). "
        "Be conservative: only include a person if supported by an evidence_url from the provided pages. "
        "Also try to match the meeting participant to one of the extracted people. "
        "Return JSON only."
    )

    schema = {
        "company": {"name": "string", "official_website": "string|null"},
        "meeting_participant": {
            "input_name": "string|null",
            "input_title": "string|null",
            "matched_person_id": "string|null",
            "match_confidence": "0..1",
            "match_rationale": "string"
        },
        "people": [
            {
                "person_id": "string (stable id, e.g., slug)",
                "full_name": "string",
                "role_title": "string|null",
                "category": "founder|executive|leadership|board|other",
                "bio_summary": "string|null",
                "signals": "array of strings (e.g., 'ex-Google', 'PhD', etc.)",
                "links": {
                    "company_profile": "string|null",
                    "linkedin": "string|null",
                    "x": "string|null",
                    "other": "array of strings"
                },
                "evidence": {
                    "evidence_url": "string",
                    "extracted_points": "array of strings"
                },
                "confidence": "0..1"
            }
        ],
        "notes": {
            "name_collisions": "array of strings",
            "data_gaps": "array of strings"
        }
    }

    bundle = {
        "startup_name": startup_name,
        "official_website": official_website,
        "meeting_participant_input": {
            "name": meeting_person_name or None,
            "title": meeting_person_title or None
        },
        "pages": [
            {
                "url": p["url"],
                "source_type": p["source_type"],
                "domain": p.get("domain"),
                "text": (p.get("text") or "")[:MAX_TEXT_CHARS_PROMPT],
            } for p in pages
        ],
        "schema": schema
    }

    # 1) Try strict JSON mode (preferred)
    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
            ],
            temperature=0.0,
            response_format={"type": "json_object"},  # <-- key change
        )
        content = (resp.choices[0].message.content or "").strip()
        if content:
            return json.loads(content)
    except Exception as e:
        # We'll fallback to lenient parsing below
        last_error = e

    # 2) Fallback: request again with explicit "no markdown" + then extract JSON
    resp2 = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system + " Output must be a single JSON object. No markdown. No commentary."},
            {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
        ],
        temperature=0.0,
    )
    raw = (resp2.choices[0].message.content or "")

    candidate = _extract_json_object(raw)
    if candidate:
        try:
            return json.loads(candidate)
        except Exception as e:
            # Debug-friendly error
            print("---- Raw model output (truncated) ----")
            print(raw[:2000])
            print("---- Extracted JSON candidate (truncated) ----")
            print(candidate[:2000])
            raise RuntimeError(f"Failed to parse extracted JSON candidate: {e}") from e

    # 3) If we still cannot recover, raise a clear error
    print("---- Raw model output (truncated) ----")
    print(raw[:2000])
    raise RuntimeError(
        "Model did not return a JSON object. "
        "Consider reducing pages, reducing MAX_TEXT_CHARS_PROMPT, or switching the model."
    )


extracted = openai_extract_people_directory(
    startup_name=startup_name,
    official_website=official_home,
    meeting_person_name=meeting_person_name,
    meeting_person_title=meeting_person_title,
    pages=shortlist
)

# Save extraction JSON
EXTRACT_PATH = ART_DIR / f"people_extraction_{run_id}.json"
EXTRACT_PATH.write_text(json.dumps(extracted, ensure_ascii=False, indent=2), encoding="utf-8")

# Build people DF
people = extracted.get("people", []) or []
df_people = pd.DataFrame(people)

if not df_people.empty:
    # flatten some nested fields for CSV friendliness
    def _get(d, *keys):
        cur = d
        for k in keys:
            if not isinstance(cur, dict):
                return None
            cur = cur.get(k)
        return cur

    df_people["run_id"] = run_id
    df_people["company_name"] = startup_name
    df_people["evidence_url"] = df_people["evidence"].apply(lambda x: _get(x, "evidence_url"))
    df_people["evidence_points"] = df_people["evidence"].apply(lambda x: _get(x, "extracted_points"))
    df_people["company_profile_url"] = df_people["links"].apply(lambda x: _get(x, "company_profile"))
    df_people["linkedin_url"] = df_people["links"].apply(lambda x: _get(x, "linkedin"))
    df_people["x_url"] = df_people["links"].apply(lambda x: _get(x, "x"))
    df_people["other_links"] = df_people["links"].apply(lambda x: _get(x, "other"))

    # keep a readable column order
    cols = [
        "run_id", "company_name",
        "person_id", "full_name", "role_title", "category",
        "bio_summary", "signals",
        "company_profile_url", "linkedin_url", "x_url", "other_links",
        "evidence_url", "evidence_points",
        "confidence"
    ]
    cols = [c for c in cols if c in df_people.columns]
    df_people = df_people[cols]

# Save people directory
PEOPLE_CSV = ART_DIR / f"people_directory_{run_id}.csv"
df_people.to_csv(PEOPLE_CSV, index=False)

# Meeting participant matching info (print)
mp = extracted.get("meeting_participant", {}) or {}
print("✅ Key People Extraction complete")
print(f"- Raw pages: {RAW_PATH.as_posix()}")
print(f"- Extraction JSON: {EXTRACT_PATH.as_posix()}")
print(f"- People DF: {PEOPLE_CSV.as_posix()}")
print()
print("Meeting participant match:")
print(f"- input: {mp.get('input_name')} / {mp.get('input_title')}")
print(f"- matched_person_id: {mp.get('matched_person_id')}")
print(f"- confidence: {mp.get('match_confidence')}")
print(f"- rationale: {mp.get('match_rationale')}")

display(df_people)


Official people targets: 7
CSE people targets: 20
✅ People raw pages saved: artifacts/meeting_deep_dive/people_raw_pages_20260107_225628.jsonl
Usable pages: 12 | Shortlist to LLM: 10
✅ Key People Extraction complete
- Raw pages: artifacts/meeting_deep_dive/people_raw_pages_20260107_225628.jsonl
- Extraction JSON: artifacts/meeting_deep_dive/people_extraction_20260107_225628.json
- People DF: artifacts/meeting_deep_dive/people_directory_20260107_225628.csv

Meeting participant match:
- input: David Ha / 代表取締役
- matched_person_id: david-ha
- confidence: 1
- rationale: David Ha is identified as the CEO and co-founder of Sakana AI, matching the title of 代表取締役.


Unnamed: 0,run_id,company_name,person_id,full_name,role_title,category,bio_summary,signals,company_profile_url,linkedin_url,x_url,other_links,evidence_url,evidence_points,confidence
0,20260107_225628,Sakana AI,david-ha,David Ha,CEO,founder,David Ha is the co-founder and CEO of Sakana A...,"[ex-Google, PhD]",https://sakana.ai/series-a,,,[],https://time.com/collections/time100-ai-2025/7...,"[Co-founder and CEO of Sakana AI, Developed AI...",1
1,20260107_225628,Sakana AI,llion-jones,Llion Jones,CTO,founder,Llion Jones is the co-founder and CTO of Sakan...,"[ex-Google, PhD]",https://sakana.ai/series-a,,,[],https://sakana.ai/seed-round/,"[Co-founder and CTO of Sakana AI, Co-author of...",1
2,20260107_225628,Sakana AI,ren-ito,Ren Ito,COO,founder,Ren Ito is the co-founder and COO of Sakana AI...,"[ex-Mercari, diplomat]",https://sakana.ai/mufg-bank/,,,[],https://sakana.ai/mufg-bank/,"[Co-founder and COO of Sakana AI, Former CEO o...",1


In [13]:
# ============================================================
# 5. Meeting Person Deep Dive (Facts → Hypotheses)
# ============================================================
# Goal:
# - Deep dive on:
#   (a) the meeting participant, and
#   (b) other major members (founders/executives) if needed
# - Collect: background/career, public posts, interviews, articles, talks, and notable quotes.
# - Use the same pattern:
#   official first + CSE enrichment, robots check, store raw pages,
#   then OpenAI normalizes into:
#     1) PeopleProfile table (one row per person)
#     2) ContentIndex table (one row per content item, evidence URL + type + confidence)
#
# Outputs:
# - meeting_deep_dive_raw_pages_<run_id>.jsonl
# - people_profiles_<run_id>.csv
# - people_content_index_<run_id>.csv
# - meeting_person_deep_dive_<run_id>.json
#
# Notes:
# - Conservative: only include claims supported by evidence URLs.
# - The meeting participant is the priority; others are "optional add-ons."
# - If LinkedIn is blocked by robots / login wall, we keep only metadata from snippets.

import os
import re
import json
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
from urllib.parse import urlparse, urljoin
from datetime import datetime, timezone
from urllib.robotparser import RobotFileParser
from dotenv import load_dotenv
from openai import OpenAI

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
load_dotenv("env.txt")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set.")

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_CX = os.getenv("GOOGLE_CSE_CX")
if not GOOGLE_API_KEY or not GOOGLE_CSE_CX:
    raise EnvironmentError("GOOGLE_API_KEY and GOOGLE_CSE_CX must be set.")

client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Load inputs + entity + people directory
# ------------------------------------------------------------
def load_latest_json(pattern: str) -> tuple[dict, Path]:
    files = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError(f"No {pattern} found. Please run previous cells.")
    p = files[0]
    return json.loads(p.read_text(encoding="utf-8")), p

# Inputs (meeting person)
try:
    run_id = inputs["meta"]["run_id"]
    meeting_person_name = (inputs.get("meeting_person", {}).get("name") or "").strip()
    meeting_person_title = (inputs.get("meeting_person", {}).get("title") or "").strip()
except Exception:
    inp, _ = load_latest_json("inputs_*.json")
    run_id = inp.get("meta", {}).get("run_id") or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    meeting_person_name = (inp.get("meeting_person", {}).get("name") or "").strip()
    meeting_person_title = (inp.get("meeting_person", {}).get("title") or "").strip()

entity, _ = load_latest_json("entity_*.json")
startup_name = (entity.get("canonical_name") or "Unknown Startup").strip()
official_website = (entity.get("official_website") or "").strip()
if not official_website:
    raise ValueError("official_website is missing in entity. Run cell #2.")

# People directory (from cell #4), optional but recommended
people_df_path = sorted(ART_DIR.glob("people_directory_*.csv"), key=lambda p: p.stat().st_mtime, reverse=True)
df_people = pd.read_csv(people_df_path[0]) if people_df_path else pd.DataFrame()

# ------------------------------------------------------------
# Knobs
# ------------------------------------------------------------
USER_AGENT = "researchOSv2-bot/0.3 (+contact: internal-research)"
TIMEOUT = 25
SLEEP_SEC = 0.25

MAX_PERSONS = 6                # meeting person + top executives
MAX_CSE_LINKS_PER_PERSON = 18
MAX_TEXT_CHARS_STORE = 60000
MAX_TEXT_CHARS_PROMPT = 8500
MAX_PAGES_TO_LLM = 18

# ------------------------------------------------------------
# URL / domain / robots helpers (same as previous cells)
# ------------------------------------------------------------
def normalize_url(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return ""
    if not re.match(r"^https?://", url, flags=re.I):
        url = "https://" + url
    p = urlparse(url)
    return p._replace(netloc=p.netloc.lower(), fragment="").geturl()

def get_domain(url: str) -> str:
    return re.sub(r"^www\.", "", urlparse(normalize_url(url)).netloc)

def is_asset_url(url: str) -> bool:
    path = (urlparse(url).path or "").lower()
    return bool(re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|zip|mp4|mov)$", path))

def build_robot_parser(base_url: str) -> RobotFileParser | None:
    rp = RobotFileParser()
    rp.set_url(urljoin(normalize_url(base_url), "/robots.txt"))
    try:
        rp.read()
        return rp
    except Exception:
        return None

def robots_allows(rp: RobotFileParser | None, url: str) -> bool:
    if rp is None:
        # conservative fallback
        p = (urlparse(url).path or "/").lower()
        return p in ["/", ""] or any(k in p for k in ["about", "company", "team", "people", "blog", "news", "press"])
    try:
        return rp.can_fetch(USER_AGENT, url)
    except Exception:
        return False

# robots cache (3rd-party)
_robot_cache: dict[str, RobotFileParser | None] = {}

def get_rp_for_url(url: str) -> RobotFileParser | None:
    dom = get_domain(url)
    if dom in _robot_cache:
        return _robot_cache[dom]
    rp = build_robot_parser(url)
    _robot_cache[dom] = rp
    return rp

# ------------------------------------------------------------
# Fetch + text extraction
# ------------------------------------------------------------
def fetch_html(url: str) -> tuple[int, str]:
    r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=TIMEOUT, allow_redirects=True)
    return r.status_code, (r.text if r.ok else "")

def html_to_text(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript", "svg"]):
        tag.decompose()
    node = soup.find("main") or soup.find("article") or (soup.body if soup.body else soup)
    text = node.get_text("\n", strip=True)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

# ------------------------------------------------------------
# Google CSE
# ------------------------------------------------------------
def google_cse_search(query: str, num: int = 5) -> list[dict]:
    url = "https://www.googleapis.com/customsearch/v1"
    params = {"key": GOOGLE_API_KEY, "cx": GOOGLE_CSE_CX, "q": query, "num": min(max(num, 1), 10)}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    items = data.get("items", []) or []
    out = []
    for it in items:
        out.append({
            "title": it.get("title"),
            "link": normalize_url(it.get("link") or ""),
            "snippet": it.get("snippet"),
            "displayLink": it.get("displayLink"),
        })
    return out

# ------------------------------------------------------------
# Select target people (meeting person first, then top members)
# ------------------------------------------------------------
targets = []

if meeting_person_name:
    targets.append({"person_id": "meeting_person", "full_name": meeting_person_name, "role_title": meeting_person_title or None})

# Add a few key executives from df_people (if available)
if not df_people.empty:
    # prefer founders/executives/leadership, higher confidence if present
    def _cat_rank(c):
        return {"founder": 0, "executive": 1, "leadership": 2, "board": 3}.get(str(c), 9)

    # if df has confidence column, use it
    if "confidence" in df_people.columns:
        ranked = df_people.copy()
        ranked["cat_rank"] = ranked["category"].apply(_cat_rank) if "category" in ranked.columns else 9
        ranked["conf"] = pd.to_numeric(ranked["confidence"], errors="coerce").fillna(0.0)
        ranked = ranked.sort_values(["cat_rank", "conf"], ascending=[True, False])
    else:
        ranked = df_people

    for _, r in ranked.head(MAX_PERSONS - len(targets)).iterrows():
        targets.append({
            "person_id": str(r.get("person_id") or r.get("full_name") or f"person_{len(targets)}"),
            "full_name": str(r.get("full_name") or "").strip(),
            "role_title": (str(r.get("role_title")) if pd.notna(r.get("role_title")) else None),
        })

# Dedup by name
seen_names = set()
dedup = []
for t in targets:
    name = (t.get("full_name") or "").strip().lower()
    if name and name not in seen_names:
        seen_names.add(name)
        dedup.append(t)
targets = dedup[:MAX_PERSONS]

print("Target people:")
for t in targets:
    print("-", t["full_name"], "|", t.get("role_title"))

# ------------------------------------------------------------
# Build CSE queries per person to find profiles, posts, interviews, articles
# ------------------------------------------------------------
def build_person_queries(startup: str, person: str) -> list[str]:
    # bilingual-friendly; keep it practical
    return [
        f'"{person}" "{startup}"',
        f'"{person}" "{startup}" interview OR インタビュー',
        f'"{person}" "{startup}" podcast OR 登壇 OR talk OR keynote',
        f'"{person}" "{startup}" LinkedIn',
        f'"{person}" "{startup}" X OR Twitter',
        f'"{person}" "{startup}" note.com OR Medium',
        f'"{person}" "{startup}" press release OR プレスリリース',
        f'"{person}" "{startup}" funding OR 資金調達',
    ]

# ------------------------------------------------------------
# Fetch + store raw pages for deep dive
# ------------------------------------------------------------
RAW_PATH = ART_DIR / f"meeting_deep_dive_raw_pages_{run_id}.jsonl"
RAW_PATH.write_text("", encoding="utf-8")

def append_jsonl(path: Path, obj: dict):
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def collect_url(url: str, source_type: str, person_name: str, query: str | None = None):
    url = normalize_url(url)
    if not url or is_asset_url(url):
        return

    rp = get_rp_for_url(url)
    allowed = robots_allows(rp, url)

    if not allowed:
        append_jsonl(RAW_PATH, {
            "url": url,
            "source_type": source_type,   # cse
            "person_name": person_name,
            "query": query,
            "domain": get_domain(url),
            "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
            "http_status": None,
            "robots_allowed": False,
            "text": "",
            "notes": "Skipped due to robots.txt (or conservative fallback).",
        })
        return

    try:
        code, html = fetch_html(url)
        time.sleep(SLEEP_SEC)
        text = html_to_text(html) if html else ""
        text = text[:MAX_TEXT_CHARS_STORE]

        append_jsonl(RAW_PATH, {
            "url": url,
            "source_type": source_type,
            "person_name": person_name,
            "query": query,
            "domain": get_domain(url),
            "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
            "http_status": code,
            "robots_allowed": True,
            "text": text,
            "text_char_len": len(text),
        })
    except Exception as e:
        append_jsonl(RAW_PATH, {
            "url": url,
            "source_type": source_type,
            "person_name": person_name,
            "query": query,
            "domain": get_domain(url),
            "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
            "http_status": None,
            "robots_allowed": True,
            "text": "",
            "notes": f"Fetch error: {str(e)}",
        })

# Gather CSE targets
all_links = []
for t in targets:
    person = t["full_name"]
    person_queries = build_person_queries(startup_name, person)
    person_links = []

    for q in person_queries:
        try:
            items = google_cse_search(q, num=5)
            for it in items:
                link = it.get("link")
                if link:
                    person_links.append((link, q))
            time.sleep(0.2)
        except Exception:
            continue

    # dedup + cap per person
    seen = set()
    for link, q in person_links:
        n = normalize_url(link)
        if n and n not in seen:
            seen.add(n)
            all_links.append({"url": n, "query": q, "person_name": person})

# Per-person cap
per_person_count = {}
filtered = []
for x in all_links:
    k = x["person_name"]
    per_person_count.setdefault(k, 0)
    if per_person_count[k] < MAX_CSE_LINKS_PER_PERSON:
        filtered.append(x)
        per_person_count[k] += 1

print("Fetching pages (robots-respecting)...")
for x in filtered:
    collect_url(x["url"], "cse", x["person_name"], x["query"])

print(f"✅ Deep dive raw pages saved: {RAW_PATH.as_posix()}")

# ------------------------------------------------------------
# Load raw pages + shortlist to LLM (prioritize meeting person)
# ------------------------------------------------------------
def load_raw_pages(path: Path) -> list[dict]:
    rows = []
    for line in path.read_text(encoding="utf-8").splitlines():
        if line.strip():
            rows.append(json.loads(line))
    return rows

raw_pages = load_raw_pages(RAW_PATH)
usable = [
    r for r in raw_pages
    if r.get("robots_allowed")
    and isinstance(r.get("http_status"), int) and r["http_status"] < 400
    and (r.get("text") or "").strip()
]

# prioritize meeting person pages in the prompt
def _priority(r):
    is_meeting = (meeting_person_name or "").strip().lower() == (r.get("person_name") or "").strip().lower()
    # prefer interview-ish URLs
    p = (urlparse(r["url"]).path or "").lower()
    bonus = 0
    for k in ["interview", "podcast", "talk", "note", "blog", "press", "media"]:
        if k in p:
            bonus += 1
    return (1 if is_meeting else 0, bonus, -len(r.get("text") or ""))

def score_profile_like(url: str) -> int:
    p = (urlparse(url).path or "").lower()
    keys = ["speaker", "speakers", "bio", "profile", "team", "people", "leadership", "about", "faculty", "news"]
    return sum(k in p for k in keys)

usable_sorted = sorted(
    usable,
    key=lambda r: (score_profile_like(r["url"]), (r.get("person_name") or "").lower() == (meeting_person_name or "").lower(), len(r.get("text") or "")),
    reverse=True
)
shortlist = usable_sorted[:MAX_PAGES_TO_LLM]

print(f"Usable pages: {len(usable)} | Shortlist to LLM: {len(shortlist)}")

# ------------------------------------------------------------
# OpenAI: Deep dive with "strict + loose" outputs (anti-empty)
# ------------------------------------------------------------
import json
import re

def _extract_json_object(text: str) -> str:
    if not text:
        return ""
    t = text.strip()
    fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", t, flags=re.DOTALL | re.IGNORECASE)
    if fence:
        return fence.group(1).strip()
    start, end = t.find("{"), t.rfind("}")
    if start != -1 and end != -1 and end > start:
        return t[start:end+1].strip()
    return ""

def openai_deep_dive_people_and_content(
    startup_name: str,
    official_website: str,
    meeting_person_name: str,
    targets: list[dict],
    pages: list[dict]
) -> dict:
    system = (
        "You are a research analyst assistant. "
        "Build (1) an evidence-linked content index and (2) best-effort people profiles "
        "for the meeting participant and key company members, from the provided webpage text.\n\n"
        "CRITICAL RULES:\n"
        "- Always prioritize producing a non-empty content_index if there is any relevant material.\n"
        "- For strict profiles, include only items clearly supported by evidence URLs.\n"
        "- If strict evidence is insufficient, still produce candidate profiles in people_profiles_loose "
        "with lower confidence and clear rationale.\n"
        "- Do not invent education/employment. Only include what is in the provided texts.\n"
        "- Return a single JSON object only (no markdown, no commentary)."
    )

    schema = {
        "people_profiles_strict": [
            {
                "person_id": "string",
                "full_name": "string",
                "role_title": "string|null",
                "current_company": "string|null",
                "bio_summary": "string|null",
                "career_facts": "array of strings",
                "public_presence_urls": "array of strings",
                "confidence": "0..1",
                "evidence_urls": "array of strings"
            }
        ],
        "people_profiles_loose": [
            {
                "person_id": "string",
                "full_name": "string",
                "role_title": "string|null",
                "candidate_summary": "string (best-effort, may be incomplete)",
                "why_this_is_likely_the_same_person": "string",
                "confidence": "0..1",
                "supporting_urls": "array of strings"
            }
        ],
        "content_index": [
            {
                "person_name": "string",
                "content_type": "profile|interview|article|podcast|talk|press|post|other",
                "title": "string|null",
                "publisher_or_platform": "string|null",
                "date": "string|null",
                "url": "string",
                "what_it_contains": "string (short)",
                "notable_points": "array of strings",
                "relevance": "0..1",
                "confidence": "0..1"
            }
        ],
        "meeting_person_insights": {
            "person_name": "string|null",
            "facts": "array of strings",
            "hypotheses": "array of strings",
            "open_questions": "array of strings"
        },
        "notes": {
            "name_collisions": "array of strings",
            "data_gaps": "array of strings",
            "conflicts": "array of strings"
        }
    }

    bundle = {
        "startup_name": startup_name,
        "official_website": official_website,
        "meeting_person_name": meeting_person_name or None,
        "target_people": targets,
        "pages": [
            {
                "url": p["url"],
                "person_name": p.get("person_name"),
                "source_type": p.get("source_type"),
                "domain": p.get("domain"),
                "text": (p.get("text") or "")[:MAX_TEXT_CHARS_PROMPT],
            } for p in pages
        ],
        "schema": schema,
        "extraction_priorities": [
            "First, produce content_index rows for each target person (at least 1 if possible).",
            "Then, attempt people_profiles_strict if the text contains biography-like facts.",
            "If strict is empty, populate people_profiles_loose using best-effort matching and supporting_urls."
        ]
    }

    # Prefer strict JSON mode if supported
    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
            ],
            temperature=0.0,
            response_format={"type": "json_object"},
        )
        return json.loads((resp.choices[0].message.content or "").strip())
    except Exception:
        resp2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
            ],
            temperature=0.0,
        )
        raw = resp2.choices[0].message.content or ""
        candidate = _extract_json_object(raw)
        if candidate:
            return json.loads(candidate)
        raise RuntimeError("Model did not return a JSON object.")


deep_dive = openai_deep_dive_people_and_content(
    startup_name=startup_name,
    official_website=official_website,
    meeting_person_name=meeting_person_name,
    targets=targets,
    pages=shortlist
)

# ------------------------------------------------------------
# Save JSON + DataFrames
# ------------------------------------------------------------
JSON_PATH = ART_DIR / f"meeting_person_deep_dive_{run_id}.json"
JSON_PATH.write_text(json.dumps(deep_dive, ensure_ascii=False, indent=2), encoding="utf-8")

profiles = deep_dive.get("people_profiles", []) or []
content_index = deep_dive.get("content_index", []) or []

df_profiles = pd.DataFrame(profiles)
df_content = pd.DataFrame(content_index)

# Light post-processing for CSV friendliness
if not df_profiles.empty:
    df_profiles.insert(0, "run_id", run_id)
    df_profiles.insert(1, "startup_name", startup_name)

if not df_content.empty:
    df_content.insert(0, "run_id", run_id)
    df_content.insert(1, "startup_name", startup_name)

PROFILES_CSV = ART_DIR / f"people_profiles_{run_id}.csv"
CONTENT_CSV = ART_DIR / f"people_content_index_{run_id}.csv"

df_profiles.to_csv(PROFILES_CSV, index=False)
df_content.to_csv(CONTENT_CSV, index=False)

print("✅ Meeting Person Deep Dive complete")
print(f"- Raw pages: {RAW_PATH.as_posix()}")
print(f"- JSON: {JSON_PATH.as_posix()}")
print(f"- Profiles CSV: {PROFILES_CSV.as_posix()}")
print(f"- Content CSV: {CONTENT_CSV.as_posix()}")

display(df_profiles)
display(df_content.head(30) if not df_content.empty else df_content)


Target people:
- David Ha | 代表取締役
- Llion Jones | CTO
- Ren Ito | COO
Fetching pages (robots-respecting)...
✅ Deep dive raw pages saved: artifacts/meeting_deep_dive/meeting_deep_dive_raw_pages_20260107_225628.jsonl
Usable pages: 21 | Shortlist to LLM: 18
✅ Meeting Person Deep Dive complete
- Raw pages: artifacts/meeting_deep_dive/meeting_deep_dive_raw_pages_20260107_225628.jsonl
- JSON: artifacts/meeting_deep_dive/meeting_person_deep_dive_20260107_225628.json
- Profiles CSV: artifacts/meeting_deep_dive/people_profiles_20260107_225628.csv
- Content CSV: artifacts/meeting_deep_dive/people_content_index_20260107_225628.csv


Unnamed: 0,run_id,startup_name,person_name,content_type,title,publisher_or_platform,date,url,what_it_contains,notable_points,relevance,confidence
0,20260107_225628,Sakana AI,David Ha,profile,Sakana AI CEO,Sakana AI,,https://sakana.ai/seed-round/,Overview of David Ha's role and background.,"[Co-founder and CEO of Sakana AI., Former mana...",1,1
1,20260107_225628,Sakana AI,Llion Jones,profile,CTO of Sakana AI,Sakana AI,,https://sakana.ai/seed-round/,Overview of Llion Jones's role and background.,"[Co-founder and CTO of Sakana AI., Formerly wo...",1,1
2,20260107_225628,Sakana AI,Ren Ito,profile,COO of Sakana AI,Sakana AI,,https://sakana.ai/seed-round/,Overview of Ren Ito's role and background.,"[Co-founder and COO of Sakana AI., Former exec...",1,1
3,20260107_225628,Sakana AI,David Ha,interview,Research Retrospectives: An interview with Dav...,yeschat.ai,2024-03-05,https://www.yeschat.ai/blog-Research-Retrospec...,Discussion of David Ha's journey and insights ...,"[Transitioned from finance to AI research., Em...",1,1
4,20260107_225628,Sakana AI,Ren Ito,article,MUFG enters multiyear AI partnership with Saka...,retailbankerinternational.com,2025-05-20,https://www.retailbankerinternational.com/news...,Details on the partnership between MUFG and Sa...,"[Ren Ito named AI Advisor to MUFG., Focus on i...",1,1
5,20260107_225628,Sakana AI,David Ha,article,Top Japan startup Sakana AI touts nature-inspi...,Japan Times,2025-09-10,https://www.japantimes.co.jp/business/2025/09/...,Interview discussing Sakana AI's approach and ...,"[Sakana AI became Japan's fastest unicorn., Fo...",1,1


In [14]:
# ============================================================
# 6. Business & Product Understanding (Facts-first + Content Index)
# ============================================================
# Goal:
# - Build an evidence-linked understanding of the startup's business & product:
#   - What the product is, how it works (as described), key differentiators
#   - Use cases, customer segments, pricing/packaging signals (if any)
#   - Public narratives: blog posts, interviews, articles, press releases
# - Use the same pattern as prior cells:
#   official pages first + CSE enrichment, robots check,
#   store raw pages as JSONL, then OpenAI normalizes into:
#     1) business_product_summary (single object)
#     2) product_claims (long-form table with evidence_url + confidence)
#     3) product_content_index (content list: posts/interviews/articles/press)
#
# Outputs:
# - business_product_raw_pages_<run_id>.jsonl
# - business_product_extraction_<run_id>.json
# - business_product_summary_<run_id>.csv
# - business_product_claims_<run_id>.csv
# - business_product_content_index_<run_id>.csv
#
# Notes:
# - Conservative: only include items supported by evidence URLs in fetched pages.
# - If official pages are JS-rendered and thin, rely more on CSE articles and press.

import os
import re
import json
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
from urllib.parse import urlparse, urljoin
from datetime import datetime, timezone
from urllib.robotparser import RobotFileParser
from dotenv import load_dotenv
from openai import OpenAI

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
load_dotenv("env.txt")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set.")

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_CX = os.getenv("GOOGLE_CSE_CX")
if not GOOGLE_API_KEY or not GOOGLE_CSE_CX:
    raise EnvironmentError("GOOGLE_API_KEY and GOOGLE_CSE_CX must be set.")

client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Load inputs + entity (from prior cells)
# ------------------------------------------------------------
def load_latest_json(pattern: str) -> tuple[dict, Path]:
    files = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError(f"No {pattern} found. Please run previous cells.")
    p = files[0]
    return json.loads(p.read_text(encoding="utf-8")), p

try:
    run_id = inputs["meta"]["run_id"]
except Exception:
    inp, _ = load_latest_json("inputs_*.json")
    run_id = inp.get("meta", {}).get("run_id") or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

entity, _ = load_latest_json("entity_*.json")
startup_name = (entity.get("canonical_name") or "Unknown Startup").strip()
official_website = (entity.get("official_website") or "").strip()
if not official_website:
    raise ValueError("official_website is missing. Run cell #2 first.")

# ------------------------------------------------------------
# Knobs
# ------------------------------------------------------------
USER_AGENT = "researchOSv2-bot/0.3 (+contact: internal-research)"
TIMEOUT = 25
SLEEP_SEC = 0.25

MAX_OFFICIAL_PAGES = 35
MAX_CSE_PAGES = 30
MAX_TEXT_CHARS_STORE = 70000
MAX_TEXT_CHARS_PROMPT = 9000
MAX_PAGES_TO_LLM = 20

PRODUCT_KEY_PATHS = [
    "/product", "/products", "/service", "/services", "/solutions",
    "/research", "/blog", "/news", "/press", "/updates",
    "/customers", "/case", "/cases", "/case-studies", "/usecase", "/use-cases",
    "/pricing", "/docs"
]

PRODUCT_KEYWORDS = [
    "product", "products", "service", "services", "solutions",
    "usecase", "use-case", "usecases", "case-study", "case", "customers",
    "pricing", "docs", "api", "research", "blog", "press", "news"
]

# ------------------------------------------------------------
# URL / domain / robots helpers
# ------------------------------------------------------------
def normalize_url(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return ""
    if not re.match(r"^https?://", url, flags=re.I):
        url = "https://" + url
    p = urlparse(url)
    return p._replace(netloc=p.netloc.lower(), fragment="").geturl()

def get_domain(url: str) -> str:
    return re.sub(r"^www\.", "", urlparse(normalize_url(url)).netloc)

def same_domain(a: str, b: str) -> bool:
    da, db = get_domain(a), get_domain(b)
    return da == db or da.endswith("." + db) or db.endswith("." + da)

def is_asset_url(url: str) -> bool:
    path = (urlparse(url).path or "").lower()
    return bool(re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|zip|mp4|mov)$", path))

def build_robot_parser(base_url: str) -> RobotFileParser | None:
    rp = RobotFileParser()
    rp.set_url(urljoin(normalize_url(base_url), "/robots.txt"))
    try:
        rp.read()
        return rp
    except Exception:
        return None

def robots_allows(rp: RobotFileParser | None, url: str) -> bool:
    if rp is None:
        p = (urlparse(url).path or "/").lower()
        return p in ["/", ""] or any(k in p for k in PRODUCT_KEYWORDS)
    try:
        return rp.can_fetch(USER_AGENT, url)
    except Exception:
        return False

# robots cache for 3rd-party
_robot_cache: dict[str, RobotFileParser | None] = {}

def get_rp_for_url(url: str) -> RobotFileParser | None:
    dom = get_domain(url)
    if dom in _robot_cache:
        return _robot_cache[dom]
    rp = build_robot_parser(url)
    _robot_cache[dom] = rp
    return rp

# ------------------------------------------------------------
# Fetch + text extraction
# ------------------------------------------------------------
def fetch_html(url: str) -> tuple[int, str]:
    r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=TIMEOUT, allow_redirects=True)
    return r.status_code, (r.text if r.ok else "")

def html_to_text(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript", "svg"]):
        tag.decompose()
    node = soup.find("main") or soup.find("article") or (soup.body if soup.body else soup)
    text = node.get_text("\n", strip=True)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def extract_links(base_url: str, html: str) -> list[str]:
    soup = BeautifulSoup(html, "lxml")
    out = []
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if href.startswith("#") or href.lower().startswith("mailto:") or href.lower().startswith("tel:"):
            continue
        u = normalize_url(urljoin(base_url, href))
        if u.startswith(("http://", "https://")):
            out.append(u)
    # unique preserve order
    seen, uniq = set(), []
    for u in out:
        if u not in seen:
            seen.add(u)
            uniq.append(u)
    return uniq

def score_product_link(url: str) -> float:
    p = (urlparse(url).path or "/").lower()
    score = 0.0
    if p in ["/product", "/products", "/service", "/services", "/solutions"]:
        score += 3.0
    for k in PRODUCT_KEYWORDS:
        if k in p:
            score += 1.0
    if is_asset_url(url):
        score -= 4.0
    if p.count("/") >= 4:
        score -= 0.4
    return score

# ------------------------------------------------------------
# Google CSE
# ------------------------------------------------------------
def google_cse_search(query: str, num: int = 5) -> list[dict]:
    url = "https://www.googleapis.com/customsearch/v1"
    params = {"key": GOOGLE_API_KEY, "cx": GOOGLE_CSE_CX, "q": query, "num": min(max(num, 1), 10)}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    items = data.get("items", []) or []
    return [{
        "title": it.get("title"),
        "link": normalize_url(it.get("link") or ""),
        "snippet": it.get("snippet"),
        "displayLink": it.get("displayLink"),
    } for it in items]

# ------------------------------------------------------------
# Build official targets: homepage + explicit paths + internal links
# ------------------------------------------------------------
official_home = normalize_url(official_website)
rp_official = build_robot_parser(official_home)

status, html = fetch_html(official_home)
time.sleep(SLEEP_SEC)
if status >= 400 or not html:
    raise RuntimeError(f"Failed to fetch official homepage: HTTP {status}")

home_links = extract_links(official_home, html)
internal_links = [l for l in home_links if same_domain(l, official_home) and not is_asset_url(l)]

explicit = []
for p in PRODUCT_KEY_PATHS:
    u = normalize_url(urljoin(official_home, p))
    if robots_allows(rp_official, u) and not is_asset_url(u):
        explicit.append(u)

candidates = list(dict.fromkeys([official_home] + explicit + internal_links))
scored = [{"url": u, "score": score_product_link(u)} for u in candidates]
scored.sort(key=lambda x: x["score"], reverse=True)

official_targets = []
for it in scored:
    if len(official_targets) >= MAX_OFFICIAL_PAGES:
        break
    if it["score"] < 0.2:
        continue
    if robots_allows(rp_official, it["url"]):
        official_targets.append(it["url"])

official_targets = list(dict.fromkeys(official_targets))
print(f"Official product targets: {len(official_targets)}")

# ------------------------------------------------------------
# Build CSE targets (business/product focused, bilingual-friendly)
# ------------------------------------------------------------
official_domain = get_domain(official_home)
queries = [
    f'"{startup_name}" {official_domain}',
    f'"{startup_name}" product OR technology OR 技術',
    f'"{startup_name}" use case OR 導入事例 OR 活用事例',
    f'"{startup_name}" customers OR 顧客 OR 導入',
    f'"{startup_name}" pricing OR 料金',
    f'"{startup_name}" API OR docs OR documentation',
    f'"{startup_name}" blog OR research OR paper OR 研究',
    f'"{startup_name}" interview OR インタビュー product',
    f'"{startup_name}" press release OR プレスリリース',
    f'"{startup_name}" partnership OR 提携',
]
cse_items = []
for q in queries:
    try:
        cse_items.extend(google_cse_search(q, num=5))
        time.sleep(0.2)
    except Exception:
        pass

seen = set()
cse_links = []
for it in cse_items:
    link = it.get("link")
    if link and link not in seen and link.startswith(("http://", "https://")) and not is_asset_url(link):
        seen.add(link)
        cse_links.append(link)
cse_links = cse_links[:MAX_CSE_PAGES]
print(f"CSE product targets: {len(cse_links)}")

# ------------------------------------------------------------
# Fetch + store raw pages (JSONL)
# ------------------------------------------------------------
RAW_PATH = ART_DIR / f"business_product_raw_pages_{run_id}.jsonl"
RAW_PATH.write_text("", encoding="utf-8")

def append_jsonl(path: Path, obj: dict):
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def collect_pages(urls: list[str], source_type: str):
    for u in urls:
        u = normalize_url(u)
        if not u or is_asset_url(u):
            continue

        rp = rp_official if source_type == "official" else get_rp_for_url(u)
        allowed = robots_allows(rp, u)

        if not allowed:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": False,
                "text": "",
                "notes": "Skipped due to robots.txt (or conservative fallback).",
            })
            continue

        try:
            code, html = fetch_html(u)
            time.sleep(SLEEP_SEC)
            text = html_to_text(html) if html else ""
            text = text[:MAX_TEXT_CHARS_STORE]

            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": code,
                "robots_allowed": True,
                "text": text,
                "text_char_len": len(text),
            })
        except Exception as e:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": True,
                "text": "",
                "notes": f"Fetch error: {str(e)}",
            })

collect_pages(official_targets, "official")
collect_pages(cse_links, "cse")

print(f"✅ Business/Product raw pages saved: {RAW_PATH.as_posix()}")

# ------------------------------------------------------------
# Load raw pages + shortlist to LLM (product-heavy URLs first)
# ------------------------------------------------------------
def load_raw_pages(path: Path) -> list[dict]:
    rows = []
    for line in path.read_text(encoding="utf-8").splitlines():
        if line.strip():
            rows.append(json.loads(line))
    return rows

raw_pages = load_raw_pages(RAW_PATH)

usable = [
    r for r in raw_pages
    if r.get("robots_allowed")
    and isinstance(r.get("http_status"), int) and r["http_status"] < 400
    and (r.get("text") or "").strip()
]

def score_page_for_product(r):
    url = r.get("url") or ""
    text = r.get("text") or ""
    p = (urlparse(url).path or "").lower()
    s = 0
    for k in PRODUCT_KEYWORDS:
        if k in p:
            s += 2
    s += min(len(text), 20000) / 6000
    if r.get("source_type") == "official":
        s += 2
    return s

usable_sorted = sorted(usable, key=score_page_for_product, reverse=True)
shortlist = usable_sorted[:MAX_PAGES_TO_LLM]
print(f"Usable pages: {len(usable)} | Shortlist to LLM: {len(shortlist)}")

# ------------------------------------------------------------
# Robust JSON parsing helper
# ------------------------------------------------------------
def _extract_json_object(text: str) -> str:
    if not text:
        return ""
    t = text.strip()
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", t, flags=re.DOTALL | re.IGNORECASE)
    if m:
        return m.group(1).strip()
    start, end = t.find("{"), t.rfind("}")
    if start != -1 and end != -1 and end > start:
        return t[start:end+1].strip()
    return ""

# ------------------------------------------------------------
# OpenAI: business/product summary + claims + content index (anti-empty)
# ------------------------------------------------------------
def openai_extract_business_product(pages: list[dict]) -> dict:
    system = (
        "You are a research analyst assistant. "
        "From the provided webpage texts, extract an evidence-linked understanding of the startup's business and product. "
        "CRITICAL: Always produce a non-empty content_index if there are any relevant pages. "
        "Only include claims that can be supported by an evidence_url from the provided pages. "
        "Return a single JSON object only (no markdown)."
    )

    schema = {
        "business_product_summary": {
            "company_name": "string",
            "one_liner": "string|null",
            "product_overview": "string|null",
            "how_it_works_as_described": "array of strings",
            "key_differentiators": "array of strings",
            "primary_use_cases": "array of strings",
            "target_customers": "array of strings",
            "pricing_packaging_signals": "array of strings",
            "positioning_tags": "array of strings"
        },
        "product_claims": [
            {
                "category": "product|tech|use_case|customer|pricing|positioning|other",
                "claim_type": "fact|hypothesis",
                "claim": "string",
                "evidence_url": "string",
                "source_type": "official|cse",
                "confidence": "0..1"
            }
        ],
        "content_index": [
            {
                "content_type": "product_page|blog_post|research_post|press|interview|article|docs|case_study|other",
                "title": "string|null",
                "publisher_or_platform": "string|null",
                "date": "string|null",
                "url": "string",
                "what_it_contains": "string",
                "relevance": "0..1",
                "confidence": "0..1"
            }
        ],
        "notes": {
            "data_gaps": "array of strings",
            "conflicts": "array of strings"
        },
        "confidence": {"overall": "0..1", "rationale": "string"}
    }

    bundle = {
        "startup_name": startup_name,
        "official_website": official_home,
        "pages": [
            {
                "url": p["url"],
                "source_type": p.get("source_type"),
                "domain": p.get("domain"),
                "text": (p.get("text") or "")[:MAX_TEXT_CHARS_PROMPT],
            } for p in pages
        ],
        "schema": schema,
        "instructions": [
            "Prefer official sources where possible.",
            "For product_claims, attach evidence_url to each claim.",
            "For content_index, include at least 5 items if available."
        ]
    }

    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
            ],
            temperature=0.0,
            response_format={"type": "json_object"},
        )
        return json.loads((resp.choices[0].message.content or "").strip())
    except Exception:
        resp2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
            ],
            temperature=0.0,
        )
        raw = resp2.choices[0].message.content or ""
        candidate = _extract_json_object(raw)
        if candidate:
            return json.loads(candidate)
        raise RuntimeError("Model did not return a JSON object.")

extracted = openai_extract_business_product(shortlist)

# ------------------------------------------------------------
# Save JSON + build DataFrames
# ------------------------------------------------------------
JSON_PATH = ART_DIR / f"business_product_extraction_{run_id}.json"
JSON_PATH.write_text(json.dumps(extracted, ensure_ascii=False, indent=2), encoding="utf-8")

summary = extracted.get("business_product_summary", {}) or {}
claims = extracted.get("product_claims", []) or []
content = extracted.get("content_index", []) or []

df_summary = pd.DataFrame([{"run_id": run_id, **summary}])
df_claims = pd.DataFrame(claims)
df_content = pd.DataFrame(content)

if not df_claims.empty:
    df_claims.insert(0, "run_id", run_id)
if not df_content.empty:
    df_content.insert(0, "run_id", run_id)

SUMMARY_CSV = ART_DIR / f"business_product_summary_{run_id}.csv"
CLAIMS_CSV = ART_DIR / f"business_product_claims_{run_id}.csv"
CONTENT_CSV = ART_DIR / f"business_product_content_index_{run_id}.csv"

df_summary.to_csv(SUMMARY_CSV, index=False)
df_claims.to_csv(CLAIMS_CSV, index=False)
df_content.to_csv(CONTENT_CSV, index=False)

print("✅ Business & Product Understanding complete")
print(f"- Raw pages: {RAW_PATH.as_posix()}")
print(f"- JSON: {JSON_PATH.as_posix()}")
print(f"- Summary CSV: {SUMMARY_CSV.as_posix()}")
print(f"- Claims CSV: {CLAIMS_CSV.as_posix()}")
print(f"- Content CSV: {CONTENT_CSV.as_posix()}")

display(df_summary)
display(df_claims.head(30) if not df_claims.empty else df_claims)
display(df_content.head(30) if not df_content.empty else df_content)


Official product targets: 17
CSE product targets: 30
✅ Business/Product raw pages saved: artifacts/meeting_deep_dive/business_product_raw_pages_20260107_225628.jsonl
Usable pages: 25 | Shortlist to LLM: 20
✅ Business & Product Understanding complete
- Raw pages: artifacts/meeting_deep_dive/business_product_raw_pages_20260107_225628.jsonl
- JSON: artifacts/meeting_deep_dive/business_product_extraction_20260107_225628.json
- Summary CSV: artifacts/meeting_deep_dive/business_product_summary_20260107_225628.csv
- Claims CSV: artifacts/meeting_deep_dive/business_product_claims_20260107_225628.csv
- Content CSV: artifacts/meeting_deep_dive/business_product_content_index_20260107_225628.csv


Unnamed: 0,run_id,company_name,one_liner,product_overview,how_it_works_as_described,key_differentiators,primary_use_cases,target_customers,pricing_packaging_signals,positioning_tags
0,20260107_225628,Sakana AI,A Tokyo-based AI startup focused on developing...,Sakana AI develops advanced AI systems that le...,[Utilizes evolutionary algorithms to merge exi...,"[Focus on nature-inspired AI development., Inn...",[Automating scientific research and discovery....,"[Financial institutions., Government agencies....",[Cost-efficient AI model generation at approxi...,"[Nature-inspired AI, Generative AI, AI for sci..."


Unnamed: 0,run_id,category,claim_type,claim,evidence_url,source_type,confidence
0,20260107_225628,product,fact,Sakana AI's AI Scientist can autonomously cond...,https://siliconangle.com/2024/08/13/sakana-ai-...,cse,0.9
1,20260107_225628,tech,fact,The AI Scientist has produced papers that pass...,https://sakana.ai/ai-scientist-first-publication/,cse,0.85
2,20260107_225628,use_case,fact,Sakana AI's technology is being applied in the...,https://sakana.ai/mufg-bank/,cse,0.8
3,20260107_225628,customer,fact,Sakana AI has partnered with MUFG Bank to deve...,https://sakana.ai/mufg-bank/,cse,0.9
4,20260107_225628,other,fact,Sakana AI raised $135 million in a Series B fu...,https://siliconangle.com/2025/11/17/sakana-ai-...,cse,0.95


Unnamed: 0,run_id,content_type,title,publisher_or_platform,date,url,what_it_contains,relevance,confidence
0,20260107_225628,blog_post,Sakana AI creates an ‘AI Scientist’ to automat...,SiliconANGLE,"August 13, 2024",https://siliconangle.com/2024/08/13/sakana-ai-...,Details on the AI Scientist's capabilities and...,1,0.9
1,20260107_225628,blog_post,The AI Scientist Generates its First Peer-Revi...,Sakana AI,"March 12, 2025",https://sakana.ai/ai-scientist-first-publication/,Announcement of the AI Scientist's first peer-...,1,0.85
2,20260107_225628,press,Announcing a Multiyear Partnership between Sak...,Sakana AI,"May 19, 2025",https://sakana.ai/mufg-bank/,Details on the partnership with MUFG Bank to d...,1,0.9
3,20260107_225628,article,Sakana AI lands $135M on $2.635B valuation to ...,SiliconANGLE,"November 17, 2025",https://siliconangle.com/2025/11/17/sakana-ai-...,Information about the funding round and its im...,1,0.95
4,20260107_225628,blog_post,Sakana AI Agent Wins AtCoder Heuristic Contest...,Sakana AI,"January 05, 2026",https://sakana.ai/ahc058/,Details on the AI agent's victory in a competi...,1,0.9


In [15]:
# ============================================================
# 7. Customer & Market Structure (Facts-first + Evidence-linked)
# ============================================================
# Goal:
# - Build an evidence-linked view of:
#   - Customer segments and buyer personas
#   - Use cases / workflows / jobs-to-be-done
#   - Market structure: TAM/SAM/SOM signals, vertical focus, geography, trends
#   - GTM signals: channels, partnerships, sales motion, pricing signals (as mentioned)
# - Identify relevant public narratives: posts, interviews, articles, reports.
# - Use the same pattern:
#   official pages first + CSE enrichment, robots check,
#   store raw pages as JSONL, then OpenAI normalizes into:
#     1) market_summary (single object)
#     2) market_claims (long-form with evidence_url + confidence)
#     3) market_content_index (content list with URLs)
#
# Outputs:
# - market_raw_pages_<run_id>.jsonl
# - market_extraction_<run_id>.json
# - market_summary_<run_id>.csv
# - market_claims_<run_id>.csv
# - market_content_index_<run_id>.csv

import os
import re
import json
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
from urllib.parse import urlparse, urljoin
from datetime import datetime, timezone
from urllib.robotparser import RobotFileParser
from dotenv import load_dotenv
from openai import OpenAI

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
load_dotenv("env.txt")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set.")

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_CX = os.getenv("GOOGLE_CSE_CX")
if not GOOGLE_API_KEY or not GOOGLE_CSE_CX:
    raise EnvironmentError("GOOGLE_API_KEY and GOOGLE_CSE_CX must be set.")

client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Load inputs + entity
# ------------------------------------------------------------
def load_latest_json(pattern: str) -> tuple[dict, Path]:
    files = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError(f"No {pattern} found. Please run previous cells.")
    p = files[0]
    return json.loads(p.read_text(encoding="utf-8")), p

try:
    run_id = inputs["meta"]["run_id"]
except Exception:
    inp, _ = load_latest_json("inputs_*.json")
    run_id = inp.get("meta", {}).get("run_id") or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

entity, _ = load_latest_json("entity_*.json")
startup_name = (entity.get("canonical_name") or "Unknown Startup").strip()
official_website = (entity.get("official_website") or "").strip()
if not official_website:
    raise ValueError("official_website is missing. Run cell #2 first.")

# ------------------------------------------------------------
# Knobs
# ------------------------------------------------------------
USER_AGENT = "researchOSv2-bot/0.3 (+contact: internal-research)"
TIMEOUT = 25
SLEEP_SEC = 0.25

MAX_OFFICIAL_PAGES = 35
MAX_CSE_PAGES = 35
MAX_TEXT_CHARS_STORE = 70000
MAX_TEXT_CHARS_PROMPT = 9000
MAX_PAGES_TO_LLM = 22

MARKET_KEY_PATHS = [
    "/customers", "/case", "/cases", "/case-studies", "/usecase", "/use-cases",
    "/solutions", "/industries",
    "/pricing",
    "/blog", "/news", "/press", "/research", "/updates",
]
MARKET_KEYWORDS = [
    "customer", "customers", "case", "case-study", "usecase", "use-case", "industries",
    "solutions", "pricing",
    "market", "enterprise", "partner", "partnership",
    "blog", "news", "press", "research"
]

# ------------------------------------------------------------
# URL / domain / robots helpers
# ------------------------------------------------------------
def normalize_url(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return ""
    if not re.match(r"^https?://", url, flags=re.I):
        url = "https://" + url
    p = urlparse(url)
    return p._replace(netloc=p.netloc.lower(), fragment="").geturl()

def get_domain(url: str) -> str:
    return re.sub(r"^www\.", "", urlparse(normalize_url(url)).netloc)

def same_domain(a: str, b: str) -> bool:
    da, db = get_domain(a), get_domain(b)
    return da == db or da.endswith("." + db) or db.endswith("." + da)

def is_asset_url(url: str) -> bool:
    path = (urlparse(url).path or "").lower()
    return bool(re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|zip|mp4|mov)$", path))

def build_robot_parser(base_url: str) -> RobotFileParser | None:
    rp = RobotFileParser()
    rp.set_url(urljoin(normalize_url(base_url), "/robots.txt"))
    try:
        rp.read()
        return rp
    except Exception:
        return None

def robots_allows(rp: RobotFileParser | None, url: str) -> bool:
    if rp is None:
        p = (urlparse(url).path or "/").lower()
        return p in ["/", ""] or any(k in p for k in MARKET_KEYWORDS)
    try:
        return rp.can_fetch(USER_AGENT, url)
    except Exception:
        return False

_robot_cache: dict[str, RobotFileParser | None] = {}

def get_rp_for_url(url: str) -> RobotFileParser | None:
    dom = get_domain(url)
    if dom in _robot_cache:
        return _robot_cache[dom]
    rp = build_robot_parser(url)
    _robot_cache[dom] = rp
    return rp

# ------------------------------------------------------------
# Fetch + text extraction
# ------------------------------------------------------------
def fetch_html(url: str) -> tuple[int, str]:
    r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=TIMEOUT, allow_redirects=True)
    return r.status_code, (r.text if r.ok else "")

def html_to_text(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript", "svg"]):
        tag.decompose()
    node = soup.find("main") or soup.find("article") or (soup.body if soup.body else soup)
    text = node.get_text("\n", strip=True)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def extract_links(base_url: str, html: str) -> list[str]:
    soup = BeautifulSoup(html, "lxml")
    out = []
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if href.startswith("#") or href.lower().startswith("mailto:") or href.lower().startswith("tel:"):
            continue
        u = normalize_url(urljoin(base_url, href))
        if u.startswith(("http://", "https://")):
            out.append(u)
    seen, uniq = set(), []
    for u in out:
        if u not in seen:
            seen.add(u)
            uniq.append(u)
    return uniq

def score_market_link(url: str) -> float:
    p = (urlparse(url).path or "/").lower()
    score = 0.0
    for k in MARKET_KEYWORDS:
        if k in p:
            score += 1.0
    if "case" in p or "customer" in p:
        score += 1.5
    if "pricing" in p:
        score += 1.0
    if is_asset_url(url):
        score -= 4.0
    if p.count("/") >= 4:
        score -= 0.4
    return score

# ------------------------------------------------------------
# Google CSE
# ------------------------------------------------------------
def google_cse_search(query: str, num: int = 5) -> list[dict]:
    url = "https://www.googleapis.com/customsearch/v1"
    params = {"key": GOOGLE_API_KEY, "cx": GOOGLE_CSE_CX, "q": query, "num": min(max(num, 1), 10)}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    items = data.get("items", []) or []
    return [{
        "title": it.get("title"),
        "link": normalize_url(it.get("link") or ""),
        "snippet": it.get("snippet"),
        "displayLink": it.get("displayLink"),
    } for it in items]

# ------------------------------------------------------------
# Official targets
# ------------------------------------------------------------
official_home = normalize_url(official_website)
rp_official = build_robot_parser(official_home)

status, html = fetch_html(official_home)
time.sleep(SLEEP_SEC)
if status >= 400 or not html:
    raise RuntimeError(f"Failed to fetch official homepage: HTTP {status}")

home_links = extract_links(official_home, html)
internal_links = [l for l in home_links if same_domain(l, official_home) and not is_asset_url(l)]

explicit = []
for p in MARKET_KEY_PATHS:
    u = normalize_url(urljoin(official_home, p))
    if robots_allows(rp_official, u) and not is_asset_url(u):
        explicit.append(u)

candidates = list(dict.fromkeys([official_home] + explicit + internal_links))
scored = [{"url": u, "score": score_market_link(u)} for u in candidates]
scored.sort(key=lambda x: x["score"], reverse=True)

official_targets = []
for it in scored:
    if len(official_targets) >= MAX_OFFICIAL_PAGES:
        break
    if it["score"] < 0.2:
        continue
    if robots_allows(rp_official, it["url"]):
        official_targets.append(it["url"])

official_targets = list(dict.fromkeys(official_targets))
print(f"Official market targets: {len(official_targets)}")

# ------------------------------------------------------------
# CSE targets (customers + market structure focused)
# ------------------------------------------------------------
official_domain = get_domain(official_home)
queries = [
    f'"{startup_name}" {official_domain}',
    f'"{startup_name}" customer OR customers OR 導入 OR 顧客',
    f'"{startup_name}" case study OR 導入事例 OR 事例',
    f'"{startup_name}" target market OR 市場 OR TAM OR SAM OR SOM',
    f'"{startup_name}" pricing OR 料金 OR enterprise plan',
    f'"{startup_name}" go-to-market OR GTM OR sales OR 営業',
    f'"{startup_name}" partnership OR partner OR 提携',
    f'"{startup_name}" industry OR industries OR 業界',
    f'"{startup_name}" procurement OR RFP OR 導入',
    f'"{startup_name}" interview market OR インタビュー 市場',
]

cse_items = []
for q in queries:
    try:
        cse_items.extend(google_cse_search(q, num=5))
        time.sleep(0.2)
    except Exception:
        pass

seen = set()
cse_links = []
for it in cse_items:
    link = it.get("link")
    if link and link not in seen and link.startswith(("http://", "https://")) and not is_asset_url(link):
        seen.add(link)
        cse_links.append(link)

cse_links = cse_links[:MAX_CSE_PAGES]
print(f"CSE market targets: {len(cse_links)}")

# ------------------------------------------------------------
# Fetch + store raw pages (JSONL)
# ------------------------------------------------------------
RAW_PATH = ART_DIR / f"market_raw_pages_{run_id}.jsonl"
RAW_PATH.write_text("", encoding="utf-8")

def append_jsonl(path: Path, obj: dict):
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def collect_pages(urls: list[str], source_type: str):
    for u in urls:
        u = normalize_url(u)
        if not u or is_asset_url(u):
            continue

        rp = rp_official if source_type == "official" else get_rp_for_url(u)
        allowed = robots_allows(rp, u)

        if not allowed:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": False,
                "text": "",
                "notes": "Skipped due to robots.txt (or conservative fallback).",
            })
            continue

        try:
            code, html = fetch_html(u)
            time.sleep(SLEEP_SEC)
            text = html_to_text(html) if html else ""
            text = text[:MAX_TEXT_CHARS_STORE]

            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": code,
                "robots_allowed": True,
                "text": text,
                "text_char_len": len(text),
            })
        except Exception as e:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": True,
                "text": "",
                "notes": f"Fetch error: {str(e)}",
            })

collect_pages(official_targets, "official")
collect_pages(cse_links, "cse")

print(f"✅ Market raw pages saved: {RAW_PATH.as_posix()}")

# ------------------------------------------------------------
# Load raw pages + shortlist to LLM (market-heavy URLs first)
# ------------------------------------------------------------
def load_raw_pages(path: Path) -> list[dict]:
    rows = []
    for line in path.read_text(encoding="utf-8").splitlines():
        if line.strip():
            rows.append(json.loads(line))
    return rows

raw_pages = load_raw_pages(RAW_PATH)

usable = [
    r for r in raw_pages
    if r.get("robots_allowed")
    and isinstance(r.get("http_status"), int) and r["http_status"] < 400
    and (r.get("text") or "").strip()
]

def score_page_for_market(r):
    url = r.get("url") or ""
    text = r.get("text") or ""
    p = (urlparse(url).path or "").lower()
    s = 0
    for k in MARKET_KEYWORDS:
        if k in p:
            s += 2
    # prefer pages that mention customer words in the text
    if re.search(r"\b(customer|customers|client|導入|顧客|事例|利用)\b", text, flags=re.IGNORECASE):
        s += 2
    s += min(len(text), 20000) / 7000
    if r.get("source_type") == "official":
        s += 2
    return s

usable_sorted = sorted(usable, key=score_page_for_market, reverse=True)
shortlist = usable_sorted[:MAX_PAGES_TO_LLM]
print(f"Usable pages: {len(usable)} | Shortlist to LLM: {len(shortlist)}")

# ------------------------------------------------------------
# Robust JSON parsing helper
# ------------------------------------------------------------
def _extract_json_object(text: str) -> str:
    if not text:
        return ""
    t = text.strip()
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", t, flags=re.DOTALL | re.IGNORECASE)
    if m:
        return m.group(1).strip()
    start, end = t.find("{"), t.rfind("}")
    if start != -1 and end != -1 and end > start:
        return t[start:end+1].strip()
    return ""

# ------------------------------------------------------------
# OpenAI: market summary + claims + content index
# ------------------------------------------------------------
def openai_extract_market(pages: list[dict]) -> dict:
    system = (
        "You are a research analyst assistant. "
        "From the provided webpage texts, extract an evidence-linked understanding of customer segments and market structure. "
        "CRITICAL: Always produce a non-empty content_index if there are relevant pages. "
        "Only include claims that can be supported by an evidence_url from the provided pages. "
        "Return a single JSON object only (no markdown)."
    )

    schema = {
        "market_summary": {
            "company_name": "string",
            "customer_segments": "array of strings",
            "buyer_personas": "array of strings",
            "primary_use_cases": "array of strings",
            "market_definition": "string|null",
            "market_structure_notes": "array of strings",
            "tam_sam_som_signals": "array of strings",
            "geography_focus": "string|null",
            "gtm_signals": "array of strings",
            "pricing_signals": "array of strings",
            "partnership_signals": "array of strings"
        },
        "market_claims": [
            {
                "category": "customer_segment|buyer|use_case|market|tam|gtm|pricing|partnership|other",
                "claim_type": "fact|hypothesis",
                "claim": "string",
                "evidence_url": "string",
                "source_type": "official|cse",
                "confidence": "0..1"
            }
        ],
        "content_index": [
            {
                "content_type": "case_study|customer_story|interview|article|press|blog|report|other",
                "title": "string|null",
                "publisher_or_platform": "string|null",
                "date": "string|null",
                "url": "string",
                "what_it_contains": "string",
                "relevance": "0..1",
                "confidence": "0..1"
            }
        ],
        "notes": {
            "data_gaps": "array of strings",
            "conflicts": "array of strings"
        },
        "confidence": {"overall": "0..1", "rationale": "string"}
    }

    bundle = {
        "startup_name": startup_name,
        "official_website": official_home,
        "pages": [
            {
                "url": p["url"],
                "source_type": p.get("source_type"),
                "domain": p.get("domain"),
                "text": (p.get("text") or "")[:MAX_TEXT_CHARS_PROMPT],
            } for p in pages
        ],
        "schema": schema,
        "instructions": [
            "Prefer official sources where possible.",
            "Attach evidence_url to each claim.",
            "For content_index, include at least 8 items if available."
        ]
    }

    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
            ],
            temperature=0.0,
            response_format={"type": "json_object"},
        )
        return json.loads((resp.choices[0].message.content or "").strip())
    except Exception:
        resp2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
            ],
            temperature=0.0,
        )
        raw = resp2.choices[0].message.content or ""
        candidate = _extract_json_object(raw)
        if candidate:
            return json.loads(candidate)
        raise RuntimeError("Model did not return a JSON object.")

extracted = openai_extract_market(shortlist)

# ------------------------------------------------------------
# Save JSON + build DataFrames
# ------------------------------------------------------------
JSON_PATH = ART_DIR / f"market_extraction_{run_id}.json"
JSON_PATH.write_text(json.dumps(extracted, ensure_ascii=False, indent=2), encoding="utf-8")

summary = extracted.get("market_summary", {}) or {}
claims = extracted.get("market_claims", []) or []
content = extracted.get("content_index", []) or []

df_summary = pd.DataFrame([{"run_id": run_id, **summary}])
df_claims = pd.DataFrame(claims)
df_content = pd.DataFrame(content)

if not df_claims.empty:
    df_claims.insert(0, "run_id", run_id)
if not df_content.empty:
    df_content.insert(0, "run_id", run_id)

SUMMARY_CSV = ART_DIR / f"market_summary_{run_id}.csv"
CLAIMS_CSV = ART_DIR / f"market_claims_{run_id}.csv"
CONTENT_CSV = ART_DIR / f"market_content_index_{run_id}.csv"

df_summary.to_csv(SUMMARY_CSV, index=False)
df_claims.to_csv(CLAIMS_CSV, index=False)
df_content.to_csv(CONTENT_CSV, index=False)

print("✅ Customer & Market Structure complete")
print(f"- Raw pages: {RAW_PATH.as_posix()}")
print(f"- JSON: {JSON_PATH.as_posix()}")
print(f"- Summary CSV: {SUMMARY_CSV.as_posix()}")
print(f"- Claims CSV: {CLAIMS_CSV.as_posix()}")
print(f"- Content CSV: {CONTENT_CSV.as_posix()}")

display(df_summary)
display(df_claims.head(30) if not df_claims.empty else df_claims)
display(df_content.head(30) if not df_content.empty else df_content)


Official market targets: 13
CSE market targets: 35
✅ Market raw pages saved: artifacts/meeting_deep_dive/market_raw_pages_20260107_225628.jsonl
Usable pages: 28 | Shortlist to LLM: 22
✅ Customer & Market Structure complete
- Raw pages: artifacts/meeting_deep_dive/market_raw_pages_20260107_225628.jsonl
- JSON: artifacts/meeting_deep_dive/market_extraction_20260107_225628.json
- Summary CSV: artifacts/meeting_deep_dive/market_summary_20260107_225628.csv
- Claims CSV: artifacts/meeting_deep_dive/market_claims_20260107_225628.csv
- Content CSV: artifacts/meeting_deep_dive/market_content_index_20260107_225628.csv


Unnamed: 0,run_id,company_name,customer_segments,buyer_personas,primary_use_cases,market_definition,market_structure_notes,tam_sam_som_signals,geography_focus,gtm_signals,pricing_signals,partnership_signals
0,20260107_225628,Sakana AI,"[Financial institutions, Government agencies, ...","[CIOs and CTOs of financial institutions, Data...","[Automating banking operations, Developing AI ...",Sakana AI operates within the AI research and ...,[Sakana AI is positioned as a leader in Japan'...,[Japan's venture capital investment in startup...,Japan,[Partnerships with major financial institution...,[Sakana AI's AI Scientist can generate researc...,[Collaboration with NTT Group for R&D of AI co...


Unnamed: 0,run_id,category,claim_type,claim,evidence_url,source_type,confidence
0,20260107_225628,customer_segment,fact,Sakana AI primarily serves large enterprises a...,https://promptloop.com/directory/what-does-sak...,cse,1
1,20260107_225628,use_case,fact,Sakana AI is developing AI specialized for ban...,https://sakana.ai/mufg-bank/,cse,1
2,20260107_225628,partnership,fact,Sakana AI has established a comprehensive mult...,https://sakana.ai/mufg-bank/,cse,1
3,20260107_225628,market,fact,Sakana AI is positioned as a leader in Japan's...,https://www.japantimes.co.jp/business/2024/04/...,cse,1
4,20260107_225628,gtm,fact,Sakana AI's collaboration with NVIDIA aims to ...,https://sakana.ai/series-a/,cse,1
5,20260107_225628,pricing,fact,Sakana AI's AI Scientist can generate research...,https://sakana.ai/ai-scientist/,cse,1
6,20260107_225628,customer_segment,fact,Sakana AI is targeting government agency work ...,https://www.japantimes.co.jp/business/2024/04/...,cse,1
7,20260107_225628,market,fact,Japan's venture capital investment in startups...,https://nea.com/blog/our-investment-in-sakana-...,cse,1


Unnamed: 0,run_id,content_type,title,publisher_or_platform,date,url,what_it_contains,relevance,confidence
0,20260107_225628,press,MUFG enters multiyear AI partnership with Saka...,Retail Banker International,"May 20, 2025",https://www.retailbankerinternational.com/news...,Details on the partnership between MUFG and Sa...,1,1
1,20260107_225628,press,NTT and Sakana AI sign a collaboration agreement,NTT Group,"November 13, 2023",https://group.ntt/en/newsrelease/2023/11/13/23...,Collaboration agreement for R&D of AI constell...,1,1
2,20260107_225628,blog,Our Investment in Japan’s Sakana AI,NEA,"September 04, 2024",https://www.nea.com/blog/our-investment-in-sak...,Insights into NEA's investment in Sakana AI an...,1,1
3,20260107_225628,press,Sakana AI raises $135M Series B,Yahoo Finance,"November 17, 2025",https://finance.yahoo.com/news/sakana-ai-raise...,Details on Sakana AI's Series B funding and it...,1,1
4,20260107_225628,press,Sakana AI and Daiwa Securities Group to Develo...,Sakana AI,"October 03, 2025",https://sakana.ai/daiwa-securities/,Announcement of a partnership to innovate Japa...,1,1
5,20260107_225628,press,Sakana AI raises over $100M in Series A,The SaaS News,"September 04, 2024",https://thesaasnews.com/news/sakana-ai-raises-...,Overview of Sakana AI's Series A funding and i...,1,1
6,20260107_225628,press,Sakana AI and ANA HOLDINGS' Investment,ANA Holdings,"November 05, 2024",https://www.anahd.co.jp/group/en/pr/202411/202...,Details on ANA Holdings' investment in Sakana ...,1,1
7,20260107_225628,press,Sakana AI lands $135M on $2.635B valuation,SiliconANGLE,"November 17, 2025",https://siliconangle.com/2025/11/17/sakana-ai-...,Information on Sakana AI's funding and its foc...,1,1


In [16]:
# ============================================================
# 8. Competitive Landscape (Evidence-linked + Content Index)
# ============================================================
# Goal:
# - Build an evidence-linked view of the competitive landscape:
#   - Direct competitors (same category / same buyer)
#   - Indirect alternatives (different approach / adjacent category)
#   - Positioning comparisons (how the company differentiates, as stated)
#   - Category map (where the startup sits relative to known players)
# - Identify public narratives: comparisons, analyst notes, interviews, articles.
# - Use the same pattern:
#   official pages first + CSE enrichment, robots check,
#   store raw pages as JSONL, then OpenAI normalizes into:
#     1) competitive_summary (single object)
#     2) competitor_table (one row per competitor)
#     3) competitive_claims (long-form evidence-linked claims)
#     4) competitive_content_index (posts/interviews/articles with URLs)
#
# Outputs:
# - competitive_raw_pages_<run_id>.jsonl
# - competitive_extraction_<run_id>.json
# - competitive_summary_<run_id>.csv
# - competitor_table_<run_id>.csv
# - competitive_claims_<run_id>.csv
# - competitive_content_index_<run_id>.csv

import os
import re
import json
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
from urllib.parse import urlparse, urljoin
from datetime import datetime, timezone
from urllib.robotparser import RobotFileParser
from dotenv import load_dotenv
from openai import OpenAI

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
load_dotenv("env.txt")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set.")

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_CX = os.getenv("GOOGLE_CSE_CX")
if not GOOGLE_API_KEY or not GOOGLE_CSE_CX:
    raise EnvironmentError("GOOGLE_API_KEY and GOOGLE_CSE_CX must be set.")

client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Load inputs + entity + (optional) product/market artifacts
# ------------------------------------------------------------
def load_latest_json(pattern: str) -> tuple[dict, Path]:
    files = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError(f"No {pattern} found. Please run previous cells.")
    p = files[0]
    return json.loads(p.read_text(encoding="utf-8")), p

try:
    run_id = inputs["meta"]["run_id"]
except Exception:
    inp, _ = load_latest_json("inputs_*.json")
    run_id = inp.get("meta", {}).get("run_id") or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

entity, _ = load_latest_json("entity_*.json")
startup_name = (entity.get("canonical_name") or "Unknown Startup").strip()
official_website = (entity.get("official_website") or "").strip()
if not official_website:
    raise ValueError("official_website is missing. Run cell #2 first.")

# Optional: use known positioning tags from cell #6/#7 if present
positioning_tags = []
try:
    # business_product_summary_*.csv contains positioning_tags column (list-like)
    bp_paths = sorted(ART_DIR.glob("business_product_summary_*.csv"), key=lambda p: p.stat().st_mtime, reverse=True)
    if bp_paths:
        bp = pd.read_csv(bp_paths[0])
        if "positioning_tags" in bp.columns:
            positioning_tags = [str(bp.loc[0, "positioning_tags"])]
except Exception:
    pass

# ------------------------------------------------------------
# Knobs
# ------------------------------------------------------------
USER_AGENT = "researchOSv2-bot/0.3 (+contact: internal-research)"
TIMEOUT = 25
SLEEP_SEC = 0.25

MAX_OFFICIAL_PAGES = 25
MAX_CSE_PAGES = 45
MAX_TEXT_CHARS_STORE = 70000
MAX_TEXT_CHARS_PROMPT = 9000
MAX_PAGES_TO_LLM = 24

COMP_KEY_PATHS = [
    "/blog", "/news", "/press", "/research", "/updates",
    "/pricing", "/docs", "/product", "/products", "/solutions"
]
COMP_KEYWORDS = [
    "compare", "comparison", "vs", "alternative", "competitor", "competition",
    "pricing", "docs", "api", "blog", "press", "news", "research", "product", "solutions"
]

# ------------------------------------------------------------
# URL / domain / robots helpers
# ------------------------------------------------------------
def normalize_url(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return ""
    if not re.match(r"^https?://", url, flags=re.I):
        url = "https://" + url
    p = urlparse(url)
    return p._replace(netloc=p.netloc.lower(), fragment="").geturl()

def get_domain(url: str) -> str:
    return re.sub(r"^www\.", "", urlparse(normalize_url(url)).netloc)

def same_domain(a: str, b: str) -> bool:
    da, db = get_domain(a), get_domain(b)
    return da == db or da.endswith("." + db) or db.endswith("." + da)

def is_asset_url(url: str) -> bool:
    path = (urlparse(url).path or "").lower()
    return bool(re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|zip|mp4|mov)$", path))

def build_robot_parser(base_url: str) -> RobotFileParser | None:
    rp = RobotFileParser()
    rp.set_url(urljoin(normalize_url(base_url), "/robots.txt"))
    try:
        rp.read()
        return rp
    except Exception:
        return None

def robots_allows(rp: RobotFileParser | None, url: str) -> bool:
    if rp is None:
        p = (urlparse(url).path or "/").lower()
        return p in ["/", ""] or any(k in p for k in ["blog", "news", "press", "research", "product", "pricing", "docs"])
    try:
        return rp.can_fetch(USER_AGENT, url)
    except Exception:
        return False

_robot_cache: dict[str, RobotFileParser | None] = {}

def get_rp_for_url(url: str) -> RobotFileParser | None:
    dom = get_domain(url)
    if dom in _robot_cache:
        return _robot_cache[dom]
    rp = build_robot_parser(url)
    _robot_cache[dom] = rp
    return rp

# ------------------------------------------------------------
# Fetch + text extraction
# ------------------------------------------------------------
def fetch_html(url: str) -> tuple[int, str]:
    r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=TIMEOUT, allow_redirects=True)
    return r.status_code, (r.text if r.ok else "")

def html_to_text(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript", "svg"]):
        tag.decompose()
    node = soup.find("main") or soup.find("article") or (soup.body if soup.body else soup)
    text = node.get_text("\n", strip=True)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def extract_links(base_url: str, html: str) -> list[str]:
    soup = BeautifulSoup(html, "lxml")
    out = []
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if href.startswith("#") or href.lower().startswith("mailto:") or href.lower().startswith("tel:"):
            continue
        u = normalize_url(urljoin(base_url, href))
        if u.startswith(("http://", "https://")):
            out.append(u)
    seen, uniq = set(), []
    for u in out:
        if u not in seen:
            seen.add(u)
            uniq.append(u)
    return uniq

def score_competitive_link(url: str) -> float:
    """
    Prefer pages that might contain comparisons, alternatives, or category statements.
    """
    u = normalize_url(url)
    p = (urlparse(u).path or "/").lower()
    score = 0.0
    # strong signals
    if any(x in p for x in ["compare", "comparison", "vs", "alternative", "competitor"]):
        score += 4.0
    # medium signals
    for k in ["pricing", "docs", "api", "blog", "press", "news", "research", "product", "solutions"]:
        if k in p:
            score += 1.0
    if is_asset_url(u):
        score -= 4.0
    if p.count("/") >= 4:
        score -= 0.4
    return score

# ------------------------------------------------------------
# Google CSE
# ------------------------------------------------------------
def google_cse_search(query: str, num: int = 5) -> list[dict]:
    url = "https://www.googleapis.com/customsearch/v1"
    params = {"key": GOOGLE_API_KEY, "cx": GOOGLE_CSE_CX, "q": query, "num": min(max(num, 1), 10)}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    items = data.get("items", []) or []
    return [{
        "title": it.get("title"),
        "link": normalize_url(it.get("link") or ""),
        "snippet": it.get("snippet"),
        "displayLink": it.get("displayLink"),
    } for it in items]

# ------------------------------------------------------------
# Build official targets (light)
# ------------------------------------------------------------
official_home = normalize_url(official_website)
rp_official = build_robot_parser(official_home)

status, html = fetch_html(official_home)
time.sleep(SLEEP_SEC)
if status >= 400 or not html:
    raise RuntimeError(f"Failed to fetch official homepage: HTTP {status}")

home_links = extract_links(official_home, html)
internal_links = [l for l in home_links if same_domain(l, official_home) and not is_asset_url(l)]

explicit = []
for p in COMP_KEY_PATHS:
    u = normalize_url(urljoin(official_home, p))
    if robots_allows(rp_official, u) and not is_asset_url(u):
        explicit.append(u)

candidates = list(dict.fromkeys([official_home] + explicit + internal_links))
scored = [{"url": u, "score": score_competitive_link(u)} for u in candidates]
scored.sort(key=lambda x: x["score"], reverse=True)

official_targets = []
for it in scored:
    if len(official_targets) >= MAX_OFFICIAL_PAGES:
        break
    if it["score"] < 0.2:
        continue
    if robots_allows(rp_official, it["url"]):
        official_targets.append(it["url"])

official_targets = list(dict.fromkeys(official_targets))
print(f"Official competitive targets: {len(official_targets)}")

# ------------------------------------------------------------
# Build CSE targets (competitors + alternatives + comparisons)
# ------------------------------------------------------------
tag_hint = " ".join(positioning_tags) if positioning_tags else ""
queries = [
    f'"{startup_name}" competitor OR competitors OR 競合',
    f'"{startup_name}" alternative OR alternatives OR 代替',
    f'"{startup_name}" vs',
    f'"{startup_name}" comparison OR compare',
    f'"{startup_name}" competitive landscape',
    f'"{startup_name}" market category {tag_hint}'.strip(),
    f'"{startup_name}" pricing compare',
    f'"{startup_name}" feature comparison',
    f'"{startup_name}" interview competitor',
    f'"{startup_name}" 評判 OR レビュー OR 比較',
]

# Optional: if you know the product category keywords, add them here
# e.g., queries.append(f'"{startup_name}" "agentic AI" competitor')

cse_items = []
for q in queries:
    try:
        cse_items.extend(google_cse_search(q, num=5))
        time.sleep(0.2)
    except Exception:
        pass

seen = set()
cse_links = []
for it in cse_items:
    link = it.get("link")
    if link and link not in seen and link.startswith(("http://", "https://")) and not is_asset_url(link):
        seen.add(link)
        cse_links.append(link)

cse_links = cse_links[:MAX_CSE_PAGES]
print(f"CSE competitive targets: {len(cse_links)}")

# ------------------------------------------------------------
# Fetch + store raw pages (JSONL)
# ------------------------------------------------------------
RAW_PATH = ART_DIR / f"competitive_raw_pages_{run_id}.jsonl"
RAW_PATH.write_text("", encoding="utf-8")

def append_jsonl(path: Path, obj: dict):
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def collect_pages(urls: list[str], source_type: str):
    for u in urls:
        u = normalize_url(u)
        if not u or is_asset_url(u):
            continue

        rp = rp_official if source_type == "official" else get_rp_for_url(u)
        allowed = robots_allows(rp, u)

        if not allowed:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": False,
                "text": "",
                "notes": "Skipped due to robots.txt (or conservative fallback).",
            })
            continue

        try:
            code, html = fetch_html(u)
            time.sleep(SLEEP_SEC)
            text = html_to_text(html) if html else ""
            text = text[:MAX_TEXT_CHARS_STORE]

            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": code,
                "robots_allowed": True,
                "text": text,
                "text_char_len": len(text),
            })
        except Exception as e:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": True,
                "text": "",
                "notes": f"Fetch error: {str(e)}",
            })

collect_pages(official_targets, "official")
collect_pages(cse_links, "cse")

print(f"✅ Competitive raw pages saved: {RAW_PATH.as_posix()}")

# ------------------------------------------------------------
# Load raw pages + shortlist to LLM (prefer compare-ish URLs)
# ------------------------------------------------------------
def load_raw_pages(path: Path) -> list[dict]:
    rows = []
    for line in path.read_text(encoding="utf-8").splitlines():
        if line.strip():
            rows.append(json.loads(line))
    return rows

raw_pages = load_raw_pages(RAW_PATH)

usable = [
    r for r in raw_pages
    if r.get("robots_allowed")
    and isinstance(r.get("http_status"), int) and r["http_status"] < 400
    and (r.get("text") or "").strip()
]

def score_page_for_competition(r):
    url = r.get("url") or ""
    text = r.get("text") or ""
    p = (urlparse(url).path or "").lower()
    s = 0
    for k in ["compare", "comparison", "vs", "alternative", "competitor", "pricing"]:
        if k in p:
            s += 3
    if re.search(r"\b(vs\.?|versus|alternative|competitor|competition|rival)\b", text, flags=re.IGNORECASE):
        s += 2
    s += min(len(text), 20000) / 7000
    if r.get("source_type") == "official":
        s += 2
    return s

usable_sorted = sorted(usable, key=score_page_for_competition, reverse=True)
shortlist = usable_sorted[:MAX_PAGES_TO_LLM]
print(f"Usable pages: {len(usable)} | Shortlist to LLM: {len(shortlist)}")

# ------------------------------------------------------------
# Robust JSON parsing helper
# ------------------------------------------------------------
def _extract_json_object(text: str) -> str:
    if not text:
        return ""
    t = text.strip()
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", t, flags=re.DOTALL | re.IGNORECASE)
    if m:
        return m.group(1).strip()
    start, end = t.find("{"), t.rfind("}")
    if start != -1 and end != -1 and end > start:
        return t[start:end+1].strip()
    return ""

# ------------------------------------------------------------
# OpenAI: competitive summary + competitor table + claims + content index
# ------------------------------------------------------------
def openai_extract_competitive(pages: list[dict]) -> dict:
    system = (
        "You are a research analyst assistant. "
        "From the provided webpage texts, extract an evidence-linked competitive landscape. "
        "CRITICAL: Always produce a non-empty content_index if there are relevant pages. "
        "Only include competitors/alternatives if supported by an evidence_url from the provided pages. "
        "Return a single JSON object only (no markdown)."
    )

    schema = {
        "competitive_summary": {
            "company_name": "string",
            "category_description": "string|null",
            "positioning_statements": "array of strings",
            "differentiation_points": "array of strings",
            "competitive_dynamics_notes": "array of strings"
        },
        "competitors": [
            {
                "name": "string",
                "type": "direct|indirect|adjacent|incumbent|open_source|other",
                "why_it_is_a_competitor": "string",
                "comparison_points": "array of strings",
                "evidence_url": "string",
                "confidence": "0..1"
            }
        ],
        "competitive_claims": [
            {
                "category": "competitor|alternative|positioning|differentiation|category|other",
                "claim_type": "fact|hypothesis",
                "claim": "string",
                "evidence_url": "string",
                "source_type": "official|cse",
                "confidence": "0..1"
            }
        ],
        "content_index": [
            {
                "content_type": "article|interview|blog|press|comparison|review|other",
                "title": "string|null",
                "publisher_or_platform": "string|null",
                "date": "string|null",
                "url": "string",
                "what_it_contains": "string",
                "relevance": "0..1",
                "confidence": "0..1"
            }
        ],
        "notes": {"data_gaps": "array of strings", "conflicts": "array of strings"},
        "confidence": {"overall": "0..1", "rationale": "string"}
    }

    bundle = {
        "startup_name": startup_name,
        "official_website": official_home,
        "pages": [
            {
                "url": p["url"],
                "source_type": p.get("source_type"),
                "domain": p.get("domain"),
                "text": (p.get("text") or "")[:MAX_TEXT_CHARS_PROMPT],
            } for p in pages
        ],
        "schema": schema,
        "instructions": [
            "Prefer official sources where possible for positioning statements.",
            "For competitors, include evidence_url that actually names the competitor or implies it clearly.",
            "For content_index, include at least 8 items if available."
        ]
    }

    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
            ],
            temperature=0.0,
            response_format={"type": "json_object"},
        )
        return json.loads((resp.choices[0].message.content or "").strip())
    except Exception:
        resp2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
            ],
            temperature=0.0,
        )
        raw = resp2.choices[0].message.content or ""
        candidate = _extract_json_object(raw)
        if candidate:
            return json.loads(candidate)
        raise RuntimeError("Model did not return a JSON object.")

extracted = openai_extract_competitive(shortlist)

# ------------------------------------------------------------
# Save JSON + build DataFrames
# ------------------------------------------------------------
JSON_PATH = ART_DIR / f"competitive_extraction_{run_id}.json"
JSON_PATH.write_text(json.dumps(extracted, ensure_ascii=False, indent=2), encoding="utf-8")

summary = extracted.get("competitive_summary", {}) or {}
competitors = extracted.get("competitors", []) or []
claims = extracted.get("competitive_claims", []) or []
content = extracted.get("content_index", []) or []

df_summary = pd.DataFrame([{"run_id": run_id, **summary}])
df_competitors = pd.DataFrame(competitors)
df_claims = pd.DataFrame(claims)
df_content = pd.DataFrame(content)

for df in [df_competitors, df_claims, df_content]:
    if not df.empty:
        df.insert(0, "run_id", run_id)

SUMMARY_CSV = ART_DIR / f"competitive_summary_{run_id}.csv"
COMP_CSV = ART_DIR / f"competitor_table_{run_id}.csv"
CLAIMS_CSV = ART_DIR / f"competitive_claims_{run_id}.csv"
CONTENT_CSV = ART_DIR / f"competitive_content_index_{run_id}.csv"

df_summary.to_csv(SUMMARY_CSV, index=False)
df_competitors.to_csv(COMP_CSV, index=False)
df_claims.to_csv(CLAIMS_CSV, index=False)
df_content.to_csv(CONTENT_CSV, index=False)

print("✅ Competitive Landscape complete")
print(f"- Raw pages: {RAW_PATH.as_posix()}")
print(f"- JSON: {JSON_PATH.as_posix()}")
print(f"- Summary CSV: {SUMMARY_CSV.as_posix()}")
print(f"- Competitors CSV: {COMP_CSV.as_posix()}")
print(f"- Claims CSV: {CLAIMS_CSV.as_posix()}")
print(f"- Content CSV: {CONTENT_CSV.as_posix()}")

display(df_summary)
display(df_competitors.head(30) if not df_competitors.empty else df_competitors)
display(df_claims.head(30) if not df_claims.empty else df_claims)
display(df_content.head(30) if not df_content.empty else df_content)


Official competitive targets: 9
CSE competitive targets: 33
✅ Competitive raw pages saved: artifacts/meeting_deep_dive/competitive_raw_pages_20260107_225628.jsonl
Usable pages: 19 | Shortlist to LLM: 19
✅ Competitive Landscape complete
- Raw pages: artifacts/meeting_deep_dive/competitive_raw_pages_20260107_225628.jsonl
- JSON: artifacts/meeting_deep_dive/competitive_extraction_20260107_225628.json
- Summary CSV: artifacts/meeting_deep_dive/competitive_summary_20260107_225628.csv
- Competitors CSV: artifacts/meeting_deep_dive/competitor_table_20260107_225628.csv
- Claims CSV: artifacts/meeting_deep_dive/competitive_claims_20260107_225628.csv
- Content CSV: artifacts/meeting_deep_dive/competitive_content_index_20260107_225628.csv


Unnamed: 0,run_id,company_name,category_description,positioning_statements,differentiation_points,competitive_dynamics_notes
0,20260107_225628,Sakana AI,AI research and development focused on nature-...,[Sakana AI aims to develop transformative AI t...,[Nature-inspired intelligence and evolutionary...,[Sakana AI is positioned uniquely in the Japan...


Unnamed: 0,run_id,name,type,why_it_is_a_competitor,comparison_points,evidence_url,confidence
0,20260107_225628,OpenAI,direct,OpenAI develops advanced AI models and technol...,[Both companies focus on developing large-scal...,https://www.promptloop.com/directory/what-does...,0.9
1,20260107_225628,Google DeepMind,direct,DeepMind is a leader in AI research and develo...,[Both companies are involved in cutting-edge A...,https://www.promptloop.com/directory/what-does...,0.85
2,20260107_225628,Anthropic,direct,Anthropic focuses on AI safety and developing ...,[Both companies are innovating in the AI space...,https://www.promptloop.com/directory/what-does...,0.8


Unnamed: 0,run_id,category,claim_type,claim,evidence_url,source_type,confidence
0,20260107_225628,positioning,fact,"Sakana AI is focused on developing efficient, ...",https://siliconangle.com/2025/11/17/sakana-ai-...,cse,0.95
1,20260107_225628,differentiation,fact,Sakana AI's approach emphasizes collective int...,https://www.promptloop.com/directory/what-does...,cse,0.9


Unnamed: 0,run_id,content_type,title,publisher_or_platform,date,url,what_it_contains,relevance,confidence
0,20260107_225628,article,Sakana AI lands $135M on $2.635B valuation to ...,SiliconANGLE,"November 17, 2025",https://siliconangle.com/2025/11/17/sakana-ai-...,Details on Sakana AI's funding and strategic f...,1,0.95
1,20260107_225628,blog,What Does Sakana AI Do? - Company Overview,PromptLoop,January 2025,https://www.promptloop.com/directory/what-does...,"Overview of Sakana AI's mission, products, and...",1,0.9
2,20260107_225628,press,Announcing Our Series A,Sakana AI,"September 04, 2024",https://sakana.ai/series-a/,Details on Sakana AI's Series A funding round ...,1,0.9
3,20260107_225628,article,Sakana AI Agent Wins AtCoder Heuristic Contest...,Sakana AI,"January 05, 2026",https://sakana.ai/blog,Announcement of Sakana AI's agent winning a co...,1,0.9
4,20260107_225628,article,EDINET-Bench: Evaluating LLMs on Complex Finan...,Sakana AI,"June 09, 2025",https://sakana.ai/edinet-bench/,Introduction of a benchmark for evaluating LLM...,1,0.85
5,20260107_225628,article,Population-based Model Merging via Quality Div...,Sakana AI,"December 03, 2024",https://sakana.ai/cycleqd/,Research on evolving AI models through populat...,1,0.85
6,20260107_225628,article,The AI Scientist: Towards Fully Automated Open...,Sakana AI,"August 13, 2024",https://sakana.ai/ai-scientist/,Overview of Sakana AI's AI Scientist project f...,1,0.85
7,20260107_225628,article,An Evolved Universal Transformer Memory,Sakana AI,"December 10, 2024",https://sakana.ai/namm/,Introduction of a new memory system for transf...,1,0.85


In [17]:
# ============================================================
# 9. Funding & Cap Table Signals (Evidence-linked + Content Index)
# ============================================================
# Goal:
# - Build an evidence-linked view of funding & cap table signals:
#   - Funding events (round type, amount, date, lead/participants) when mentioned
#   - Investor names and roles (lead/participant/strategic) when mentioned
#   - Cap table signals (ownership hints, board seats, strategic partners) when mentioned
#   - Hiring / runway / burn / revenue signals when mentioned (as weak proxies)
# - Identify public narratives: funding announcements, interviews, articles, filings.
# - Use the same pattern:
#   official pages first + CSE enrichment, robots check,
#   store raw pages as JSONL, then OpenAI normalizes into:
#     1) funding_summary (single object)
#     2) funding_events (table)
#     3) investor_mentions (table)
#     4) cap_table_signals (table)
#     5) funding_claims (evidence-linked)
#     6) funding_content_index (posts/interviews/articles with URLs)
#
# Outputs:
# - funding_raw_pages_<run_id>.jsonl
# - funding_extraction_<run_id>.json
# - funding_summary_<run_id>.csv
# - funding_events_<run_id>.csv
# - investor_mentions_<run_id>.csv
# - cap_table_signals_<run_id>.csv
# - funding_claims_<run_id>.csv
# - funding_content_index_<run_id>.csv

import os
import re
import json
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
from urllib.parse import urlparse, urljoin
from datetime import datetime, timezone
from urllib.robotparser import RobotFileParser
from dotenv import load_dotenv
from openai import OpenAI

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
load_dotenv("env.txt")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set.")

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_CX = os.getenv("GOOGLE_CSE_CX")
if not GOOGLE_API_KEY or not GOOGLE_CSE_CX:
    raise EnvironmentError("GOOGLE_API_KEY and GOOGLE_CSE_CX must be set in env.txt.")

client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Load inputs + entity
# ------------------------------------------------------------
def load_latest_json(pattern: str) -> tuple[dict, Path]:
    files = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError(f"No {pattern} found. Please run previous cells.")
    p = files[0]
    return json.loads(p.read_text(encoding="utf-8")), p

try:
    run_id = inputs["meta"]["run_id"]
except Exception:
    inp, _ = load_latest_json("inputs_*.json")
    run_id = inp.get("meta", {}).get("run_id") or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

entity, _ = load_latest_json("entity_*.json")
startup_name = (entity.get("canonical_name") or "Unknown Startup").strip()
official_website = (entity.get("official_website") or "").strip()
if not official_website:
    raise ValueError("official_website is missing. Run cell #2 first.")

# Optional: use Company Basics extraction as hints
funding_hint_urls = []
try:
    cb_paths = sorted(ART_DIR.glob("company_claims_*.csv"), key=lambda p: p.stat().st_mtime, reverse=True)
    if cb_paths:
        df_cb = pd.read_csv(cb_paths[0])
        # Keep only funding-related claims evidence URLs
        if "category" in df_cb.columns and "evidence_url" in df_cb.columns:
            funding_hint_urls = df_cb[df_cb["category"].astype(str).str.contains("funding", case=False, na=False)]["evidence_url"].dropna().unique().tolist()
except Exception:
    pass

# ------------------------------------------------------------
# Knobs
# ------------------------------------------------------------
USER_AGENT = "researchOSv2-bot/0.3 (+contact: internal-research)"
TIMEOUT = 25
SLEEP_SEC = 0.25

MAX_OFFICIAL_PAGES = 25
MAX_CSE_PAGES = 50
MAX_TEXT_CHARS_STORE = 70000
MAX_TEXT_CHARS_PROMPT = 9000
MAX_PAGES_TO_LLM = 26

FUND_KEY_PATHS = [
    "/press", "/news", "/blog", "/updates",
    "/investor", "/investors", "/funding", "/ir",
]
FUND_KEYWORDS = [
    "seed", "series", "funding", "raised", "round", "investment", "investor",
    "press", "news", "blog", "ir", "investor"
]

# ------------------------------------------------------------
# URL / domain / robots helpers
# ------------------------------------------------------------
def normalize_url(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return ""
    if not re.match(r"^https?://", url, flags=re.I):
        url = "https://" + url
    p = urlparse(url)
    return p._replace(netloc=p.netloc.lower(), fragment="").geturl()

def get_domain(url: str) -> str:
    return re.sub(r"^www\.", "", urlparse(normalize_url(url)).netloc)

def same_domain(a: str, b: str) -> bool:
    da, db = get_domain(a), get_domain(b)
    return da == db or da.endswith("." + db) or db.endswith("." + da)

def is_asset_url(url: str) -> bool:
    path = (urlparse(url).path or "").lower()
    return bool(re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|zip|mp4|mov)$", path))

def build_robot_parser(base_url: str) -> RobotFileParser | None:
    rp = RobotFileParser()
    rp.set_url(urljoin(normalize_url(base_url), "/robots.txt"))
    try:
        rp.read()
        return rp
    except Exception:
        return None

def robots_allows(rp: RobotFileParser | None, url: str) -> bool:
    if rp is None:
        p = (urlparse(url).path or "/").lower()
        return p in ["/", ""] or any(k in p for k in FUND_KEYWORDS)
    try:
        return rp.can_fetch(USER_AGENT, url)
    except Exception:
        return False

_robot_cache: dict[str, RobotFileParser | None] = {}

def get_rp_for_url(url: str) -> RobotFileParser | None:
    dom = get_domain(url)
    if dom in _robot_cache:
        return _robot_cache[dom]
    rp = build_robot_parser(url)
    _robot_cache[dom] = rp
    return rp

# ------------------------------------------------------------
# Fetch + text extraction
# ------------------------------------------------------------
def fetch_html(url: str) -> tuple[int, str]:
    r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=TIMEOUT, allow_redirects=True)
    return r.status_code, (r.text if r.ok else "")

def html_to_text(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript", "svg"]):
        tag.decompose()
    node = soup.find("main") or soup.find("article") or (soup.body if soup.body else soup)
    text = node.get_text("\n", strip=True)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def extract_links(base_url: str, html: str) -> list[str]:
    soup = BeautifulSoup(html, "lxml")
    out = []
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if href.startswith("#") or href.lower().startswith("mailto:") or href.lower().startswith("tel:"):
            continue
        u = normalize_url(urljoin(base_url, href))
        if u.startswith(("http://", "https://")):
            out.append(u)
    seen, uniq = set(), []
    for u in out:
        if u not in seen:
            seen.add(u)
            uniq.append(u)
    return uniq

def score_funding_link(url: str) -> float:
    p = (urlparse(url).path or "/").lower()
    score = 0.0
    # funding-ish
    if any(k in p for k in ["funding", "invest", "investor", "ir", "seed", "series", "round"]):
        score += 3.0
    # press/news
    for k in ["press", "news", "blog", "updates"]:
        if k in p:
            score += 1.0
    if is_asset_url(url):
        score -= 4.0
    if p.count("/") >= 4:
        score -= 0.4
    return score

# ------------------------------------------------------------
# Google CSE
# ------------------------------------------------------------
def google_cse_search(query: str, num: int = 5) -> list[dict]:
    url = "https://www.googleapis.com/customsearch/v1"
    params = {"key": GOOGLE_API_KEY, "cx": GOOGLE_CSE_CX, "q": query, "num": min(max(num, 1), 10)}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    items = data.get("items", []) or []
    return [{
        "title": it.get("title"),
        "link": normalize_url(it.get("link") or ""),
        "snippet": it.get("snippet"),
        "displayLink": it.get("displayLink"),
    } for it in items]

# ------------------------------------------------------------
# Official targets
# ------------------------------------------------------------
official_home = normalize_url(official_website)
rp_official = build_robot_parser(official_home)

status, html = fetch_html(official_home)
time.sleep(SLEEP_SEC)
if status >= 400 or not html:
    raise RuntimeError(f"Failed to fetch official homepage: HTTP {status}")

home_links = extract_links(official_home, html)
internal_links = [l for l in home_links if same_domain(l, official_home) and not is_asset_url(l)]

explicit = []
for p in FUND_KEY_PATHS:
    u = normalize_url(urljoin(official_home, p))
    if robots_allows(rp_official, u) and not is_asset_url(u):
        explicit.append(u)

# Add hint URLs (from Company Basics) if any
hint_urls = [normalize_url(u) for u in funding_hint_urls if same_domain(u, official_home)]
candidates = list(dict.fromkeys([official_home] + explicit + hint_urls + internal_links))

scored = [{"url": u, "score": score_funding_link(u)} for u in candidates]
scored.sort(key=lambda x: x["score"], reverse=True)

official_targets = []
for it in scored:
    if len(official_targets) >= MAX_OFFICIAL_PAGES:
        break
    if it["score"] < 0.2:
        continue
    if robots_allows(rp_official, it["url"]):
        official_targets.append(it["url"])

official_targets = list(dict.fromkeys(official_targets))
print(f"Official funding targets: {len(official_targets)}")

# ------------------------------------------------------------
# CSE targets (funding + investors + cap table hints)
# ------------------------------------------------------------
queries = [
    f'"{startup_name}" raised OR raises OR funding OR 資金調達',
    f'"{startup_name}" seed round OR Series A OR Series B OR シード',
    f'"{startup_name}" investors OR investor OR VC OR venture capital',
    f'"{startup_name}" cap table OR ownership OR board seat',
    f'"{startup_name}" valuation OR post-money OR pre-money OR 評価額',
    f'"{startup_name}" lead investor OR led by',
    f'"{startup_name}" strategic investment OR corporate venture',
    f'"{startup_name}" press release funding OR プレスリリース 資金調達',
    f'"{startup_name}" Crunchbase OR PitchBook',
]

cse_items = []
for q in queries:
    try:
        cse_items.extend(google_cse_search(q, num=5))
        time.sleep(0.2)
    except Exception:
        pass

seen = set()
cse_links = []
for it in cse_items:
    link = it.get("link")
    if link and link not in seen and link.startswith(("http://", "https://")) and not is_asset_url(link):
        seen.add(link)
        cse_links.append(link)

cse_links = cse_links[:MAX_CSE_PAGES]
print(f"CSE funding targets: {len(cse_links)}")

# ------------------------------------------------------------
# Fetch + store raw pages (JSONL)
# ------------------------------------------------------------
RAW_PATH = ART_DIR / f"funding_raw_pages_{run_id}.jsonl"
RAW_PATH.write_text("", encoding="utf-8")

def append_jsonl(path: Path, obj: dict):
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def collect_pages(urls: list[str], source_type: str):
    for u in urls:
        u = normalize_url(u)
        if not u or is_asset_url(u):
            continue

        rp = rp_official if source_type == "official" else get_rp_for_url(u)
        allowed = robots_allows(rp, u)

        if not allowed:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": False,
                "text": "",
                "notes": "Skipped due to robots.txt (or conservative fallback).",
            })
            continue

        try:
            code, html = fetch_html(u)
            time.sleep(SLEEP_SEC)
            text = html_to_text(html) if html else ""
            text = text[:MAX_TEXT_CHARS_STORE]

            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": code,
                "robots_allowed": True,
                "text": text,
                "text_char_len": len(text),
            })
        except Exception as e:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": True,
                "text": "",
                "notes": f"Fetch error: {str(e)}",
            })

collect_pages(official_targets, "official")
collect_pages(cse_links, "cse")

print(f"✅ Funding raw pages saved: {RAW_PATH.as_posix()}")

# ------------------------------------------------------------
# Load raw pages + shortlist to LLM (funding-heavy URLs first)
# ------------------------------------------------------------
def load_raw_pages(path: Path) -> list[dict]:
    rows = []
    for line in path.read_text(encoding="utf-8").splitlines():
        if line.strip():
            rows.append(json.loads(line))
    return rows

raw_pages = load_raw_pages(RAW_PATH)

usable = [
    r for r in raw_pages
    if r.get("robots_allowed")
    and isinstance(r.get("http_status"), int) and r["http_status"] < 400
    and (r.get("text") or "").strip()
]

def score_page_for_funding(r):
    url = r.get("url") or ""
    text = r.get("text") or ""
    p = (urlparse(url).path or "").lower()
    s = 0
    for k in ["seed", "series", "fund", "invest", "investor", "round", "valuation", "cap", "board"]:
        if k in p:
            s += 2
    if re.search(r"\b(seed|series|raised|funding|investor|valuation|cap table|board)\b", text, flags=re.IGNORECASE):
        s += 2
    if re.search(r"(資金調達|ラウンド|投資家|評価額|株主|持分|取締役)", text):
        s += 2
    s += min(len(text), 20000) / 7000
    if r.get("source_type") == "official":
        s += 2
    return s

usable_sorted = sorted(usable, key=score_page_for_funding, reverse=True)
shortlist = usable_sorted[:MAX_PAGES_TO_LLM]
print(f"Usable pages: {len(usable)} | Shortlist to LLM: {len(shortlist)}")

# ------------------------------------------------------------
# Robust JSON parsing helper
# ------------------------------------------------------------
def _extract_json_object(text: str) -> str:
    if not text:
        return ""
    t = text.strip()
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", t, flags=re.DOTALL | re.IGNORECASE)
    if m:
        return m.group(1).strip()
    start, end = t.find("{"), t.rfind("}")
    if start != -1 and end != -1 and end > start:
        return t[start:end+1].strip()
    return ""

# ------------------------------------------------------------
# OpenAI: funding summary + events + investors + cap signals + content index
# ------------------------------------------------------------
def openai_extract_funding(pages: list[dict]) -> dict:
    system = (
        "You are a research analyst assistant. "
        "From the provided webpage texts, extract evidence-linked funding and cap table signals. "
        "CRITICAL: Always produce a non-empty content_index if there are relevant pages. "
        "Do not guess missing numbers/dates—only extract what is stated. "
        "Return a single JSON object only (no markdown)."
    )

    schema = {
        "funding_summary": {
            "company_name": "string",
            "latest_round_hint": "string|null",
            "funding_stage_hint": "string|null",
            "total_funding_mentioned": "array of strings",
            "valuation_signals": "array of strings",
            "runway_burn_signals": "array of strings",
            "notable_investors": "array of strings"
        },
        "funding_events": [
            {
                "round_label": "string|null",
                "date": "string|null",
                "amount": "string|null",
                "lead_investor": "string|null",
                "investors_participants": "array of strings",
                "source_url": "string",
                "confidence": "0..1"
            }
        ],
        "investor_mentions": [
            {
                "investor_name": "string",
                "mention_context": "string",
                "type_hint": "lead|participant|strategic|other",
                "source_url": "string",
                "confidence": "0..1"
            }
        ],
        "cap_table_signals": [
            {
                "signal_type": "board_seat|ownership_hint|strategic_partner|secondary|other",
                "signal": "string",
                "source_url": "string",
                "confidence": "0..1"
            }
        ],
        "funding_claims": [
            {
                "category": "round|investor|amount|valuation|cap_table|other",
                "claim_type": "fact|hypothesis",
                "claim": "string",
                "evidence_url": "string",
                "source_type": "official|cse",
                "confidence": "0..1"
            }
        ],
        "content_index": [
            {
                "content_type": "press_release|article|interview|database|blog|other",
                "title": "string|null",
                "publisher_or_platform": "string|null",
                "date": "string|null",
                "url": "string",
                "what_it_contains": "string",
                "relevance": "0..1",
                "confidence": "0..1"
            }
        ],
        "notes": {"data_gaps": "array of strings", "conflicts": "array of strings"},
        "confidence": {"overall": "0..1", "rationale": "string"}
    }

    bundle = {
        "startup_name": startup_name,
        "official_website": official_home,
        "pages": [
            {
                "url": p["url"],
                "source_type": p.get("source_type"),
                "domain": p.get("domain"),
                "text": (p.get("text") or "")[:MAX_TEXT_CHARS_PROMPT],
            } for p in pages
        ],
        "schema": schema,
        "instructions": [
            "Prefer official press releases for funding events.",
            "Extract investors as they appear; do not deduplicate aggressively if contexts differ.",
            "If Crunchbase/PitchBook is paywalled, still capture what is visible (title/snippet-level hints) as low confidence."
        ]
    }

    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
            ],
            temperature=0.0,
            response_format={"type": "json_object"},
        )
        return json.loads((resp.choices[0].message.content or "").strip())
    except Exception:
        resp2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
            ],
            temperature=0.0,
        )
        raw = resp2.choices[0].message.content or ""
        candidate = _extract_json_object(raw)
        if candidate:
            return json.loads(candidate)
        raise RuntimeError("Model did not return a JSON object.")

extracted = openai_extract_funding(shortlist)

# ------------------------------------------------------------
# Save JSON + build DataFrames
# ------------------------------------------------------------
JSON_PATH = ART_DIR / f"funding_extraction_{run_id}.json"
JSON_PATH.write_text(json.dumps(extracted, ensure_ascii=False, indent=2), encoding="utf-8")

summary = extracted.get("funding_summary", {}) or {}
events = extracted.get("funding_events", []) or []
investors = extracted.get("investor_mentions", []) or []
cap_signals = extracted.get("cap_table_signals", []) or []
claims = extracted.get("funding_claims", []) or []
content = extracted.get("content_index", []) or []

df_summary = pd.DataFrame([{"run_id": run_id, **summary}])
df_events = pd.DataFrame(events)
df_investors = pd.DataFrame(investors)
df_cap = pd.DataFrame(cap_signals)
df_claims = pd.DataFrame(claims)
df_content = pd.DataFrame(content)

for df in [df_events, df_investors, df_cap, df_claims, df_content]:
    if not df.empty:
        df.insert(0, "run_id", run_id)

SUMMARY_CSV = ART_DIR / f"funding_summary_{run_id}.csv"
EVENTS_CSV = ART_DIR / f"funding_events_{run_id}.csv"
INV_CSV = ART_DIR / f"investor_mentions_{run_id}.csv"
CAP_CSV = ART_DIR / f"cap_table_signals_{run_id}.csv"
CLAIMS_CSV = ART_DIR / f"funding_claims_{run_id}.csv"
CONTENT_CSV = ART_DIR / f"funding_content_index_{run_id}.csv"

df_summary.to_csv(SUMMARY_CSV, index=False)
df_events.to_csv(EVENTS_CSV, index=False)
df_investors.to_csv(INV_CSV, index=False)
df_cap.to_csv(CAP_CSV, index=False)
df_claims.to_csv(CLAIMS_CSV, index=False)
df_content.to_csv(CONTENT_CSV, index=False)

print("✅ Funding & Cap Table Signals complete")
print(f"- Raw pages: {RAW_PATH.as_posix()}")
print(f"- JSON: {JSON_PATH.as_posix()}")
print(f"- Summary CSV: {SUMMARY_CSV.as_posix()}")
print(f"- Events CSV: {EVENTS_CSV.as_posix()}")
print(f"- Investor mentions CSV: {INV_CSV.as_posix()}")
print(f"- Cap table signals CSV: {CAP_CSV.as_posix()}")
print(f"- Claims CSV: {CLAIMS_CSV.as_posix()}")
print(f"- Content CSV: {CONTENT_CSV.as_posix()}")

display(df_summary)
display(df_events.head(30) if not df_events.empty else df_events)
display(df_investors.head(30) if not df_investors.empty else df_investors)
display(df_cap.head(30) if not df_cap.empty else df_cap)
display(df_claims.head(30) if not df_claims.empty else df_claims)
display(df_content.head(30) if not df_content.empty else df_content)


Official funding targets: 10
CSE funding targets: 31
✅ Funding raw pages saved: artifacts/meeting_deep_dive/funding_raw_pages_20260107_225628.jsonl
Usable pages: 23 | Shortlist to LLM: 23
✅ Funding & Cap Table Signals complete
- Raw pages: artifacts/meeting_deep_dive/funding_raw_pages_20260107_225628.jsonl
- JSON: artifacts/meeting_deep_dive/funding_extraction_20260107_225628.json
- Summary CSV: artifacts/meeting_deep_dive/funding_summary_20260107_225628.csv
- Events CSV: artifacts/meeting_deep_dive/funding_events_20260107_225628.csv
- Investor mentions CSV: artifacts/meeting_deep_dive/investor_mentions_20260107_225628.csv
- Cap table signals CSV: artifacts/meeting_deep_dive/cap_table_signals_20260107_225628.csv
- Claims CSV: artifacts/meeting_deep_dive/funding_claims_20260107_225628.csv
- Content CSV: artifacts/meeting_deep_dive/funding_content_index_20260107_225628.csv


Unnamed: 0,run_id,company_name,latest_round_hint,funding_stage_hint,total_funding_mentioned,valuation_signals,runway_burn_signals,notable_investors
0,20260107_225628,Sakana AI,Series B,Growth,"[¥52 billion (approximately $347 million), ¥20...",[Post-money valuation of approximately ¥400 bi...,[],"[Mitsubishi UFJ Financial Group (MUFG), Khosla..."


Unnamed: 0,run_id,round_label,date,amount,lead_investor,investors_participants,source_url,confidence
0,20260107_225628,Series B,"November 17, 2025",¥20 billion (approximately $135 million),,"[Mitsubishi UFJ Financial Group (MUFG), Khosla...",https://techcrunch.com/2025/11/17/sakana-ai-ra...,1
1,20260107_225628,Series A,"September 4, 2024",¥30 billion (approximately $214 million),New Enterprise Associates,"[Khosla Ventures, Lux Capital, NVIDIA, Mitsubi...",https://sakana.ai/series-a/,1
2,20260107_225628,Seed Round,"January 16, 2024",¥4.5 billion (approximately $30 million),Lux Capital,"[Khosla Ventures, NTT Group, KDDI CVC, Sony Gr...",https://sakana.ai/seed-round/,1


Unnamed: 0,run_id,investor_name,mention_context,type_hint,source_url,confidence
0,20260107_225628,Mitsubishi UFJ Financial Group (MUFG),Participated in Series B and Series A funding ...,strategic,https://techcrunch.com/2025/11/17/sakana-ai-ra...,1
1,20260107_225628,Khosla Ventures,"Participated in Series B, Series A, and Seed R...",lead,https://techcrunch.com/2025/11/17/sakana-ai-ra...,1
2,20260107_225628,New Enterprise Associates (NEA),Lead investor in Series A.,lead,https://sakana.ai/series-a/,1
3,20260107_225628,Lux Capital,Lead investor in Seed Round and participated i...,lead,https://sakana.ai/seed-round/,1
4,20260107_225628,In-Q-Tel,Participated in Series B.,strategic,https://techcrunch.com/2025/11/17/sakana-ai-ra...,1
5,20260107_225628,Factorial Funds,Participated in Series B.,participant,https://thesaasnews.com/news/sakana-ai-secures...,1
6,20260107_225628,Macquarie Capital,Participated in Series B.,participant,https://thesaasnews.com/news/sakana-ai-secures...,1
7,20260107_225628,Mouro Capital,Participated in Series B.,participant,https://thesaasnews.com/news/sakana-ai-secures...,1
8,20260107_225628,Geodesic Capital,Participated in Series B.,participant,https://thesaasnews.com/news/sakana-ai-secures...,1
9,20260107_225628,Ora Global,Participated in Series B.,participant,https://thesaasnews.com/news/sakana-ai-secures...,1


Unnamed: 0,run_id,signal_type,signal,source_url,confidence
0,20260107_225628,ownership_hint,NTT Group became the largest shareholder in Sa...,https://nttdocomo-v.com/en/news/ibcqisb5bp/,1


Unnamed: 0,run_id,content_type,title,publisher_or_platform,date,url,what_it_contains,relevance,confidence
0,20260107_225628,article,Sakana AI Raises $135M as Japan Pushes Soverei...,The Corporate World,"December 4, 2025",https://www.thecorporate.world/post/sakana-ai-...,Details on Sakana AI's Series B funding and it...,1,1
1,20260107_225628,article,Sakana AI lands $135M on $2.635B valuation to ...,SiliconANGLE,"November 17, 2025",https://siliconangle.com/2025/11/17/sakana-ai-...,Information about Sakana AI's funding round an...,1,1
2,20260107_225628,article,Sakana AI secures $100m in Series A funding round,Yahoo Finance,"September 5, 2024",https://finance.yahoo.com/news/japan-sakana-ai...,Overview of Sakana AI's Series A funding and s...,1,1
3,20260107_225628,article,Sakana AI announces Series A funding,Sakana AI Official,"September 4, 2024",https://sakana.ai/series-a/,Details on the Series A funding round and inve...,1,1
4,20260107_225628,article,Sakana AI raises $30M to develop nature-inspir...,Sakana AI Official,"January 16, 2024",https://sakana.ai/seed-round/,Information on the seed funding round and init...,1,1


In [19]:
# ============================================================
# 10. Recent Changes Timeline (Last N months, Evidence-linked)
# ============================================================
# Goal:
# - Identify meaningful "recent changes" in the last few months:
#   - product launches / releases / research posts
#   - major partnerships / customers / deployments
#   - funding announcements / investor updates
#   - leadership changes / hiring / org changes
# - Build a timeline with:
#   - date (or best-effort month)
#   - event_type + short description
#   - evidence_url + source_type + confidence
#
# Key fixes vs. earlier draft:
# - Progress logging per URL (so it never "looks stuck")
# - Hard per-request timeouts (connect/read)
# - Domain cap + staged CSE fetch (avoid a single slow domain dominating)
# - Safer cutoff computation without pandas DateOffset dependency
#
# Outputs:
# - recent_raw_pages_<run_id>.jsonl
# - recent_timeline_extraction_<run_id>.json
# - recent_timeline_<run_id>.csv
# - recent_content_index_<run_id>.csv

import os
import re
import json
import time
import socket
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
from urllib.parse import urlparse, urljoin
from datetime import datetime, timezone, timedelta
from urllib.robotparser import RobotFileParser
from collections import defaultdict
from dotenv import load_dotenv
from openai import OpenAI

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
load_dotenv("env.txt")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set.")

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_CX = os.getenv("GOOGLE_CSE_CX")
if not GOOGLE_API_KEY or not GOOGLE_CSE_CX:
    raise EnvironmentError("GOOGLE_API_KEY and GOOGLE_CSE_CX must be set.")

client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Load inputs + entity
# ------------------------------------------------------------
def load_latest_json(pattern: str) -> tuple[dict, Path]:
    files = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError(f"No {pattern} found. Please run previous cells.")
    p = files[0]
    return json.loads(p.read_text(encoding="utf-8")), p

try:
    run_id = inputs["meta"]["run_id"]
except Exception:
    inp, _ = load_latest_json("inputs_*.json")
    run_id = inp.get("meta", {}).get("run_id") or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

entity, _ = load_latest_json("entity_*.json")
startup_name = (entity.get("canonical_name") or "Unknown Startup").strip()
official_website = (entity.get("official_website") or "").strip()
if not official_website:
    raise ValueError("official_website is missing. Run cell #2 first.")

# ------------------------------------------------------------
# Time window config (edit as needed)
# ------------------------------------------------------------
MONTHS_BACK = 4
NOW_UTC = datetime.now(timezone.utc)

# Approximate cutoff (30 days per month is enough for a "recent scan")
CUTOFF_UTC = NOW_UTC - timedelta(days=30 * MONTHS_BACK)
CUTOFF_STR = CUTOFF_UTC.strftime("%Y-%m-%d")
print(f"Recent window: last {MONTHS_BACK} months (cutoff ~ {CUTOFF_STR} UTC)")

# ------------------------------------------------------------
# Knobs
# ------------------------------------------------------------
USER_AGENT = "researchOSv2-bot/0.3 (+contact: internal-research)"
TIMEOUT_CONNECT = 8
TIMEOUT_READ = 18
SLEEP_SEC = 0.20

MAX_OFFICIAL_PAGES = 40
MAX_CSE_PAGES = 60
MAX_TEXT_CHARS_STORE = 70000
MAX_TEXT_CHARS_PROMPT = 9000
MAX_PAGES_TO_LLM = 26

# Stage fetch (prevents long hangs / makes debugging easier)
CSE_FETCH_LIMIT_STAGE1 = 20   # first pass
CSE_FETCH_LIMIT_STAGE2 = 50   # optional second pass

RECENT_KEY_PATHS = ["/news", "/press", "/blog", "/updates", "/research"]
RECENT_KEYWORDS = ["news", "press", "blog", "update", "updates", "research", "announcement", "release"]

# Global hard timeout safety net
socket.setdefaulttimeout(20)

# ------------------------------------------------------------
# URL / domain / robots helpers
# ------------------------------------------------------------
def normalize_url(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return ""
    if not re.match(r"^https?://", url, flags=re.I):
        url = "https://" + url
    p = urlparse(url)
    return p._replace(netloc=p.netloc.lower(), fragment="").geturl()

def get_domain(url: str) -> str:
    return re.sub(r"^www\.", "", urlparse(normalize_url(url)).netloc)

def same_domain(a: str, b: str) -> bool:
    da, db = get_domain(a), get_domain(b)
    return da == db or da.endswith("." + db) or db.endswith("." + da)

def is_asset_url(url: str) -> bool:
    path = (urlparse(url).path or "").lower()
    return bool(re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|zip|mp4|mov)$", path))

def build_robot_parser(base_url: str) -> RobotFileParser | None:
    rp = RobotFileParser()
    rp.set_url(urljoin(normalize_url(base_url), "/robots.txt"))
    try:
        rp.read()
        return rp
    except Exception:
        return None

def robots_allows(rp: RobotFileParser | None, url: str) -> bool:
    if rp is None:
        p = (urlparse(url).path or "/").lower()
        return p in ["/", ""] or any(k in p for k in RECENT_KEYWORDS)
    try:
        return rp.can_fetch(USER_AGENT, url)
    except Exception:
        return False

_robot_cache: dict[str, RobotFileParser | None] = {}

def get_rp_for_url(url: str) -> RobotFileParser | None:
    dom = get_domain(url)
    if dom in _robot_cache:
        return _robot_cache[dom]
    rp = build_robot_parser(url)
    _robot_cache[dom] = rp
    return rp

# ------------------------------------------------------------
# Fetch + text extraction
# ------------------------------------------------------------
def fetch_html(url: str) -> tuple[int, str]:
    r = requests.get(
        url,
        headers={"User-Agent": USER_AGENT},
        timeout=(TIMEOUT_CONNECT, TIMEOUT_READ),
        allow_redirects=True
    )
    return r.status_code, (r.text if r.ok else "")

def html_to_text(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript", "svg"]):
        tag.decompose()
    node = soup.find("main") or soup.find("article") or (soup.body if soup.body else soup)
    text = node.get_text("\n", strip=True)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def extract_links(base_url: str, html: str) -> list[str]:
    soup = BeautifulSoup(html, "lxml")
    out = []
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if href.startswith("#") or href.lower().startswith("mailto:") or href.lower().startswith("tel:"):
            continue
        u = normalize_url(urljoin(base_url, href))
        if u.startswith(("http://", "https://")):
            out.append(u)
    seen, uniq = set(), []
    for u in out:
        if u not in seen:
            seen.add(u)
            uniq.append(u)
    return uniq

def score_recent_link(url: str) -> float:
    p = (urlparse(url).path or "/").lower()
    score = 0.0
    for k in RECENT_KEYWORDS:
        if k in p:
            score += 1.0
    if re.search(r"/20\d{2}/\d{1,2}/", p) or re.search(r"/20\d{2}/", p):
        score += 1.0
    if is_asset_url(url):
        score -= 4.0
    if p.count("/") >= 5:
        score -= 0.3
    return score

# ------------------------------------------------------------
# Google CSE
# ------------------------------------------------------------
def google_cse_search(query: str, num: int = 5) -> list[dict]:
    url = "https://www.googleapis.com/customsearch/v1"
    params = {"key": GOOGLE_API_KEY, "cx": GOOGLE_CSE_CX, "q": query, "num": min(max(num, 1), 10)}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    items = data.get("items", []) or []
    return [{
        "title": it.get("title"),
        "link": normalize_url(it.get("link") or ""),
        "snippet": it.get("snippet"),
        "displayLink": it.get("displayLink"),
    } for it in items]

def cap_by_domain(urls: list[str], cap: int = 4) -> list[str]:
    buckets = defaultdict(int)
    out = []
    for u in urls:
        d = get_domain(u)
        if buckets[d] < cap:
            out.append(u)
            buckets[d] += 1
    return out

def prefer_dated_urls(urls: list[str], top_n: int = 40) -> list[str]:
    def score(u):
        p = (urlparse(u).path or "").lower()
        s = 0
        if re.search(r"/20\d{2}/\d{1,2}/", p): s += 3
        if re.search(r"/20\d{2}/", p): s += 1
        return s
    return sorted(urls, key=score, reverse=True)[:top_n]

# ------------------------------------------------------------
# Official targets (news/press/blog hubs + internal links)
# ------------------------------------------------------------
official_home = normalize_url(official_website)
rp_official = build_robot_parser(official_home)

status, html = fetch_html(official_home)
time.sleep(SLEEP_SEC)
if status >= 400 or not html:
    raise RuntimeError(f"Failed to fetch official homepage: HTTP {status}")

home_links = extract_links(official_home, html)
internal_links = [l for l in home_links if same_domain(l, official_home) and not is_asset_url(l)]

explicit = []
for p in RECENT_KEY_PATHS:
    u = normalize_url(urljoin(official_home, p))
    if robots_allows(rp_official, u) and not is_asset_url(u):
        explicit.append(u)

candidates = list(dict.fromkeys([official_home] + explicit + internal_links))
scored = [{"url": u, "score": score_recent_link(u)} for u in candidates]
scored.sort(key=lambda x: x["score"], reverse=True)

official_targets = []
for it in scored:
    if len(official_targets) >= MAX_OFFICIAL_PAGES:
        break
    if it["score"] < 0.2:
        continue
    if robots_allows(rp_official, it["url"]):
        official_targets.append(it["url"])

official_targets = list(dict.fromkeys(official_targets))
print(f"Official recent targets: {len(official_targets)}")

# ------------------------------------------------------------
# CSE targets (recent-focused queries)
# ------------------------------------------------------------
# Google CSE doesn't guarantee strict date filters; we bias recall with month tokens
month_tokens = []
for i in range(MONTHS_BACK + 1):
    d = NOW_UTC - timedelta(days=30 * i)
    month_tokens.append(d.strftime("%Y-%m"))

official_domain = get_domain(official_home)

queries = [
    f'"{startup_name}" site:{official_domain} (news OR press OR blog OR update OR research)',
    f'"{startup_name}" (press release OR announcement OR launched OR partnership OR funding) {month_tokens[0]}',
    f'"{startup_name}" (プレスリリース OR 発表 OR リリース OR 提携 OR 資金調達) {month_tokens[0]}',
]
for t in month_tokens:
    queries.append(f'"{startup_name}" {t} (announcement OR press OR blog OR funding OR partnership OR research)')
    queries.append(f'"{startup_name}" {t} (プレスリリース OR 発表 OR リリース OR 提携 OR 資金調達 OR 研究)')

cse_items = []
for q in queries:
    try:
        cse_items.extend(google_cse_search(q, num=5))
        time.sleep(0.2)
    except Exception:
        pass

seen = set()
cse_links = []
for it in cse_items:
    link = it.get("link")
    if link and link not in seen and link.startswith(("http://", "https://")) and not is_asset_url(link):
        seen.add(link)
        cse_links.append(link)

# De-bias: cap domains + prefer dated URLs
cse_links = cap_by_domain(cse_links, cap=4)
cse_links = prefer_dated_urls(cse_links, top_n=MAX_CSE_PAGES)
print(f"CSE recent targets: {len(cse_links)}")

# ------------------------------------------------------------
# Fetch + store raw pages (JSONL) with progress + timeouts
# ------------------------------------------------------------
RAW_PATH = ART_DIR / f"recent_raw_pages_{run_id}.jsonl"
RAW_PATH.write_text("", encoding="utf-8")

def append_jsonl(path: Path, obj: dict):
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def collect_pages(urls: list[str], source_type: str, limit: int | None = None):
    fetched = 0
    urls2 = urls[:limit] if limit else urls

    for i, u in enumerate(urls2, start=1):
        u = normalize_url(u)
        if not u or is_asset_url(u):
            continue

        print(f"[{source_type}] {i}/{len(urls2)} fetching: {u}")

        rp = rp_official if source_type == "official" else get_rp_for_url(u)
        allowed = robots_allows(rp, u)

        if not allowed:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": False,
                "text": "",
                "notes": "Skipped due to robots.txt (or conservative fallback).",
            })
            continue

        try:
            code, html = fetch_html(u)
            time.sleep(SLEEP_SEC)
            text = html_to_text(html) if html else ""
            text = text[:MAX_TEXT_CHARS_STORE]

            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": code,
                "robots_allowed": True,
                "text": text,
                "text_char_len": len(text),
            })
            fetched += 1

        except Exception as e:
            append_jsonl(RAW_PATH, {
                "url": u,
                "source_type": source_type,
                "domain": get_domain(u),
                "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
                "http_status": None,
                "robots_allowed": True,
                "text": "",
                "notes": f"Fetch error: {type(e).__name__}: {str(e)}",
            })
            print(f"  -> skip (error): {type(e).__name__}: {e}")

    print(f"Done fetching {source_type}: saved {fetched} pages (errors/blocked recorded).")

# Stage fetching (so it never "hangs" silently)
collect_pages(official_targets, "official")
collect_pages(cse_links, "cse", limit=CSE_FETCH_LIMIT_STAGE1)

print(f"✅ Recent raw pages saved: {RAW_PATH.as_posix()}")

# ------------------------------------------------------------
# Load raw pages + shortlist to LLM (favor pages that look recent)
# ------------------------------------------------------------
def load_raw_pages(path: Path) -> list[dict]:
    rows = []
    for line in path.read_text(encoding="utf-8").splitlines():
        if line.strip():
            rows.append(json.loads(line))
    return rows

raw_pages = load_raw_pages(RAW_PATH)

usable = [
    r for r in raw_pages
    if r.get("robots_allowed")
    and isinstance(r.get("http_status"), int) and r["http_status"] < 400
    and (r.get("text") or "").strip()
]

def looks_recent(text: str) -> bool:
    if not text:
        return False
    # weak heuristics; LLM will do actual date extraction
    return bool(re.search(r"\b(2026|2025)\b", text))

def score_page_for_recent(r):
    url = r.get("url") or ""
    text = r.get("text") or ""
    p = (urlparse(url).path or "").lower()
    s = 0
    for k in RECENT_KEYWORDS:
        if k in p:
            s += 2
    if re.search(r"/20\d{2}/\d{1,2}/", p) or re.search(r"/20\d{2}/", p):
        s += 2
    if looks_recent(text):
        s += 1
    s += min(len(text), 20000) / 8000
    if r.get("source_type") == "official":
        s += 2
    return s

usable_sorted = sorted(usable, key=score_page_for_recent, reverse=True)
shortlist = usable_sorted[:MAX_PAGES_TO_LLM]
print(f"Usable pages: {len(usable)} | Shortlist to LLM: {len(shortlist)}")

# ------------------------------------------------------------
# Robust JSON parsing helper
# ------------------------------------------------------------
def _extract_json_object(text: str) -> str:
    if not text:
        return ""
    t = text.strip()
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", t, flags=re.DOTALL | re.IGNORECASE)
    if m:
        return m.group(1).strip()
    start, end = t.find("{"), t.rfind("}")
    if start != -1 and end != -1 and end > start:
        return t[start:end+1].strip()
    return ""

# ------------------------------------------------------------
# OpenAI: timeline extraction (events + content index)
# ------------------------------------------------------------
def openai_extract_recent_timeline(pages: list[dict], cutoff_date: str) -> dict:
    system = (
        "You are a research analyst assistant. "
        "From the provided webpage texts, extract a timeline of recent changes in the last few months. "
        "CRITICAL: Always produce a non-empty content_index if there are relevant pages. "
        "Prefer events on/after the cutoff date, but if date is missing, keep as month-level or unknown date with lower confidence. "
        "Do not invent dates. Use only what appears in the text. "
        "Return a single JSON object only (no markdown)."
    )

    schema = {
        "cutoff_date": "string (YYYY-MM-DD)",
        "timeline_events": [
            {
                "date": "string|null (YYYY-MM-DD if known, else YYYY-MM or null)",
                "event_type": "product|research|partnership|customer|funding|hiring|org_change|policy|other",
                "headline": "string",
                "details": "string",
                "evidence_url": "string",
                "source_type": "official|cse",
                "confidence": "0..1"
            }
        ],
        "content_index": [
            {
                "content_type": "press|blog|research|article|interview|event_page|other",
                "title": "string|null",
                "publisher_or_platform": "string|null",
                "date": "string|null",
                "url": "string",
                "what_it_contains": "string",
                "relevance": "0..1",
                "confidence": "0..1"
            }
        ],
        "notes": {"data_gaps": "array of strings", "conflicts": "array of strings"},
        "confidence": {"overall": "0..1", "rationale": "string"}
    }

    bundle = {
        "startup_name": startup_name,
        "official_website": official_home,
        "cutoff_date": cutoff_date,
        "pages": [
            {
                "url": p["url"],
                "source_type": p.get("source_type"),
                "domain": p.get("domain"),
                "text": (p.get("text") or "")[:MAX_TEXT_CHARS_PROMPT],
            } for p in pages
        ],
        "schema": schema,
        "instructions": [
            "Extract events that are likely in the last few months.",
            "If a page is older but referenced as context, keep it out of timeline_events (or mark low relevance).",
            "For content_index, include at least 10 items if available."
        ]
    }

    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
            ],
            temperature=0.0,
            response_format={"type": "json_object"},
        )
        return json.loads((resp.choices[0].message.content or "").strip())
    except Exception:
        resp2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
            ],
            temperature=0.0,
        )
        raw = resp2.choices[0].message.content or ""
        candidate = _extract_json_object(raw)
        if candidate:
            return json.loads(candidate)
        raise RuntimeError("Model did not return a JSON object.")

extracted = openai_extract_recent_timeline(shortlist, cutoff_date=CUTOFF_STR)

# ------------------------------------------------------------
# Save JSON + DataFrames
# ------------------------------------------------------------
JSON_PATH = ART_DIR / f"recent_timeline_extraction_{run_id}.json"
JSON_PATH.write_text(json.dumps(extracted, ensure_ascii=False, indent=2), encoding="utf-8")

events = extracted.get("timeline_events", []) or []
content = extracted.get("content_index", []) or []

df_events = pd.DataFrame(events)
df_content = pd.DataFrame(content)

if not df_events.empty:
    df_events.insert(0, "run_id", run_id)
if not df_content.empty:
    df_content.insert(0, "run_id", run_id)

EVENTS_CSV = ART_DIR / f"recent_timeline_{run_id}.csv"
CONTENT_CSV = ART_DIR / f"recent_content_index_{run_id}.csv"

df_events.to_csv(EVENTS_CSV, index=False)
df_content.to_csv(CONTENT_CSV, index=False)

print("✅ Recent Changes Timeline complete")
print(f"- Raw pages: {RAW_PATH.as_posix()}")
print(f"- JSON: {JSON_PATH.as_posix()}")
print(f"- Timeline CSV: {EVENTS_CSV.as_posix()}")
print(f"- Content CSV: {CONTENT_CSV.as_posix()}")

# Optional: sort events by date (best-effort)
if not df_events.empty and "date" in df_events.columns:
    df_events_sorted = df_events.copy()
    df_events_sorted["date_sort"] = df_events_sorted["date"].fillna("").astype(str)
    df_events_sorted = df_events_sorted.sort_values("date_sort", ascending=False).drop(columns=["date_sort"])
    display(df_events_sorted.head(50))
else:
    display(df_events)

display(df_content.head(30) if not df_content.empty else df_content)


Recent window: last 4 months (cutoff ~ 2025-09-10 UTC)
Official recent targets: 5
CSE recent targets: 48
[official] 1/5 fetching: https://sakana.ai/updates
[official] 2/5 fetching: https://sakana.ai/news
[official] 3/5 fetching: https://sakana.ai/press
[official] 4/5 fetching: https://sakana.ai/blog
[official] 5/5 fetching: https://sakana.ai/research
Done fetching official: saved 5 pages (errors/blocked recorded).
[cse] 1/20 fetching: https://theaiinsider.tech/2025/11/17/sakana-ai-secures-135m-series-b-at-a-2-65b-valuation-to-advance-japan-optimized-ai-models/
[cse] 2/20 fetching: https://fortune.com/2025/02/09/linkedin-cofounder-reid-hoffman-hugging-face-ceo-clemen-delangue-letter-ai-public-goods-current-ai-action-summit/
[cse] 3/20 fetching: https://www.japantimes.co.jp/business/2025/05/13/companies/sakana-ai-defense/
[cse] 4/20 fetching: https://techcrunch.com/2026/01/02/nvidias-ai-empire-a-look-at-its-top-startup-investments/
[cse] 5/20 fetching: https://www.technologyreview.com/20

Unnamed: 0,run_id,date,event_type,headline,details,evidence_url,source_type,confidence
2,20260107_225628,2025-11-21,product,Sakana AI takes crown as Japan’s most valuable...,Sakana AI has been recognized as Japan's most ...,https://ethanbholland.com/2025/11/21/sakana-ai...,cse,0.8
0,20260107_225628,2025-11-17,funding,Sakana AI raises $135M Series B at a $2.65B va...,Sakana AI has closed a ¥20 billion (approximat...,https://techcrunch.com/2025/11/17/sakana-ai-ra...,cse,0.9
1,20260107_225628,2025-11-17,partnership,Sakana AI targets defense and banking markets ...,Sakana AI is now eyeing expansion into defense...,https://www.japantimes.co.jp/business/2025/11/...,cse,0.8


Unnamed: 0,run_id,content_type,title,publisher_or_platform,date,url,what_it_contains,relevance,confidence
0,20260107_225628,article,Sakana AI raises $135M Series B at a $2.65B va...,TechCrunch,2025-11-17,https://techcrunch.com/2025/11/17/sakana-ai-ra...,Details about Sakana AI's Series B funding rou...,1.0,0.9
1,20260107_225628,article,Sakana AI targets defense and banking markets ...,Japan Times,2025-11-17,https://www.japantimes.co.jp/business/2025/11/...,Discussion on Sakana AI's expansion plans into...,1.0,0.8
2,20260107_225628,article,Sakana AI takes crown as Japan’s most valuable...,Ethan B Holland,2025-11-21,https://ethanbholland.com/2025/11/21/sakana-ai...,Recognition of Sakana AI as Japan's most valua...,1.0,0.8
3,20260107_225628,article,Sakana AI lands $135M on $2.635B valuation to ...,SiliconANGLE,2025-11-17,https://siliconangle.com/2025/11/17/sakana-ai-...,Details on the funding round and its implicati...,1.0,0.9
4,20260107_225628,article,"Sakana AI, Japan’s answer to OpenAI, in talks ...",TechStartups,2025-10-21,https://techstartups.com/2025/10/21/sakana-ai-...,Discussion on Sakana AI's funding talks and it...,0.8,0.7
5,20260107_225628,article,Sakana AI Agent Wins AtCoder Heuristic Contest...,Sakana AI Blog,2026-01-05,https://sakana.ai/blog/,Announcement of Sakana AI's agent winning a co...,0.8,0.6
6,20260107_225628,article,Sakana AI unveils the future: Fully automated ...,Times of AI,2024-08-13,https://www.timesofai.com/news/sakana-ai-unvei...,Overview of Sakana AI's AI Scientist project a...,0.7,0.5
7,20260107_225628,article,Sakana AI’s AI Scientist generates its first p...,Sakana AI,2025-03-12,https://sakana.ai/ai-scientist-first-publication/,Details on the AI Scientist's achievement in g...,0.7,0.5
8,20260107_225628,article,Japan should produce its own AI defense soluti...,Japan Times,2025-05-13,https://www.japantimes.co.jp/business/2025/05/...,Comments from Sakana AI's CEO on the need for ...,0.6,0.5
9,20260107_225628,article,Sakana AI’s AI Scientist: Towards Fully Automa...,Sakana AI,2024-08-13,https://sakana.ai/ai-scientist/,Introduction to the AI Scientist and its capab...,0.6,0.5


In [20]:
# ============================================================
# 11. Integrated Insights (Synthesis across all sections)
# ============================================================
# Goal:
# - Integrate outputs from sections #3–#10 into a single, inspection-ready insight pack:
#   - What is confidently true (facts with evidence)
#   - What is likely (high-confidence hypotheses)
#   - What is uncertain / missing (gaps + conflicting signals)
#   - Implications for the meeting (what to probe, what to validate)
#
# Approach:
# - Load latest artifacts (JSON/CSV) from prior sections (if available)
# - Build a compact "evidence bundle" to send to OpenAI
# - Ask the model to produce:
#   1) integrated_summary (1–2 paragraphs)
#   2) key_takeaways (bullets)
#   3) strengths / risks (bullets)
#   4) hypotheses_to_test (bullets)
#   5) due_diligence_priorities (ranked)
#   6) contradictions & data gaps
#   7) evidence_map (top URLs by theme)
#
# Outputs:
# - integrated_insights_<run_id>.json
# - integrated_insights_<run_id>.md (optional human-readable)
# - integrated_evidence_map_<run_id>.csv

import os
import re
import json
import glob
import pandas as pd
from pathlib import Path
from datetime import datetime, timezone
from dotenv import load_dotenv
from openai import OpenAI

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
load_dotenv("env.txt")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set.")

client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Helpers: find latest artifacts
# ------------------------------------------------------------
def latest_path(pattern: str) -> Path | None:
    paths = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    return paths[0] if paths else None

def safe_read_json(path: Path | None) -> dict | None:
    if not path or not path.exists():
        return None
    return json.loads(path.read_text(encoding="utf-8"))

def safe_read_csv(path: Path | None) -> pd.DataFrame | None:
    if not path or not path.exists():
        return None
    try:
        return pd.read_csv(path)
    except Exception:
        return None

def df_head_records(df: pd.DataFrame | None, n: int = 30) -> list[dict]:
    if df is None or df.empty:
        return []
    return df.head(n).to_dict(orient="records")

def normalize_url(u: str) -> str:
    u = (u or "").strip()
    u = re.sub(r"#.*$", "", u)
    return u

# ------------------------------------------------------------
# Determine run_id + core entity
# ------------------------------------------------------------
inputs_path = latest_path("inputs_*.json")
entity_path = latest_path("entity_*.json")

inputs = safe_read_json(inputs_path) or {}
entity = safe_read_json(entity_path) or {}

run_id = (inputs.get("meta", {}) or {}).get("run_id") or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
startup_name = (entity.get("canonical_name") or inputs.get("startup_name") or "Unknown Startup").strip()
official_website = (entity.get("official_website") or "").strip()

print("run_id:", run_id)
print("startup_name:", startup_name)
print("official_website:", official_website)

# ------------------------------------------------------------
# Load prior artifacts (best-effort)
# ------------------------------------------------------------
# 3) Company Basics
company_extraction = safe_read_json(latest_path("company_extraction_*.json")) or {}
df_company_claims = safe_read_csv(latest_path("company_claims_*.csv"))
df_company_content = safe_read_csv(latest_path("company_content_index_*.csv"))

# 4) Key People
people_extraction = safe_read_json(latest_path("people_extraction_*.json")) or {}
df_people_directory = safe_read_csv(latest_path("people_directory_*.csv"))
# 5) Meeting Person Deep Dive
deep_dive = safe_read_json(latest_path("meeting_person_deep_dive_*.json")) or {}
df_people_profiles = safe_read_csv(latest_path("people_profiles_*.csv"))
df_people_content = safe_read_csv(latest_path("people_content_index_*.csv"))

# 6) Business & Product
bp = safe_read_json(latest_path("business_product_extraction_*.json")) or {}
df_bp_claims = safe_read_csv(latest_path("business_product_claims_*.csv"))
df_bp_content = safe_read_csv(latest_path("business_product_content_index_*.csv"))

# 7) Customer & Market
mk = safe_read_json(latest_path("market_extraction_*.json")) or {}
df_mk_claims = safe_read_csv(latest_path("market_claims_*.csv"))
df_mk_content = safe_read_csv(latest_path("market_content_index_*.csv"))

# 8) Competitive
cp = safe_read_json(latest_path("competitive_extraction_*.json")) or {}
df_cp_competitors = safe_read_csv(latest_path("competitor_table_*.csv"))
df_cp_claims = safe_read_csv(latest_path("competitive_claims_*.csv"))
df_cp_content = safe_read_csv(latest_path("competitive_content_index_*.csv"))

# 9) Funding
fd = safe_read_json(latest_path("funding_extraction_*.json")) or {}
df_fd_events = safe_read_csv(latest_path("funding_events_*.csv"))
df_fd_claims = safe_read_csv(latest_path("funding_claims_*.csv"))
df_fd_content = safe_read_csv(latest_path("funding_content_index_*.csv"))

# 10) Recent Timeline
recent = safe_read_json(latest_path("recent_timeline_extraction_*.json")) or {}
df_recent_timeline = safe_read_csv(latest_path("recent_timeline_*.csv"))
df_recent_content = safe_read_csv(latest_path("recent_content_index_*.csv"))

# ------------------------------------------------------------
# Build an "evidence map" (top URLs grouped by theme)
# ------------------------------------------------------------
def collect_urls(df: pd.DataFrame | None, cols: list[str], limit: int = 50) -> list[str]:
    if df is None or df.empty:
        return []
    urls = []
    for c in cols:
        if c in df.columns:
            urls.extend([normalize_url(x) for x in df[c].dropna().astype(str).tolist()])
    # unique preserve order
    seen, out = set(), []
    for u in urls:
        if u and u.startswith("http") and u not in seen:
            seen.add(u)
            out.append(u)
        if len(out) >= limit:
            break
    return out

evidence_map = {
    "company_basics": collect_urls(df_company_claims, ["evidence_url"], 30) + collect_urls(df_company_content, ["url"], 30),
    "people": collect_urls(df_people_content, ["url"], 30),
    "business_product": collect_urls(df_bp_claims, ["evidence_url"], 30) + collect_urls(df_bp_content, ["url"], 30),
    "market": collect_urls(df_mk_claims, ["evidence_url"], 30) + collect_urls(df_mk_content, ["url"], 30),
    "competition": collect_urls(df_cp_claims, ["evidence_url"], 30) + collect_urls(df_cp_content, ["url"], 30),
    "funding": collect_urls(df_fd_claims, ["evidence_url"], 30) + collect_urls(df_fd_content, ["url"], 30),
    "recent_changes": collect_urls(df_recent_timeline, ["evidence_url"], 30) + collect_urls(df_recent_content, ["url"], 30),
}
# dedupe within each theme
for k, v in evidence_map.items():
    seen, out = set(), []
    for u in v:
        if u not in seen:
            seen.add(u)
            out.append(u)
    evidence_map[k] = out[:40]

df_evidence_map = pd.DataFrame(
    [{"theme": theme, "url": url} for theme, urls in evidence_map.items() for url in urls]
)

EVIDENCE_CSV = ART_DIR / f"integrated_evidence_map_{run_id}.csv"
df_evidence_map.to_csv(EVIDENCE_CSV, index=False)
print("✅ Evidence map saved:", EVIDENCE_CSV.as_posix())

# ------------------------------------------------------------
# Build the LLM bundle (keep compact to avoid token blowups)
# ------------------------------------------------------------
bundle = {
    "startup_name": startup_name,
    "official_website": official_website,
    "inputs": {
        "meeting_person_name": (inputs.get("meeting_person_name") if isinstance(inputs, dict) else None),
        "meeting_context": (inputs.get("meeting_context") if isinstance(inputs, dict) else None),
        "your_org_context": (inputs.get("your_org_context") if isinstance(inputs, dict) else None),
    },
    "section_snapshots": {
        "company_basics": {
            "summary": company_extraction.get("company_summary") if isinstance(company_extraction, dict) else None,
            "claims_head": df_head_records(df_company_claims, 25),
        },
        "people": {
            "meeting_deep_dive": deep_dive.get("meeting_person_insights") if isinstance(deep_dive, dict) else None,
            "profiles_head": df_head_records(df_people_profiles, 12),
            "content_head": df_head_records(df_people_content, 15),
        },
        "business_product": {
            "summary": (bp.get("business_product_summary") if isinstance(bp, dict) else None),
            "claims_head": df_head_records(df_bp_claims, 25),
        },
        "market": {
            "summary": (mk.get("market_summary") if isinstance(mk, dict) else None),
            "claims_head": df_head_records(df_mk_claims, 25),
        },
        "competition": {
            "summary": (cp.get("competitive_summary") if isinstance(cp, dict) else None),
            "competitors_head": df_head_records(df_cp_competitors, 20),
            "claims_head": df_head_records(df_cp_claims, 20),
        },
        "funding": {
            "summary": (fd.get("funding_summary") if isinstance(fd, dict) else None),
            "events_head": df_head_records(df_fd_events, 15),
            "claims_head": df_head_records(df_fd_claims, 20),
        },
        "recent_changes": {
            "cutoff_date": recent.get("cutoff_date") if isinstance(recent, dict) else None,
            "timeline_head": df_head_records(df_recent_timeline, 25),
        },
    },
    "evidence_map": evidence_map,
}

# ------------------------------------------------------------
# OpenAI: Integrated synthesis (facts vs hypotheses)
# ------------------------------------------------------------
def _extract_json_object(text: str) -> str:
    if not text:
        return ""
    t = text.strip()
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", t, flags=re.DOTALL | re.IGNORECASE)
    if m:
        return m.group(1).strip()
    start, end = t.find("{"), t.rfind("}")
    if start != -1 and end != -1 and end > start:
        return t[start:end+1].strip()
    return ""

def openai_integrated_insights(bundle: dict) -> dict:
    system = (
        "You are a diligence analyst assistant. "
        "Synthesize the provided multi-section evidence bundle into integrated insights for a startup meeting.\n\n"
        "CRITICAL RULES:\n"
        "- Separate facts (with URLs) from hypotheses.\n"
        "- Never invent details (no made-up customers, funding numbers, or competitive claims).\n"
        "- If evidence is thin or contradictory, call it out.\n"
        "- Prefer concise, decision-useful writing.\n"
        "Return a single JSON object only (no markdown)."
    )

    schema = {
        "integrated_summary": "string (1-2 short paragraphs)",
        "key_takeaways": ["array of strings (facts-first)"],
        "strengths": ["array of strings"],
        "risks_watchouts": ["array of strings"],
        "hypotheses_to_test": ["array of strings"],
        "due_diligence_priorities": [
            {"priority": "1..10", "item": "string", "why": "string", "evidence_urls": "array of strings"}
        ],
        "contradictions": [
            {"topic": "string", "conflicting_points": "array of strings", "evidence_urls": "array of strings"}
        ],
        "data_gaps": [
            {"topic": "string", "what_is_missing": "string", "how_to_verify": "string"}
        ],
        "meeting_implications": {
            "what_to_probe": ["array of strings"],
            "what_to_validate_fast": ["array of strings"],
            "what_to_defer": ["array of strings"]
        },
        "evidence_map_top": [
            {"theme": "string", "top_urls": "array of strings", "why_these": "string"}
        ],
        "confidence": {"overall": "0..1", "rationale": "string"}
    }

    payload = {"schema": schema, "bundle": bundle}

    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(payload, ensure_ascii=False)},
            ],
            temperature=0.2,
            response_format={"type": "json_object"},
        )
        return json.loads((resp.choices[0].message.content or "").strip())
    except Exception:
        resp2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(payload, ensure_ascii=False)},
            ],
            temperature=0.2,
        )
        raw = resp2.choices[0].message.content or ""
        candidate = _extract_json_object(raw)
        if candidate:
            return json.loads(candidate)
        raise RuntimeError("Model did not return a JSON object.")

integrated = openai_integrated_insights(bundle)

# ------------------------------------------------------------
# Save outputs
# ------------------------------------------------------------
JSON_PATH = ART_DIR / f"integrated_insights_{run_id}.json"
JSON_PATH.write_text(json.dumps(integrated, ensure_ascii=False, indent=2), encoding="utf-8")
print("✅ Integrated insights JSON saved:", JSON_PATH.as_posix())

# Optional: human-readable markdown
def to_md(x: dict) -> str:
    lines = []
    lines.append(f"# Integrated Insights: {startup_name}")
    lines.append("")
    lines.append("## Integrated Summary")
    lines.append(x.get("integrated_summary", ""))
    lines.append("")
    lines.append("## Key Takeaways")
    for b in x.get("key_takeaways", [])[:12]:
        lines.append(f"- {b}")
    lines.append("")
    lines.append("## Strengths")
    for b in x.get("strengths", [])[:12]:
        lines.append(f"- {b}")
    lines.append("")
    lines.append("## Risks / Watchouts")
    for b in x.get("risks_watchouts", [])[:12]:
        lines.append(f"- {b}")
    lines.append("")
    lines.append("## Hypotheses to Test")
    for b in x.get("hypotheses_to_test", [])[:12]:
        lines.append(f"- {b}")
    lines.append("")
    lines.append("## Due Diligence Priorities")
    for r in x.get("due_diligence_priorities", [])[:10]:
        lines.append(f"- **{r.get('priority')}** {r.get('item')} — {r.get('why')}")
        urls = r.get("evidence_urls") or []
        if urls:
            for u in urls[:3]:
                lines.append(f"  - {u}")
    lines.append("")
    lines.append("## Contradictions")
    for c in x.get("contradictions", [])[:10]:
        lines.append(f"- **{c.get('topic')}**")
        for p in c.get("conflicting_points", [])[:4]:
            lines.append(f"  - {p}")
    lines.append("")
    lines.append("## Data Gaps")
    for g in x.get("data_gaps", [])[:12]:
        lines.append(f"- **{g.get('topic')}**: {g.get('what_is_missing')}")
        lines.append(f"  - Verify: {g.get('how_to_verify')}")
    lines.append("")
    return "\n".join(lines)

MD_PATH = ART_DIR / f"integrated_insights_{run_id}.md"
MD_PATH.write_text(to_md(integrated), encoding="utf-8")
print("✅ Integrated insights MD saved:", MD_PATH.as_posix())

# ------------------------------------------------------------
# Display
# ------------------------------------------------------------
display(pd.DataFrame([{
    "startup_name": startup_name,
    "confidence_overall": (integrated.get("confidence", {}) or {}).get("overall"),
    "summary": integrated.get("integrated_summary", "")[:240] + ("..." if len(integrated.get("integrated_summary", "")) > 240 else "")
}]))

print("\nTop takeaways:")
for b in integrated.get("key_takeaways", [])[:8]:
    print("-", b)

print("\nTop diligence priorities:")
for r in integrated.get("due_diligence_priorities", [])[:6]:
    print(f"- {r.get('priority')}. {r.get('item')}")


run_id: 20260107_225628
startup_name: Sakana AI
official_website: https://sakana.ai/series-a
✅ Evidence map saved: artifacts/meeting_deep_dive/integrated_evidence_map_20260107_225628.csv
✅ Integrated insights JSON saved: artifacts/meeting_deep_dive/integrated_insights_20260107_225628.json
✅ Integrated insights MD saved: artifacts/meeting_deep_dive/integrated_insights_20260107_225628.md


Unnamed: 0,startup_name,confidence_overall,summary
0,Sakana AI,0.85,Sakana AI is a Tokyo-based startup focused on ...



Top takeaways:
- Sakana AI raised approximately $200M in its Series A funding round (https://sakana.ai/series-a/).
- Sakana AI raised $135M in its Series B funding round at a valuation of $2.635 billion (https://siliconangle.com/2025/11/17/sakana-ai-lands-135m-2-635b-valuation-accelerate-frontier-research-applied-ai-japan/).
- The company has developed the AI Scientist, capable of fully automated scientific discovery (https://sakana.ai/ai-scientist/).
- Sakana AI's technology is being applied in the banking sector to automate decision-making processes (https://sakana.ai/mufg-bank/).

Top diligence priorities:
- 1. Validate the performance and adoption of the AI Scientist in real-world applications.
- 2. Assess the competitive landscape and Sakana AI's differentiation strategy.


In [1]:
# ============================================================
# 12. Meeting Context Framing (Fixed: save the MODEL OUTPUT, not the payload)
# ============================================================

import os
import re
import json
import pandas as pd
from pathlib import Path
from datetime import datetime, timezone
from dotenv import load_dotenv
from openai import OpenAI

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
load_dotenv("env.txt")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set.")
client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
def latest_path(pattern: str) -> Path | None:
    paths = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    return paths[0] if paths else None

def safe_read_json(path: Path | None) -> dict:
    if not path or not path.exists():
        return {}
    return json.loads(path.read_text(encoding="utf-8"))

def safe_read_text(path: Path | None) -> str:
    if not path or not path.exists():
        return ""
    return path.read_text(encoding="utf-8")

def _extract_json_object(text: str) -> str:
    if not text:
        return ""
    t = text.strip()
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", t, flags=re.DOTALL | re.IGNORECASE)
    if m:
        return m.group(1).strip()
    start, end = t.find("{"), t.rfind("}")
    if start != -1 and end != -1 and end > start:
        return t[start:end+1].strip()
    return ""

def validate_framing(obj: dict):
    """
    Fail fast if we accidentally saved payload, or model returned wrong structure.
    """
    if not isinstance(obj, dict):
        raise ValueError("Framing output is not a dict.")
    if "meeting_frame" not in obj or "narrative" not in obj or "suggested_agenda" not in obj:
        raise ValueError(
            "Framing output missing required keys. "
            "Likely you saved payload (schema/bundle) instead of model output, "
            "or the model returned an unexpected structure."
        )
    mf = obj.get("meeting_frame") or {}
    nar = obj.get("narrative") or {}
    if not (mf.get("meeting_objective") or "").strip():
        raise ValueError("meeting_frame.meeting_objective is empty.")
    if not (nar.get("thirty_second") or "").strip():
        raise ValueError("narrative.thirty_second is empty.")
    if len(obj.get("suggested_agenda") or []) < 3:
        raise ValueError("suggested_agenda has < 3 items.")

# ------------------------------------------------------------
# Load inputs / entity / integrated insights
# ------------------------------------------------------------
inputs = safe_read_json(latest_path("inputs_*.json"))
entity = safe_read_json(latest_path("entity_*.json"))

run_id = (inputs.get("meta", {}) or {}).get("run_id") or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
startup_name = (entity.get("canonical_name") or inputs.get("startup_name") or "Unknown Startup").strip()

meeting_person_name = inputs.get("meeting_person_name")
meeting_person_title = inputs.get("meeting_person_title")
meeting_context = inputs.get("meeting_context") or ""
your_org_context = inputs.get("your_org_context") or ""

integrated = safe_read_json(latest_path("integrated_insights_*.json"))
integrated_md = safe_read_text(latest_path("integrated_insights_*.md"))

# Evidence map -> top URLs
top_urls = []
ev_path = latest_path("integrated_evidence_map_*.csv")
if ev_path and ev_path.exists():
    try:
        df_ev = pd.read_csv(ev_path)
        if "url" in df_ev.columns:
            top_urls = df_ev["url"].dropna().astype(str).head(15).tolist()
    except Exception:
        pass

bundle = {
    "startup_name": startup_name,
    "meeting_person": {"name": meeting_person_name, "title": meeting_person_title},
    "meeting_context": meeting_context,
    "your_org_context": your_org_context,
    "integrated_insights": integrated,
    "integrated_insights_md_excerpt": integrated_md[:6000],
    "top_pre_read_urls": top_urls
}

print("run_id:", run_id)
print("startup_name:", startup_name)
print("integrated_summary_len:", len((integrated.get("integrated_summary") or "")))
print("takeaways_len:", len(integrated.get("key_takeaways", []) or []))
print("dd_priorities_len:", len(integrated.get("due_diligence_priorities", []) or []))
print("top_pre_read_urls:", len(top_urls))

# ------------------------------------------------------------
# OpenAI: Meeting framing
# ------------------------------------------------------------
def openai_meeting_context_framing(bundle: dict, retry_mode: bool = False) -> dict:
    system = (
        "You are a venture diligence assistant. Create a meeting-ready context framing using the provided insights.\n\n"
        "NON-NEGOTIABLE RULES:\n"
        "- Do NOT return empty strings. If unknown, write 'UNKNOWN' and list assumptions.\n"
        "- Always produce: meeting_objective, 30-sec narrative, and at least 5 agenda items.\n"
        "- Separate facts vs hypotheses.\n"
        "- Do not invent numbers/customers/investors.\n"
        "Return a single JSON object only (no markdown)."
    )
    if retry_mode:
        system += (
            "\n\nRETRY MODE: previous output was invalid/empty. "
            "You MUST fill every field with useful content. "
            "If context is missing, assume a generic first meeting and label assumptions."
        )

    schema = {
        "meeting_frame": {
            "meeting_objective": "string",
            "why_now": "string",
            "success_criteria": ["array of strings (>=3)"],
            "key_themes": ["array of strings (>=4)"],
            "facts_to_anchor": ["array of strings (>=4)"],
            "hypotheses_to_validate": ["array of strings (>=4)"],
            "risks_to_probe": ["array of strings (>=4)"]
        },
        "narrative": {
            "thirty_second": "string",
            "two_minute": "string",
            "one_slide_version": ["array of bullet strings (>=5)"]
        },
        "suggested_agenda": [
            {
                "minutes": "integer",
                "topic": "string",
                "what_to_ask": ["array of strings (>=3)"],
                "what_to_listen_for": ["array of strings (>=2)"]
            }
        ],
        "role_plan": [
            {"role": "string", "focus": "string", "top_questions": ["array of strings (>=3)"]}
        ],
        "pre_reads": [
            {"title": "string|null", "url": "string", "why_read": "string"}
        ],
        "notes": {"assumptions": ["array of strings"], "data_gaps": ["array of strings"]},
        "confidence": {"overall": "0..1", "rationale": "string"}
    }

    payload = {"schema": schema, "bundle": bundle}

    # NOTE: We RETURN the model output, NOT the payload.
    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(payload, ensure_ascii=False)},
            ],
            temperature=0.3,
            response_format={"type": "json_object"},
        )
        return json.loads((resp.choices[0].message.content or "").strip())
    except Exception:
        resp2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(payload, ensure_ascii=False)},
            ],
            temperature=0.3,
        )
        raw = resp2.choices[0].message.content or ""
        candidate = _extract_json_object(raw)
        if candidate:
            return json.loads(candidate)
        raise RuntimeError("Model did not return a JSON object.")

# 1st try
framing = openai_meeting_context_framing(bundle, retry_mode=False)

# Validate; if invalid, retry once with stronger instruction
try:
    validate_framing(framing)
except Exception as e:
    print("⚠️ Invalid framing output:", e)
    print("Retrying once with stronger constraints...")
    framing = openai_meeting_context_framing(bundle, retry_mode=True)
    validate_framing(framing)

# ------------------------------------------------------------
# Save outputs (MODEL OUTPUT)
# ------------------------------------------------------------
JSON_PATH = ART_DIR / f"meeting_context_framing_{run_id}.json"
JSON_PATH.write_text(json.dumps(framing, ensure_ascii=False, indent=2), encoding="utf-8")
print("✅ Meeting context framing JSON saved:", JSON_PATH.as_posix())

# Optional MD
def to_md(x: dict) -> str:
    mf = x.get("meeting_frame", {}) or {}
    nar = x.get("narrative", {}) or {}
    agenda = x.get("suggested_agenda", []) or []
    roles = x.get("role_plan", []) or []
    pre = x.get("pre_reads", []) or []
    notes = x.get("notes", {}) or {}

    lines = []
    lines.append(f"# Meeting Context Framing: {startup_name}\n")
    lines.append("## Objective")
    lines.append(mf.get("meeting_objective", "UNKNOWN"))
    lines.append("\n## Talk Track (30s)")
    lines.append(nar.get("thirty_second", "UNKNOWN"))
    lines.append("\n## Talk Track (2m)")
    lines.append(nar.get("two_minute", "UNKNOWN"))

    lines.append("\n## Agenda")
    for a in agenda:
        lines.append(f"- **{a.get('minutes', 0)} min** — {a.get('topic','')}")
        for q in (a.get("what_to_ask") or [])[:4]:
            lines.append(f"  - Ask: {q}")
        for l in (a.get("what_to_listen_for") or [])[:3]:
            lines.append(f"  - Listen for: {l}")

    lines.append("\n## Role Plan")
    for r in roles:
        lines.append(f"- **{r.get('role','')}**: {r.get('focus','')}")
        for q in (r.get("top_questions") or [])[:3]:
            lines.append(f"  - {q}")

    lines.append("\n## Pre-Reads")
    for p in pre[:12]:
        lines.append(f"- {p.get('title') or 'Pre-read'}")
        lines.append(f"  - {p.get('url')}")
        lines.append(f"  - Why: {p.get('why_read')}")

    if notes.get("assumptions"):
        lines.append("\n## Assumptions")
        for a in notes.get("assumptions", [])[:10]:
            lines.append(f"- {a}")

    if notes.get("data_gaps"):
        lines.append("\n## Data Gaps")
        for g in notes.get("data_gaps", [])[:12]:
            lines.append(f"- {g}")

    return "\n".join(lines)

MD_PATH = ART_DIR / f"meeting_context_framing_{run_id}.md"
MD_PATH.write_text(to_md(framing), encoding="utf-8")
print("✅ Meeting context framing MD saved:", MD_PATH.as_posix())

# Quick display
mf = framing.get("meeting_frame", {}) or {}
print("\nobjective:", mf.get("meeting_objective"))
print("30-sec:", (framing.get("narrative", {}) or {}).get("thirty_second"))
print("agenda items:", len(framing.get("suggested_agenda", []) or []))


run_id: 20260107_225628
startup_name: Sakana AI
integrated_summary_len: 364
takeaways_len: 4
dd_priorities_len: 2
top_pre_read_urls: 15
⚠️ Invalid framing output: Framing output missing required keys. Likely you saved payload (schema/bundle) instead of model output, or the model returned an unexpected structure.
Retrying once with stronger constraints...
✅ Meeting context framing JSON saved: artifacts/meeting_deep_dive/meeting_context_framing_20260107_225628.json
✅ Meeting context framing MD saved: artifacts/meeting_deep_dive/meeting_context_framing_20260107_225628.md

objective: Evaluate the potential of Sakana AI for investment and strategic partnership opportunities.
30-sec: Sakana AI, a leading Tokyo-based startup, is pioneering nature-inspired AI technologies with significant funding backing. With its innovative AI Scientist and applications in sectors like banking, it represents a compelling investment opportunity. This meeting aims to evaluate its market potential and strategic 

In [2]:
# ============================================================
# 12. Meeting Context Framing (Fixed: save the MODEL OUTPUT, not the payload)
# ============================================================

import os
import re
import json
import pandas as pd
from pathlib import Path
from datetime import datetime, timezone
from dotenv import load_dotenv
from openai import OpenAI

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
load_dotenv("env.txt")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set.")
client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
def latest_path(pattern: str) -> Path | None:
    paths = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    return paths[0] if paths else None

def safe_read_json(path: Path | None) -> dict:
    if not path or not path.exists():
        return {}
    return json.loads(path.read_text(encoding="utf-8"))

def safe_read_text(path: Path | None) -> str:
    if not path or not path.exists():
        return ""
    return path.read_text(encoding="utf-8")

def _extract_json_object(text: str) -> str:
    if not text:
        return ""
    t = text.strip()
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", t, flags=re.DOTALL | re.IGNORECASE)
    if m:
        return m.group(1).strip()
    start, end = t.find("{"), t.rfind("}")
    if start != -1 and end != -1 and end > start:
        return t[start:end+1].strip()
    return ""

def validate_framing(obj: dict):
    """
    Fail fast if we accidentally saved payload, or model returned wrong structure.
    """
    if not isinstance(obj, dict):
        raise ValueError("Framing output is not a dict.")
    if "meeting_frame" not in obj or "narrative" not in obj or "suggested_agenda" not in obj:
        raise ValueError(
            "Framing output missing required keys. "
            "Likely you saved payload (schema/bundle) instead of model output, "
            "or the model returned an unexpected structure."
        )
    mf = obj.get("meeting_frame") or {}
    nar = obj.get("narrative") or {}
    if not (mf.get("meeting_objective") or "").strip():
        raise ValueError("meeting_frame.meeting_objective is empty.")
    if not (nar.get("thirty_second") or "").strip():
        raise ValueError("narrative.thirty_second is empty.")
    if len(obj.get("suggested_agenda") or []) < 3:
        raise ValueError("suggested_agenda has < 3 items.")

# ------------------------------------------------------------
# Load inputs / entity / integrated insights
# ------------------------------------------------------------
inputs = safe_read_json(latest_path("inputs_*.json"))
entity = safe_read_json(latest_path("entity_*.json"))

run_id = (inputs.get("meta", {}) or {}).get("run_id") or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
startup_name = (entity.get("canonical_name") or inputs.get("startup_name") or "Unknown Startup").strip()

meeting_person_name = inputs.get("meeting_person_name")
meeting_person_title = inputs.get("meeting_person_title")
meeting_context = inputs.get("meeting_context") or ""
your_org_context = inputs.get("your_org_context") or ""

integrated = safe_read_json(latest_path("integrated_insights_*.json"))
integrated_md = safe_read_text(latest_path("integrated_insights_*.md"))

# Evidence map -> top URLs
top_urls = []
ev_path = latest_path("integrated_evidence_map_*.csv")
if ev_path and ev_path.exists():
    try:
        df_ev = pd.read_csv(ev_path)
        if "url" in df_ev.columns:
            top_urls = df_ev["url"].dropna().astype(str).head(15).tolist()
    except Exception:
        pass

bundle = {
    "startup_name": startup_name,
    "meeting_person": {"name": meeting_person_name, "title": meeting_person_title},
    "meeting_context": meeting_context,
    "your_org_context": your_org_context,
    "integrated_insights": integrated,
    "integrated_insights_md_excerpt": integrated_md[:6000],
    "top_pre_read_urls": top_urls
}

print("run_id:", run_id)
print("startup_name:", startup_name)
print("integrated_summary_len:", len((integrated.get("integrated_summary") or "")))
print("takeaways_len:", len(integrated.get("key_takeaways", []) or []))
print("dd_priorities_len:", len(integrated.get("due_diligence_priorities", []) or []))
print("top_pre_read_urls:", len(top_urls))

# ------------------------------------------------------------
# OpenAI: Meeting framing
# ------------------------------------------------------------
def openai_meeting_context_framing(bundle: dict, retry_mode: bool = False) -> dict:
    system = (
        "You are a venture diligence assistant. Create a meeting-ready context framing using the provided insights.\n\n"
        "NON-NEGOTIABLE RULES:\n"
        "- Do NOT return empty strings. If unknown, write 'UNKNOWN' and list assumptions.\n"
        "- Always produce: meeting_objective, 30-sec narrative, and at least 5 agenda items.\n"
        "- Separate facts vs hypotheses.\n"
        "- Do not invent numbers/customers/investors.\n"
        "Return a single JSON object only (no markdown)."
    )
    if retry_mode:
        system += (
            "\n\nRETRY MODE: previous output was invalid/empty. "
            "You MUST fill every field with useful content. "
            "If context is missing, assume a generic first meeting and label assumptions."
        )

    schema = {
        "meeting_frame": {
            "meeting_objective": "string",
            "why_now": "string",
            "success_criteria": ["array of strings (>=3)"],
            "key_themes": ["array of strings (>=4)"],
            "facts_to_anchor": ["array of strings (>=4)"],
            "hypotheses_to_validate": ["array of strings (>=4)"],
            "risks_to_probe": ["array of strings (>=4)"]
        },
        "narrative": {
            "thirty_second": "string",
            "two_minute": "string",
            "one_slide_version": ["array of bullet strings (>=5)"]
        },
        "suggested_agenda": [
            {
                "minutes": "integer",
                "topic": "string",
                "what_to_ask": ["array of strings (>=3)"],
                "what_to_listen_for": ["array of strings (>=2)"]
            }
        ],
        "role_plan": [
            {"role": "string", "focus": "string", "top_questions": ["array of strings (>=3)"]}
        ],
        "pre_reads": [
            {"title": "string|null", "url": "string", "why_read": "string"}
        ],
        "notes": {"assumptions": ["array of strings"], "data_gaps": ["array of strings"]},
        "confidence": {"overall": "0..1", "rationale": "string"}
    }

    payload = {"schema": schema, "bundle": bundle}

    # NOTE: We RETURN the model output, NOT the payload.
    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(payload, ensure_ascii=False)},
            ],
            temperature=0.3,
            response_format={"type": "json_object"},
        )
        return json.loads((resp.choices[0].message.content or "").strip())
    except Exception:
        resp2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(payload, ensure_ascii=False)},
            ],
            temperature=0.3,
        )
        raw = resp2.choices[0].message.content or ""
        candidate = _extract_json_object(raw)
        if candidate:
            return json.loads(candidate)
        raise RuntimeError("Model did not return a JSON object.")

# 1st try
framing = openai_meeting_context_framing(bundle, retry_mode=False)

# Validate; if invalid, retry once with stronger instruction
try:
    validate_framing(framing)
except Exception as e:
    print("⚠️ Invalid framing output:", e)
    print("Retrying once with stronger constraints...")
    framing = openai_meeting_context_framing(bundle, retry_mode=True)
    validate_framing(framing)

# ------------------------------------------------------------
# Save outputs (MODEL OUTPUT)
# ------------------------------------------------------------
JSON_PATH = ART_DIR / f"meeting_context_framing_{run_id}.json"
JSON_PATH.write_text(json.dumps(framing, ensure_ascii=False, indent=2), encoding="utf-8")
print("✅ Meeting context framing JSON saved:", JSON_PATH.as_posix())

# Optional MD
def to_md(x: dict) -> str:
    mf = x.get("meeting_frame", {}) or {}
    nar = x.get("narrative", {}) or {}
    agenda = x.get("suggested_agenda", []) or []
    roles = x.get("role_plan", []) or []
    pre = x.get("pre_reads", []) or []
    notes = x.get("notes", {}) or {}

    lines = []
    lines.append(f"# Meeting Context Framing: {startup_name}\n")
    lines.append("## Objective")
    lines.append(mf.get("meeting_objective", "UNKNOWN"))
    lines.append("\n## Talk Track (30s)")
    lines.append(nar.get("thirty_second", "UNKNOWN"))
    lines.append("\n## Talk Track (2m)")
    lines.append(nar.get("two_minute", "UNKNOWN"))

    lines.append("\n## Agenda")
    for a in agenda:
        lines.append(f"- **{a.get('minutes', 0)} min** — {a.get('topic','')}")
        for q in (a.get("what_to_ask") or [])[:4]:
            lines.append(f"  - Ask: {q}")
        for l in (a.get("what_to_listen_for") or [])[:3]:
            lines.append(f"  - Listen for: {l}")

    lines.append("\n## Role Plan")
    for r in roles:
        lines.append(f"- **{r.get('role','')}**: {r.get('focus','')}")
        for q in (r.get("top_questions") or [])[:3]:
            lines.append(f"  - {q}")

    lines.append("\n## Pre-Reads")
    for p in pre[:12]:
        lines.append(f"- {p.get('title') or 'Pre-read'}")
        lines.append(f"  - {p.get('url')}")
        lines.append(f"  - Why: {p.get('why_read')}")

    if notes.get("assumptions"):
        lines.append("\n## Assumptions")
        for a in notes.get("assumptions", [])[:10]:
            lines.append(f"- {a}")

    if notes.get("data_gaps"):
        lines.append("\n## Data Gaps")
        for g in notes.get("data_gaps", [])[:12]:
            lines.append(f"- {g}")

    return "\n".join(lines)

MD_PATH = ART_DIR / f"meeting_context_framing_{run_id}.md"
MD_PATH.write_text(to_md(framing), encoding="utf-8")
print("✅ Meeting context framing MD saved:", MD_PATH.as_posix())

# Quick display
mf = framing.get("meeting_frame", {}) or {}
print("\nobjective:", mf.get("meeting_objective"))
print("30-sec:", (framing.get("narrative", {}) or {}).get("thirty_second"))
print("agenda items:", len(framing.get("suggested_agenda", []) or []))


run_id: 20260107_225628
startup_name: Sakana AI
integrated_summary_len: 364
takeaways_len: 4
dd_priorities_len: 2
top_pre_read_urls: 15
⚠️ Invalid framing output: Framing output missing required keys. Likely you saved payload (schema/bundle) instead of model output, or the model returned an unexpected structure.
Retrying once with stronger constraints...
✅ Meeting context framing JSON saved: artifacts/meeting_deep_dive/meeting_context_framing_20260107_225628.json
✅ Meeting context framing MD saved: artifacts/meeting_deep_dive/meeting_context_framing_20260107_225628.md

objective: Evaluate the potential of Sakana AI for investment and partnership opportunities.
30-sec: Sakana AI is a Tokyo-based startup revolutionizing AI with nature-inspired technologies, having raised significant funding and developed innovative solutions like the AI Scientist. This meeting aims to evaluate its market potential and investment opportunities.
agenda items: 5


In [5]:
# ============================================================
# 13. Top 5 Meeting Questions (Robust + Evidence-linked)
# ============================================================
# Fixes:
# - Do NOT send {"schema":..., "bundle":...} as user JSON (model may echo it back)
# - Send only the bundle, with a strict system schema description
# - Save raw model response for debugging
# - Retry cascade: gpt-4o-mini -> (stronger prompt) -> gpt-4o
# - Key salvage: accept alternative keys like "questions" and normalize to "top_questions"
#
# Outputs:
# - top_meeting_questions_<run_id>.json
# - top_meeting_questions_<run_id>.md
# - top_meeting_questions_<run_id>.csv
# - top_meeting_questions_raw_<run_id>.txt (debug)

import os
import re
import json
import pandas as pd
from pathlib import Path
from datetime import datetime, timezone
from dotenv import load_dotenv
from openai import OpenAI

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
load_dotenv("env.txt")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set.")
client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
def latest_path(pattern: str) -> Path | None:
    paths = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    return paths[0] if paths else None

def safe_read_json(path: Path | None) -> dict:
    if not path or not path.exists():
        return {}
    return json.loads(path.read_text(encoding="utf-8"))

def safe_read_text(path: Path | None) -> str:
    if not path or not path.exists():
        return ""
    return path.read_text(encoding="utf-8")

def _extract_json_object(text: str) -> str:
    if not text:
        return ""
    t = text.strip()
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", t, flags=re.DOTALL | re.IGNORECASE)
    if m:
        return m.group(1).strip()
    start, end = t.find("{"), t.rfind("}")
    if start != -1 and end != -1 and end > start:
        return t[start:end+1].strip()
    return ""

def normalize_questions_obj(obj: dict) -> dict:
    """
    Normalize common variants into {"top_questions": [...], "notes": {...}, "confidence": {...}}
    """
    if not isinstance(obj, dict):
        return {}

    # If model echoed input
    if set(obj.keys()) == {"schema", "bundle"}:
        return {}  # treat as invalid

    if "top_questions" in obj and isinstance(obj["top_questions"], list):
        return obj

    # Salvage alternative keys
    for k in ["questions", "meeting_questions", "top5_questions", "top_5_questions"]:
        if k in obj and isinstance(obj[k], list):
            obj["top_questions"] = obj[k]
            break

    # Sometimes nested
    if "result" in obj and isinstance(obj["result"], dict):
        inner = obj["result"]
        if "top_questions" in inner and isinstance(inner["top_questions"], list):
            return inner
        for k in ["questions", "meeting_questions"]:
            if k in inner and isinstance(inner[k], list):
                inner["top_questions"] = inner[k]
                return inner

    return obj if "top_questions" in obj else {}

def validate_questions(obj: dict):
    if "top_questions" not in obj:
        raise ValueError("Missing top_questions in output.")
    qs = obj.get("top_questions") or []
    if len(qs) != 5:
        raise ValueError(f"Expected exactly 5 questions, got {len(qs)}")
    for i, q in enumerate(qs, start=1):
        if not (q.get("question") or "").strip():
            raise ValueError(f"Question {i} missing question text.")
        if not (q.get("why_it_matters") or "").strip():
            raise ValueError(f"Question {i} missing why_it_matters.")
        if not (q.get("strong_answer_signals") or []):
            raise ValueError(f"Question {i} missing strong_answer_signals.")
        if not (q.get("red_flags") or []):
            raise ValueError(f"Question {i} missing red_flags.")
        ev = q.get("evidence_urls") or []
        if not isinstance(ev, list) or len(ev) < 1:
            raise ValueError(f"Question {i} missing evidence_urls (need >=1).")

# ------------------------------------------------------------
# Load inputs / integrated / framing / evidence
# ------------------------------------------------------------
inputs = safe_read_json(latest_path("inputs_*.json"))
entity = safe_read_json(latest_path("entity_*.json"))
integrated = safe_read_json(latest_path("integrated_insights_*.json"))
framing = safe_read_json(latest_path("meeting_context_framing_*.json"))

integrated_md = safe_read_text(latest_path("integrated_insights_*.md"))
framing_md = safe_read_text(latest_path("meeting_context_framing_*.md"))

run_id = (inputs.get("meta", {}) or {}).get("run_id") or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
startup_name = (entity.get("canonical_name") or inputs.get("startup_name") or "Unknown Startup").strip()

# Evidence URLs
top_urls = []
ev_path = latest_path("integrated_evidence_map_*.csv")
if ev_path and ev_path.exists():
    try:
        df_ev = pd.read_csv(ev_path)
        if "url" in df_ev.columns:
            top_urls = df_ev["url"].dropna().astype(str).head(25).tolist()
    except Exception:
        pass

bundle = {
    "startup_name": startup_name,
    "meeting_person": {"name": inputs.get("meeting_person_name"), "title": inputs.get("meeting_person_title")},
    "meeting_context": inputs.get("meeting_context"),
    "your_org_context": inputs.get("your_org_context"),
    "integrated_insights_json": integrated,
    "meeting_context_framing_json": framing,
    "integrated_md_excerpt": integrated_md[:5000],
    "framing_md_excerpt": framing_md[:3500],
    "allowed_evidence_urls": top_urls[:25],   # IMPORTANT: model must only use these URLs
}

print("run_id:", run_id)
print("startup_name:", startup_name)
print("evidence_urls:", len(top_urls))

# ------------------------------------------------------------
# OpenAI: Top 5 questions
# ------------------------------------------------------------
SYSTEM_BASE = """
You are a venture diligence assistant.

TASK:
Generate the Top 5 meeting questions for an initial meeting, using the provided bundle (integrated insights + meeting framing).

OUTPUT FORMAT (STRICT JSON):
Return a single JSON object with EXACTLY these top-level keys:
{
  "top_questions": [ ... exactly 5 items ... ],
  "notes": { "assumptions": [...], "open_points": [...] },
  "confidence": { "overall": 0..1, "rationale": "..." }
}

Each item in "top_questions" MUST have:
{
  "rank": 1..5,
  "theme": "product|market|gtm|moat|team|funding|other",
  "question": "...",
  "why_it_matters": "...",
  "what_a_strong_answer_looks_like": "...",
  "strong_answer_signals": ["...", "...", "...", "..."],
  "red_flags": ["...", "...", "...", "..."],
  "suggested_followups": ["...", "...", "..."],
  "evidence_urls": ["<URL from allowed_evidence_urls>", "..."]
}

NON-NEGOTIABLE RULES:
- Produce exactly 5 questions ranked 1..5.
- DO NOT invent facts or URLs.
- evidence_urls must be chosen ONLY from bundle.allowed_evidence_urls.
- Do not include any keys other than: top_questions, notes, confidence.
- Avoid vague questions; each should be askable in the room.
""".strip()

SYSTEM_STRONG = (SYSTEM_BASE + """

RETRY MODE:
- Your previous output was invalid.
- You MUST comply with the exact JSON schema above.
- If information is missing, write assumptions in notes.assumptions instead of leaving blanks.
""").strip()

def call_model(bundle: dict, model: str, strong: bool = False) -> tuple[dict, str]:
    system = SYSTEM_STRONG if strong else SYSTEM_BASE
    user = json.dumps(bundle, ensure_ascii=False)

    try:
        resp = client.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": system},
                      {"role": "user", "content": user}],
            temperature=0.15,
            response_format={"type": "json_object"},
        )
        raw = (resp.choices[0].message.content or "").strip()
        out = json.loads(raw) if raw else {}
        return out, raw
    except Exception:
        # fallback: best-effort extraction
        resp2 = client.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": system},
                      {"role": "user", "content": user}],
            temperature=0.15,
        )
        raw = (resp2.choices[0].message.content or "").strip()
        candidate = _extract_json_object(raw)
        out = json.loads(candidate) if candidate else {}
        return out, raw

RAW_TXT = ART_DIR / f"top_meeting_questions_raw_{run_id}.txt"

# Try 1: mini
out, raw = call_model(bundle, model="gpt-4o-mini", strong=False)
RAW_TXT.write_text(raw, encoding="utf-8")
out = normalize_questions_obj(out)

# Try 2: mini (strong)
if not out:
    print("⚠️ Invalid output (no top_questions) -> retry with stronger prompt...")
    out2, raw2 = call_model(bundle, model="gpt-4o-mini", strong=True)
    RAW_TXT.write_text(raw2, encoding="utf-8")
    out = normalize_questions_obj(out2)

# Try 3: gpt-4o (strong) if still invalid
if not out:
    print("⚠️ Still invalid -> retry with gpt-4o (strong)...")
    out3, raw3 = call_model(bundle, model="gpt-4o", strong=True)
    RAW_TXT.write_text(raw3, encoding="utf-8")
    out = normalize_questions_obj(out3)

# Final validation
validate_questions(out)

# ------------------------------------------------------------
# Save outputs
# ------------------------------------------------------------
JSON_PATH = ART_DIR / f"top_meeting_questions_{run_id}.json"
JSON_PATH.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
print("✅ Top meeting questions JSON saved:", JSON_PATH.as_posix())
print("✅ Raw model response saved:", RAW_TXT.as_posix())

# Markdown
def to_md(x: dict) -> str:
    qs = x.get("top_questions") or []
    notes = x.get("notes", {}) or {}
    lines = []
    lines.append(f"# Top 5 Meeting Questions: {startup_name}\n")
    for q in sorted(qs, key=lambda z: int(z.get("rank", 999))):
        lines.append(f"## {q.get('rank')}. {q.get('theme')}")
        lines.append(f"**Q:** {q.get('question')}\n")
        lines.append(f"**Why it matters:** {q.get('why_it_matters')}\n")
        lines.append(f"**What a strong answer looks like:** {q.get('what_a_strong_answer_looks_like')}\n")
        lines.append("**Strong answer signals:**")
        for s in (q.get("strong_answer_signals") or [])[:8]:
            lines.append(f"- {s}")
        lines.append("\n**Red flags:**")
        for s in (q.get("red_flags") or [])[:8]:
            lines.append(f"- {s}")
        lines.append("\n**Suggested follow-ups:**")
        for s in (q.get("suggested_followups") or [])[:6]:
            lines.append(f"- {s}")
        lines.append("\n**Evidence URLs:**")
        for u in (q.get("evidence_urls") or [])[:5]:
            lines.append(f"- {u}")
        lines.append("\n---\n")

    if notes.get("assumptions"):
        lines.append("## Assumptions")
        for a in notes.get("assumptions", [])[:12]:
            lines.append(f"- {a}")
        lines.append("")
    if notes.get("open_points"):
        lines.append("## Open Points")
        for a in notes.get("open_points", [])[:12]:
            lines.append(f"- {a}")
        lines.append("")
    return "\n".join(lines)

MD_PATH = ART_DIR / f"top_meeting_questions_{run_id}.md"
MD_PATH.write_text(to_md(out), encoding="utf-8")
print("✅ Top meeting questions MD saved:", MD_PATH.as_posix())

# CSV
rows = []
for q in out.get("top_questions", []) or []:
    rows.append({
        "run_id": run_id,
        "rank": q.get("rank"),
        "theme": q.get("theme"),
        "question": q.get("question"),
        "why_it_matters": q.get("why_it_matters"),
        "what_a_strong_answer_looks_like": q.get("what_a_strong_answer_looks_like"),
        "strong_answer_signals": " | ".join(q.get("strong_answer_signals") or []),
        "red_flags": " | ".join(q.get("red_flags") or []),
        "suggested_followups": " | ".join(q.get("suggested_followups") or []),
        "evidence_urls": " | ".join(q.get("evidence_urls") or []),
    })

df_q = pd.DataFrame(rows).sort_values("rank")
CSV_PATH = ART_DIR / f"top_meeting_questions_{run_id}.csv"
df_q.to_csv(CSV_PATH, index=False)
print("✅ Top meeting questions CSV saved:", CSV_PATH.as_posix())

display(df_q)


run_id: 20260107_225628
startup_name: Sakana AI
evidence_urls: 25
✅ Top meeting questions JSON saved: artifacts/meeting_deep_dive/top_meeting_questions_20260107_225628.json
✅ Raw model response saved: artifacts/meeting_deep_dive/top_meeting_questions_raw_20260107_225628.txt
✅ Top meeting questions MD saved: artifacts/meeting_deep_dive/top_meeting_questions_20260107_225628.md
✅ Top meeting questions CSV saved: artifacts/meeting_deep_dive/top_meeting_questions_20260107_225628.csv


Unnamed: 0,run_id,rank,theme,question,why_it_matters,what_a_strong_answer_looks_like,strong_answer_signals,red_flags,suggested_followups,evidence_urls
0,20260107_225628,1,product,Can you provide specific metrics on the perfor...,Understanding the effectiveness and real-world...,"Detailed metrics showing user adoption rates, ...",High user adoption rates | Positive customer f...,Lack of data on performance | Negative custome...,What specific sectors have seen the most succe...,https://sakana.ai/ai-scientist/
1,20260107_225628,2,market,How do you differentiate Sakana AI from establ...,Identifying Sakana AI's unique value propositi...,"Clear articulation of unique features, advanta...",Unique technology features | Clear differentia...,Vague differentiation | Inability to articulat...,What specific features do customers prefer? | ...,https://promptloop.com/directory/what-does-sak...
2,20260107_225628,3,team,What is the background and expertise of the te...,The team's expertise and experience are critic...,"A diverse team with relevant experience in AI,...",Experienced leadership | Diverse skill sets | ...,Lack of relevant experience | High turnover in...,What are the team's previous successes? | How ...,https://www.japantimes.co.jp/business/2025/09/...
3,20260107_225628,4,funding,How do you plan to utilize the recent $135 mil...,Understanding the allocation of funds will pro...,A clear plan detailing how funds will be used ...,Specific allocation of funds | Clear growth st...,Vague plans for fund utilization | Lack of str...,What are the key milestones you aim to achieve...,https://siliconangle.com/2025/11/17/sakana-ai-...
4,20260107_225628,5,risks,What are the primary risks you foresee in scal...,Identifying potential risks will help assess t...,"A comprehensive understanding of market risks,...",Clear risk identification | Proactive risk man...,Lack of risk awareness | No contingency plans ...,How do you plan to address these risks? | What...,https://www.retailbankerinternational.com/news...


In [6]:
# ============================================================
# 14. NG Questions & Watchouts (Do/Don't list + Risk cues)
# ============================================================
# Goal:
# - Generate "NG questions" (questions to avoid) and "watchouts" for the meeting, based on:
#   - Integrated Insights (#11)
#   - Meeting Context Framing (#12)
#   - Top 5 Meeting Questions (#13)
# - Output should help the team avoid:
#   - asking poorly framed / biased / overly leading questions
#   - triggering defensiveness or confidentiality issues
#   - missing key risk cues (watchouts) during the conversation
#
# Deliverables:
# 1) NG Questions (bad questions) with "why bad" + "better alternative"
# 2) Watchouts (things to listen for) with "what it might indicate" + "follow-up"
# 3) Tone / posture guidance (how to ask, sequencing, pacing)
#
# Outputs:
# - ng_questions_watchouts_<run_id>.json
# - ng_questions_watchouts_<run_id>.md
# - ng_questions_watchouts_<run_id>.csv (optional)
# - ng_questions_watchouts_raw_<run_id>.txt (debug)

import os
import re
import json
import pandas as pd
from pathlib import Path
from datetime import datetime, timezone
from dotenv import load_dotenv
from openai import OpenAI

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
load_dotenv("env.txt")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set.")
client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
def latest_path(pattern: str) -> Path | None:
    paths = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    return paths[0] if paths else None

def safe_read_json(path: Path | None) -> dict:
    if not path or not path.exists():
        return {}
    return json.loads(path.read_text(encoding="utf-8"))

def safe_read_text(path: Path | None) -> str:
    if not path or not path.exists():
        return ""
    return path.read_text(encoding="utf-8")

def _extract_json_object(text: str) -> str:
    if not text:
        return ""
    t = text.strip()
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", t, flags=re.DOTALL | re.IGNORECASE)
    if m:
        return m.group(1).strip()
    start, end = t.find("{"), t.rfind("}")
    if start != -1 and end != -1 and end > start:
        return t[start:end+1].strip()
    return ""

def normalize_obj(obj: dict) -> dict:
    """
    Normalize common failure mode where model echoes input or uses alternate keys.
    """
    if not isinstance(obj, dict):
        return {}
    if set(obj.keys()) == {"schema", "bundle"}:
        return {}
    # Alternate top-level
    if "ng_questions" not in obj:
        for k in ["bad_questions", "do_not_ask", "dont_ask_questions"]:
            if k in obj and isinstance(obj[k], list):
                obj["ng_questions"] = obj[k]
                break
    if "watchouts" not in obj:
        for k in ["red_flags", "risk_cues", "things_to_watch"]:
            if k in obj and isinstance(obj[k], list):
                obj["watchouts"] = obj[k]
                break
    return obj

def validate_out(obj: dict):
    if "ng_questions" not in obj or "watchouts" not in obj:
        raise ValueError("Missing ng_questions or watchouts.")
    if len(obj.get("ng_questions") or []) < 8:
        raise ValueError("Need at least 8 NG questions.")
    if len(obj.get("watchouts") or []) < 10:
        raise ValueError("Need at least 10 watchouts.")
    if "tone_guidance" not in obj:
        raise ValueError("Missing tone_guidance.")

# ------------------------------------------------------------
# Load inputs / integrated / framing / top questions / evidence
# ------------------------------------------------------------
inputs = safe_read_json(latest_path("inputs_*.json"))
entity = safe_read_json(latest_path("entity_*.json"))
integrated = safe_read_json(latest_path("integrated_insights_*.json"))
framing = safe_read_json(latest_path("meeting_context_framing_*.json"))
topq = safe_read_json(latest_path("top_meeting_questions_*.json"))

integrated_md = safe_read_text(latest_path("integrated_insights_*.md"))
framing_md = safe_read_text(latest_path("meeting_context_framing_*.md"))
topq_md = safe_read_text(latest_path("top_meeting_questions_*.md"))

run_id = (inputs.get("meta", {}) or {}).get("run_id") or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
startup_name = (entity.get("canonical_name") or inputs.get("startup_name") or "Unknown Startup").strip()

top_urls = []
ev_path = latest_path("integrated_evidence_map_*.csv")
if ev_path and ev_path.exists():
    try:
        df_ev = pd.read_csv(ev_path)
        if "url" in df_ev.columns:
            top_urls = df_ev["url"].dropna().astype(str).head(30).tolist()
    except Exception:
        pass

bundle = {
    "startup_name": startup_name,
    "meeting_person": {"name": inputs.get("meeting_person_name"), "title": inputs.get("meeting_person_title")},
    "meeting_context": inputs.get("meeting_context"),
    "your_org_context": inputs.get("your_org_context"),
    "integrated_insights_json": integrated,
    "meeting_context_framing_json": framing,
    "top_meeting_questions_json": topq,
    "integrated_md_excerpt": integrated_md[:4500],
    "framing_md_excerpt": framing_md[:3500],
    "top_questions_md_excerpt": topq_md[:3000],
    "allowed_evidence_urls": top_urls[:30],
}

print("run_id:", run_id)
print("startup_name:", startup_name)

# ------------------------------------------------------------
# OpenAI: NG questions + watchouts
# ------------------------------------------------------------
SYSTEM_BASE = """
You are a venture diligence assistant.

TASK:
Using the provided bundle (integrated insights + meeting framing + top questions),
produce (A) NG questions to avoid and (B) watchouts to listen for during the meeting.

OUTPUT FORMAT (STRICT JSON):
Return a single JSON object with EXACTLY these top-level keys:
{
  "ng_questions": [...],
  "watchouts": [...],
  "tone_guidance": {...},
  "confidence": {...}
}

NG QUESTIONS REQUIREMENTS:
- Provide 10 items.
- Each item:
{
  "category": "bias/leading|too_broad|too_sensitive|confidentiality|premature_pricing|other",
  "ng_question": "string",
  "why_bad": "string",
  "better_alternative": "string",
  "when_to_use_instead": "string"
}

WATCHOUTS REQUIREMENTS:
- Provide 12 items.
- Each item:
{
  "signal": "string (what you might hear/see)",
  "why_it_matters": "string",
  "what_it_might_indicate": "string",
  "follow_up_question": "string",
  "severity": "low|medium|high"
}

TONE GUIDANCE REQUIREMENTS:
{
  "posture": ["array of strings"],
  "sequencing": ["array of strings (how to order topics)"],
  "phrasing_patterns": ["array of strings (templates)"],
  "things_to_avoid": ["array of strings"]
}

NON-NEGOTIABLE RULES:
- Do not invent facts.
- Keep it practical and meeting-usable.
- Do not include any other top-level keys.
""".strip()

SYSTEM_STRONG = (SYSTEM_BASE + """

RETRY MODE:
- Your previous output was invalid.
- You MUST comply with the exact JSON schema above.
- Do NOT output "schema" or "bundle".
""").strip()

def call_model(bundle: dict, model: str, strong: bool = False) -> tuple[dict, str]:
    system = SYSTEM_STRONG if strong else SYSTEM_BASE
    user = json.dumps(bundle, ensure_ascii=False)
    try:
        resp = client.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": system},
                      {"role": "user", "content": user}],
            temperature=0.2,
            response_format={"type": "json_object"},
        )
        raw = (resp.choices[0].message.content or "").strip()
        out = json.loads(raw) if raw else {}
        return out, raw
    except Exception:
        resp2 = client.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": system},
                      {"role": "user", "content": user}],
            temperature=0.2,
        )
        raw = (resp2.choices[0].message.content or "").strip()
        candidate = _extract_json_object(raw)
        out = json.loads(candidate) if candidate else {}
        return out, raw

RAW_TXT = ART_DIR / f"ng_questions_watchouts_raw_{run_id}.txt"

# Try 1: mini
out, raw = call_model(bundle, model="gpt-4o-mini", strong=False)
RAW_TXT.write_text(raw, encoding="utf-8")
out = normalize_obj(out)

# Try 2: mini (strong)
if not out or "ng_questions" not in out:
    print("⚠️ Invalid output -> retry with stronger prompt...")
    out2, raw2 = call_model(bundle, model="gpt-4o-mini", strong=True)
    RAW_TXT.write_text(raw2, encoding="utf-8")
    out = normalize_obj(out2)

# Try 3: gpt-4o (strong) if still invalid
if not out or "ng_questions" not in out:
    print("⚠️ Still invalid -> retry with gpt-4o (strong)...")
    out3, raw3 = call_model(bundle, model="gpt-4o", strong=True)
    RAW_TXT.write_text(raw3, encoding="utf-8")
    out = normalize_obj(out3)

validate_out(out)

# ------------------------------------------------------------
# Save outputs
# ------------------------------------------------------------
JSON_PATH = ART_DIR / f"ng_questions_watchouts_{run_id}.json"
JSON_PATH.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
print("✅ NG questions & watchouts JSON saved:", JSON_PATH.as_posix())
print("✅ Raw model response saved:", RAW_TXT.as_posix())

# Markdown
def to_md(x: dict) -> str:
    lines = []
    lines.append(f"# NG Questions & Watchouts: {startup_name}\n")

    lines.append("## NG Questions (Do not ask like this)\n")
    for i, q in enumerate((x.get("ng_questions") or [])[:10], start=1):
        lines.append(f"### {i}. {q.get('category')}")
        lines.append(f"- **NG:** {q.get('ng_question')}")
        lines.append(f"- **Why bad:** {q.get('why_bad')}")
        lines.append(f"- **Better:** {q.get('better_alternative')}")
        lines.append(f"- **When to use instead:** {q.get('when_to_use_instead')}")
        lines.append("")

    lines.append("## Watchouts (Listen for these)\n")
    for i, w in enumerate((x.get("watchouts") or [])[:12], start=1):
        lines.append(f"### {i}. ({w.get('severity')}) {w.get('signal')}")
        lines.append(f"- **Why it matters:** {w.get('why_it_matters')}")
        lines.append(f"- **What it might indicate:** {w.get('what_it_might_indicate')}")
        lines.append(f"- **Follow-up:** {w.get('follow_up_question')}")
        lines.append("")

    tg = x.get("tone_guidance", {}) or {}
    lines.append("## Tone / Posture Guidance\n")
    lines.append("**Posture**")
    for b in (tg.get("posture") or [])[:12]:
        lines.append(f"- {b}")
    lines.append("\n**Sequencing**")
    for b in (tg.get("sequencing") or [])[:12]:
        lines.append(f"- {b}")
    lines.append("\n**Phrasing patterns**")
    for b in (tg.get("phrasing_patterns") or [])[:12]:
        lines.append(f"- {b}")
    lines.append("\n**Things to avoid**")
    for b in (tg.get("things_to_avoid") or [])[:12]:
        lines.append(f"- {b}")

    lines.append("\n")
    return "\n".join(lines)

MD_PATH = ART_DIR / f"ng_questions_watchouts_{run_id}.md"
MD_PATH.write_text(to_md(out), encoding="utf-8")
print("✅ NG questions & watchouts MD saved:", MD_PATH.as_posix())

# Optional CSV exports
df_ng = pd.DataFrame(out.get("ng_questions") or [])
df_w = pd.DataFrame(out.get("watchouts") or [])

if not df_ng.empty:
    df_ng.insert(0, "run_id", run_id)
if not df_w.empty:
    df_w.insert(0, "run_id", run_id)

CSV_NG = ART_DIR / f"ng_questions_{run_id}.csv"
CSV_W = ART_DIR / f"watchouts_{run_id}.csv"

df_ng.to_csv(CSV_NG, index=False)
df_w.to_csv(CSV_W, index=False)

print("✅ CSV saved:", CSV_NG.as_posix(), "and", CSV_W.as_posix())

display(df_ng.head(12))
display(df_w.head(12))


run_id: 20260107_225628
startup_name: Sakana AI
✅ NG questions & watchouts JSON saved: artifacts/meeting_deep_dive/ng_questions_watchouts_20260107_225628.json
✅ Raw model response saved: artifacts/meeting_deep_dive/ng_questions_watchouts_raw_20260107_225628.txt
✅ NG questions & watchouts MD saved: artifacts/meeting_deep_dive/ng_questions_watchouts_20260107_225628.md
✅ CSV saved: artifacts/meeting_deep_dive/ng_questions_20260107_225628.csv and artifacts/meeting_deep_dive/watchouts_20260107_225628.csv


Unnamed: 0,run_id,category,ng_question,why_bad,better_alternative,when_to_use_instead
0,20260107_225628,bias/leading,Isn't Sakana AI the best AI company in Japan?,This question is leading and biases the respon...,How do you perceive Sakana AI's position in th...,Use when seeking an unbiased perspective on ma...
1,20260107_225628,too_broad,Can you tell us everything about your technology?,This question is too broad and may overwhelm t...,Can you explain the key features of the AI Sci...,Use when you want focused information on speci...
2,20260107_225628,too_sensitive,What are your company's weaknesses?,This question may put the respondent on the de...,What challenges have you faced in your growth ...,Use when you want to understand challenges wit...
3,20260107_225628,confidentiality,Can you disclose your client list?,This question may breach confidentiality and t...,Can you share examples of industries you are w...,Use when seeking to understand market applicat...
4,20260107_225628,premature_pricing,What is the price of your product?,This question may be premature if the value pr...,How do you determine the value of your product...,Use when you want to understand pricing strate...
5,20260107_225628,other,Why should we invest in you?,This question may put pressure on the responde...,What unique value does Sakana AI bring to pote...,Use when you want to elicit a thoughtful respo...
6,20260107_225628,too_broad,What do you think about AI?,This question is too broad and lacks focus.,What trends do you see in the AI industry that...,Use when seeking insights on industry trends r...
7,20260107_225628,bias/leading,Isn't your technology superior to competitors?,This question is leading and may not elicit ho...,How do you compare your technology to that of ...,Use when seeking a balanced comparison.
8,20260107_225628,too_sensitive,Are you worried about your competition?,This question may make the respondent uncomfor...,What strategies do you have in place to addres...,Use when you want to discuss competition in a ...
9,20260107_225628,premature_pricing,What is your projected revenue for next year?,This question may be too early in the discussi...,What are your growth expectations for the upco...,Use when discussing future plans without focus...


Unnamed: 0,run_id,signal,why_it_matters,what_it_might_indicate,follow_up_question,severity
0,20260107_225628,Vague responses about technology capabilities.,Indicates a lack of clarity or confidence in t...,Potential weaknesses in the technology or its ...,Can you provide more specific examples of how ...,high
1,20260107_225628,Overemphasis on funding without clear growth s...,May suggest reliance on funding rather than su...,Lack of a solid plan for utilizing funds effec...,How do you plan to allocate the recent funding...,medium
2,20260107_225628,Mention of challenges in customer acquisition.,Highlights potential barriers to growth and ma...,Possible issues with product-market fit or com...,What specific challenges have you encountered ...,high
3,20260107_225628,Lack of competitive differentiation.,Indicates potential vulnerability in a crowded...,Unclear value proposition compared to competit...,What specific features set your product apart ...,high
4,20260107_225628,Heavy reliance on the Japanese market.,Limits growth potential and exposes the compan...,Lack of international expansion strategy.,What are your plans for expanding beyond the J...,medium
5,20260107_225628,Unclear metrics on customer satisfaction.,Lack of data can hinder understanding of produ...,Potential issues with product adoption or perf...,How do you measure customer satisfaction with ...,high
6,20260107_225628,Defensive tone when discussing competition.,May indicate insecurity about their market pos...,Concerns about competitive threats and market ...,How do you plan to address competitive threats...,medium
7,20260107_225628,Lack of clarity on regulatory challenges.,Regulatory hurdles can significantly impact bu...,Unpreparedness for compliance and operational ...,What regulatory challenges do you anticipate i...,high
8,20260107_225628,Overly optimistic projections without backing ...,Can indicate a disconnect between expectations...,Potential issues with planning and forecasting.,What data supports your growth projections?,medium
9,20260107_225628,Inconsistent information about funding history.,Inconsistencies can raise red flags about tran...,Possible mismanagement or misunderstanding of ...,Can you clarify the details of your funding ro...,high


In [10]:
# ============================================================
# 15. One-Page Summary Generation (Integrated Narrative Memo)
# ============================================================
# Goal:
# - Generate a narrative-style, one-page internal memo for an initial meeting.
# - Use paragraph-based prose (no bullet points).
# - Integrate findings from sections #3–#14.
# - Clearly explain:
#   - What the company is
#   - Who is behind it
#   - What it is building
#   - Who it sells to
#   - How it competes
#   - How it is funded and what recently changed
#
# Output:
# - one_pager_<run_id>.md
# - Rendered Markdown in the notebook

import os
import json
from pathlib import Path
from datetime import datetime, timezone
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import Markdown, display

# ------------------------------------------------------------
# Config
# ------------------------------------------------------------
load_dotenv("env.txt")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY is not set.")
client = OpenAI(api_key=OPENAI_API_KEY)

ART_DIR = Path("artifacts") / "meeting_deep_dive"
ART_DIR.mkdir(parents=True, exist_ok=True)

def latest_path(pattern: str):
    paths = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
    return paths[0] if paths else None

def safe_read_json(path):
    if not path or not path.exists():
        return {}
    return json.loads(path.read_text(encoding="utf-8"))

# ------------------------------------------------------------
# Load artifacts (best-effort)
# ------------------------------------------------------------
inputs = safe_read_json(latest_path("inputs_*.json"))
entity = safe_read_json(latest_path("entity_*.json"))

bundle = {
    "startup_name": entity.get("canonical_name") or inputs.get("startup_name"),
    "official_website": entity.get("official_website"),
    "meeting_person": {
        "name": inputs.get("meeting_person_name"),
        "title": inputs.get("meeting_person_title"),
    },
    "company_basics": safe_read_json(latest_path("company_basics_*.json")),
    "key_people": safe_read_json(latest_path("people_extraction_*.json")),
    "meeting_person_deep_dive": safe_read_json(latest_path("meeting_person_deep_dive_*.json")),
    "business_product": safe_read_json(latest_path("business_product_extraction_*.json")),
    "customer_market": safe_read_json(latest_path("market_extraction_*.json")),
    "competitive": safe_read_json(latest_path("competitive_extraction_*.json")),
    "funding": safe_read_json(latest_path("funding_extraction_*.json")),
    "recent_changes": safe_read_json(latest_path("recent_timeline_extraction_*.json")),
    "integrated_insights": safe_read_json(latest_path("integrated_insights_*.json")),
}

run_id = (inputs.get("meta", {}) or {}).get("run_id") or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

# ------------------------------------------------------------
# OpenAI: narrative one-pager
# ------------------------------------------------------------
SYSTEM_PROMPT = """
You are a venture capital research associate preparing a one-page internal memo
for an initial meeting with a startup.

Write in clear, professional prose.
Do NOT use bullet points.
Do NOT use labels such as "UNKNOWN" or "TODO".

Structure the memo EXACTLY as follows, using Markdown headings:

# {startup_name} — Initial Meeting Brief

## Company Overview
Explain what the company is, where it is based, what problem it is tackling,
and why it is attracting attention now.

## Team & Leadership
Describe the founding and leadership team, their backgrounds, and why they are
credible for this problem.

## Business, Product, and Technology
Explain what the company is building, how the technology works at a high level,
and what makes it distinctive.

## Customers, Market, and Go-To-Market
Describe the target customers, market context, and how the company appears to
approach adoption and distribution.

## Competitive Landscape
Explain who the company competes with and how it positions itself relative to
large global players.

## Funding and Recent Developments
Summarize funding history, notable investors, and recent developments or
strategic shifts.

Writing guidelines:
- Paragraphs only (no lists).
- Be factual where possible, but naturally note areas where clarity is still emerging.
- Maintain a neutral, analytical investment tone.
- Length: roughly 900–1300 words.

Output Markdown only.
""".strip()

resp = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT.format(
            startup_name=bundle["startup_name"] or "The Company"
        )},
        {"role": "user", "content": json.dumps(bundle, ensure_ascii=False)},
    ],
    temperature=0.25,
)

md = (resp.choices[0].message.content or "").strip()

# ------------------------------------------------------------
# Save & display
# ------------------------------------------------------------
MD_PATH = ART_DIR / f"one_pager_{run_id}.md"
MD_PATH.write_text(md, encoding="utf-8")
print("✅ One-pager saved:", MD_PATH.as_posix())

display(Markdown(md))


✅ One-pager saved: artifacts/meeting_deep_dive/one_pager_20260107_225628.md


# Sakana AI — Initial Meeting Brief

## Company Overview
Sakana AI is a burgeoning artificial intelligence startup headquartered in Tokyo, Japan. The company is gaining significant attention for its innovative approach to AI development, which is inspired by natural processes. Sakana AI focuses on creating AI systems that mimic evolutionary and collective intelligence principles, aiming to develop more efficient and adaptive models. This approach is particularly timely as the demand for AI solutions tailored to specific cultural and linguistic contexts grows, especially in Japan. The company's recent success in securing substantial funding underscores its potential to lead in the AI sector, particularly within the Japanese market.

## Team & Leadership
The leadership team at Sakana AI is composed of highly credible figures in the AI and technology sectors. David Ha, the co-founder and CEO, brings a wealth of experience from his previous roles as a research scientist at Google Brain and a managing director at Goldman Sachs. His transition from finance to AI research highlights a unique blend of skills that are instrumental in steering Sakana AI's strategic direction. Llion Jones, the CTO, is renowned for his contributions to the transformer architecture, having co-authored the influential "Attention is All You Need" paper. His expertise in large-scale language models is pivotal to the company's technological advancements. Ren Ito, the COO, complements the team with his background in e-commerce and diplomacy, having served as the CEO of Mercari Europe and worked within Japan's Ministry of Foreign Affairs. This diverse leadership team is well-equipped to address the complex challenges of developing and deploying AI technologies.

## Business, Product, and Technology
Sakana AI is at the forefront of developing AI technologies that draw inspiration from natural phenomena. The company's flagship product, the AI Scientist, is designed to autonomously conduct scientific research, generate hypotheses, and produce peer-reviewed papers. This system leverages evolutionary algorithms to merge existing models, creating new, optimized AI systems that are both cost-effective and efficient. Additionally, Sakana AI employs the Continuous Thought Machine (CTM) to enhance reasoning capabilities through synchronized neuron dynamics. These technologies are distinctive due to their focus on reducing computational costs and their ability to autonomously generate scientific insights, positioning Sakana AI as a leader in nature-inspired AI development.

## Customers, Market, and Go-To-Market
Sakana AI targets a diverse range of customers, including financial institutions, government agencies, and manufacturing companies. The company's AI solutions are particularly appealing to sectors that require advanced decision-making capabilities and tailored AI models. In the Japanese market, Sakana AI is positioned as a leader in the emerging AI ecosystem, collaborating with major corporations and government entities. The company has established strategic partnerships with financial giants like MUFG Bank and Daiwa Securities, which facilitate the integration of AI into banking operations and asset management. Sakana AI's go-to-market strategy emphasizes collaboration with local enterprises and leveraging its nature-inspired technology to meet specific market needs.

## Competitive Landscape
In the competitive landscape, Sakana AI positions itself against global AI leaders such as OpenAI and Google DeepMind. While these companies have a broader global reach and established products, Sakana AI differentiates itself through its focus on nature-inspired intelligence and evolutionary algorithms. The company's emphasis on developing AI models tailored to Japan's linguistic and cultural environment provides a unique competitive edge. Additionally, Sakana AI's approach to AI safety and sustainability aligns with the growing demand for ethical AI solutions, positioning it favorably against competitors like Anthropic, which also prioritize AI safety.

## Funding and Recent Developments
Sakana AI has demonstrated robust financial growth, having raised approximately $347 million across multiple funding rounds. The company secured $135 million in its recent Series B round, achieving a post-money valuation of approximately $2.635 billion. Notable investors include Mitsubishi UFJ Financial Group, Khosla Ventures, and New Enterprise Associates, highlighting strong backing from both strategic and venture capital investors. Recent developments include Sakana AI's recognition as Japan's most valuable unicorn and its strategic shift towards expanding into defense and manufacturing sectors. These developments underscore the company's commitment to leveraging its innovative AI solutions to address diverse market needs while maintaining a strong focus on the Japanese market.

In [11]:
# ============================================================
# 16. Export & Save Artifacts (Run-level manifest)
# ============================================================
# Purpose:
# - Collect all key artifacts generated in this run
# - Create a simple manifest for reuse (Notion / Drive / IC memo)
# - No new analysis is performed in this step

import json
from pathlib import Path
from datetime import datetime, timezone

ART_DIR = Path("artifacts") / "meeting_deep_dive"
EXPORT_DIR = ART_DIR / "exports"
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

run_id = "20260107_225628"  # or read from inputs if you prefer

# ------------------------------------------------------------
# Helper: collect existing artifacts
# ------------------------------------------------------------
def collect(patterns):
    files = []
    for pat in patterns:
        files.extend(sorted(ART_DIR.glob(pat)))
    return [p.as_posix() for p in files]

manifest = {
    "run_id": run_id,
    "exported_at_utc": datetime.now(timezone.utc).isoformat(),
    "artifacts": {
        "entity": collect(["entity_*.json"]),
        "company_basics": collect(["company_*.*"]),
        "people": collect(["people_*.*"]),
        "meeting_person_deep_dive": collect(["meeting_person_deep_dive_*.*"]),
        "business_product": collect(["business_product_*.*"]),
        "market": collect(["market_*.*"]),
        "competitive": collect(["competitive_*.*"]),
        "funding": collect(["funding_*.*"]),
        "recent_timeline": collect(["recent_*.*"]),
        "integrated_insights": collect(["integrated_insights_*.*"]),
        "meeting_context_framing": collect(["meeting_context_framing_*.*"]),
        "top_meeting_questions": collect(["top_meeting_questions_*.*"]),
        "ng_questions_watchouts": collect(["ng_questions_watchouts_*.*"]),
        "one_pager": collect(["one_pager_*.*"]),
    }
}

MANIFEST_PATH = EXPORT_DIR / f"manifest_{run_id}.json"
MANIFEST_PATH.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")

print("✅ Export manifest saved:", MANIFEST_PATH.as_posix())

# Optional: quick view
for k, v in manifest["artifacts"].items():
    if v:
        print(f"- {k}: {len(v)} files")


✅ Export manifest saved: artifacts/meeting_deep_dive/exports/manifest_20260107_225628.json
- entity: 1 files
- company_basics: 5 files
- people: 5 files
- meeting_person_deep_dive: 1 files
- business_product: 5 files
- market: 5 files
- competitive: 5 files
- funding: 6 files
- recent_timeline: 4 files
- integrated_insights: 2 files
- meeting_context_framing: 2 files
- top_meeting_questions: 4 files
- ng_questions_watchouts: 3 files
- one_pager: 3 files
