In [24]:
# ============================================================
# 008_Startup_Profile_Reconstruction
# ============================================================
#
# Overview
# ----------------
# This notebook reconstructs structured startup profiles from publicly
# available web information.
#
# Starting from a list of target companies, it collects raw textual signals
# from seed URLs and search-based discovery (e.g., official websites,
# announcements, and other public pages), applies LLM-based information
# extraction and normalization, and produces analysis-ready,
# startup-level profiles.
#
# The goal is to transform unstructured public information into a
# comparable, inspectable data structure with explicit evidence and
# confidence signals, enabling reuse across downstream portfolio analysis,
# VC comparison, and research workflows.
#
#
# Inputs / Outputs
# ----------------
# Inputs:
# - List of target startups (names and optional seed URLs)
# - Google Custom Search Engine (CSE) configuration for public web discovery
# - Environment variables for API access (search and LLM)
# - Configuration parameters for scraping, filtering, and summarization
#
# Outputs:
# - Structured startup profiles (JSON / pandas DataFrame)
# - Field-level attributes with evidence URLs and confidence scores
# - Human-readable narrative reports for inspection and sharing
#
#
# Structure
# ----------------
# Cell 0 : Notebook purpose and design assumptions
# Cell 1 : Imports and global configuration
# Cell 2 : Load input startup list and metadata
# Cell 3 : Fetch public web content for each startup (seed + CSE, robots-aware)
# Cell 4 : Clean and normalize raw textual data
# Cell 5 : LLM-based profile reconstruction (structured attributes with evidence)
# Cell 6 : Assemble structured startup profiles (field-level and flat tables)
# Cell 7 : Export artifacts and narrative reports for downstream analysis
#
#
# Notes
# ----------------
# - This notebook focuses on profile reconstruction, not evaluation or scoring.
# - All assumptions, filters, and heuristics are kept explicit for traceability.
# - LLM outputs are treated as extractive, not authoritative; evidence URLs
#   and confidence scores are preserved for auditability.
# - The resulting artifacts are designed to be reusable across multiple
#   research, portfolio, and investment analysis contexts.


In [2]:
# ============================================================
# Cell 1 : Imports and Global Configuration
# ============================================================

# Standard library imports
import os
import json
import time
from typing import List, Dict, Any
from dotenv import load_dotenv
# Third-party imports
import requests
import pandas as pd
from bs4 import BeautifulSoup

# (Optional) progress visualization
from tqdm import tqdm

# LLM / API related imports
import openai

# ------------------------------------------------------------
# Global configuration
# ------------------------------------------------------------
# Load env.txt explicitly (recommended for local + GitHub Actions parity)
load_dotenv("env.txt")

# --- OpenAI API (required) ---
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if OPENAI_API_KEY is None:
    raise EnvironmentError("OPENAI_API_KEY is not set in the environment variables.")

openai.api_key = OPENAI_API_KEY

# --- Google Custom Search Engine (required for this notebook) ---
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_CX = os.getenv("GOOGLE_CSE_CX")

if GOOGLE_API_KEY is None or GOOGLE_CSE_CX is None:
    raise EnvironmentError(
        "GOOGLE_API_KEY and GOOGLE_CSE_CX must be set in the environment variables."
    )

# Request configuration
REQUEST_TIMEOUT = 30          # seconds
REQUEST_SLEEP_SEC = 1.0       # polite delay between requests

# LLM configuration
LLM_MODEL_NAME = "gpt-4.1-mini"   # adjust as needed
LLM_TEMPERATURE = 0.2             # low temperature for stable summaries
LLM_MAX_TOKENS = 800

# Text processing configuration
MAX_TEXT_LENGTH = 6000            # truncate long web text before LLM input

# Output configuration
OUTPUT_DIR = "artifacts"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Imports and global configuration loaded successfully.")

Imports and global configuration loaded successfully.


In [4]:
# ============================================================
# Cell 2 : Load Input Startup List and Metadata
# ============================================================

import re
from datetime import datetime

import ipywidgets as widgets
from IPython.display import display, Markdown

display(Markdown("### Input: Startup List"))
display(Markdown(
    "Enter one startup per line. You can optionally add a website URL after a comma.\n\n"
    "**Examples**:\n"
    "- Eureka Robotics, https://eurekarobotics.com/\n"
    "- Notion, https://www.notion.so\n"
    "- SmartHR, https://smarthr.jp"
))

# Widget for user input
startup_input = widgets.Textarea(
    value="",
    placeholder="One startup per line. Optionally: Name, https://example.com",
    description="Startups:",
    layout=widgets.Layout(width="100%", height="180px"),
)

run_button = widgets.Button(
    description="Load list",
    button_style="primary",
    tooltip="Parse the input into a structured table",
)

output = widgets.Output()

display(startup_input, run_button, output)

def _normalize_url(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return ""
    if not re.match(r"^https?://", url):
        url = "https://" + url
    return url

def _parse_startup_lines(text: str):
    rows = []
    for raw_line in (text or "").splitlines():
        line = raw_line.strip()
        if not line:
            continue

        # Format: "Name, URL" (URL optional)
        if "," in line:
            name, url = line.split(",", 1)
            name = name.strip()
            url = _normalize_url(url)
        else:
            name = line
            url = ""

        rows.append({"startup_name": name, "seed_url": url})
    return rows

def on_click(_):
    with output:
        output.clear_output()

        rows = _parse_startup_lines(startup_input.value)
        if not rows:
            print("No startups provided. Please enter at least one startup name.")
            return

        df_startups = pd.DataFrame(rows)

        # Add lightweight metadata (traceability)
        df_startups["input_source"] = "manual_widget"
        df_startups["created_at"] = datetime.utcnow().isoformat(timespec="seconds") + "Z"
        df_startups["startup_id"] = (
            df_startups["startup_name"].str.lower().str.strip()
            .str.replace(r"\s+", "-", regex=True)
            .str.replace(r"[^a-z0-9\-]", "", regex=True)
        )

        # Basic de-duplication by normalized name
        df_startups = df_startups.drop_duplicates(subset=["startup_id"]).reset_index(drop=True)

        print(f"Loaded {len(df_startups)} unique startups.")
        display(df_startups)

        # Expose as a global variable for downstream cells
        globals()["df_startups"] = df_startups

run_button.on_click(on_click)


### Input: Startup List

Enter one startup per line. You can optionally add a website URL after a comma.

**Examples**:
- Eureka Robotics, https://eurekarobotics.com/
- Notion, https://www.notion.so
- SmartHR, https://smarthr.jp

Textarea(value='', description='Startups:', layout=Layout(height='180px', width='100%'), placeholder='One star…

Button(button_style='primary', description='Load list', style=ButtonStyle(), tooltip='Parse the input into a s…

Output()

In [6]:
# ============================================================
# Cell 3 : Fetch Public Web Content for Each Startup
# ============================================================
#
# What this cell does
# - Always runs Google CSE queries (even if a seed URL exists)
# - Builds a small set of candidate URLs per startup (seed + CSE)
# - Checks robots.txt (best-effort) and fetches only allowed pages
# - Extracts lightweight plain text for downstream LLM processing
# - Outputs df_pages (page-level dataset) for subsequent reconstruction

import re
import time
import urllib.parse
import urllib.robotparser
from datetime import datetime
from urllib.parse import urlparse

from bs4 import BeautifulSoup


# ------------------------------------------------------------
# Preconditions
# ------------------------------------------------------------
if "df_startups" not in globals():
    raise ValueError("df_startups is not defined. Please run Cell 2 first.")

if GOOGLE_API_KEY is None or GOOGLE_CSE_CX is None:
    raise EnvironmentError("GOOGLE_API_KEY and GOOGLE_CSE_CX are required for Cell 3.")

# Customize the user agent if you want a stable identifier.
USER_AGENT = "Mozilla/5.0 (researchOS; startup-profile-reconstruction)"


# ------------------------------------------------------------
# Config knobs
# ------------------------------------------------------------
CSE_RESULTS_PER_QUERY = 3                 # keep it light
MAX_CANDIDATE_URLS_PER_STARTUP = 12       # scrape only a small shortlist
MAX_URLS_PER_QUERY_TO_KEEP = 3            # cap per-query contribution
REQUEST_SLEEP_SEC = float(REQUEST_SLEEP_SEC)  # ensure numeric

SKIP_FILE_EXTENSIONS = {
    ".pdf", ".zip", ".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg",
    ".mp4", ".mov", ".avi", ".mp3", ".wav", ".json", ".xml"
}

# If you want to constrain third-party sources, add domains here.
# Leave empty list to allow any domain (robots permitting).
ALLOWLIST_DOMAINS = []  # e.g., ["eurekarobotics.com", "crunchbase.com", "linkedin.com"]


# ------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------
def _normalize_url(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return ""
    if not re.match(r"^https?://", url):
        url = "https://" + url
    return url


def _is_probably_html_url(url: str) -> bool:
    u = (url or "").strip().lower()
    path = urlparse(u).path
    for ext in SKIP_FILE_EXTENSIONS:
        if path.endswith(ext):
            return False
    return True


def _domain_allowed(url: str) -> bool:
    if not ALLOWLIST_DOMAINS:
        return True
    dom = urlparse(url).netloc.lower()
    return any(dom == d or dom.endswith("." + d) for d in [x.lower() for x in ALLOWLIST_DOMAINS])


def google_cse_search(query: str, api_key: str, cx: str, num: int = 5) -> list:
    endpoint = "https://www.googleapis.com/customsearch/v1"
    params = {"key": api_key, "cx": cx, "q": query, "num": num}
    r = requests.get(endpoint, params=params, timeout=REQUEST_TIMEOUT)
    r.raise_for_status()
    data = r.json()
    return data.get("items", [])


_robots_cache = {}  # robots_url -> RobotFileParser

def is_allowed_by_robots(url: str, user_agent: str = USER_AGENT) -> bool:
    """
    Best-effort robots.txt check.
    If robots.txt cannot be fetched or parsed, we default to NOT allowed (safer).
    """
    try:
        parsed = urlparse(url)
        if not parsed.scheme or not parsed.netloc:
            return False

        base = f"{parsed.scheme}://{parsed.netloc}"
        robots_url = urllib.parse.urljoin(base, "/robots.txt")

        if robots_url not in _robots_cache:
            rp = urllib.robotparser.RobotFileParser()
            rp.set_url(robots_url)
            rp.read()
            _robots_cache[robots_url] = rp

        rp = _robots_cache[robots_url]
        return rp.can_fetch(user_agent, url)
    except Exception:
        return False


def fetch_html(url: str, user_agent: str = USER_AGENT) -> tuple[str, int, str]:
    """
    Fetch HTML and return (html_text, status_code, content_type).
    Raises for non-2xx responses.
    """
    headers = {
        "User-Agent": user_agent,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9,ja;q=0.8",
    }
    r = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT, allow_redirects=True)
    r.raise_for_status()
    content_type = (r.headers.get("Content-Type") or "").lower()
    return r.text, r.status_code, content_type


def extract_main_text(html: str) -> str:
    """
    Very lightweight HTML-to-text.
    (You can later replace with readability-lxml or trafilatura if desired.)
    """
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    text = soup.get_text(separator=" ", strip=True)
    text = re.sub(r"\s+", " ", text).strip()

    if len(text) > MAX_TEXT_LENGTH:
        text = text[:MAX_TEXT_LENGTH] + " ...[TRUNCATED]"
    return text


def build_cse_probes(startup_name: str, seed_url: str = "") -> list[dict]:
    """
    Build probes for multiple 'facets' we care about.
    We generate two query sets:
      A) site-restricted (official domain) if seed_url exists
      B) open web (for funding/press/third-party)
    Returns a list of dicts: {facet, query, scope}
    """
    base = startup_name.strip()

    facets = [
        ("business", "business overview OR company overview OR what we do"),
        ("product", "product OR service OR solution"),
        ("founders", "founder OR co-founder"),
        ("team", "team OR leadership OR management"),
        ("advantage", "competitive advantage OR differentiation"),
        ("market", "market OR industry OR use cases"),
        ("customers", "customer OR clients OR case study OR testimonial"),
        ("funding", "funding OR raised OR seed round OR Series A OR investment"),
        ("press", "press OR news OR announcement"),
    ]

    probes = []

    # A) Site-restricted probes (if seed provided)
    if seed_url:
        domain = urlparse(seed_url).netloc
        if domain:
            for facet, phrase in facets:
                probes.append(
                    {"facet": facet, "scope": "official_site", "query": f"{base} {phrase} site:{domain}"}
                )

    # B) Open web probes (always)
    for facet, phrase in facets:
        probes.append({"facet": facet, "scope": "open_web", "query": f"{base} {phrase}"})

    return probes


def collect_cse_urls(probes: list[dict]) -> list[dict]:
    """
    Run probes and collect URL candidates with provenance.
    Returns a list of dicts: {url, title, snippet, facet, scope, query}
    """
    out = []
    for p in probes:
        q = p["query"]
        facet = p["facet"]
        scope = p["scope"]

        items = google_cse_search(q, api_key=GOOGLE_API_KEY, cx=GOOGLE_CSE_CX, num=CSE_RESULTS_PER_QUERY)

        kept = 0
        for it in items:
            link = (it.get("link") or "").strip()
            if not link:
                continue
            out.append(
                {
                    "url": link,
                    "cse_title": (it.get("title") or "").strip(),
                    "cse_snippet": (it.get("snippet") or "").strip(),
                    "cse_facet": facet,
                    "cse_scope": scope,
                    "cse_query": q,
                }
            )
            kept += 1
            if kept >= MAX_URLS_PER_QUERY_TO_KEEP:
                break

        time.sleep(REQUEST_SLEEP_SEC)
    return out


def merge_seed_and_cse(seed_url: str, cse_url_rows: list[dict], limit: int) -> list[dict]:
    """
    Merge seed URL + CSE URLs, dedupe by URL while preserving order.
    Seed is tagged as facet='seed' for provenance.
    """
    merged = []
    if seed_url:
        merged.append(
            {
                "url": seed_url,
                "cse_title": "",
                "cse_snippet": "",
                "cse_facet": "seed",
                "cse_scope": "seed",
                "cse_query": "",
            }
        )

    merged.extend(cse_url_rows)

    seen = set()
    deduped = []
    for r in merged:
        u = r["url"]
        if u in seen:
            continue
        seen.add(u)
        deduped.append(r)

    # Prefer seed-domain URLs early (if seed exists), but keep others too
    if seed_url:
        seed_domain = urlparse(seed_url).netloc
        same = [r for r in deduped if urlparse(r["url"]).netloc == seed_domain]
        other = [r for r in deduped if urlparse(r["url"]).netloc != seed_domain]
        deduped = same + other

    return deduped[:limit]


# ------------------------------------------------------------
# Main execution
# ------------------------------------------------------------
records = []
run_ts = datetime.utcnow().isoformat(timespec="seconds") + "Z"

for _, row in tqdm(df_startups.iterrows(), total=len(df_startups)):
    startup_id = row["startup_id"]
    startup_name = row["startup_name"]
    seed_url = _normalize_url(row.get("seed_url", "") or "")

    # --- Always run CSE (required) ---
    probes = build_cse_probes(startup_name, seed_url=seed_url)

    try:
        cse_url_rows = collect_cse_urls(probes)
    except Exception as e:
        # Since CSE is required, fail fast
        raise RuntimeError(
            f"Google CSE failed for '{startup_name}'. "
            f"Check GOOGLE_API_KEY / GOOGLE_CSE_CX / quota. Error: {e}"
        )

    if len(cse_url_rows) == 0:
        # Since CSE is required, fail fast (you can relax this if desired)
        raise RuntimeError(
            f"Google CSE returned no URLs for '{startup_name}'. "
            "CSE is required in this notebook. Consider adjusting probes or checking quota."
        )

    # Merge seed + CSE URLs
    candidates = merge_seed_and_cse(
        seed_url=seed_url,
        cse_url_rows=cse_url_rows,
        limit=MAX_CANDIDATE_URLS_PER_STARTUP,
    )

    # --- Scrape candidates (robots + allowlist + html-ish) ---
    for c in candidates:
        url = _normalize_url(c["url"])
        dom = urlparse(url).netloc

        # quick filters
        if not _is_probably_html_url(url):
            records.append(
                {
                    "run_ts": run_ts,
                    "startup_id": startup_id,
                    "startup_name": startup_name,
                    "seed_url": seed_url,
                    "url": url,
                    "domain": dom,
                    "cse_facet": c["cse_facet"],
                    "cse_scope": c["cse_scope"],
                    "cse_query": c["cse_query"],
                    "cse_title": c["cse_title"],
                    "cse_snippet": c["cse_snippet"],
                    "robots_allowed": None,
                    "http_status": None,
                    "content_type": None,
                    "title": None,
                    "text": "",
                    "error": "Skipped non-HTML-like URL (by extension filter).",
                }
            )
            continue

        if not _domain_allowed(url):
            records.append(
                {
                    "run_ts": run_ts,
                    "startup_id": startup_id,
                    "startup_name": startup_name,
                    "seed_url": seed_url,
                    "url": url,
                    "domain": dom,
                    "cse_facet": c["cse_facet"],
                    "cse_scope": c["cse_scope"],
                    "cse_query": c["cse_query"],
                    "cse_title": c["cse_title"],
                    "cse_snippet": c["cse_snippet"],
                    "robots_allowed": None,
                    "http_status": None,
                    "content_type": None,
                    "title": None,
                    "text": "",
                    "error": "Skipped by domain allowlist.",
                }
            )
            continue

        robots_ok = is_allowed_by_robots(url, user_agent=USER_AGENT)
        if not robots_ok:
            records.append(
                {
                    "run_ts": run_ts,
                    "startup_id": startup_id,
                    "startup_name": startup_name,
                    "seed_url": seed_url,
                    "url": url,
                    "domain": dom,
                    "cse_facet": c["cse_facet"],
                    "cse_scope": c["cse_scope"],
                    "cse_query": c["cse_query"],
                    "cse_title": c["cse_title"],
                    "cse_snippet": c["cse_snippet"],
                    "robots_allowed": False,
                    "http_status": None,
                    "content_type": None,
                    "title": None,
                    "text": "",
                    "error": "Blocked by robots.txt (best-effort check).",
                }
            )
            continue

        try:
            html, status, content_type = fetch_html(url, user_agent=USER_AGENT)

            # Light content-type guard (avoid downloading unexpected binaries)
            if "text/html" not in content_type and "application/xhtml+xml" not in content_type:
                records.append(
                    {
                        "run_ts": run_ts,
                        "startup_id": startup_id,
                        "startup_name": startup_name,
                        "seed_url": seed_url,
                        "url": url,
                        "domain": dom,
                        "cse_facet": c["cse_facet"],
                        "cse_scope": c["cse_scope"],
                        "cse_query": c["cse_query"],
                        "cse_title": c["cse_title"],
                        "cse_snippet": c["cse_snippet"],
                        "robots_allowed": True,
                        "http_status": status,
                        "content_type": content_type,
                        "title": None,
                        "text": "",
                        "error": f"Skipped non-HTML content-type: {content_type}",
                    }
                )
                continue

            soup = BeautifulSoup(html, "html.parser")
            page_title = (soup.title.string.strip() if soup.title and soup.title.string else None)
            text = extract_main_text(html)

            records.append(
                {
                    "run_ts": run_ts,
                    "startup_id": startup_id,
                    "startup_name": startup_name,
                    "seed_url": seed_url,
                    "url": url,
                    "domain": dom,
                    "cse_facet": c["cse_facet"],
                    "cse_scope": c["cse_scope"],
                    "cse_query": c["cse_query"],
                    "cse_title": c["cse_title"],
                    "cse_snippet": c["cse_snippet"],
                    "robots_allowed": True,
                    "http_status": status,
                    "content_type": content_type,
                    "title": page_title,
                    "text": text,
                    "error": "",
                }
            )

            time.sleep(REQUEST_SLEEP_SEC)

        except requests.HTTPError as e:
            status = getattr(e.response, "status_code", None)
            records.append(
                {
                    "run_ts": run_ts,
                    "startup_id": startup_id,
                    "startup_name": startup_name,
                    "seed_url": seed_url,
                    "url": url,
                    "domain": dom,
                    "cse_facet": c["cse_facet"],
                    "cse_scope": c["cse_scope"],
                    "cse_query": c["cse_query"],
                    "cse_title": c["cse_title"],
                    "cse_snippet": c["cse_snippet"],
                    "robots_allowed": True,
                    "http_status": status,
                    "content_type": None,
                    "title": None,
                    "text": "",
                    "error": f"HTTPError: {str(e)}",
                }
            )

        except Exception as e:
            records.append(
                {
                    "run_ts": run_ts,
                    "startup_id": startup_id,
                    "startup_name": startup_name,
                    "seed_url": seed_url,
                    "url": url,
                    "domain": dom,
                    "cse_facet": c["cse_facet"],
                    "cse_scope": c["cse_scope"],
                    "cse_query": c["cse_query"],
                    "cse_title": c["cse_title"],
                    "cse_snippet": c["cse_snippet"],
                    "robots_allowed": True,
                    "http_status": None,
                    "content_type": None,
                    "title": None,
                    "text": "",
                    "error": f"Error: {str(e)}",
                }
            )

df_pages = pd.DataFrame(records)

print(
    f"Collected {len(df_pages)} page records across "
    f"{df_pages['startup_id'].nunique()} startups."
)
display(df_pages.head(10))

# Expose for downstream cells
globals()["df_pages"] = df_pages


100%|█████████████████████████████████████████████| 1/1 [00:42<00:00, 42.78s/it]

Collected 12 page records across 1 startups.





Unnamed: 0,run_ts,startup_id,startup_name,seed_url,url,domain,cse_facet,cse_scope,cse_query,cse_title,cse_snippet,robots_allowed,http_status,content_type,title,text,error
0,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://eurekarobotics.com/,eurekarobotics.com,seed,seed,,,,True,200,text/html,Precision 3D Vision Systems | Eureka Robotics,Precision 3D Vision Systems | Eureka Robotics ...,
1,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://eurekarobotics.com/applications,eurekarobotics.com,product,official_site,Eureka Robotics product OR service OR solution...,Applications | Eureka Robotics,Discover robotic vision applications across au...,True,200,text/html,Applications | Eureka Robotics,Applications | Eureka Robotics English æ¥æ¬è...,
2,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://eurekarobotics.com/resources/announcement,eurekarobotics.com,founders,official_site,Eureka Robotics founder OR co-founder site:eur...,Press Release: Eureka Robotics Refreshes Brand...,"Oct 7, 2025 ... ... robotic solutions in our c...",True,200,text/html,Press Release: Eureka Robotics Refreshes Brand...,Press Release: Eureka Robotics Refreshes Brand...,
3,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://eurekarobotics.com/about-us,eurekarobotics.com,team,official_site,Eureka Robotics team OR leadership OR manageme...,About | Eureka Robotics,We're building the robotic nervous system of t...,True,200,text/html,About | Eureka Robotics,About | Eureka Robotics English æ¥æ¬èª +1 (...,
4,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://eurekarobotics.com/eureka-controller,eurekarobotics.com,market,official_site,Eureka Robotics market OR industry OR use case...,Eureka Controller | Vision-Guided Robotics,The Eureka Controller is the brain behind our ...,True,200,text/html,Eureka Controller | Vision-Guided Robotics,Eureka Controller | Vision-Guided Robotics Eng...,
5,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://eurekarobotics.com/resources/press-rel...,eurekarobotics.com,press,official_site,Eureka Robotics press OR news OR announcement ...,Press Release: Eureka Robotics to Exhibit at t...,"Marking its third appearance at iREX, Eureka R...",True,200,text/html,Press Release: Eureka Robotics to Exhibit at t...,Press Release: Eureka Robotics to Exhibit at t...,
6,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://www.eurekarobotics.com/resources/eurek...,www.eurekarobotics.com,business,official_site,Eureka Robotics business overview OR company o...,"Eureka Robotics Opens U.S. Office in Atlanta, ...","[Atlanta, GA – Jun 3, 2025] Eureka Robotics, a...",True,200,text/html,"Eureka Robotics Opens U.S. Office in Atlanta, ...","Eureka Robotics Opens U.S. Office in Atlanta, ...",
7,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://www.eurekarobotics.com/resources/eurek...,www.eurekarobotics.com,product,official_site,Eureka Robotics product OR service OR solution...,Eureka Robotics Raises USD 10.5 Million Series A,"Dec 12, 2024 ... ... service of deep tech entr...",True,200,text/html,Eureka Robotics Raises USD 10.5 Million Series A,Eureka Robotics Raises USD 10.5 Million Series...,
8,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://www.eurekarobotics.com/resources,www.eurekarobotics.com,press,official_site,Eureka Robotics press OR news OR announcement ...,Resources | Eureka Robotics,Press Release: Eureka Robotics to Exhibit at t...,True,200,text/html,Resources | Eureka Robotics,Resources | Eureka Robotics English æ¥æ¬èª ...,
9,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://b.capital/why-we-invested/why-we-inves...,b.capital,business,open_web,Eureka Robotics business overview OR company o...,Why We Invested: Eureka Robotics - B Capital,"Jan 9, 2025 ... B Capital is pleased to partne...",True,200,text/html; charset=utf-8,Why We Invested: Eureka Robotics - B Capital,Why We Invested: Eureka Robotics - B Capital A...,


In [14]:
# ============================================================
# Cell 4 : Clean and Normalize Raw Textual Data
# ============================================================
#
# What this cell does
# - Filters page records to keep only scrapeable / usable text
# - Cleans boilerplate-ish noise (best-effort)
# - Normalizes whitespace + de-duplicates near-identical pages
# - Produces df_pages_clean for downstream profile reconstruction

import hashlib


# ------------------------------------------------------------
# Preconditions
# ------------------------------------------------------------
if "df_pages" not in globals():
    raise ValueError("df_pages is not defined. Please run Cell 3 first.")


# ------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------
def _stable_text_hash(text: str) -> str:
    """
    Stable hash for de-duplication.
    """
    t = (text or "").strip().encode("utf-8", errors="ignore")
    return hashlib.sha256(t).hexdigest()


def clean_text(text: str) -> str:
    """
    Best-effort text cleaning for web pages.
    Keeps it conservative: normalize whitespace, remove obvious nav/footer fragments,
    and strip common cookie/privacy boilerplate.
    """
    if not text:
        return ""

    t = text

    # Normalize whitespace
    t = re.sub(r"\s+", " ", t).strip()

    # Remove common boilerplate phrases (best-effort; safe to keep small)
    boilerplate_patterns = [
        r"\baccept cookies\b.*?$",
        r"\bcookie policy\b.*?$",
        r"\bprivacy policy\b.*?$",
        r"\bterms of service\b.*?$",
        r"\ball rights reserved\b.*?$",
        r"\bsubscribe\b.*?$",
        r"\bnewsletter\b.*?$",
    ]
    for pat in boilerplate_patterns:
        t = re.sub(pat, "", t, flags=re.IGNORECASE)

    # Remove excessive repeated separators
    t = re.sub(r"( \| ){2,}", " | ", t).strip()

    # Final whitespace normalize
    t = re.sub(r"\s+", " ", t).strip()

    return t


def is_valid_page_text(text: str, min_chars: int = 200) -> bool:
    """
    Basic quality filter.
    """
    if not text:
        return False
    return len(text.strip()) >= min_chars


# ------------------------------------------------------------
# Cleaning pipeline
# ------------------------------------------------------------
df_pages_work = df_pages.copy()

# Keep only pages that were allowed and fetched successfully
df_pages_work = df_pages_work[
    (df_pages_work["robots_allowed"] == True) &
    (df_pages_work["http_status"] == 200) &
    (df_pages_work["error"].fillna("") == "")
].copy()

# Clean text
df_pages_work["text_clean"] = df_pages_work["text"].fillna("").apply(clean_text)

# Filter very short pages
df_pages_work["is_valid_text"] = df_pages_work["text_clean"].apply(is_valid_page_text)
df_pages_work = df_pages_work[df_pages_work["is_valid_text"]].copy()

# Create text hash for de-duplication
df_pages_work["text_hash"] = df_pages_work["text_clean"].apply(_stable_text_hash)

# Dedupe identical texts per startup
df_pages_work = (
    df_pages_work
    .drop_duplicates(subset=["startup_id", "text_hash"])
    .reset_index(drop=True)
)

# Light ranking (optional): prioritize official_site and seed over open_web
scope_rank = {"seed": 0, "official_site": 1, "open_web": 2}
df_pages_work["scope_rank"] = df_pages_work["cse_scope"].map(scope_rank).fillna(9).astype(int)

df_pages_work = df_pages_work.sort_values(
    by=["startup_id", "scope_rank", "cse_facet", "url"],
    ascending=[True, True, True, True],
).reset_index(drop=True)

df_pages_clean = df_pages_work

print(
    f"After cleaning: {len(df_pages_clean)} usable page records across "
    f"{df_pages_clean['startup_id'].nunique()} startups."
)
display(df_pages_clean.head(10))

# Expose for downstream cells
globals()["df_pages_clean"] = df_pages_clean


After cleaning: 12 usable page records across 1 startups.


Unnamed: 0,run_ts,startup_id,startup_name,seed_url,url,domain,cse_facet,cse_scope,cse_query,cse_title,...,robots_allowed,http_status,content_type,title,text,error,text_clean,is_valid_text,text_hash,scope_rank
0,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://eurekarobotics.com/,eurekarobotics.com,seed,seed,,,...,True,200,text/html,Precision 3D Vision Systems | Eureka Robotics,Precision 3D Vision Systems | Eureka Robotics ...,,Precision 3D Vision Systems | Eureka Robotics ...,True,fdb50a436327b4b335b87797ea629e3dbccb0fbcab84ce...,0
1,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://www.eurekarobotics.com/resources/eurek...,www.eurekarobotics.com,business,official_site,Eureka Robotics business overview OR company o...,"Eureka Robotics Opens U.S. Office in Atlanta, ...",...,True,200,text/html,"Eureka Robotics Opens U.S. Office in Atlanta, ...","Eureka Robotics Opens U.S. Office in Atlanta, ...",,"Eureka Robotics Opens U.S. Office in Atlanta, ...",True,9526dc0be03364a1a959e1af1d4d4cfbb6f3b116076281...,1
2,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://eurekarobotics.com/resources/announcement,eurekarobotics.com,founders,official_site,Eureka Robotics founder OR co-founder site:eur...,Press Release: Eureka Robotics Refreshes Brand...,...,True,200,text/html,Press Release: Eureka Robotics Refreshes Brand...,Press Release: Eureka Robotics Refreshes Brand...,,Press Release: Eureka Robotics Refreshes Brand...,True,a34f0ce986f1bb1d02a950c5aaeaed4b4f7650d34f54e4...,1
3,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://eurekarobotics.com/eureka-controller,eurekarobotics.com,market,official_site,Eureka Robotics market OR industry OR use case...,Eureka Controller | Vision-Guided Robotics,...,True,200,text/html,Eureka Controller | Vision-Guided Robotics,Eureka Controller | Vision-Guided Robotics Eng...,,Eureka Controller | Vision-Guided Robotics Eng...,True,2235e68f1bc606c35095ff4db76e2119882511c01b0e7c...,1
4,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://eurekarobotics.com/resources/press-rel...,eurekarobotics.com,press,official_site,Eureka Robotics press OR news OR announcement ...,Press Release: Eureka Robotics to Exhibit at t...,...,True,200,text/html,Press Release: Eureka Robotics to Exhibit at t...,Press Release: Eureka Robotics to Exhibit at t...,,Press Release: Eureka Robotics to Exhibit at t...,True,2392cb91208220386dc1a0a4852f4d0a962ae8d4813586...,1
5,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://www.eurekarobotics.com/resources,www.eurekarobotics.com,press,official_site,Eureka Robotics press OR news OR announcement ...,Resources | Eureka Robotics,...,True,200,text/html,Resources | Eureka Robotics,Resources | Eureka Robotics English æ¥æ¬èª ...,,Resources | Eureka Robotics English æ¥æ¬èª ...,True,f3fd33bbdfb749d7d7f57318579e653cb6d5113b40c02c...,1
6,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://eurekarobotics.com/applications,eurekarobotics.com,product,official_site,Eureka Robotics product OR service OR solution...,Applications | Eureka Robotics,...,True,200,text/html,Applications | Eureka Robotics,Applications | Eureka Robotics English æ¥æ¬è...,,Applications | Eureka Robotics English æ¥æ¬è...,True,1255246679809a83d58b7f0fb4101ea343fe3a6af39c8a...,1
7,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://www.eurekarobotics.com/resources/eurek...,www.eurekarobotics.com,product,official_site,Eureka Robotics product OR service OR solution...,Eureka Robotics Raises USD 10.5 Million Series A,...,True,200,text/html,Eureka Robotics Raises USD 10.5 Million Series A,Eureka Robotics Raises USD 10.5 Million Series...,,Eureka Robotics Raises USD 10.5 Million Series...,True,62e62884c09a0493fff084ff0013a8156b39d17dc53ed9...,1
8,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://eurekarobotics.com/about-us,eurekarobotics.com,team,official_site,Eureka Robotics team OR leadership OR manageme...,About | Eureka Robotics,...,True,200,text/html,About | Eureka Robotics,About | Eureka Robotics English æ¥æ¬èª +1 (...,,About | Eureka Robotics English æ¥æ¬èª +1 (...,True,f2b5c3e81dacfb8f4b3f9f1b914b23d54e331311066b02...,1
9,2026-01-07T01:52:55Z,eureka-robotics,Eureka Robotics,https://eurekarobotics.com/,https://b.capital/why-we-invested/why-we-inves...,b.capital,business,open_web,Eureka Robotics business overview OR company o...,Why We Invested: Eureka Robotics - B Capital,...,True,200,text/html; charset=utf-8,Why We Invested: Eureka Robotics - B Capital,Why We Invested: Eureka Robotics - B Capital A...,,Why We Invested: Eureka Robotics - B Capital A...,True,d1cc011863fff127b0810f3c6a2a53460918f776f2b331...,2


In [20]:
# ============================================================
# Cell 5 : LLM-based Profile Reconstruction (Summary, Attributes)
# (Debug-heavy version: detailed logs + optional artifact dumps)
# ============================================================

import json
import time
import traceback
from datetime import datetime

from openai import OpenAI


# ------------------------------------------------------------
# Preconditions
# ------------------------------------------------------------
if "df_pages_clean" not in globals():
    raise ValueError("df_pages_clean is not defined. Please run Cell 4 first.")

if OPENAI_API_KEY is None:
    raise EnvironmentError("OPENAI_API_KEY is required for Cell 5.")


# ------------------------------------------------------------
# Debug knobs
# ------------------------------------------------------------
DEBUG = True
SAVE_DEBUG_ARTIFACTS = True
DEBUG_DIR = "artifacts/debug_cell5"
os.makedirs(DEBUG_DIR, exist_ok=True)

def log(msg: str):
    if DEBUG:
        print(msg)

def dump_text(path: str, text: str):
    if not SAVE_DEBUG_ARTIFACTS:
        return
    with open(path, "w", encoding="utf-8") as f:
        f.write(text or "")

def dump_json(path: str, obj):
    if not SAVE_DEBUG_ARTIFACTS:
        return
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)


# ------------------------------------------------------------
# Config knobs
# ------------------------------------------------------------
MAX_PAGES_PER_STARTUP = 6
MAX_CHARS_PER_PAGE = 1600
MAX_TOTAL_CHARS = 6500

LLM_MODEL_NAME = LLM_MODEL_NAME
LLM_TEMPERATURE = 0.0
MAX_TOKENS = int(LLM_MAX_TOKENS)

API_SLEEP_SEC = 0.4


# ------------------------------------------------------------
# OpenAI client (SDK v2.x)
# ------------------------------------------------------------
client = OpenAI(api_key=OPENAI_API_KEY)


# ------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------
def _clip(s: str, n: int) -> str:
    s = (s or "").strip()
    return s if len(s) <= n else s[:n] + " ...[TRUNCATED]"


def build_startup_context(df_pages_for_one: pd.DataFrame) -> tuple[str, list[dict]]:
    chosen = df_pages_for_one.head(MAX_PAGES_PER_STARTUP).copy()

    chunks = []
    sources = []
    total = 0

    for _, r in chosen.iterrows():
        text_snip = _clip(r.get("text_clean", ""), MAX_CHARS_PER_PAGE)

        block = (
            f"[SOURCE {len(sources)+1}]\n"
            f"URL: {r.get('url','')}\n"
            f"TITLE: {r.get('title') or ''}\n"
            f"FACET: {r.get('cse_facet') or ''} | SCOPE: {r.get('cse_scope') or ''}\n"
            f"TEXT: {text_snip}\n"
        )

        if total + len(block) > MAX_TOTAL_CHARS:
            break

        chunks.append(block)
        sources.append(
            {
                "source_idx": len(sources) + 1,
                "url": r.get("url", ""),
                "title": r.get("title") or "",
                "cse_facet": r.get("cse_facet") or "",
                "cse_scope": r.get("cse_scope") or "",
                "text_len": len(r.get("text_clean", "") or ""),
            }
        )
        total += len(block)

    return "\n".join(chunks), sources


def build_profile_prompt(startup_name: str, context_text: str) -> str:
    return f"""
Reconstruct a startup profile ONLY from the sources below.

Startup: {startup_name}

Hard rules:
- Use ONLY the provided sources. Do not invent facts.
- Evidence URLs MUST be copied from the "URL:" lines in the sources.
- evidence_urls must always be an array (use [] if none).
- If unsupported, set value = "unknown" and confidence <= 0.3.

Return your result by calling the provided tool.

Sources:
{context_text}
""".strip()


# ------------------------------------------------------------
# Tool schema (Function Calling)
# ------------------------------------------------------------
PROFILE_FIELDS = [
    "company_summary",
    "business_description",
    "products_services",
    "founders",
    "team_members",
    "competitive_advantage",
    "target_market",
    "target_customers",
    "funding_raised",
    "funding_notes",
]

def _field_schema(value_type: str):
    if value_type == "array":
        value_schema = {"type": "array", "items": {"type": "string"}}
    else:
        value_schema = {"type": "string"}

    return {
        "type": "object",
        "properties": {
            "value": value_schema,
            "evidence_urls": {"type": "array", "items": {"type": "string"}},
            "confidence": {"type": "number"},
        },
        "required": ["value", "evidence_urls", "confidence"],
        "additionalProperties": False,
    }

tools = [
    {
        "type": "function",
        "function": {
            "name": "submit_startup_profile",
            "description": "Submit a structured startup profile reconstructed only from the provided sources.",
            "parameters": {
                "type": "object",
                "properties": {
                    "startup_name": {"type": "string"},
                    "extracted_at_utc": {"type": "string"},
                    "profile": {
                        "type": "object",
                        "properties": {
                            "company_summary": _field_schema("string"),
                            "business_description": _field_schema("string"),
                            "products_services": _field_schema("array"),
                            "founders": _field_schema("array"),
                            "team_members": _field_schema("array"),
                            "competitive_advantage": _field_schema("string"),
                            "target_market": _field_schema("string"),
                            "target_customers": _field_schema("string"),
                            "funding_raised": _field_schema("string"),
                            "funding_notes": _field_schema("string"),
                        },
                        "required": PROFILE_FIELDS,
                        "additionalProperties": False,
                    },
                },
                "required": ["startup_name", "extracted_at_utc", "profile"],
                "additionalProperties": False,
            },
        },
    }
]


def call_llm_profile_via_tool(prompt: str) -> tuple[dict, dict]:
    """
    Returns (parsed_args_dict, raw_debug_payload_dict)
    """
    resp = client.chat.completions.create(
        model=LLM_MODEL_NAME,
        messages=[
            {"role": "system", "content": "You are a careful information extraction assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=LLM_TEMPERATURE,
        max_tokens=MAX_TOKENS,
        tools=tools,
        tool_choice={"type": "function", "function": {"name": "submit_startup_profile"}},
    )

    msg = resp.choices[0].message

    debug_payload = {
        "model": LLM_MODEL_NAME,
        "finish_reason": resp.choices[0].finish_reason,
        "has_tool_calls": bool(getattr(msg, "tool_calls", None)),
        "content_preview": (msg.content[:400] if isinstance(msg.content, str) else str(msg.content)[:400]),
    }

    if not msg.tool_calls:
        raise RuntimeError(f"No tool_call returned. Message content preview: {debug_payload['content_preview']}")

    tc = msg.tool_calls[0]
    args_text = tc.function.arguments

    debug_payload.update(
        {
            "tool_name": tc.function.name,
            "arguments_len": len(args_text or ""),
            "arguments_head_800": (args_text[:800] if args_text else ""),
            "arguments_tail_800": (args_text[-800:] if args_text and len(args_text) > 800 else args_text or ""),
        }
    )

    # Parse tool arguments (JSON)
    try:
        parsed = json.loads(args_text)
        return parsed, debug_payload
    except json.JSONDecodeError as e:
        pos = e.pos
        snippet = args_text[max(0, pos - 220): pos + 220]
        debug_payload.update(
            {
                "json_error": str(e),
                "json_error_pos": pos,
                "json_error_snippet": snippet,
            }
        )
        raise


# ------------------------------------------------------------
# Main execution
# ------------------------------------------------------------
profiles = []
source_rows = []
run_ts = datetime.utcnow().isoformat(timespec="seconds") + "Z"

n_startups = df_pages_clean["startup_id"].nunique()
log(f"[Cell5] Starting profile reconstruction for {n_startups} startup(s)")
log(f"[Cell5] Model={LLM_MODEL_NAME} | temperature={LLM_TEMPERATURE} | max_tokens={MAX_TOKENS}")
log(f"[Cell5] Context caps: pages={MAX_PAGES_PER_STARTUP}, per_page_chars={MAX_CHARS_PER_PAGE}, total_chars={MAX_TOTAL_CHARS}")
log(f"[Cell5] Debug artifacts: {SAVE_DEBUG_ARTIFACTS} -> {DEBUG_DIR}")

for startup_id, g in tqdm(df_pages_clean.groupby("startup_id"), total=n_startups):
    startup_name = g["startup_name"].iloc[0]

    log("\n" + "=" * 80)
    log(f"[Startup] {startup_name} ({startup_id})")

    # Build context
    context_text, sources = build_startup_context(g)
    log(f"[Context] sources_selected={len(sources)} | context_chars={len(context_text)}")
    if len(sources) > 0:
        log("[Context] Source URLs:")
        for s in sources:
            log(f"  - #{s['source_idx']} {s['url']} (facet={s['cse_facet']}, scope={s['cse_scope']}, text_len={s['text_len']})")

    if SAVE_DEBUG_ARTIFACTS:
        dump_text(f"{DEBUG_DIR}/{startup_id}_context.txt", context_text)
        dump_json(f"{DEBUG_DIR}/{startup_id}_sources.json", sources)

    if not context_text.strip():
        err = "No usable context after cleaning."
        log(f"[Skip] {err}")
        profiles.append(
            {
                "startup_id": startup_id,
                "startup_name": startup_name,
                "extracted_at_utc": run_ts,
                "error": err,
                "raw_profile_json": None,
            }
        )
        continue

    prompt = build_profile_prompt(startup_name, context_text)
    log(f"[Prompt] prompt_chars={len(prompt)}")
    if SAVE_DEBUG_ARTIFACTS:
        dump_text(f"{DEBUG_DIR}/{startup_id}_prompt.txt", prompt)

    try:
        log("[LLM] Calling tool...")

        result, dbg = call_llm_profile_via_tool(prompt)

        log("[LLM] Tool call returned successfully.")
        log(f"[LLM] finish_reason={dbg.get('finish_reason')} | arguments_len={dbg.get('arguments_len')}")
        log("[LLM] arguments_head_800:")
        log(dbg.get("arguments_head_800", ""))

        if SAVE_DEBUG_ARTIFACTS:
            dump_json(f"{DEBUG_DIR}/{startup_id}_llm_debug.json", dbg)
            dump_json(f"{DEBUG_DIR}/{startup_id}_profile.json", result)

        # Ensure timestamps
        if isinstance(result, dict):
            result.setdefault("startup_name", startup_name)
            result.setdefault("extracted_at_utc", run_ts)

        profiles.append(
            {
                "startup_id": startup_id,
                "startup_name": startup_name,
                "extracted_at_utc": run_ts,
                "error": "",
                "raw_profile_json": result,
            }
        )

        for s in sources:
            source_rows.append(
                {"startup_id": startup_id, "startup_name": startup_name, **s, "run_ts": run_ts}
            )

        time.sleep(API_SLEEP_SEC)

    except Exception as e:
        log("[ERROR] LLM call/parsing failed.")
        log(f"[ERROR] {type(e).__name__}: {str(e)}")
        log("[TRACEBACK]")
        log(traceback.format_exc())

        # If we had a JSON parsing issue, try to write additional context (best-effort)
        # Note: args_text may not be in scope if tool_call didn't return.
        if SAVE_DEBUG_ARTIFACTS:
            dump_text(f"{DEBUG_DIR}/{startup_id}_error.txt", traceback.format_exc())

        profiles.append(
            {
                "startup_id": startup_id,
                "startup_name": startup_name,
                "extracted_at_utc": run_ts,
                "error": f"LLM call/parsing failed: {str(e)}",
                "raw_profile_json": None,
            }
        )

df_profiles_raw = pd.DataFrame(profiles)
df_profile_sources = pd.DataFrame(source_rows)

log("\n" + "=" * 80)
log("[Cell5] Done.")
print(f"Generated profiles for {df_profiles_raw['startup_id'].nunique()} startups.")
display(df_profiles_raw[["startup_id", "startup_name", "extracted_at_utc", "error"]].head(10))


# ------------------------------------------------------------
# Optional: Flatten selected fields into an analysis-friendly table
# ------------------------------------------------------------
def _safe_get(d: dict, path: list, default=None):
    cur = d
    for p in path:
        if not isinstance(cur, dict) or p not in cur:
            return default
        cur = cur[p]
    return cur

flat_rows = []
for _, r in df_profiles_raw.iterrows():
    rp = r.get("raw_profile_json")
    if not isinstance(rp, dict):
        flat_rows.append({"startup_id": r["startup_id"], "startup_name": r["startup_name"], "error": r["error"]})
        continue

    prof = rp.get("profile", {})
    flat_rows.append(
        {
            "startup_id": r["startup_id"],
            "startup_name": r["startup_name"],
            "company_summary": _safe_get(prof, ["company_summary", "value"], ""),
            "business_description": _safe_get(prof, ["business_description", "value"], ""),
            "products_services": _safe_get(prof, ["products_services", "value"], []),
            "founders": _safe_get(prof, ["founders", "value"], []),
            "team_members": _safe_get(prof, ["team_members", "value"], []),
            "competitive_advantage": _safe_get(prof, ["competitive_advantage", "value"], ""),
            "target_market": _safe_get(prof, ["target_market", "value"], ""),
            "target_customers": _safe_get(prof, ["target_customers", "value"], ""),
            "funding_raised": _safe_get(prof, ["funding_raised", "value"], ""),
            "funding_notes": _safe_get(prof, ["funding_notes", "value"], ""),
            "error": r["error"],
        }
    )

df_profiles = pd.DataFrame(flat_rows)
print("Flat profile table preview:")
display(df_profiles.head(10))

globals()["df_profiles_raw"] = df_profiles_raw
globals()["df_profile_sources"] = df_profile_sources
globals()["df_profiles"] = df_profiles


[Cell5] Starting profile reconstruction for 1 startup(s)
[Cell5] Model=gpt-4.1-mini | temperature=0.0 | max_tokens=800
[Cell5] Context caps: pages=6, per_page_chars=1600, total_chars=6500
[Cell5] Debug artifacts: True -> artifacts/debug_cell5


  0%|                                                     | 0/1 [00:00<?, ?it/s]


[Startup] Eureka Robotics (eureka-robotics)
[Context] sources_selected=3 | context_chars=5368
[Context] Source URLs:
  - #1 https://eurekarobotics.com/ (facet=seed, scope=seed, text_len=3063)
  - #2 https://www.eurekarobotics.com/resources/eureka-robotics-opens-u-s-office-in-atlanta-georgia (facet=business, scope=official_site, text_len=3218)
  - #3 https://eurekarobotics.com/resources/announcement (facet=founders, scope=official_site, text_len=2809)
[Prompt] prompt_chars=5769
[LLM] Calling tool...
[LLM] Tool call returned successfully.
[LLM] finish_reason=stop | arguments_len=3604
[LLM] arguments_head_800:
{"startup_name":"Eureka Robotics","extracted_at_utc":"2024-06-01T00:00:00Z","profile":{"company_summary":{"value":"Eureka Robotics is a technology company specializing in AI-driven 3D vision systems for robotics, founded in 2018 as a spin-off from Nanyang Technological University in Singapore. The company develops advanced AI vision systems that enable automation of complex process

100%|█████████████████████████████████████████████| 1/1 [00:15<00:00, 15.44s/it]


[Cell5] Done.
Generated profiles for 1 startups.





Unnamed: 0,startup_id,startup_name,extracted_at_utc,error
0,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,


Flat profile table preview:


Unnamed: 0,startup_id,startup_name,company_summary,business_description,products_services,founders,team_members,competitive_advantage,target_market,target_customers,funding_raised,funding_notes,error
0,eureka-robotics,Eureka Robotics,Eureka Robotics is a technology company specia...,Eureka Robotics develops and provides advanced...,"[Eureka AI Vision System, Eureka 3D Camera, Eu...",[Pham Quang-Cuong],[Pham Quang Cuong],Eureka Robotics' competitive advantage lies in...,High-precision manufacturing industries includ...,"Manufacturers and companies in logistics, aero...",Series A fundraising completed in December 2024.,"Following the Series A fundraising, Eureka Rob...",


In [21]:
# ============================================================
# Cell 6 : Assemble Structured Startup Profiles
# ============================================================
#
# What this cell does
# - Validates the LLM output schema (best-effort)
# - Extracts field-level values / evidence URLs / confidences into tidy tables
# - Produces:
#   1) df_profiles_final      : one row per startup (flat, analysis-ready)
#   2) df_profile_fields      : one row per (startup, field) with evidence+confidence
#   3) df_profile_evidence    : one row per (startup, field, evidence_url)
#
# Notes
# - We do not "trust" the LLM output blindly. We keep:
#   - field confidence
#   - evidence URLs
#   - raw JSON (from Cell 5)
# - Unknown values are preserved as "unknown" with low confidence.

import math


# ------------------------------------------------------------
# Preconditions
# ------------------------------------------------------------
if "df_profiles_raw" not in globals():
    raise ValueError("df_profiles_raw is not defined. Please run Cell 5 first.")

# If you used the flat table in Cell 5, it may already exist; not required here.
# if "df_profiles" not in globals(): ...


# ------------------------------------------------------------
# Config knobs
# ------------------------------------------------------------
EXPECTED_FIELDS = [
    "company_summary",
    "business_description",
    "products_services",
    "founders",
    "team_members",
    "competitive_advantage",
    "target_market",
    "target_customers",
    "funding_raised",
    "funding_notes",
]

UNKNOWN_TOKENS = {"unknown", "n/a", "na", "", None}

# Optional: if confidence is missing, use this default
DEFAULT_CONFIDENCE = 0.2


# ------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------
def _safe_dict(x):
    return x if isinstance(x, dict) else {}

def _safe_list(x):
    return x if isinstance(x, list) else []

def _to_str(x):
    if x is None:
        return ""
    if isinstance(x, (str, int, float, bool)):
        return str(x)
    return json.dumps(x, ensure_ascii=False)

def _normalize_conf(x):
    try:
        v = float(x)
        if math.isnan(v):
            return DEFAULT_CONFIDENCE
        return max(0.0, min(1.0, v))
    except Exception:
        return DEFAULT_CONFIDENCE

def _normalize_value(val):
    """
    Normalize values so the final table is consistent.
    Lists stay lists; scalars become strings.
    """
    if isinstance(val, list):
        # strip items, drop empty
        out = []
        for it in val:
            s = str(it).strip()
            if s and s.lower() not in UNKNOWN_TOKENS:
                out.append(s)
        return out
    if val is None:
        return "unknown"
    s = str(val).strip()
    if s.lower() in UNKNOWN_TOKENS:
        return "unknown"
    return s

def _normalize_evidence(urls):
    """
    Ensure evidence_urls is a list[str], deduped, order-preserving.
    """
    if urls is None:
        return []
    if isinstance(urls, str):
        urls = [urls]
    if not isinstance(urls, list):
        return []
    out, seen = [], set()
    for u in urls:
        u = str(u).strip()
        if not u:
            continue
        if u in seen:
            continue
        seen.add(u)
        out.append(u)
    return out

def _extract_profile_object(raw_profile_json):
    """
    Returns the 'profile' dict inside raw JSON if present.
    """
    d = _safe_dict(raw_profile_json)
    return _safe_dict(d.get("profile"))


# ------------------------------------------------------------
# Build field-level table
# ------------------------------------------------------------
field_rows = []

for _, r in df_profiles_raw.iterrows():
    startup_id = r["startup_id"]
    startup_name = r["startup_name"]
    extracted_at_utc = r.get("extracted_at_utc", "")
    err = r.get("error", "")
    raw = r.get("raw_profile_json", None)

    prof = _extract_profile_object(raw)

    # If LLM failed or profile missing, keep an error record per field
    if err or not prof:
        for f in EXPECTED_FIELDS:
            field_rows.append(
                {
                    "startup_id": startup_id,
                    "startup_name": startup_name,
                    "extracted_at_utc": extracted_at_utc,
                    "field": f,
                    "value": "unknown",
                    "confidence": 0.0,
                    "evidence_urls": [],
                    "error": err or "Missing profile object.",
                }
            )
        continue

    for f in EXPECTED_FIELDS:
        obj = _safe_dict(prof.get(f))
        val = _normalize_value(obj.get("value", "unknown"))
        conf = _normalize_conf(obj.get("confidence", DEFAULT_CONFIDENCE))
        ev = _normalize_evidence(obj.get("evidence_urls", []))

        # If value is unknown, force conservative confidence
        if (isinstance(val, str) and val.lower() == "unknown") or (isinstance(val, list) and len(val) == 0):
            conf = min(conf, 0.3)

        field_rows.append(
            {
                "startup_id": startup_id,
                "startup_name": startup_name,
                "extracted_at_utc": extracted_at_utc,
                "field": f,
                "value": val,
                "confidence": conf,
                "evidence_urls": ev,
                "error": "",
            }
        )

df_profile_fields = pd.DataFrame(field_rows)

print("Field-level profile table preview:")
display(df_profile_fields.head(12))


# ------------------------------------------------------------
# Evidence table: one row per (startup, field, evidence_url)
# ------------------------------------------------------------
evidence_rows = []
for _, r in df_profile_fields.iterrows():
    for u in _safe_list(r["evidence_urls"]):
        evidence_rows.append(
            {
                "startup_id": r["startup_id"],
                "startup_name": r["startup_name"],
                "extracted_at_utc": r["extracted_at_utc"],
                "field": r["field"],
                "evidence_url": u,
                "confidence": r["confidence"],
            }
        )

df_profile_evidence = pd.DataFrame(evidence_rows)

print("Evidence table preview:")
display(df_profile_evidence.head(12))


# ------------------------------------------------------------
# Final flat table: one row per startup (analysis-ready)
# ------------------------------------------------------------
def _get_field(df_fields_one: pd.DataFrame, field_name: str):
    row = df_fields_one[df_fields_one["field"] == field_name]
    if row.empty:
        return "unknown", 0.0, []
    row = row.iloc[0]
    return row["value"], row["confidence"], row["evidence_urls"]

final_rows = []

for startup_id, g in df_profile_fields.groupby("startup_id"):
    startup_name = g["startup_name"].iloc[0]
    extracted_at_utc = g["extracted_at_utc"].iloc[0]
    any_error = ""
    if (g["error"].fillna("") != "").any():
        any_error = "; ".join(sorted(set(g["error"].fillna("").tolist())))

    row_out = {
        "startup_id": startup_id,
        "startup_name": startup_name,
        "extracted_at_utc": extracted_at_utc,
        "error": any_error,
    }

    # Add each field + confidence (and optionally evidence count)
    for f in EXPECTED_FIELDS:
        v, c, ev = _get_field(g, f)
        row_out[f] = v
        row_out[f + "_confidence"] = c
        row_out[f + "_evidence_count"] = len(ev)

    # Optional: overall confidence (simple average of key fields)
    key_fields = ["business_description", "products_services", "founders", "target_market", "funding_raised"]
    confs = [row_out[k + "_confidence"] for k in key_fields if isinstance(row_out.get(k + "_confidence"), (int, float))]
    row_out["overall_confidence"] = float(sum(confs) / len(confs)) if confs else 0.0

    final_rows.append(row_out)

df_profiles_final = pd.DataFrame(final_rows)

print("Final startup-level profile table preview:")
display(df_profiles_final.head(10))


# ------------------------------------------------------------
# Expose for downstream cells
# ------------------------------------------------------------
globals()["df_profile_fields"] = df_profile_fields
globals()["df_profile_evidence"] = df_profile_evidence
globals()["df_profiles_final"] = df_profiles_final


Field-level profile table preview:


Unnamed: 0,startup_id,startup_name,extracted_at_utc,field,value,confidence,evidence_urls,error
0,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,company_summary,Eureka Robotics is a technology company specia...,0.95,[https://www.eurekarobotics.com/resources/eure...,
1,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,business_description,Eureka Robotics develops and provides advanced...,0.95,"[https://eurekarobotics.com/, https://www.eure...",
2,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,products_services,"[Eureka AI Vision System, Eureka 3D Camera, Eu...",0.95,[https://eurekarobotics.com/],
3,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,founders,[Pham Quang-Cuong],0.9,[https://www.eurekarobotics.com/resources/eure...,
4,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,team_members,[Pham Quang Cuong],0.9,[https://eurekarobotics.com/resources/announce...,
5,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,competitive_advantage,Eureka Robotics' competitive advantage lies in...,0.95,[https://eurekarobotics.com/],
6,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,target_market,High-precision manufacturing industries includ...,0.9,[https://www.eurekarobotics.com/resources/eure...,
7,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,target_customers,"Manufacturers and companies in logistics, aero...",0.9,[https://www.eurekarobotics.com/resources/eure...,
8,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,funding_raised,Series A fundraising completed in December 2024.,0.9,[https://eurekarobotics.com/resources/announce...,
9,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,funding_notes,"Following the Series A fundraising, Eureka Rob...",0.9,[https://eurekarobotics.com/resources/announce...,


Evidence table preview:


Unnamed: 0,startup_id,startup_name,extracted_at_utc,field,evidence_url,confidence
0,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,company_summary,https://www.eurekarobotics.com/resources/eurek...,0.95
1,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,company_summary,https://eurekarobotics.com/resources/announcement,0.95
2,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,business_description,https://eurekarobotics.com/,0.95
3,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,business_description,https://www.eurekarobotics.com/resources/eurek...,0.95
4,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,products_services,https://eurekarobotics.com/,0.95
5,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,founders,https://www.eurekarobotics.com/resources/eurek...,0.9
6,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,founders,https://eurekarobotics.com/resources/announcement,0.9
7,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,team_members,https://eurekarobotics.com/resources/announcement,0.9
8,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,competitive_advantage,https://eurekarobotics.com/,0.95
9,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,target_market,https://www.eurekarobotics.com/resources/eurek...,0.9


Final startup-level profile table preview:


Unnamed: 0,startup_id,startup_name,extracted_at_utc,error,company_summary,company_summary_confidence,company_summary_evidence_count,business_description,business_description_confidence,business_description_evidence_count,...,target_customers,target_customers_confidence,target_customers_evidence_count,funding_raised,funding_raised_confidence,funding_raised_evidence_count,funding_notes,funding_notes_confidence,funding_notes_evidence_count,overall_confidence
0,eureka-robotics,Eureka Robotics,2026-01-07T03:21:32Z,,Eureka Robotics is a technology company specia...,0.95,2,Eureka Robotics develops and provides advanced...,0.95,2,...,"Manufacturers and companies in logistics, aero...",0.9,1,Series A fundraising completed in December 2024.,0.9,1,"Following the Series A fundraising, Eureka Rob...",0.9,1,0.92


In [23]:
# ============================================================
# Cell 7 : Export Artifacts for Downstream Analysis (with Report)
# (Narrative Report–oriented version)
# ============================================================

import json
from pathlib import Path
from datetime import datetime
from IPython.display import display, Markdown


# ------------------------------------------------------------
# Preconditions
# ------------------------------------------------------------
if "df_profiles_raw" not in globals():
    raise ValueError("df_profiles_raw is not defined. Please run Cell 5 first.")

HAS_CELL6 = all(name in globals() for name in ["df_profile_fields", "df_profile_evidence", "df_profiles_final"])


# ------------------------------------------------------------
# Output directory
# ------------------------------------------------------------
ARTIFACTS_DIR = Path("artifacts")
EXPORT_DIR = ARTIFACTS_DIR / "008_startup_profile_reconstruction"
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

run_ts = datetime.utcnow().isoformat(timespec="seconds") + "Z"
run_tag = run_ts.replace(":", "").replace("-", "")

print(f"Export directory: {EXPORT_DIR}")
print(f"Run timestamp: {run_ts}")


# ------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------
def as_list(x):
    return x if isinstance(x, list) else []

def as_dict(x):
    return x if isinstance(x, dict) else {}

def fmt_text(x):
    if isinstance(x, list):
        return ", ".join([str(v) for v in x])
    return str(x).strip()

def fmt_conf(c):
    if c is None:
        return ""
    try:
        return f"(confidence: {float(c):.2f})"
    except Exception:
        return ""

def write_json(path: Path, obj):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def write_text(path: Path, text: str):
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)

def safe_csv(df, path: Path):
    df.to_csv(path, index=False)


# ------------------------------------------------------------
# Build Narrative Report (Markdown)
# ------------------------------------------------------------
lines = []

lines.append("# Startup Profile Reconstruction Report")
lines.append("")
lines.append(f"- Notebook: **008_Startup_Profile_Reconstruction**")
lines.append(f"- Run timestamp (UTC): **{run_ts}**")
lines.append(f"- Startups processed: **{df_profiles_raw['startup_id'].nunique()}**")
lines.append(f"- Successful profiles: **{(df_profiles_raw['error'].fillna('') == '').sum()}**")
lines.append(f"- Failed profiles: **{(df_profiles_raw['error'].fillna('') != '').sum()}**")
lines.append("")

# Corpus summary
lines.append("## Corpus Summary")
if "df_pages_clean" in globals():
    df_pages_clean = globals()["df_pages_clean"]
    lines.append(f"- Clean page records: **{len(df_pages_clean)}**")
    if "domain" in df_pages_clean.columns:
        lines.append(f"- Domains covered: **{df_pages_clean['domain'].nunique()}**")
else:
    lines.append("- Clean page records: *(df_pages_clean not available)*")
lines.append("")

# Profiles
lines.append("## Reconstructed Startup Profiles")
lines.append("")

for _, r in df_profiles_raw.iterrows():
    startup_id = r["startup_id"]
    startup_name = r["startup_name"]
    err = (r.get("error") or "").strip()
    raw = as_dict(r.get("raw_profile_json"))

    lines.append(f"### {startup_name} (`{startup_id}`)")
    lines.append("")

    if err:
        lines.append("**Status:** ❌ Failed")
        lines.append(f"**Error:** `{err}`")
        lines.append("")
        continue

    prof = as_dict(raw.get("profile"))

    def val(k): return as_dict(prof.get(k)).get("value", "unknown")
    def conf(k): return as_dict(prof.get(k)).get("confidence", None)
    def ev(k): return as_list(as_dict(prof.get(k)).get("evidence_urls", []))

    lines.append("**Company Overview**")
    lines.append(fmt_text(val("company_summary")))
    lines.append("")

    lines.append("**Business & Products**")
    lines.append(
        fmt_text(val("business_description"))
        + " Key products and services include "
        + fmt_text(val("products_services"))
        + ". "
        + fmt_conf(conf("business_description"))
    )
    lines.append("")

    lines.append("**Founding Team & Organization**")
    lines.append(
        "Founder(s): "
        + fmt_text(val("founders"))
        + ". Other notable team members include "
        + fmt_text(val("team_members"))
        + ". "
        + fmt_conf(conf("founders"))
    )
    lines.append("")

    lines.append("**Competitive Advantage**")
    lines.append(fmt_text(val("competitive_advantage")) + " " + fmt_conf(conf("competitive_advantage")))
    lines.append("")

    lines.append("**Market & Customers**")
    lines.append(
        "The company targets "
        + fmt_text(val("target_market"))
        + ", serving customers such as "
        + fmt_text(val("target_customers"))
        + ". "
        + fmt_conf(conf("target_market"))
    )
    lines.append("")

    lines.append("**Funding & Recent Developments**")
    lines.append(
        fmt_text(val("funding_raised"))
        + " "
        + fmt_text(val("funding_notes"))
        + " "
        + fmt_conf(conf("funding_raised"))
    )
    lines.append("")

    # Evidence URLs
    all_ev = sorted({u for k in prof for u in ev(k)})
    if all_ev:
        lines.append("**Evidence URLs**")
        for u in all_ev:
            lines.append(f"- {u}")
        lines.append("")

# Finalize report
report_md = "\n".join(lines)
report_path = EXPORT_DIR / f"report_{run_tag}.md"
write_text(report_path, report_md)

display(Markdown(report_md))


# ------------------------------------------------------------
# Export Data Artifacts
# ------------------------------------------------------------
if "df_pages_clean" in globals():
    safe_csv(globals()["df_pages_clean"], EXPORT_DIR / f"df_pages_clean_{run_tag}.csv")

# Raw profiles (JSONL + JSON)
jsonl_path = EXPORT_DIR / f"profiles_raw_{run_tag}.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
    for _, r in df_profiles_raw.iterrows():
        f.write(json.dumps({
            "startup_id": r["startup_id"],
            "startup_name": r["startup_name"],
            "extracted_at_utc": r.get("extracted_at_utc", ""),
            "error": r.get("error", ""),
            "raw_profile_json": r.get("raw_profile_json"),
        }, ensure_ascii=False) + "\n")

write_json(EXPORT_DIR / f"profiles_raw_{run_tag}.json", df_profiles_raw.to_dict(orient="records"))

if "df_profile_sources" in globals():
    safe_csv(globals()["df_profile_sources"], EXPORT_DIR / f"df_profile_sources_{run_tag}.csv")

if HAS_CELL6:
    safe_csv(globals()["df_profile_fields"], EXPORT_DIR / f"df_profile_fields_{run_tag}.csv")
    safe_csv(globals()["df_profile_evidence"], EXPORT_DIR / f"df_profile_evidence_{run_tag}.csv")
    safe_csv(globals()["df_profiles_final"], EXPORT_DIR / f"df_profiles_final_{run_tag}.csv")

print("\nExport completed.")
print(f"- Report: {report_path}")
print(f"- JSONL : {jsonl_path}")


Export directory: artifacts/008_startup_profile_reconstruction
Run timestamp: 2026-01-07T04:05:36Z


# Startup Profile Reconstruction Report

- Notebook: **008_Startup_Profile_Reconstruction**
- Run timestamp (UTC): **2026-01-07T04:05:36Z**
- Startups processed: **1**
- Successful profiles: **1**
- Failed profiles: **0**

## Corpus Summary
- Clean page records: **12**
- Domains covered: **5**

## Reconstructed Startup Profiles

### Eureka Robotics (`eureka-robotics`)

**Company Overview**
Eureka Robotics is a technology company specializing in AI-driven 3D vision systems for robotics, founded in 2018 as a spin-off from Nanyang Technological University in Singapore. The company develops advanced AI vision systems that enable automation of complex processes such as high-precision picking and inspection in industries including automotive, electronics, aerospace, and logistics. Eureka Robotics has a strong presence in Asia and has expanded to North America with a U.S. office in Atlanta, Georgia.

**Business & Products**
Eureka Robotics develops and provides advanced AI vision systems that integrate precision 3D cameras and controllers to enable high-precision pick and place robotic applications. Their technology features quick calibration, sub-millimeter picking accuracy, and seamless integration with major robot and PLC brands. The company serves industries such as logistics, aerospace, automotive, electronics, and high-precision manufacturing. Key products and services include Eureka AI Vision System, Eureka 3D Camera, Eureka Controller. (confidence: 0.95)

**Founding Team & Organization**
Founder(s): Pham Quang-Cuong. Other notable team members include Pham Quang Cuong. (confidence: 0.90)

**Competitive Advantage**
Eureka Robotics' competitive advantage lies in its AI-driven 3D vision systems that offer high-precision, sub-millimeter picking accuracy with minimal setup time (30 minutes calibration), no need for CAD or training, and seamless plug & play integration with major robot and PLC brands. This enables faster and easier deployment of precision pick and place robotic systems. (confidence: 0.95)

**Market & Customers**
The company targets High-precision manufacturing industries including logistics, aerospace, automotive, electronics, and other sectors requiring advanced robotic automation solutions., serving customers such as Manufacturers and companies in logistics, aerospace, automotive, electronics, and high-precision manufacturing sectors seeking advanced AI-driven robotic vision systems for automation.. (confidence: 0.90)

**Funding & Recent Developments**
Series A fundraising completed in December 2024. Following the Series A fundraising, Eureka Robotics strengthened its team and organizational foundation to support accelerated global growth and expanded its presence by opening a U.S. office in Atlanta in May 2025. (confidence: 0.90)

**Evidence URLs**
- https://eurekarobotics.com/
- https://eurekarobotics.com/resources/announcement
- https://www.eurekarobotics.com/resources/eureka-robotics-opens-u-s-office-in-atlanta-georgia



Export completed.
- Report: artifacts/008_startup_profile_reconstruction/report_20260107T040536Z.md
- JSONL : artifacts/008_startup_profile_reconstruction/profiles_raw_20260107T040536Z.jsonl
