In [12]:
# ============================================================
# 005_Startup Portfolio Intelligence Pipeline
# ============================================================
#
# Overview
# ----------------
# This notebook implements a lightweight intelligence pipeline for analyzing
# a portfolio of startups from an investor or research perspective.
#
# The goal is not to build a fully automated scoring system, but to construct
# a reproducible workflow that aggregates heterogeneous signals
# (company metadata, funding history, textual descriptions, external signals)
# and transforms them into structured, comparable representations.
#
# The pipeline is designed to support:
# - Portfolio-level overview and comparison
# - Company-level deep dives
# - Downstream qualitative analysis and hypothesis generation
#
#
# Inputs / Outputs
# ----------------
# Inputs:
# - A portfolio universe definition (e.g., scraped public portfolio pages or curated lists)
# - Optional external data sources (APIs, CSVs, or cached JSON files)
#
# Outputs:
# - Normalized tabular datasets (DataFrame / CSV) at the company and portfolio level
# - Enriched company profiles with derived attributes
# - Canonical "latest" artifacts and timestamped snapshots for reproducibility
# - Intermediate artifacts intended for downstream visualization or reporting
#
#
# Structure
# ----------------
# Cell 0 : Notebook purpose, scope, and design principles
# Cell 1 : Imports and global configuration
# Cell 2 : Load and validate startup portfolio input data (scraping + normalization)
# Cell 3 : Data enrichment (metadata, funding, textual signals)
# Cell 4 : Feature normalization and basic transformations
# Cell 5 : Portfolio-level aggregation and comparison
# Cell 6 : Company-level summaries and diagnostics
# Cell 7 : Export of structured outputs for downstream use
#
#
# Notes
# ----------------
# - This notebook prioritizes clarity and reproducibility over model complexity.
# - Each step is intentionally modular to allow partial re-runs and extensions.
# - External data enrichment is designed to be fail-soft to avoid breaking the pipeline.
# - Outputs are designed to be consumed by separate visualization or analysis notebooks.
# - API behavior and data availability may change; cache intermediate results when possible.
#


In [2]:
# ============================================================
# Cell 1 : Imports and Global Configuration
# ============================================================
#
# This cell defines all core imports and global settings used
# throughout the notebook.
#
# The configuration here is intentionally lightweight:
# - Prefer standard scientific Python libraries
# - Avoid hidden side effects or environment-specific assumptions
# - Centralize constants and runtime parameters for reproducibility
#
# Any changes to data sources, display behavior, or runtime options
# should be made here to ensure consistent downstream execution.
#

# --- Standard library imports ---
import os
import sys
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional

# --- Third-party imports ---
import pandas as pd
import numpy as np

# --- Display and formatting ---
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)

# --- Project paths ---
PROJECT_ROOT = Path.cwd()
DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "outputs"

DATA_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Global runtime parameters ---
RUN_TIMESTAMP = datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S")

# Default behavior flags (override as needed in downstream cells)
ENABLE_CACHING = True
VERBOSE_LOGGING = True

# --- Random seed (for reproducibility where applicable) ---
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# --- Environment sanity check ---
print(f"Project root : {PROJECT_ROOT}")
print(f"Data dir     : {DATA_DIR}")
print(f"Output dir   : {OUTPUT_DIR}")
print(f"Run timestamp: {RUN_TIMESTAMP}")


Project root : /Users/yuetoya/Desktop/researchOS100-private/notebooks
Data dir     : /Users/yuetoya/Desktop/researchOS100-private/notebooks/data
Output dir   : /Users/yuetoya/Desktop/researchOS100-private/notebooks/outputs
Run timestamp: 2026-01-05_02-42-59


In [3]:
# ============================================================
# Cell 2 : Load and Validate Startup Portfolio Input Data
# (Scrape a public portfolio page as the input source)
# ============================================================
#
# This cell builds the portfolio "input dataset" by scraping a public
# portfolio page (B Capital) and transforming it into a normalized table.
#
# Why here:
# - The scraped portfolio list becomes the canonical entry point for the pipeline
# - Downstream enrichment can assume a consistent schema
#
# Outputs:
# - portfolio_df (DataFrame)
# - A timestamped CSV snapshot under outputs/
#

import re
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from datetime import datetime

# --- Target portfolio page (public) ---
PORTFOLIO_URL = "https://b.capital/portfolio/"

# --- Output snapshot path ---
SNAPSHOT_DIR = OUTPUT_DIR / "portfolio_snapshots"
SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True)
RUN_TIMESTAMP = datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S")
SNAPSHOT_PATH = SNAPSHOT_DIR / f"bcap_portfolio_{RUN_TIMESTAMP}.csv"

# --- Expected schema (minimum contract for downstream cells) ---
REQUIRED_COLUMNS = ["name", "industry", "region", "stage", "url"]


def extract_company_name_from_domain(url: str) -> str:
    """
    Fallback utility: infer a company-like name from a domain when HTML
    does not explicitly include the company name.
    Example: https://www.perplexity.ai -> Perplexity
    """
    try:
        parsed = urlparse(url)
        host = (parsed.netloc or "").lower()
        if host.startswith("www."):
            host = host[4:]

        parts = host.split(".")
        core = parts[-2] if len(parts) >= 2 else host  # perplexity.ai -> perplexity
        core = re.sub(r"[-_]+", " ", core).strip()
        return core.title() if core else "Unknown"
    except Exception:
        return "Unknown"


def scrape_bcap_portfolio(portfolio_url: str = PORTFOLIO_URL, timeout: int = 20) -> list[dict]:
    """
    Scrape B Capital portfolio list and return a list of dict records.

    Notes:
    - Relies on current page structure (e.g., 'ul.logos-grid').
    - If the site layout changes, update the selectors below.
    """
    res = requests.get(portfolio_url, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"})
    res.raise_for_status()

    soup = BeautifulSoup(res.text, "html.parser")

    ul = soup.find("ul", class_="logos-grid")
    if not ul:
        raise RuntimeError("Could not find 'ul.logos-grid' in the HTML. The page structure may have changed.")

    companies: list[dict] = []

    for li in ul.find_all("li"):
        a = li.find("a", href=True)
        if not a:
            continue

        company_url = a["href"].strip()

        # Industry / Region / Stage (often inside <span class="h5"> with <br> separators)
        span = a.find("span", class_="h5")
        lines = []
        if span:
            lines = [
                line.strip()
                for line in span.get_text(separator="\n").splitlines()
                if line.strip()
            ]

        industry = lines[0] if len(lines) > 0 else ""
        region   = lines[1] if len(lines) > 1 else ""
        stage    = lines[2] if len(lines) > 2 else ""

        # Company name: <img alt="..."> preferred, fallback to inferred domain name
        img = a.find("img")
        if img and img.get("alt"):
            company_name = img["alt"].strip()
        else:
            company_name = extract_company_name_from_domain(company_url)

        companies.append(
            {
                "name": company_name,
                "industry": industry,
                "region": region,
                "stage": stage,
                "url": company_url,
                "source": "b.capital/portfolio",
                "scraped_at_utc": RUN_TIMESTAMP,
            }
        )

    return companies


# --- Run scraping ---
records = scrape_bcap_portfolio()
print(f"Number of companies scraped: {len(records)}")

portfolio_df = pd.DataFrame(records)

# --- Validation: columns and basic sanity checks ---
missing_cols = set(REQUIRED_COLUMNS) - set(portfolio_df.columns)
if missing_cols:
    raise ValueError(f"Missing required columns after scraping: {missing_cols}")

# Drop empty names / urls (defensive)
portfolio_df["name"] = portfolio_df["name"].astype(str).str.strip()
portfolio_df["url"] = portfolio_df["url"].astype(str).str.strip()
portfolio_df = portfolio_df[(portfolio_df["name"] != "") & (portfolio_df["url"] != "")]

# De-duplicate by (name, url)
before = len(portfolio_df)
portfolio_df = portfolio_df.drop_duplicates(subset=["name", "url"]).reset_index(drop=True)
after = len(portfolio_df)

if before != after:
    print(f"Removed {before - after} duplicate rows.")

# --- Save a snapshot for reproducibility ---
portfolio_df.to_csv(SNAPSHOT_PATH, index=False)
print(f"Saved snapshot: {SNAPSHOT_PATH}")

display(portfolio_df.head(10))


Number of companies scraped: 140
Saved snapshot: /Users/yuetoya/Desktop/researchOS100-private/notebooks/outputs/portfolio_snapshots/bcap_portfolio_2026-01-05_02-45-24.csv


Unnamed: 0,name,industry,region,stage,url,source,scraped_at_utc
0,6Sense,Technology & AI,North America,Growth Stage,https://6sense.com,b.capital/portfolio,2026-01-05_02-45-24
1,Accacia,ClimateOpportunistic,India,Early Stage,https://accacia.ai/,b.capital/portfolio,2026-01-05_02-45-24
2,Aetion,Healthcare,North America,Growth Stage,https://aetion.com/,b.capital/portfolio,2026-01-05_02-45-24
3,Aimotive,Opportunistic,Europe,Growth Stage,https://aimotive.com/,b.capital/portfolio,2026-01-05_02-45-24
4,Ansiblehealth,Healthcare,North America,Early Stage,https://www.ansiblehealth.com,b.capital/portfolio,2026-01-05_02-45-24
5,Apptronik,Technology & AI,North America,Growth Stage,https://apptronik.com/,b.capital/portfolio,2026-01-05_02-45-24
6,Yaocheng,Healthcare,China,Early Stage,http://yaocheng.cn/,b.capital/portfolio,2026-01-05_02-45-24
7,Baichuan Ai,Technology & AI,China,Growth Stage,https://www.baichuan-ai.com,b.capital/portfolio,2026-01-05_02-45-24
8,Bhanzu,Opportunistic,India,Early Stage,https://bhanzu.com/,b.capital/portfolio,2026-01-05_02-45-24
9,Blackbuck,Opportunistic,India,Growth Stage,https://blackbuck.com/,b.capital/portfolio,2026-01-05_02-45-24


In [4]:
# ============================================================
# Cell 3 : Data Enrichment (metadata, funding, textual signals)
# ============================================================
#
# This cell enriches the base portfolio table with additional signals.
# The goal is to transform a simple portfolio list (name / url / tags)
# into a richer "company intelligence" table that can support:
# - Portfolio comparison
# - Company-level deep dives
# - Downstream scoring / clustering / visualization
#
# Enrichment philosophy:
# - Prefer lightweight, high-signal fields first (domain, short description)
# - Cache raw responses aggressively to keep the pipeline reproducible
# - Keep each enrichment module optional and fail-soft (do not break the run)
#
# Notes:
# - Funding data typically requires a paid provider (Crunchbase, PitchBook, Tracxn, etc.)
#   or a custom internal dataset. This notebook therefore includes a plug-in style stub
#   for funding enrichment rather than assuming a specific provider.
#

import time
import hashlib
from typing import Any, Tuple

# --- Enrichment output / cache directories ---
CACHE_DIR = DATA_DIR / "cache"
CACHE_DIR.mkdir(parents=True, exist_ok=True)

ENRICH_DIR = OUTPUT_DIR / "enriched"
ENRICH_DIR.mkdir(parents=True, exist_ok=True)

ENRICH_SNAPSHOT_PATH = ENRICH_DIR / f"portfolio_enriched_{RUN_TIMESTAMP}.csv"

# --- Utility helpers ---
def safe_get(url: str, timeout: int = 15) -> Optional[str]:
    """
    Fetch a URL and return HTML text. Fail-soft: returns None on any error.
    """
    try:
        res = requests.get(url, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"})
        if res.status_code != 200:
            return None
        return res.text
    except Exception:
        return None

def cache_key(prefix: str, value: str) -> str:
    """
    Create a stable cache key for a given (prefix, value) pair.
    """
    h = hashlib.sha256(value.encode("utf-8")).hexdigest()[:16]
    return f"{prefix}_{h}.json"

def load_cache(path: Path) -> Optional[dict]:
    if not path.exists():
        return None
    try:
        return json.loads(path.read_text(encoding="utf-8"))
    except Exception:
        return None

def save_cache(path: Path, payload: dict) -> None:
    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")


# ============================================================
# (A) Basic metadata enrichment from the company website
# ============================================================
#
# We attempt to extract:
# - domain / hostname
# - page title
# - meta description (often a useful 1-line company summary)
#

def parse_basic_metadata(company_url: str) -> dict:
    """
    Extract lightweight metadata (title, meta description) from the company website.
    Fail-soft: returns empty strings if parsing fails.
    """
    html = safe_get(company_url)
    if not html:
        return {"domain": "", "page_title": "", "meta_description": ""}

    soup = BeautifulSoup(html, "html.parser")

    # Domain
    parsed = urlparse(company_url)
    domain = (parsed.netloc or "").lower().replace("www.", "").strip()

    # Title
    title = soup.title.get_text(strip=True) if soup.title else ""

    # Meta description
    desc = ""
    tag = soup.find("meta", attrs={"name": re.compile("^description$", re.I)})
    if tag and tag.get("content"):
        desc = tag["content"].strip()

    return {"domain": domain, "page_title": title, "meta_description": desc}


# ============================================================
# (B) Funding enrichment (stub / plugin point)
# ============================================================
#
# Funding data is typically provider-specific. We keep a function stub that:
# - returns empty fields by default
# - can be replaced with a provider connector later
#
# Expected output fields:
# - total_funding_usd
# - last_round_date
# - last_round_type
#

def enrich_funding_stub(company_name: str, domain: str) -> dict:
    """
    Placeholder funding enrichment.
    Replace this with a real connector (Crunchbase / PitchBook / internal DB).
    """
    return {
        "total_funding_usd": None,
        "last_round_date": "",
        "last_round_type": "",
        "funding_source": "",
    }


# ============================================================
# (C) Lightweight textual signals
# ============================================================
#
# We can optionally derive a compact "text signal" field for downstream use.
# For now, we use:
# - meta_description as the primary short text
# - fallback to page_title when meta description is missing
#

def derive_text_signal(meta_description: str, page_title: str) -> str:
    text = (meta_description or "").strip()
    if not text:
        text = (page_title or "").strip()
    return text


# ============================================================
# Enrichment Execution
# ============================================================

enriched_rows = []

for _, row in portfolio_df.iterrows():
    name = str(row.get("name", "")).strip()
    url = str(row.get("url", "")).strip()

    if not name or not url:
        continue

    # --- Cache lookup ---
    ck = cache_key("site_meta", url)
    cache_path = CACHE_DIR / ck
    cached = load_cache(cache_path)

    if cached is None:
        meta = parse_basic_metadata(url)
        save_cache(cache_path, meta)
        if VERBOSE_LOGGING:
            print(f"[meta] fetched: {name}")
        # polite delay to reduce load on target sites
        time.sleep(0.7)
    else:
        meta = cached
        if VERBOSE_LOGGING:
            print(f"[meta] cache hit: {name}")

    domain = meta.get("domain", "")
    page_title = meta.get("page_title", "")
    meta_description = meta.get("meta_description", "")

    # --- Funding enrichment (stub) ---
    funding = enrich_funding_stub(name, domain)

    # --- Derived text signal ---
    text_signal = derive_text_signal(meta_description, page_title)

    enriched_rows.append(
        {
            **row.to_dict(),
            "domain": domain,
            "page_title": page_title,
            "meta_description": meta_description,
            "text_signal": text_signal,
            **funding,
        }
    )

portfolio_enriched_df = pd.DataFrame(enriched_rows)

print(f"Enriched companies: {len(portfolio_enriched_df)}")

# --- Basic QA checks ---
# Ensure no duplicate primary keys (name+url)
dup_count = portfolio_enriched_df.duplicated(subset=["name", "url"]).sum()
if dup_count > 0:
    print(f"Warning: found {dup_count} duplicate (name, url) rows after enrichment.")

display(portfolio_enriched_df.head(10))

# --- Save enriched snapshot ---
portfolio_enriched_df.to_csv(ENRICH_SNAPSHOT_PATH, index=False)
print(f"Saved enriched snapshot: {ENRICH_SNAPSHOT_PATH}")


[meta] fetched: 6Sense
[meta] fetched: Accacia
[meta] fetched: Aetion
[meta] fetched: Aimotive
[meta] fetched: Ansiblehealth
[meta] fetched: Apptronik
[meta] fetched: Yaocheng
[meta] fetched: Baichuan Ai
[meta] fetched: Bhanzu
[meta] fetched: Blackbuck
[meta] fetched: Branch
[meta] fetched: Brik
[meta] fetched: Buildops
[meta] fetched: Cap Rx
[meta] fetched: Carlsmed
[meta] fetched: Carricktherapeutics
[meta] fetched: Carro
[meta] fetched: Centivo
[meta] fetched: Certn
[meta] fetched: Clari
[meta] fetched: Cloudwise
[meta] fetched: Coindcx
[meta] fetched: Comparably
[meta] fetched: Curbwaste
[meta] fetched: Dailyhunt
[meta] fetched: Datasutram
[meta] fetched: Datarobot
[meta] fetched: Deferred
[meta] fetched: Eeroq
[meta] fetched: Fi
[meta] fetched: Eurekarobotics
[meta] fetched: Evenuplaw
[meta] fetched: Evidation
[meta] fetched: Evommune
[meta] fetched: Paywithextend
[meta] fetched: Fabric
[meta] fetched: Falconx
[meta] fetched: Fancraze
[meta] fetched: Figment
[meta] fetched: Finku


Unnamed: 0,name,industry,region,stage,url,source,scraped_at_utc,domain,page_title,meta_description,text_signal,total_funding_usd,last_round_date,last_round_type,funding_source
0,6Sense,Technology & AI,North America,Growth Stage,https://6sense.com,b.capital/portfolio,2026-01-05_02-45-24,,,,,,,,
1,Accacia,ClimateOpportunistic,India,Early Stage,https://accacia.ai/,b.capital/portfolio,2026-01-05_02-45-24,accacia.ai,Pioneering Real Estate Decarbonization | Accac...,Accacia empowers real estate with AI-driven to...,Accacia empowers real estate with AI-driven to...,,,,
2,Aetion,Healthcare,North America,Growth Stage,https://aetion.com/,b.capital/portfolio,2026-01-05_02-45-24,aetion.com,Real-World Evidence Solutions | RWE Analytics ...,Aetion is a health care technology company tha...,Aetion is a health care technology company tha...,,,,
3,Aimotive,Opportunistic,Europe,Growth Stage,https://aimotive.com/,b.capital/portfolio,2026-01-05_02-45-24,aimotive.com,Automated Driving Tooling and Embedded Solutio...,"With our tools and solutions, automotive partn...","With our tools and solutions, automotive partn...",,,,
4,Ansiblehealth,Healthcare,North America,Early Stage,https://www.ansiblehealth.com,b.capital/portfolio,2026-01-05_02-45-24,ansiblehealth.com,"AnsibleHealth, Pulmonary Doctors | COPD Treatment",AnsibleHealth is a virtual pulmonary rehab tre...,AnsibleHealth is a virtual pulmonary rehab tre...,,,,
5,Apptronik,Technology & AI,North America,Growth Stage,https://apptronik.com/,b.capital/portfolio,2026-01-05_02-45-24,apptronik.com,Apptronik,,Apptronik,,,,
6,Yaocheng,Healthcare,China,Early Stage,http://yaocheng.cn/,b.capital/portfolio,2026-01-05_02-45-24,yaocheng.cn,AI Writing Solution for Life Sciences - AlphaL...,Streamline R&D and automate clinical trials wi...,Streamline R&D and automate clinical trials wi...,,,,
7,Baichuan Ai,Technology & AI,China,Growth Stage,https://www.baichuan-ai.com,b.capital/portfolio,2026-01-05_02-45-24,baichuan-ai.com,百川大模型-汇聚世界知识 创作妙笔生花-百川智能,百川智能以帮助大众轻松、普惠地获取世界知识和专业服务为使命，致力于通过语言AI的突破，构建中...,百川智能以帮助大众轻松、普惠地获取世界知识和专业服务为使命，致力于通过语言AI的突破，构建中...,,,,
8,Bhanzu,Opportunistic,India,Early Stage,https://bhanzu.com/,b.capital/portfolio,2026-01-05_02-45-24,bhanzu.com,Online Math Courses With Personalized Guidance...,Learning mathematics should be fun & easy. Cho...,Learning mathematics should be fun & easy. Cho...,,,,
9,Blackbuck,Opportunistic,India,Growth Stage,https://blackbuck.com/,b.capital/portfolio,2026-01-05_02-45-24,blackbuck.com,Home - BlackBuck,India's Largest Trucking Platform,India's Largest Trucking Platform,,,,


Saved enriched snapshot: /Users/yuetoya/Desktop/researchOS100-private/notebooks/outputs/enriched/portfolio_enriched_2026-01-05_02-45-24.csv


In [5]:
# ============================================================
# Cell 4 : Feature Normalization and Basic Transformations
# ============================================================
#
# This cell standardizes and lightly transforms enriched fields into
# analysis-ready features.
#
# Goals:
# - Normalize categorical fields (industry / region / stage) for consistency
# - Clean and standardize URLs / domains
# - Create compact derived features useful for grouping and comparison
# - Keep transformations transparent and reversible
#
# Outputs:
# - features_df : a cleaned, feature-ready DataFrame
# - (optional) a CSV snapshot for downstream portfolio-level aggregation
#

from urllib.parse import urlunparse

FEATURES_DIR = OUTPUT_DIR / "features"
FEATURES_DIR.mkdir(parents=True, exist_ok=True)
FEATURES_SNAPSHOT_PATH = FEATURES_DIR / f"portfolio_features_{RUN_TIMESTAMP}.csv"

# --- Helper functions ---
def normalize_text(x: Any) -> str:
    """Normalize free-text / categorical strings to a consistent representation."""
    if x is None:
        return ""
    s = str(x).strip()
    s = re.sub(r"\s+", " ", s)         # collapse whitespace
    s = s.replace("\u00a0", " ")       # non-breaking space
    return s

def normalize_category(x: Any) -> str:
    """
    Normalize a categorical label:
    - trim
    - collapse whitespace
    - title-case (conservative default)
    """
    s = normalize_text(x)
    return s.title() if s else ""

def normalize_domain(x: Any) -> str:
    """Lowercase domain and remove leading 'www.' when present."""
    s = normalize_text(x).lower()
    if s.startswith("www."):
        s = s[4:]
    return s

def normalize_url(x: Any) -> str:
    """
    Normalize URLs to reduce duplicates:
    - remove fragments
    - keep scheme + netloc + path (no query normalization here)
    - strip trailing slash (except root)
    """
    s = normalize_text(x)
    if not s:
        return ""
    try:
        p = urlparse(s)
        scheme = p.scheme or "https"
        netloc = p.netloc.lower()
        path = p.path or "/"
        # strip fragment
        normalized = urlunparse((scheme, netloc, path, "", p.query or "", ""))
        # strip trailing slash for non-root
        if normalized.endswith("/") and path != "/":
            normalized = normalized[:-1]
        return normalized
    except Exception:
        return s

def shorten_text(x: Any, max_len: int = 220) -> str:
    """Create a compact snippet from longer text fields (for display or export)."""
    s = normalize_text(x)
    if len(s) <= max_len:
        return s
    return s[: max_len - 3].rstrip() + "..."

def classify_stage(stage: str) -> str:
    """
    Optional: map messy stage labels into a smaller controlled vocabulary.
    This is conservative and can be expanded later.
    """
    s = normalize_text(stage).lower()
    if not s:
        return ""
    if "seed" in s:
        return "Seed"
    if "series a" in s or "a round" in s:
        return "Series A"
    if "series b" in s:
        return "Series B"
    if "series c" in s:
        return "Series C+"
    if "growth" in s:
        return "Growth"
    if "public" in s or "ipo" in s:
        return "Public"
    return stage.title()

# --- Start from enriched dataframe ---
features_df = portfolio_enriched_df.copy()

# --- Normalize core identifiers ---
features_df["name"] = features_df["name"].apply(normalize_text)
features_df["url"] = features_df["url"].apply(normalize_url)
features_df["domain"] = features_df["domain"].apply(normalize_domain)

# --- Normalize categories ---
features_df["industry_raw"] = features_df["industry"]
features_df["region_raw"] = features_df["region"]
features_df["stage_raw"] = features_df["stage"]

features_df["industry"] = features_df["industry"].apply(normalize_category)
features_df["region"] = features_df["region"].apply(normalize_category)
features_df["stage"] = features_df["stage"].apply(normalize_text)
features_df["stage_bucket"] = features_df["stage"].apply(classify_stage)

# --- Normalize text features ---
features_df["meta_description"] = features_df["meta_description"].fillna("").apply(normalize_text)
features_df["text_signal"] = features_df["text_signal"].fillna("").apply(normalize_text)
features_df["text_signal_short"] = features_df["text_signal"].apply(shorten_text)

# --- Numeric normalization (funding, if available) ---
# Keep as float where possible; failures become NaN
if "total_funding_usd" in features_df.columns:
    features_df["total_funding_usd"] = pd.to_numeric(
        features_df["total_funding_usd"], errors="coerce"
    )

# --- Simple derived features ---
features_df["has_text_signal"] = features_df["text_signal"].apply(lambda x: 1 if x else 0)
features_df["has_funding_data"] = (
    features_df["total_funding_usd"].notna().astype(int)
    if "total_funding_usd" in features_df.columns
    else 0
)

# --- Lightweight QA checks ---
# Ensure unique (name, url) pairs after normalization
dup_pairs = features_df.duplicated(subset=["name", "url"]).sum()
if dup_pairs > 0:
    print(f"Warning: {dup_pairs} duplicate (name, url) pairs after normalization.")

# Preview
display(features_df.head(10))

# Save snapshot for downstream steps
features_df.to_csv(FEATURES_SNAPSHOT_PATH, index=False)
print(f"Saved features snapshot: {FEATURES_SNAPSHOT_PATH}")


Unnamed: 0,name,industry,region,stage,url,source,scraped_at_utc,domain,page_title,meta_description,text_signal,total_funding_usd,last_round_date,last_round_type,funding_source,industry_raw,region_raw,stage_raw,stage_bucket,text_signal_short,has_text_signal,has_funding_data
0,6Sense,Technology & Ai,North America,Growth Stage,https://6sense.com/,b.capital/portfolio,2026-01-05_02-45-24,,,,,,,,,Technology & AI,North America,Growth Stage,Growth,,0,0
1,Accacia,Climateopportunistic,India,Early Stage,https://accacia.ai/,b.capital/portfolio,2026-01-05_02-45-24,accacia.ai,Pioneering Real Estate Decarbonization | Accac...,Accacia empowers real estate with AI-driven to...,Accacia empowers real estate with AI-driven to...,,,,,ClimateOpportunistic,India,Early Stage,Early Stage,Accacia empowers real estate with AI-driven to...,1,0
2,Aetion,Healthcare,North America,Growth Stage,https://aetion.com/,b.capital/portfolio,2026-01-05_02-45-24,aetion.com,Real-World Evidence Solutions | RWE Analytics ...,Aetion is a health care technology company tha...,Aetion is a health care technology company tha...,,,,,Healthcare,North America,Growth Stage,Growth,Aetion is a health care technology company tha...,1,0
3,Aimotive,Opportunistic,Europe,Growth Stage,https://aimotive.com/,b.capital/portfolio,2026-01-05_02-45-24,aimotive.com,Automated Driving Tooling and Embedded Solutio...,"With our tools and solutions, automotive partn...","With our tools and solutions, automotive partn...",,,,,Opportunistic,Europe,Growth Stage,Growth,"With our tools and solutions, automotive partn...",1,0
4,Ansiblehealth,Healthcare,North America,Early Stage,https://www.ansiblehealth.com/,b.capital/portfolio,2026-01-05_02-45-24,ansiblehealth.com,"AnsibleHealth, Pulmonary Doctors | COPD Treatment",AnsibleHealth is a virtual pulmonary rehab tre...,AnsibleHealth is a virtual pulmonary rehab tre...,,,,,Healthcare,North America,Early Stage,Early Stage,AnsibleHealth is a virtual pulmonary rehab tre...,1,0
5,Apptronik,Technology & Ai,North America,Growth Stage,https://apptronik.com/,b.capital/portfolio,2026-01-05_02-45-24,apptronik.com,Apptronik,,Apptronik,,,,,Technology & AI,North America,Growth Stage,Growth,Apptronik,1,0
6,Yaocheng,Healthcare,China,Early Stage,http://yaocheng.cn/,b.capital/portfolio,2026-01-05_02-45-24,yaocheng.cn,AI Writing Solution for Life Sciences - AlphaL...,Streamline R&D and automate clinical trials wi...,Streamline R&D and automate clinical trials wi...,,,,,Healthcare,China,Early Stage,Early Stage,Streamline R&D and automate clinical trials wi...,1,0
7,Baichuan Ai,Technology & Ai,China,Growth Stage,https://www.baichuan-ai.com/,b.capital/portfolio,2026-01-05_02-45-24,baichuan-ai.com,百川大模型-汇聚世界知识 创作妙笔生花-百川智能,百川智能以帮助大众轻松、普惠地获取世界知识和专业服务为使命，致力于通过语言AI的突破，构建中...,百川智能以帮助大众轻松、普惠地获取世界知识和专业服务为使命，致力于通过语言AI的突破，构建中...,,,,,Technology & AI,China,Growth Stage,Growth,百川智能以帮助大众轻松、普惠地获取世界知识和专业服务为使命，致力于通过语言AI的突破，构建中...,1,0
8,Bhanzu,Opportunistic,India,Early Stage,https://bhanzu.com/,b.capital/portfolio,2026-01-05_02-45-24,bhanzu.com,Online Math Courses With Personalized Guidance...,Learning mathematics should be fun & easy. Cho...,Learning mathematics should be fun & easy. Cho...,,,,,Opportunistic,India,Early Stage,Early Stage,Learning mathematics should be fun & easy. Cho...,1,0
9,Blackbuck,Opportunistic,India,Growth Stage,https://blackbuck.com/,b.capital/portfolio,2026-01-05_02-45-24,blackbuck.com,Home - BlackBuck,India's Largest Trucking Platform,India's Largest Trucking Platform,,,,,Opportunistic,India,Growth Stage,Growth,India's Largest Trucking Platform,1,0


Saved features snapshot: /Users/yuetoya/Desktop/researchOS100-private/notebooks/outputs/features/portfolio_features_2026-01-05_02-45-24.csv


In [9]:
# ============================================================
# Cell 5 : Portfolio-level Aggregation and Comparison
# ============================================================
#
# This cell aggregates company-level features into portfolio-level views
# that make it easy to compare patterns across categories.
#
# Typical questions this cell helps answer:
# - What industries / regions dominate the portfolio?
# - How is the portfolio distributed by stage buckets?
# - Which segments have stronger text coverage or funding coverage?
# - What does a "representative" subset look like for quick review?
#
# Outputs:
# - summary tables (industry / region / stage distributions)
# - a compact "portfolio snapshot" table for reporting
# - optional exports for downstream visualization
#

AGG_DIR = OUTPUT_DIR / "portfolio_aggregations"
AGG_DIR.mkdir(parents=True, exist_ok=True)

AGG_SNAPSHOT_PREFIX = AGG_DIR / f"portfolio_agg_{RUN_TIMESTAMP}"

# Use the normalized feature table from Cell 4
df = features_df.copy()

# --- Core counts ---
total_companies = len(df)
print(f"Total companies in portfolio: {total_companies}")

# ============================================================
# (A) Distribution by category
# ============================================================

def top_k_distribution(
    df: pd.DataFrame,
    col: str,
    k: int = 20,
    min_count: int = 1
) -> pd.DataFrame:
    """
    Create a top-k distribution table for a given categorical column.
    """
    tmp = (
        df[col]
        .fillna("")
        .replace("", "Unknown")
        .value_counts(dropna=False)
        .reset_index()
    )
    tmp.columns = [col, "count"]
    tmp = tmp[tmp["count"] >= min_count].copy()
    tmp["share"] = (tmp["count"] / max(len(df), 1)).round(4)
    return tmp.head(k)

industry_dist = top_k_distribution(df, "industry", k=30)
region_dist   = top_k_distribution(df, "region", k=30)
stage_dist    = top_k_distribution(df, "stage_bucket", k=30)

print("\n--- Industry distribution (top) ---")
display(industry_dist)

print("\n--- Region distribution (top) ---")
display(region_dist)

print("\n--- Stage distribution (top) ---")
display(stage_dist)

# Save distributions
industry_dist.to_csv(f"{AGG_SNAPSHOT_PREFIX}_industry.csv", index=False)
region_dist.to_csv(f"{AGG_SNAPSHOT_PREFIX}_region.csv", index=False)
stage_dist.to_csv(f"{AGG_SNAPSHOT_PREFIX}_stage.csv", index=False)

# ============================================================
# (B) Coverage metrics (text / funding) by segment
# ============================================================

def coverage_by_segment(df: pd.DataFrame, segment_col: str) -> pd.DataFrame:
    """
    Compute simple coverage metrics grouped by a segment column.
    Coverage here indicates whether a company has:
    - non-empty text_signal
    - non-null funding data (if available)
    """
    tmp = df.copy()
    tmp[segment_col] = tmp[segment_col].replace("", "Unknown").fillna("Unknown")

    group = tmp.groupby(segment_col, dropna=False)

    out = group.agg(
        companies=("name", "count"),
        text_coverage=("has_text_signal", "mean"),
        funding_coverage=("has_funding_data", "mean") if "has_funding_data" in tmp.columns else ("name", lambda x: 0.0),
    ).reset_index()

    out["share"] = (out["companies"] / max(len(tmp), 1)).round(4)
    out["text_coverage"] = out["text_coverage"].round(4)
    out["funding_coverage"] = out["funding_coverage"].round(4)

    out = out.sort_values("companies", ascending=False).reset_index(drop=True)
    return out

industry_cov = coverage_by_segment(df, "industry")
region_cov   = coverage_by_segment(df, "region")
stage_cov    = coverage_by_segment(df, "stage_bucket")

print("\n--- Coverage by industry ---")
display(industry_cov.head(25))

print("\n--- Coverage by region ---")
display(region_cov.head(25))

print("\n--- Coverage by stage ---")
display(stage_cov.head(25))

industry_cov.to_csv(f"{AGG_SNAPSHOT_PREFIX}_industry_coverage.csv", index=False)
region_cov.to_csv(f"{AGG_SNAPSHOT_PREFIX}_region_coverage.csv", index=False)
stage_cov.to_csv(f"{AGG_SNAPSHOT_PREFIX}_stage_coverage.csv", index=False)

# ============================================================
# (C) Quick “portfolio snapshot” table
# ============================================================
#
# A compact table that is easy to scan and can be exported to a memo.
#

snapshot_cols = [
    "name",
    "industry",
    "region",
    "stage_bucket",
    "domain",
    "url",
    "text_signal_short",
    "total_funding_usd",
    "last_round_date",
    "last_round_type",
]

# Keep only columns that exist (funding fields may be absent)
snapshot_cols = [c for c in snapshot_cols if c in df.columns]

portfolio_snapshot_df = df[snapshot_cols].copy()
portfolio_snapshot_df = portfolio_snapshot_df.sort_values(
    ["industry", "region", "name"]
).reset_index(drop=True)

print("\n--- Portfolio snapshot (preview) ---")
display(portfolio_snapshot_df.head(20))

portfolio_snapshot_path = f"{AGG_SNAPSHOT_PREFIX}_snapshot.csv"
portfolio_snapshot_df.to_csv(portfolio_snapshot_path, index=False)
print(f"Saved portfolio snapshot: {portfolio_snapshot_path}")

# ============================================================
# (D) Representative sampling (optional)
# ============================================================
#
# Select a small sample per segment to quickly review the diversity
# of the portfolio without scanning everything.
#

def sample_per_group(
    df: pd.DataFrame,
    group_col: str,
    n: int = 3,
    seed: int = 42
) -> pd.DataFrame:
    tmp = df.copy()
    tmp[group_col] = tmp[group_col].replace("", "Unknown").fillna("Unknown")

    return (
        tmp
        .assign(_rand=np.random.RandomState(seed).rand(len(tmp)))
        .sort_values("_rand")
        .groupby(group_col, as_index=False)
        .head(n)
        .drop(columns="_rand")
        .reset_index(drop=True)
    )



industry_samples = sample_per_group(df, "industry", n=3, seed=RANDOM_SEED)
sample_cols = ["name", "industry", "region", "stage_bucket", "url", "text_signal_short"]
sample_cols = [c for c in sample_cols if c in industry_samples.columns]

print("\n--- Representative samples by industry ---")
display(industry_samples[sample_cols].sort_values(["industry", "name"]).head(60))

industry_samples_path = f"{AGG_SNAPSHOT_PREFIX}_industry_samples.csv"
industry_samples.to_csv(industry_samples_path, index=False)
print(f"Saved industry samples: {industry_samples_path}")


Total companies in portfolio: 140

--- Industry distribution (top) ---


Unnamed: 0,industry,count,share
0,Technology & Ai,70,0.5
1,Healthcare,38,0.2714
2,Opportunistic,25,0.1786
3,Climate,3,0.0214
4,Climateopportunistic,2,0.0143
5,Opportunistictechnology & Ai,1,0.0071
6,Climatetechnology & Ai,1,0.0071



--- Region distribution (top) ---


Unnamed: 0,region,count,share
0,North America,90,0.6429
1,China,18,0.1286
2,India,17,0.1214
3,Southeast Asia,11,0.0786
4,Europe,3,0.0214
5,Africa,1,0.0071



--- Stage distribution (top) ---


Unnamed: 0,stage_bucket,count,share
0,Growth,88,0.6286
1,Early Stage,52,0.3714



--- Coverage by industry ---


Unnamed: 0,industry,companies,text_coverage,funding_coverage,share
0,Technology & Ai,70,0.9,0.0,0.5
1,Healthcare,38,0.9474,0.0,0.2714
2,Opportunistic,25,0.8,0.0,0.1786
3,Climate,3,1.0,0.0,0.0214
4,Climateopportunistic,2,1.0,0.0,0.0143
5,Climatetechnology & Ai,1,1.0,0.0,0.0071
6,Opportunistictechnology & Ai,1,1.0,0.0,0.0071



--- Coverage by region ---


Unnamed: 0,region,companies,text_coverage,funding_coverage,share
0,North America,90,0.9667,0.0,0.6429
1,China,18,0.8333,0.0,0.1286
2,India,17,0.7059,0.0,0.1214
3,Southeast Asia,11,0.7273,0.0,0.0786
4,Europe,3,1.0,0.0,0.0214
5,Africa,1,1.0,0.0,0.0071



--- Coverage by stage ---


Unnamed: 0,stage_bucket,companies,text_coverage,funding_coverage,share
0,Growth,88,0.875,0.0,0.6286
1,Early Stage,52,0.9423,0.0,0.3714



--- Portfolio snapshot (preview) ---


Unnamed: 0,name,industry,region,stage_bucket,domain,url,text_signal_short,total_funding_usd,last_round_date,last_round_type
0,Miotech,Climate,China,Growth,miotech.com,https://www.miotech.com/,创新永续未来 - 妙盈科技,,,
1,Leveltenenergy,Climate,North America,Growth,leveltenenergy.com,https://www.leveltenenergy.com/,LevelTen Energy provides transaction infrastru...,,,
2,Omnidian,Climate,North America,Growth,omnidian.com,https://www.omnidian.com/,Our mission is simple. Protect and accelerate ...,,,
3,Accacia,Climateopportunistic,India,Early Stage,accacia.ai,https://accacia.ai/,Accacia empowers real estate with AI-driven to...,,,
4,Patch,Climateopportunistic,North America,Growth,patch.io,https://www.patch.io/,Buy high-integrity carbon credits from our glo...,,,
5,Overstory,Climatetechnology & Ai,Europe,Early Stage,overstory.com,https://www.overstory.com/,"We help electric utilities optimize resources,...",,,
6,Grit Bio,Healthcare,China,Early Stage,grit-bio.com,https://www.grit-bio.com/,沙砾生物GRIT Biotechnology是一家聚焦世界尖端生物科技，专注于肿瘤免疫治疗领...,,,
7,Hifibio,Healthcare,China,Growth,hifibio.com,https://hifibio.com/,Home - HiFiBiO Therapeutics,,,
8,Insilico,Healthcare,China,Growth,insilico.com,https://insilico.com/,Generative AI and Automation for Longevity and...,,,
9,Meditrusthealth,Healthcare,China,Growth,meditrusthealth.com,https://www.meditrusthealth.com/,,,,


Saved portfolio snapshot: /Users/yuetoya/Desktop/researchOS100-private/notebooks/outputs/portfolio_aggregations/portfolio_agg_2026-01-05_02-45-24_snapshot.csv

--- Representative samples by industry ---


Unnamed: 0,name,industry,region,stage_bucket,url,text_signal_short
14,Leveltenenergy,Climate,North America,Growth,https://www.leveltenenergy.com/,LevelTen Energy provides transaction infrastru...
8,Miotech,Climate,China,Growth,https://www.miotech.com/,创新永续未来 - 妙盈科技
11,Omnidian,Climate,North America,Growth,https://www.omnidian.com/,Our mission is simple. Protect and accelerate ...
15,Accacia,Climateopportunistic,India,Early Stage,https://accacia.ai/,Accacia empowers real estate with AI-driven to...
12,Patch,Climateopportunistic,North America,Growth,https://www.patch.io/,Buy high-integrity carbon credits from our glo...
13,Overstory,Climatetechnology & Ai,Europe,Early Stage,https://www.overstory.com/,"We help electric utilities optimize resources,..."
0,Livelyme,Healthcare,North America,Growth,https://livelyme.com/,Best Health Savings Account and flexible benef...
4,Picnichealth,Healthcare,North America,Growth,https://picnichealth.com/,Healthcare simplified with the Unified Patient...
7,Yaocheng,Healthcare,China,Early Stage,http://yaocheng.cn/,Streamline R&D and automate clinical trials wi...
1,Com,Opportunistic,Southeast Asia,Growth,https://vng.com.vn/,Trang chủ VNG (tiền thân là VinaGame) - Kỳ lân...


Saved industry samples: /Users/yuetoya/Desktop/researchOS100-private/notebooks/outputs/portfolio_aggregations/portfolio_agg_2026-01-05_02-45-24_industry_samples.csv


In [10]:
# ============================================================
# Cell 6 : Company-level Summaries and Diagnostics
# ============================================================
#
# This cell produces company-level "one-pagers" and basic diagnostics
# to support fast review and troubleshooting.
#
# Goals:
# - Provide a clean, human-readable summary view per company
# - Surface missingness / coverage issues (e.g., no description, no domain)
# - Flag potential data quality problems (duplicates, malformed URLs, etc.)
#
# Outputs:
# - company_summary_df : compact per-company table for review/export
# - diagnostics tables for data quality and enrichment coverage
#

DIAG_DIR = OUTPUT_DIR / "company_diagnostics"
DIAG_DIR.mkdir(parents=True, exist_ok=True)

SUMMARY_PATH = DIAG_DIR / f"company_summaries_{RUN_TIMESTAMP}.csv"
MISSINGNESS_PATH = DIAG_DIR / f"missingness_report_{RUN_TIMESTAMP}.csv"
URL_ISSUES_PATH = DIAG_DIR / f"url_issues_{RUN_TIMESTAMP}.csv"

df = features_df.copy()

# ============================================================
# (A) Compact company summary table
# ============================================================

summary_cols = [
    "name",
    "industry",
    "region",
    "stage_bucket",
    "domain",
    "url",
    "text_signal_short",
    "meta_description",
    "page_title",
    "total_funding_usd",
    "last_round_date",
    "last_round_type",
    "source",
    "scraped_at_utc",
]

# Keep only columns that exist in this run
summary_cols = [c for c in summary_cols if c in df.columns]

company_summary_df = df[summary_cols].copy()
company_summary_df = company_summary_df.sort_values(["name"]).reset_index(drop=True)

print("--- Company summary preview ---")
display(company_summary_df.head(20))

company_summary_df.to_csv(SUMMARY_PATH, index=False)
print(f"Saved company summaries: {SUMMARY_PATH}")

# ============================================================
# (B) Missingness / coverage diagnostics
# ============================================================

diagnostic_fields = [
    "industry",
    "region",
    "stage_bucket",
    "domain",
    "text_signal",
    "meta_description",
]

# Filter to fields that exist
diagnostic_fields = [c for c in diagnostic_fields if c in df.columns]

missingness_rows = []
for col in diagnostic_fields:
    series = df[col]
    # treat empty strings as missing for categorical/text columns
    missing = series.isna() | (series.astype(str).str.strip() == "")
    missing_count = int(missing.sum())
    missing_share = missing_count / max(len(df), 1)
    missingness_rows.append(
        {"field": col, "missing_count": missing_count, "missing_share": round(missing_share, 4)}
    )

missingness_df = pd.DataFrame(missingness_rows).sort_values("missing_count", ascending=False)

print("\n--- Missingness report ---")
display(missingness_df)

missingness_df.to_csv(MISSINGNESS_PATH, index=False)
print(f"Saved missingness report: {MISSINGNESS_PATH}")

# ============================================================
# (C) URL / domain sanity checks
# ============================================================

def is_probably_valid_url(u: str) -> bool:
    if not u:
        return False
    try:
        p = urlparse(u)
        return bool(p.scheme) and bool(p.netloc)
    except Exception:
        return False

url_issues = df.copy()
url_issues["url_valid"] = url_issues["url"].fillna("").apply(lambda x: is_probably_valid_url(str(x).strip()))
url_issues["domain_missing"] = url_issues["domain"].fillna("").astype(str).str.strip().eq("")

issues_df = url_issues[(~url_issues["url_valid"]) | (url_issues["domain_missing"])].copy()

issue_cols = ["name", "url", "domain", "url_valid", "domain_missing"]
issue_cols = [c for c in issue_cols if c in issues_df.columns]

print("\n--- URL / domain issues (if any) ---")
display(issues_df[issue_cols].head(50))

issues_df[issue_cols].to_csv(URL_ISSUES_PATH, index=False)
print(f"Saved URL issue report: {URL_ISSUES_PATH}")

# ============================================================
# (D) Duplicate checks (post-normalization)
# ============================================================

dup_by_name = df.duplicated(subset=["name"]).sum() if "name" in df.columns else 0
dup_by_name_url = df.duplicated(subset=["name", "url"]).sum() if {"name","url"}.issubset(df.columns) else 0

print("\n--- Duplicate checks ---")
print(f"Duplicate by name:     {dup_by_name}")
print(f"Duplicate by name+url: {dup_by_name_url}")

# Optional: show duplicates
if dup_by_name_url > 0:
    dup_rows = df[df.duplicated(subset=["name", "url"], keep=False)].sort_values(["name", "url"])
    display(dup_rows.head(50))


--- Company summary preview ---


Unnamed: 0,name,industry,region,stage_bucket,domain,url,text_signal_short,meta_description,page_title,total_funding_usd,last_round_date,last_round_type,source,scraped_at_utc
0,6Sense,Technology & Ai,North America,Growth,,https://6sense.com/,,,,,,,b.capital/portfolio,2026-01-05_02-45-24
1,Accacia,Climateopportunistic,India,Early Stage,accacia.ai,https://accacia.ai/,Accacia empowers real estate with AI-driven to...,Accacia empowers real estate with AI-driven to...,Pioneering Real Estate Decarbonization | Accac...,,,,b.capital/portfolio,2026-01-05_02-45-24
2,Aetion,Healthcare,North America,Growth,aetion.com,https://aetion.com/,Aetion is a health care technology company tha...,Aetion is a health care technology company tha...,Real-World Evidence Solutions | RWE Analytics ...,,,,b.capital/portfolio,2026-01-05_02-45-24
3,Aimotive,Opportunistic,Europe,Growth,aimotive.com,https://aimotive.com/,"With our tools and solutions, automotive partn...","With our tools and solutions, automotive partn...",Automated Driving Tooling and Embedded Solutio...,,,,b.capital/portfolio,2026-01-05_02-45-24
4,Ansiblehealth,Healthcare,North America,Early Stage,ansiblehealth.com,https://www.ansiblehealth.com/,AnsibleHealth is a virtual pulmonary rehab tre...,AnsibleHealth is a virtual pulmonary rehab tre...,"AnsibleHealth, Pulmonary Doctors | COPD Treatment",,,,b.capital/portfolio,2026-01-05_02-45-24
5,Apptronik,Technology & Ai,North America,Growth,apptronik.com,https://apptronik.com/,Apptronik,,Apptronik,,,,b.capital/portfolio,2026-01-05_02-45-24
6,Baichuan Ai,Technology & Ai,China,Growth,baichuan-ai.com,https://www.baichuan-ai.com/,百川智能以帮助大众轻松、普惠地获取世界知识和专业服务为使命，致力于通过语言AI的突破，构建中...,百川智能以帮助大众轻松、普惠地获取世界知识和专业服务为使命，致力于通过语言AI的突破，构建中...,百川大模型-汇聚世界知识 创作妙笔生花-百川智能,,,,b.capital/portfolio,2026-01-05_02-45-24
7,Bhanzu,Opportunistic,India,Early Stage,bhanzu.com,https://bhanzu.com/,Learning mathematics should be fun & easy. Cho...,Learning mathematics should be fun & easy. Cho...,Online Math Courses With Personalized Guidance...,,,,b.capital/portfolio,2026-01-05_02-45-24
8,Blackbuck,Opportunistic,India,Growth,blackbuck.com,https://blackbuck.com/,India's Largest Trucking Platform,India's Largest Trucking Platform,Home - BlackBuck,,,,b.capital/portfolio,2026-01-05_02-45-24
9,Branch,Technology & Ai,North America,Growth,branch.co,https://branch.co/,Branch is the world's leading personal finance...,Branch is the world's leading personal finance...,Branch International | Home,,,,b.capital/portfolio,2026-01-05_02-45-24


Saved company summaries: /Users/yuetoya/Desktop/researchOS100-private/notebooks/outputs/company_diagnostics/company_summaries_2026-01-05_02-45-24.csv

--- Missingness report ---


Unnamed: 0,field,missing_count,missing_share
5,meta_description,24,0.1714
4,text_signal,14,0.1
3,domain,12,0.0857
0,industry,0,0.0
1,region,0,0.0
2,stage_bucket,0,0.0


Saved missingness report: /Users/yuetoya/Desktop/researchOS100-private/notebooks/outputs/company_diagnostics/missingness_report_2026-01-05_02-45-24.csv

--- URL / domain issues (if any) ---


Unnamed: 0,name,url,domain,url_valid,domain_missing
0,6Sense,https://6sense.com/,,True,True
11,Brik,https://www.brik.id/,,True,True
20,Cloudwise,https://www.cloudwise.com/en,,True,True
21,Coindcx,http://coindcx.com/,,True,True
22,Comparably,https://www.comparably.com/,,True,True
74,Mannjal,https://mannjal.com/,,True,True
76,Meesho,https://meesho.com/,,True,True
94,Payfazz,https://www.payfazz.com/,,True,True
98,Perplexity,http://www.perplexity.ai/,,True,True
104,Profoundbiosz,http://www.profoundbiosz.cn/,,True,True


Saved URL issue report: /Users/yuetoya/Desktop/researchOS100-private/notebooks/outputs/company_diagnostics/url_issues_2026-01-05_02-45-24.csv

--- Duplicate checks ---
Duplicate by name:     1
Duplicate by name+url: 0


In [11]:
# ============================================================
# Cell 7 : Export of Structured Outputs for Downstream Use
# ============================================================
#
# This cell exports the key structured artifacts produced by the pipeline.
#
# Design principles:
# - Export "canonical" tables with stable filenames for downstream notebooks
# - Also export timestamped snapshots for reproducibility / versioning
# - Keep exports lightweight and interoperable (CSV/JSON)
#
# Outputs (typical):
# - outputs/latest/portfolio_features_latest.csv
# - outputs/latest/portfolio_snapshot_latest.csv
# - outputs/latest/distributions/*.csv
# - outputs/latest/company_summaries_latest.csv
# - outputs/snapshots/... (timestamped)
#

LATEST_DIR = OUTPUT_DIR / "latest"
LATEST_DIR.mkdir(parents=True, exist_ok=True)

SNAPSHOT_EXPORT_DIR = OUTPUT_DIR / "snapshots" / RUN_TIMESTAMP
SNAPSHOT_EXPORT_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Helper: safe export
# ------------------------------------------------------------
def export_csv(df: pd.DataFrame, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False)

def export_json(df: pd.DataFrame, path: Path, orient: str = "records") -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_json(path, orient=orient, force_ascii=False, indent=2)

# ------------------------------------------------------------
# Choose canonical artifacts to export
# ------------------------------------------------------------
# From earlier cells:
# - features_df              (Cell 4)
# - portfolio_snapshot_df    (Cell 5)
# - company_summary_df       (Cell 6)
# - distribution tables      (Cell 5): industry_dist, region_dist, stage_dist
#
# Not all may exist depending on optional branches; export only what exists.

exports = []

# 1) Features (analysis-ready canonical table)
if "features_df" in globals():
    exports.append(("portfolio_features", features_df))

# 2) Portfolio snapshot (compact memo-friendly table)
if "portfolio_snapshot_df" in globals():
    exports.append(("portfolio_snapshot", portfolio_snapshot_df))

# 3) Company summaries (review/debug table)
if "company_summary_df" in globals():
    exports.append(("company_summaries", company_summary_df))

# 4) Distributions (segment tables)
dist_tables = []
if "industry_dist" in globals():
    dist_tables.append(("industry_distribution", industry_dist))
if "region_dist" in globals():
    dist_tables.append(("region_distribution", region_dist))
if "stage_dist" in globals():
    dist_tables.append(("stage_distribution", stage_dist))

# ------------------------------------------------------------
# Export "latest" artifacts (stable filenames)
# ------------------------------------------------------------
for name, df in exports:
    export_csv(df, LATEST_DIR / f"{name}_latest.csv")
    export_json(df, LATEST_DIR / f"{name}_latest.json")

    # Also export timestamped snapshots
    export_csv(df, SNAPSHOT_EXPORT_DIR / f"{name}.csv")
    export_json(df, SNAPSHOT_EXPORT_DIR / f"{name}.json")

    print(f"[exported] {name} -> latest/ + snapshots/{RUN_TIMESTAMP}/")

# Distributions under a subfolder
if dist_tables:
    dist_latest_dir = LATEST_DIR / "distributions"
    dist_snapshot_dir = SNAPSHOT_EXPORT_DIR / "distributions"

    for name, df in dist_tables:
        export_csv(df, dist_latest_dir / f"{name}_latest.csv")
        export_csv(df, dist_snapshot_dir / f"{name}.csv")

        print(f"[exported] {name} -> latest/distributions + snapshots/{RUN_TIMESTAMP}/distributions")

# ------------------------------------------------------------
# Optional: Export a minimal JSON for UI / downstream apps
# ------------------------------------------------------------
# A compact, app-friendly artifact that focuses on the "portfolio snapshot".
# Useful if you want to render a simple table or build a small dashboard.

if "portfolio_snapshot_df" in globals():
    app_json_path = LATEST_DIR / "portfolio_app_payload_latest.json"
    payload = {
        "generated_at_utc": RUN_TIMESTAMP,
        "source": PORTFOLIO_URL if "PORTFOLIO_URL" in globals() else "",
        "n_companies": int(len(portfolio_snapshot_df)),
        "records": portfolio_snapshot_df.to_dict(orient="records"),
    }
    app_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")

    app_snapshot_path = SNAPSHOT_EXPORT_DIR / "portfolio_app_payload.json"
    app_snapshot_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")

    print(f"[exported] portfolio_app_payload -> {app_json_path} (+ snapshot)")

print("\nExport completed.")
print(f"Latest dir   : {LATEST_DIR}")
print(f"Snapshot dir : {SNAPSHOT_EXPORT_DIR}")


[exported] portfolio_features -> latest/ + snapshots/2026-01-05_02-45-24/
[exported] portfolio_snapshot -> latest/ + snapshots/2026-01-05_02-45-24/
[exported] company_summaries -> latest/ + snapshots/2026-01-05_02-45-24/
[exported] industry_distribution -> latest/distributions + snapshots/2026-01-05_02-45-24/distributions
[exported] region_distribution -> latest/distributions + snapshots/2026-01-05_02-45-24/distributions
[exported] stage_distribution -> latest/distributions + snapshots/2026-01-05_02-45-24/distributions
[exported] portfolio_app_payload -> /Users/yuetoya/Desktop/researchOS100-private/notebooks/outputs/latest/portfolio_app_payload_latest.json (+ snapshot)

Export completed.
Latest dir   : /Users/yuetoya/Desktop/researchOS100-private/notebooks/outputs/latest
Snapshot dir : /Users/yuetoya/Desktop/researchOS100-private/notebooks/outputs/snapshots/2026-01-05_02-45-24
