In [1]:
from typing import Any, Dict, Optional, List


# Which Lighthouse audits we consider important for SEO / CWV.
IMPORTANT_AUDITS = [
    # Core Web Vitals / performance
    "first-contentful-paint",
    "largest-contentful-paint",
    "speed-index",
    "total-blocking-time",
    "cumulative-layout-shift",
    "max-potential-fid",
    "interactive",              # TTI if present
    "server-response-time",
    "uses-http2",
    "redirects",

    # Technical SEO
    "is-on-https",
    "viewport",
    "http-status-code",
    "meta-description",
    "font-size",
    "link-text",
    "crawlable-anchors",
    "is-crawlable",
    "robots-txt",
    "hreflang",
    "canonical",
    "structured-data",

    # Diagnostics / quality signals
    "errors-in-console",
]


def _extract_audit(audits: Dict[str, Any], audit_id: str) -> Optional[Dict[str, Any]]:
    """
    Safely extract a compact view of a single Lighthouse audit.
    Returns None if the audit is missing.
    """
    raw = audits.get(audit_id)
    if not isinstance(raw, dict):
        return None

    return {
        "id": raw.get("id", audit_id),
        "title": raw.get("title"),
        "score": raw.get("score"),
        "score_display_mode": raw.get("scoreDisplayMode"),
        "display_value": raw.get("displayValue"),
        "numeric_value": raw.get("numericValue"),
        "numeric_unit": raw.get("numericUnit"),
    }


def extract_lighthouse_seo_summary(payload: Dict[str, Any]) -> Dict[str, Any]:
    """
    Take the raw on_page_lighthouse JSON and return a small, SEO-focused summary.

    This is designed so your SEO analysis can rely ONLY on this optimized JSON.
    """
    data = payload.get("data", {})
    items: List[Dict[str, Any]] = data.get("items", [])
    item = items[0] if items else {}

    audits: Dict[str, Any] = item.get("audits", {})
    categories: Dict[str, Any] = item.get("categories", {})

    # --- Meta info about the run ---
    meta = {
        "tool_name": payload.get("tool_name"),
        "timestamp": payload.get("timestamp"),
        "lighthouse_version": item.get("lighthouseVersion"),
        "requested_url": item.get("requestedUrl"),
        "final_url": item.get("finalUrl") or item.get("finalDisplayedUrl"),
        "fetch_time": item.get("fetchTime"),
        "user_agent": item.get("userAgent"),
        "run_warnings": item.get("runWarnings", []),
    }

    # --- High-level category scores (0–1) ---
    def cat_score(cat_key: str) -> Optional[float]:
        cat = categories.get(cat_key) or {}
        return cat.get("score")

    scores = {
        "performance": cat_score("performance"),
        "seo": cat_score("seo"),
        "accessibility": cat_score("accessibility"),
        "best_practices": cat_score("best-practices"),
        "pwa": cat_score("pwa"),
    }

    # --- Core Web Vitals & timing metrics ---
    def num(aid: str) -> Optional[float]:
        a = audits.get(aid) or {}
        return a.get("numericValue")

    core_web_vitals = {
        "fcp_ms": num("first-contentful-paint"),
        "lcp_ms": num("largest-contentful-paint"),
        "speed_index_ms": num("speed-index"),
        "tbt_ms": num("total-blocking-time"),
        "cls": num("cumulative-layout-shift"),
        "tti_ms": num("interactive"),
        "max_potential_fid_ms": num("max-potential-fid"),
        "server_response_time_ms": num("server-response-time"),
    }

    # --- Technical SEO flags / booleans from audit scores ---
    def passed(aid: str) -> Optional[bool]:
        a = audits.get(aid) or {}
        score = a.get("score")
        if score is None:
            return None
        # Lighthouse uses 0/1 for binary, sometimes decimals; treat ≥0.9 as pass.
        return float(score) >= 0.9

    technical_seo = {
        "https": passed("is-on-https"),
        "viewport_meta": passed("viewport"),
        "http_status_ok": passed("http-status-code"),
        "has_meta_description": passed("meta-description"),
        "font_size_ok": passed("font-size"),
        "link_text_ok": passed("link-text"),
        "crawlable_anchors_ok": passed("crawlable-anchors"),
        "page_crawlable": passed("is-crawlable"),
        "robots_txt_valid": passed("robots-txt"),
        "hreflang_valid": passed("hreflang"),
        # canonical/structured-data can be N/A/manual; keep as raw scoreDisplayMode info
        "canonical_status": audits.get("canonical", {}).get("scoreDisplayMode"),
        "structured_data_status": audits.get("structured-data", {}).get("scoreDisplayMode"),
    }

    # --- Compact list of important audits (sorted by id) ---
    important_audits: List[Dict[str, Any]] = []
    for aid in sorted(IMPORTANT_AUDITS):
        a = _extract_audit(audits, aid)
        if a is not None:
            important_audits.append(a)

    # --- Console errors (just messages, trimmed) ---
    errors_raw = audits.get("errors-in-console", {})
    errors_details = errors_raw.get("details", {}) or {}
    error_items = errors_details.get("items", []) or []

    console_errors = [
        {
            "source": it.get("source"),
            "description": it.get("description"),
            "url": (it.get("sourceLocation") or {}).get("url"),
        }
        for it in error_items
    ]

    # Final optimized JSON – keys are ordered intentionally
    return {
        "meta": meta,
        "scores": scores,
        "core_web_vitals": core_web_vitals,
        "technical_seo": technical_seo,
        "important_audits": important_audits,
        "console_errors": console_errors,
    }


In [3]:
import json

with open("logs/on_page_lighthouse.json", "r") as f:
    raw = json.load(f)

summary = extract_lighthouse_seo_summary(raw)

# write the summary to a file
with open("logs/on_page_lighthouse_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print(json.dumps(summary, indent=2))

{
  "meta": {
    "tool_name": "on_page_lighthouse",
    "timestamp": "2025-11-21 16:17:49",
    "lighthouse_version": "12.2.0",
    "requested_url": "https://strique.io/",
    "final_url": "https://www.strique.io/",
    "fetch_time": "2025-11-21T10:47:20.868Z",
    "user_agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/132.0.0.0 Safari/537.36",
      "The page may not be loading as expected because your test URL (https://strique.io/) was redirected to https://www.strique.io/. Try testing the second URL directly."
    ]
  },
  "scores": {
    "performance": 0.46,
    "seo": 1,
    "accessibility": 0.96,
    "best_practices": 0.7,
    "pwa": null
  },
  "core_web_vitals": {
    "fcp_ms": 1821.317,
    "lcp_ms": 2284.16,
    "speed_index_ms": 3037.974,
    "tbt_ms": 966,
    "cls": 0.01,
    "tti_ms": 4605.053,
    "max_potential_fid_ms": 495,
    "server_response_time_ms": 238.093
  },
  "technical_seo": {
    "https": true,
    "viewport_me

In [5]:
from src.utils.token_counter import count_tokens

token_count = count_tokens(summary)
print(f"Token count: {token_count}")


Token count: 1808
