In [None]:
# ============================================================
# 016 fetch_newsapi_weekly
# ============================================================
#
# Overview
# --------
# This notebook fetches news articles from NewsAPI.org on a weekly basis
# for a fixed set of 15 investment-relevant technology keywords.
#
# It is designed for recurring, automated execution (e.g., via GitHub Actions)
# and produces raw, week-partitioned article datasets that serve as the
# foundation for downstream aggregation and trend analysis.
#
# The notebook focuses exclusively on data collection and normalization.
# Aggregation, normalization across weeks, and visualization are handled
# in a separate downstream notebook.
#
#
# Structure
# ---------
# Cell 0 : Purpose / Weekly Run Specification
# Cell 1 : Imports & Global Configuration
# Cell 2 : Target Week Calculation (automatic weekly time window)
# Cell 3 : Load Keyword Dictionary (15 predefined keywords)
# Cell 4 : Build NewsAPI Query Parameters
# Cell 5 : Fetch Loop (pagination, retry, rate-limit handling)
# Cell 6 : Normalize Raw API Responses
# Cell 7 : Deduplicate Articles (URL-based)
# Cell 8 : Save Raw Outputs (week-partitioned)
# Cell 9 : Weekly Fetch Summary (lightweight report)
#
#
# Notes
# -----
# - The execution target is the most recent *completed* week to ensure
#   temporal consistency in weekly trend analysis.
# - API credentials must be provided via environment variables
#   (e.g., NEWSAPI_KEY); no secrets should be hard-coded.
# - The output of this notebook is intentionally kept "raw":
#   minimal transformation, maximal traceability to the original API responses.
# - All outputs are partitioned by ISO year-week (YYYY-WW) to support
#   incremental updates and reproducible backfills.
# - When using the NewsAPI Developer plan, the /v2/everything endpoint is
#   capped at 100 retrievable results per query; this notebook is designed
#   to operate safely under that constraint.
# - This notebook is safe to rerun for the same week; downstream logic
#   should handle idempotency and deduplication.


In [3]:
# ============================================================
# Cell 1: Imports & Global Config
# ============================================================

# --- Standard library imports ---
import os
import time
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Dict, List, Any

# --- Third-party imports ---
import requests
import pandas as pd

# --- Environment variables ---
from dotenv import load_dotenv

# Explicitly load env.txt (instead of default .env)
load_dotenv("env.txt")
NEWSAPI_KEY = os.getenv("NEWSAPI_KEY")
if NEWSAPI_KEY is None:
    raise RuntimeError(
        "NEWSAPI_KEY is not set. Please provide it via an environment variable."
    )

# --- NewsAPI base configuration ---
NEWSAPI_ENDPOINT = "https://newsapi.org/v2/everything"
DEFAULT_LANGUAGE = "en"
DEFAULT_PAGE_SIZE = 100          # Max allowed by NewsAPI
DEFAULT_SORT_BY = "publishedAt"  # Ensures chronological ordering

# --- Rate limit & retry configuration ---
REQUEST_TIMEOUT_SEC = 30
MAX_RETRIES = 3
RETRY_SLEEP_SEC = 5              # Backoff for transient failures
RATE_LIMIT_SLEEP_SEC = 1         # Small delay between requests (safety)

# --- Project paths ---
PROJECT_ROOT = Path(".").resolve()
DATA_DIR = PROJECT_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw" / "newsapi"

# Ensure output directories exist
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)

# --- Logging / display options ---
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 50)

print("Configuration loaded successfully.")
print(f"Raw data directory: {RAW_DATA_DIR}")


Configuration loaded successfully.
Raw data directory: /Users/yuetoya/Desktop/researchOS100-private/notebooks/data/raw/newsapi


In [4]:
# ============================================================
# Cell 2: Target Week (Automatic Weekly Time Window)
# ============================================================

# This notebook targets the most recent *completed* ISO week.
# This avoids partial-week data and ensures consistency for
# weekly trend aggregation.

# Current time (UTC is used for consistency across environments)
now_utc = datetime.now(timezone.utc)

# Determine the start of the current ISO week (Monday 00:00 UTC)
current_week_start = now_utc - timedelta(
    days=now_utc.isoweekday() - 1,
    hours=now_utc.hour,
    minutes=now_utc.minute,
    seconds=now_utc.second,
    microseconds=now_utc.microsecond,
)

# The target week is the week immediately preceding the current week
target_week_start = current_week_start - timedelta(weeks=1)
target_week_end = current_week_start - timedelta(seconds=1)

# ISO year-week for partitioning and logging
iso_year, iso_week, _ = target_week_start.isocalendar()
week_label = f"{iso_year}-W{iso_week:02d}"

# Format dates for NewsAPI (ISO 8601)
newsapi_from = target_week_start.strftime("%Y-%m-%dT%H:%M:%SZ")
newsapi_to = target_week_end.strftime("%Y-%m-%dT%H:%M:%SZ")

print("Target week resolved successfully.")
print(f"ISO week        : {week_label}")
print(f"Week start (UTC): {target_week_start.isoformat()}")
print(f"Week end   (UTC): {target_week_end.isoformat()}")
print(f"NewsAPI from    : {newsapi_from}")
print(f"NewsAPI to      : {newsapi_to}")


Target week resolved successfully.
ISO week        : 2025-W52
Week start (UTC): 2025-12-22T00:00:00+00:00
Week end   (UTC): 2025-12-28T23:59:59+00:00
NewsAPI from    : 2025-12-22T00:00:00Z
NewsAPI to      : 2025-12-28T23:59:59Z


In [5]:
# ============================================================
# Cell 3: Load Keyword Dictionary (15 keywords with editable aliases)
# ============================================================

import ipywidgets as widgets
from IPython.display import display

# ------------------------------------------------------------------
# Default keyword definitions (canonical + category)
# ------------------------------------------------------------------
# NOTE:
# - Canonical set is fixed (15 keywords).
# - Aliases are editable via widgets (comma-separated).
# - A NewsAPI query string is auto-generated from aliases.
# - "Foundation Models" is replaced with "Robotics" per the latest decision.

DEFAULT_KEYWORDS = [
    # AI / Robotics (4)
    {"keyword_id": "generative_ai", "canonical": "Generative AI", "category": "AI"},
    {"keyword_id": "ai_agents", "canonical": "AI Agents", "category": "AI"},
    {"keyword_id": "robotics", "canonical": "Robotics", "category": "AI"},
    {"keyword_id": "edge_ai", "canonical": "Edge AI", "category": "AI"},

    # Healthtech (4)
    {"keyword_id": "digital_health", "canonical": "Digital Health", "category": "Healthtech"},
    {"keyword_id": "ai_healthcare", "canonical": "AI in Healthcare", "category": "Healthtech"},
    {"keyword_id": "precision_medicine", "canonical": "Precision Medicine", "category": "Healthtech"},
    {"keyword_id": "clinical_ai", "canonical": "Clinical AI", "category": "Healthtech"},

    # Climate (3)
    {"keyword_id": "climate_tech", "canonical": "Climate Tech", "category": "Climate"},
    {"keyword_id": "carbon_capture", "canonical": "Carbon Capture", "category": "Climate"},
    {"keyword_id": "energy_storage", "canonical": "Energy Storage", "category": "Climate"},

    # Deeptech / Frontier (4)
    {"keyword_id": "quantum_computing", "canonical": "Quantum Computing", "category": "Deeptech"},
    {"keyword_id": "advanced_materials", "canonical": "Advanced Materials", "category": "Deeptech"},
    {"keyword_id": "nuclear_fusion", "canonical": "Nuclear Fusion", "category": "Deeptech"},
    {"keyword_id": "spacetech", "canonical": "SpaceTech", "category": "Deeptech"},
]

keywords_df = pd.DataFrame(DEFAULT_KEYWORDS)

# ------------------------------------------------------------------
# Default aliases (editable)
# ------------------------------------------------------------------
# Guidance:
# - Keep aliases as common surface forms that appear in headlines.
# - Avoid overly broad terms that create excessive noise.
# - Add/remove aliases over time as the market vocabulary evolves.

DEFAULT_ALIASES = {
    # AI / Robotics
    "generative_ai": "generative AI, GenAI",
    "ai_agents": "AI agent, autonomous agent",
    "robotics": "robotics, autonomous robot, industrial robot, humanoid robot",
    "edge_ai": "edge AI, on-device AI, on device AI",

    # Healthtech
    "digital_health": "digital health, health IT, healthcare IT",
    "ai_healthcare": "healthcare AI, medical AI, AI in healthcare",
    "precision_medicine": "precision medicine, personalized medicine",
    "clinical_ai": "clinical AI, clinical decision support, CDS",

    # Climate
    "climate_tech": "climate tech, climate technology, cleantech",
    "carbon_capture": "carbon capture, CCS, carbon removal",
    "energy_storage": "energy storage, battery storage, grid storage",

    # Deeptech / Frontier
    "quantum_computing": "quantum computing, quantum technology",
    "advanced_materials": "advanced materials, material science, materials science",
    "nuclear_fusion": "nuclear fusion, fusion energy",
    "spacetech": "space technology, space startup, satellite, space launch",
}

# Optional global exclusions to reduce common noise (can be extended later).
# These are applied in later cells when building request params.
GLOBAL_EXCLUSIONS = [
    "job", "jobs", "hiring", "recruiting", "careers"
]

# ------------------------------------------------------------------
# Widgets for alias editing
# ------------------------------------------------------------------
alias_widgets = {}

print("Edit aliases if needed (comma-separated):\n")

for row in keywords_df.itertuples(index=False):
    w = widgets.Text(
        value=DEFAULT_ALIASES.get(row.keyword_id, ""),
        description=row.canonical,
        layout=widgets.Layout(width="85%"),
        style={"description_width": "220px"},
        placeholder="comma-separated aliases (e.g., term1, term2, term3)"
    )
    alias_widgets[row.keyword_id] = w
    display(w)

# ------------------------------------------------------------------
# Build final keyword table (canonical + aliases + NewsAPI query)
# ------------------------------------------------------------------

def _build_or_query_from_aliases(aliases_csv: str) -> str:
    """
    Convert a comma-separated alias string into a NewsAPI-friendly OR query:
      term1, term2 -> "term1" OR "term2"

    Returns an empty string if no aliases are provided.
    """
    if aliases_csv is None:
        return ""
    terms = [t.strip() for t in str(aliases_csv).split(",") if t.strip()]
    if not terms:
        return ""
    return " OR ".join([f'"{t}"' for t in terms])

def build_keyword_table() -> pd.DataFrame:
    records = []
    for row in keywords_df.itertuples(index=False):
        aliases = alias_widgets[row.keyword_id].value
        query = _build_or_query_from_aliases(aliases)

        records.append({
            "keyword_id": row.keyword_id,
            "canonical": row.canonical,
            "category": row.category,
            "aliases": aliases,
            "query": query
        })
    return pd.DataFrame(records)

keywords_df_final = build_keyword_table()

# Validate
if keywords_df_final["query"].isna().any() or (keywords_df_final["query"].str.len() == 0).any():
    empty = keywords_df_final[keywords_df_final["query"].str.len() == 0][["keyword_id", "canonical"]]
    raise ValueError(
        "Some keywords have empty queries (aliases missing). "
        "Please fill aliases for all keywords.\n"
        f"{empty}"
    )

print("\nFinal keyword dictionary (preview):")
display(
    keywords_df_final[["keyword_id", "canonical", "category", "aliases", "query"]]
    .sort_values(["category", "canonical"])
    .reset_index(drop=True)
)

print("\nGlobal exclusions (to be applied when building request params):")
print(GLOBAL_EXCLUSIONS)


Edit aliases if needed (comma-separated):



Text(value='generative AI, GenAI', description='Generative AI', layout=Layout(width='85%'), placeholder='comma…

Text(value='AI agent, autonomous agent', description='AI Agents', layout=Layout(width='85%'), placeholder='com…

Text(value='robotics, autonomous robot, industrial robot, humanoid robot', description='Robotics', layout=Layo…

Text(value='edge AI, on-device AI, on device AI', description='Edge AI', layout=Layout(width='85%'), placehold…

Text(value='digital health, health IT, healthcare IT', description='Digital Health', layout=Layout(width='85%'…

Text(value='healthcare AI, medical AI, AI in healthcare', description='AI in Healthcare', layout=Layout(width=…

Text(value='precision medicine, personalized medicine', description='Precision Medicine', layout=Layout(width=…

Text(value='clinical AI, clinical decision support, CDS', description='Clinical AI', layout=Layout(width='85%'…

Text(value='climate tech, climate technology, cleantech', description='Climate Tech', layout=Layout(width='85%…

Text(value='carbon capture, CCS, carbon removal', description='Carbon Capture', layout=Layout(width='85%'), pl…

Text(value='energy storage, battery storage, grid storage', description='Energy Storage', layout=Layout(width=…

Text(value='quantum computing, quantum technology', description='Quantum Computing', layout=Layout(width='85%'…

Text(value='advanced materials, material science, materials science', description='Advanced Materials', layout…

Text(value='nuclear fusion, fusion energy', description='Nuclear Fusion', layout=Layout(width='85%'), placehol…

Text(value='space technology, space startup, satellite, space launch', description='SpaceTech', layout=Layout(…


Final keyword dictionary (preview):


Unnamed: 0,keyword_id,canonical,category,aliases,query
0,ai_agents,AI Agents,AI,"AI agent, autonomous agent","""AI agent"" OR ""autonomous agent"""
1,edge_ai,Edge AI,AI,"edge AI, on-device AI, on device AI","""edge AI"" OR ""on-device AI"" OR ""on device AI"""
2,generative_ai,Generative AI,AI,"generative AI, GenAI","""generative AI"" OR ""GenAI"""
3,robotics,Robotics,AI,"robotics, autonomous robot, industrial robot, ...","""robotics"" OR ""autonomous robot"" OR ""industria..."
4,carbon_capture,Carbon Capture,Climate,"carbon capture, CCS, carbon removal","""carbon capture"" OR ""CCS"" OR ""carbon removal"""
5,climate_tech,Climate Tech,Climate,"climate tech, climate technology, cleantech","""climate tech"" OR ""climate technology"" OR ""cle..."
6,energy_storage,Energy Storage,Climate,"energy storage, battery storage, grid storage","""energy storage"" OR ""battery storage"" OR ""grid..."
7,advanced_materials,Advanced Materials,Deeptech,"advanced materials, material science, material...","""advanced materials"" OR ""material science"" OR ..."
8,nuclear_fusion,Nuclear Fusion,Deeptech,"nuclear fusion, fusion energy","""nuclear fusion"" OR ""fusion energy"""
9,quantum_computing,Quantum Computing,Deeptech,"quantum computing, quantum technology","""quantum computing"" OR ""quantum technology"""



Global exclusions (to be applied when building request params):
['job', 'jobs', 'hiring', 'recruiting', 'careers']


In [6]:
# ============================================================
# Cell 4: Build Query Params
# ============================================================

# This cell constructs NewsAPI request parameter dictionaries
# for each keyword and the target weekly time window.
#
# The output of this cell is a list of request-ready parameter
# objects that can be consumed directly by the fetch loop.

from urllib.parse import urlencode

# ------------------------------------------------------------------
# Helper: build full query string with global exclusions
# ------------------------------------------------------------------

def build_full_query(base_query: str, exclusions: list) -> str:
    """
    Combine a base OR-query with global exclusion terms.

    Example:
      base_query = '"robotics" OR "autonomous robot"'
      exclusions = ["job", "hiring"]

      -> ('"robotics" OR "autonomous robot"') AND NOT (job OR hiring)
    """
    if not exclusions:
        return base_query

    exclusion_clause = " OR ".join(exclusions)
    return f"({base_query}) AND NOT ({exclusion_clause})"

# ------------------------------------------------------------------
# Build parameter sets for each keyword
# ------------------------------------------------------------------

query_params_list = []

for row in keywords_df_final.itertuples(index=False):
    if not row.query:
        continue

    full_query = build_full_query(
        base_query=row.query,
        exclusions=GLOBAL_EXCLUSIONS
    )

    params = {
        "q": full_query,
        "from": newsapi_from,
        "to": newsapi_to,
        "language": DEFAULT_LANGUAGE,
        "sortBy": DEFAULT_SORT_BY,
        "pageSize": DEFAULT_PAGE_SIZE,
        # "page" will be added dynamically in the fetch loop
    }

    query_params_list.append({
        "keyword_id": row.keyword_id,
        "canonical": row.canonical,
        "category": row.category,
        "params": params
    })

# ------------------------------------------------------------------
# Sanity check / preview
# ------------------------------------------------------------------

print(f"Built query parameters for {len(query_params_list)} keywords.\n")

for item in query_params_list:
    print(f"[{item['canonical']}]")
    print(f"Query : {item['params']['q']}")
    print(f"From  : {item['params']['from']}")
    print(f"To    : {item['params']['to']}")
    print("-" * 80)


Built query parameters for 15 keywords.

[Generative AI]
Query : ("generative AI" OR "GenAI") AND NOT (job OR jobs OR hiring OR recruiting OR careers)
From  : 2025-12-22T00:00:00Z
To    : 2025-12-28T23:59:59Z
--------------------------------------------------------------------------------
[AI Agents]
Query : ("AI agent" OR "autonomous agent") AND NOT (job OR jobs OR hiring OR recruiting OR careers)
From  : 2025-12-22T00:00:00Z
To    : 2025-12-28T23:59:59Z
--------------------------------------------------------------------------------
[Robotics]
Query : ("robotics" OR "autonomous robot" OR "industrial robot" OR "humanoid robot") AND NOT (job OR jobs OR hiring OR recruiting OR careers)
From  : 2025-12-22T00:00:00Z
To    : 2025-12-28T23:59:59Z
--------------------------------------------------------------------------------
[Edge AI]
Query : ("edge AI" OR "on-device AI" OR "on device AI") AND NOT (job OR jobs OR hiring OR recruiting OR careers)
From  : 2025-12-22T00:00:00Z
To    : 2025-12

In [8]:
# ============================================================
# Cell 5: Fetch Loop (Pagination + Retry)
# ============================================================

# This cell executes the weekly fetch against NewsAPI for each keyword.
# Key behaviors:
# - Pagination: iterate page=1..N where N is derived from totalResults
# - Retry: retry transient failures (timeouts, 5xx) with backoff
# - Rate-limit handling: if 429 is returned, sleep and retry
#
# Output:
# - raw_records: list[dict] of article records enriched with keyword metadata
# - fetch_log_df: per-request log for debugging and auditing

# Developer plan cap: NewsAPI "everything" supports only up to 100 results in total.
# If you are on a paid plan, set this to None.
DEVELOPER_MAX_RESULTS = 100

raw_records: List[Dict[str, Any]] = []
request_logs: List[Dict[str, Any]] = []


def _request_with_retry(
    endpoint: str,
    params: Dict[str, Any],
    headers: Dict[str, str],
    keyword_id: str,
    canonical: str,
    category: str,
) -> Dict[str, Any]:
    """
    Make a single NewsAPI request with retry logic.
    Returns parsed JSON on success; raises RuntimeError on repeated failure.
    """
    last_err = None

    for attempt in range(1, MAX_RETRIES + 1):
        t0 = time.time()
        try:
            resp = requests.get(
                endpoint,
                params=params,
                headers=headers,
                timeout=REQUEST_TIMEOUT_SEC,
            )
            elapsed = time.time() - t0

            log_base = {
                "ts_utc": datetime.now(timezone.utc).isoformat(),
                "keyword_id": keyword_id,
                "canonical": canonical,
                "category": category,
                "attempt": attempt,
                "elapsed_sec": round(elapsed, 3),
                "status_code": resp.status_code,
                "page": params.get("page"),
                "pageSize": params.get("pageSize"),
            }

            # Rate limit
            if resp.status_code == 429:
                request_logs.append({**log_base, "event": "rate_limited"})
                time.sleep(RETRY_SLEEP_SEC * attempt)
                continue

            # Other non-200
            if resp.status_code != 200:
                request_logs.append(
                    {**log_base, "event": "http_error", "error_text": resp.text[:500]}
                )
                # Retry on 5xx
                if 500 <= resp.status_code < 600:
                    time.sleep(RETRY_SLEEP_SEC * attempt)
                    continue
                # For 4xx (except 429), do not keep retrying
                raise RuntimeError(f"HTTP {resp.status_code}: {resp.text[:500]}")

            data = resp.json()
            request_logs.append({**log_base, "event": "success", "totalResults": data.get("totalResults")})
            return data

        except (requests.Timeout, requests.ConnectionError) as e:
            elapsed = time.time() - t0
            last_err = e
            request_logs.append({
                "ts_utc": datetime.now(timezone.utc).isoformat(),
                "keyword_id": keyword_id,
                "canonical": canonical,
                "category": category,
                "attempt": attempt,
                "elapsed_sec": round(elapsed, 3),
                "status_code": None,
                "page": params.get("page"),
                "pageSize": params.get("pageSize"),
                "event": "network_error",
                "error_text": repr(e)[:500],
            })
            time.sleep(RETRY_SLEEP_SEC * attempt)
            continue

    raise RuntimeError(
        f"Failed after {MAX_RETRIES} attempts for keyword={canonical} "
        f"(last_err={repr(last_err)})"
    )


headers = {"X-Api-Key": NEWSAPI_KEY}

print(f"Starting fetch for {len(query_params_list)} keywords...")
print(f"Target week: {week_label} ({newsapi_from} -> {newsapi_to})")

for idx, item in enumerate(query_params_list, start=1):
    keyword_id = item["keyword_id"]
    canonical = item["canonical"]
    category = item["category"]

    base_params = dict(item["params"])  # copy
    base_params["page"] = 1

    print(f"\n[{idx}/{len(query_params_list)}] Fetching: {canonical} ({category})")

    # --- First page (to get totalResults) ---
    data = _request_with_retry(
        endpoint=NEWSAPI_ENDPOINT,
        params=base_params,
        headers=headers,
        keyword_id=keyword_id,
        canonical=canonical,
        category=category,
    )

    total_results = int(data.get("totalResults", 0) or 0)
    articles = data.get("articles", []) or []

    # Enrich and store
    for a in articles:
        raw_records.append({
            "keyword_id": keyword_id,
            "canonical": canonical,
            "category": category,
            "week_label": week_label,
            "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
            "query": base_params.get("q"),
            **a
        })

    # --- Pagination ---
    # NewsAPI returns up to pageSize results per page.
    # total_pages = ceil(total_results / pageSize)
    page_size = int(base_params["pageSize"])

    # NewsAPI Developer accounts are capped at 100 total results for /v2/everything.
    # So even if totalResults is larger, we cannot fetch beyond page=1.
    effective_total = total_results
    capped = False
    
    if DEVELOPER_MAX_RESULTS is not None and total_results > DEVELOPER_MAX_RESULTS:
        effective_total = DEVELOPER_MAX_RESULTS
        capped = True
    
    total_pages = (effective_total + page_size - 1) // page_size
    total_pages = max(total_pages, 1)
    
    if capped:
        print(f"  totalResults={total_results} (capped to {DEVELOPER_MAX_RESULTS} for Developer plan) -> pages={total_pages}")
    else:
        if total_pages > 1:
            print(f"  totalResults={total_results} -> pages={total_pages}")


print("\nFetch completed.")
print(f"Total raw records collected: {len(raw_records)}")

fetch_log_df = pd.DataFrame(request_logs)
print("\nRequest log (tail):")
display(fetch_log_df.tail(20))


Starting fetch for 15 keywords...
Target week: 2025-W52 (2025-12-22T00:00:00Z -> 2025-12-28T23:59:59Z)

[1/15] Fetching: Generative AI (AI)
  totalResults=387 (capped to 100 for Developer plan) -> pages=1

[2/15] Fetching: AI Agents (AI)
  totalResults=245 (capped to 100 for Developer plan) -> pages=1

[3/15] Fetching: Robotics (AI)
  totalResults=275 (capped to 100 for Developer plan) -> pages=1

[4/15] Fetching: Edge AI (AI)

[5/15] Fetching: Digital Health (Healthtech)
  totalResults=107 (capped to 100 for Developer plan) -> pages=1

[6/15] Fetching: AI in Healthcare (Healthtech)

[7/15] Fetching: Precision Medicine (Healthtech)

[8/15] Fetching: Clinical AI (Healthtech)
  totalResults=107 (capped to 100 for Developer plan) -> pages=1

[9/15] Fetching: Climate Tech (Climate)

[10/15] Fetching: Carbon Capture (Climate)

[11/15] Fetching: Energy Storage (Climate)
  totalResults=141 (capped to 100 for Developer plan) -> pages=1

[12/15] Fetching: Quantum Computing (Deeptech)
  totalRes

Unnamed: 0,ts_utc,keyword_id,canonical,category,attempt,elapsed_sec,status_code,page,pageSize,event,totalResults
0,2026-01-03T05:12:01.244751+00:00,generative_ai,Generative AI,AI,1,0.47,200,1,100,success,387
1,2026-01-03T05:12:01.649324+00:00,ai_agents,AI Agents,AI,1,0.401,200,1,100,success,245
2,2026-01-03T05:12:02.060351+00:00,robotics,Robotics,AI,1,0.408,200,1,100,success,275
3,2026-01-03T05:12:02.403733+00:00,edge_ai,Edge AI,AI,1,0.341,200,1,100,success,78
4,2026-01-03T05:12:03.983435+00:00,digital_health,Digital Health,Healthtech,1,1.579,200,1,100,success,107
5,2026-01-03T05:12:04.297113+00:00,ai_healthcare,AI in Healthcare,Healthtech,1,0.31,200,1,100,success,22
6,2026-01-03T05:12:04.599098+00:00,precision_medicine,Precision Medicine,Healthtech,1,0.3,200,1,100,success,34
7,2026-01-03T05:12:04.997018+00:00,clinical_ai,Clinical AI,Healthtech,1,0.396,200,1,100,success,107
8,2026-01-03T05:12:05.335922+00:00,climate_tech,Climate Tech,Climate,1,0.335,200,1,100,success,21
9,2026-01-03T05:12:05.647940+00:00,carbon_capture,Carbon Capture,Climate,1,0.311,200,1,100,success,44


In [9]:
# ============================================================
# Cell 6: Normalize Records
# ============================================================

# This cell converts the raw_records (list of dicts) into a normalized
# tabular format suitable for downstream deduplication and storage.
#
# Notes:
# - We keep the normalized schema intentionally "raw": minimal transformation,
#   maximal traceability to the original NewsAPI fields.
# - We also extract a few convenience fields (e.g., source_name) for analytics.
# - Timestamps are kept as strings (ISO 8601) to avoid timezone ambiguity;
#   downstream notebooks can parse as needed.

if not raw_records:
    print("No records to normalize (raw_records is empty).")
    normalized_df = pd.DataFrame()
else:
    df = pd.DataFrame(raw_records)

    # Ensure expected top-level columns exist (NewsAPI may omit some fields)
    expected_cols = [
        "keyword_id", "canonical", "category", "week_label", "fetched_at_utc", "query",
        "source", "author", "title", "description", "url", "urlToImage", "publishedAt", "content"
    ]
    for c in expected_cols:
        if c not in df.columns:
            df[c] = None

    # Flatten nested "source" dict -> (source_id, source_name)
    def _safe_source_id(x):
        if isinstance(x, dict):
            return x.get("id")
        return None

    def _safe_source_name(x):
        if isinstance(x, dict):
            return x.get("name")
        return None

    df["source_id"] = df["source"].apply(_safe_source_id)
    df["source_name"] = df["source"].apply(_safe_source_name)

    # Normalize text fields (strip whitespace)
    text_cols = ["author", "title", "description", "url", "urlToImage", "publishedAt", "content", "source_name"]
    for c in text_cols:
        df[c] = df[c].astype("string").str.strip()

    # Create a stable URL key for dedup (light normalization)
    # (Optional) remove common tracking params later if needed
    df["url_key"] = df["url"]

    # Minimal typed conversions
    # Keep publishedAt as string for now; parse downstream if needed.
    # df["published_at_ts"] = pd.to_datetime(df["publishedAt"], utc=True, errors="coerce")

    # Select and reorder output columns
    normalized_cols = [
        "week_label",
        "keyword_id", "canonical", "category",
        "fetched_at_utc",
        "publishedAt",
        "source_id", "source_name",
        "author",
        "title", "description",
        "url", "url_key", "urlToImage",
        "content",
        "query",
    ]

    normalized_df = df[normalized_cols].copy()

    print("Normalization completed.")
    print(f"Normalized rows: {len(normalized_df):,}")
    display(normalized_df.head(10))


Normalization completed.
Normalized rows: 1,080


Unnamed: 0,week_label,keyword_id,canonical,category,fetched_at_utc,publishedAt,source_id,source_name,author,title,description,url,url_key,urlToImage,content,query
0,2025-W52,generative_ai,Generative AI,AI,2026-01-03T05:12:01.247325+00:00,2025-12-28T23:20:06Z,,pymnts.com,PYMNTS,Hotels Swap Online Travel Agents for AI Agents,Consumers are growing comfortable with letting...,https://www.pymnts.com/travel-payments/2025/ho...,https://www.pymnts.com/travel-payments/2025/ho...,https://www.pymnts.com/wp-content/uploads/2025...,Consumers are growing comfortable with letting...,"(""generative AI"" OR ""GenAI"") AND NOT (job OR j..."
1,2025-W52,generative_ai,Generative AI,AI,2026-01-03T05:12:01.247350+00:00,2025-12-28T23:01:00Z,,Diepresse.com,By Helmut Reisinger,Why Geopolitics and AI Are Now Core to Busines...,The digital landscape has become a central are...,https://www.diepresse.com/20407798/why-geopoli...,https://www.diepresse.com/20407798/why-geopoli...,https://img.diepresse.com/public/incoming/arhh...,The digital landscape has become a central are...,"(""generative AI"" OR ""GenAI"") AND NOT (job OR j..."
2,2025-W52,generative_ai,Generative AI,AI,2026-01-03T05:12:01.247357+00:00,2025-12-28T22:00:00Z,,OilPrice.com,Michael Kern,The AI Arms Race Is Cracking Open the Nuclear ...,We are seeing a violent collision between two ...,https://oilprice.com/Alternative-Energy/Nuclea...,https://oilprice.com/Alternative-Energy/Nuclea...,https://d32r1sh890xpii.cloudfront.net/article/...,Trump Media is entering the…\r\nThe UK's artif...,"(""generative AI"" OR ""GenAI"") AND NOT (job OR j..."
3,2025-W52,generative_ai,Generative AI,AI,2026-01-03T05:12:01.247363+00:00,2025-12-28T21:31:47Z,,How-To Geek,Sydney Butler,"If Blender is so good, why isn’t Hollywood usi...","If Blender is so good, why isn’t Hollywood usi...",https://www.howtogeek.com/if-blender-is-so-goo...,https://www.howtogeek.com/if-blender-is-so-goo...,https://static0.howtogeekimages.com/wordpress/...,"The movie Flow was made on a small budget, wit...","(""generative AI"" OR ""GenAI"") AND NOT (job OR j..."
4,2025-W52,generative_ai,Generative AI,AI,2026-01-03T05:12:01.247369+00:00,2025-12-28T21:00:00Z,,Slashdot.org,EditorDavid,Did Tim Cook Post AI Slop in His Christmas Mes...,Artist Keith Thomson is a modern (and whimsica...,https://apple.slashdot.org/story/25/12/28/2048...,https://apple.slashdot.org/story/25/12/28/2048...,https://a.fsdn.com/sd/topics/ai_64.png,Artist Keith Thomson is a modern (and whimsica...,"(""generative AI"" OR ""GenAI"") AND NOT (job OR j..."
5,2025-W52,generative_ai,Generative AI,AI,2026-01-03T05:12:01.247395+00:00,2025-12-28T19:28:39Z,,Search Engine Journal,Roger Montti,"Ahrefs Tested AI Misinformation, But Proved So...",Ahrefs research showed that biased inputs can ...,https://www.searchenginejournal.com/ahrefs-tes...,https://www.searchenginejournal.com/ahrefs-tes...,https://cdn.searchenginejournal.com/wp-content...,Ahrefs tested how AI systems behave when they’...,"(""generative AI"" OR ""GenAI"") AND NOT (job OR j..."
6,2025-W52,generative_ai,Generative AI,AI,2026-01-03T05:12:01.247402+00:00,2025-12-28T17:17:24Z,,Livemint,Kanishka Singharia,How ChatGPT became turning point for ex-Google...,Ex-Googlers closed a $2M a year startup to foc...,https://www.livemint.com/companies/people/how-...,https://www.livemint.com/companies/people/how-...,https://www.livemint.com/lm-img/img/2025/12/28...,"Former Google employees, both aged 33, took a ...","(""generative AI"" OR ""GenAI"") AND NOT (job OR j..."
7,2025-W52,generative_ai,Generative AI,AI,2026-01-03T05:12:01.247414+00:00,2025-12-28T17:15:54Z,,IPWatchdog.com,Eileen McDermott,"A Year of Change, Transition and ‘Recalibratio...",Each year IPWatchdog surveys the IP community ...,https://ipwatchdog.com/2025/12/28/year-change-...,https://ipwatchdog.com/2025/12/28/year-change-...,https://ipwatchdog.com/wp-content/uploads/2025...,"Taken together, 2025 was a year of recalibrati...","(""generative AI"" OR ""GenAI"") AND NOT (job OR j..."
8,2025-W52,generative_ai,Generative AI,AI,2026-01-03T05:12:01.247421+00:00,2025-12-28T17:05:39Z,,GamesRadar+,Kaan Serin,"Witchfire CEO says Divinity devs are ""definite...","Larian just got ""got a little bit unlucky""",https://www.gamesradar.com/games/rpg/witchfire...,https://www.gamesradar.com/games/rpg/witchfire...,https://cdn.mos.cms.futurecdn.net/FPheaHQoGz5G...,The Astronauts CEO Adrian Chmielarz - develope...,"(""generative AI"" OR ""GenAI"") AND NOT (job OR j..."
9,2025-W52,generative_ai,Generative AI,AI,2026-01-03T05:12:01.247428+00:00,2025-12-28T15:35:32Z,,Tiktok.com,,Small Business Marketing: A Complete Beginner'...,Marketing a business in 2025 requires an onlin...,https://ads.tiktok.com/business/en-US/guides/s...,https://ads.tiktok.com/business/en-US/guides/s...,https://sf16-website-login.neutral.ttwstatic.c...,Marketing a business in 2025 requires an onlin...,"(""generative AI"" OR ""GenAI"") AND NOT (job OR j..."


In [10]:
# ============================================================
# Cell 7: Deduplicate
# ============================================================

# This cell removes duplicates to avoid inflating counts and storage.
#
# Dedup strategy (practical / robust):
# - Primary key: url_key (derived from url)
# - Keep the first occurrence by (publishedAt, source_name, title) ordering
#
# Notes:
# - The same article can match multiple keywords; we keep one "article row"
#   per unique URL to create a clean raw corpus for downstream processing.
# - We preserve a mapping table (keyword_hits_df) that records which keywords
#   matched each URL, so you can still compute per-keyword metrics later.

if normalized_df.empty:
    print("No rows to deduplicate (normalized_df is empty).")
    dedup_df = normalized_df.copy()
    keyword_hits_df = pd.DataFrame()
else:
    # --- 1) Build keyword-hit mapping (URL -> matched keywords) ---
    keyword_hits_df = (
        normalized_df[["week_label", "url_key", "keyword_id", "canonical", "category"]]
        .dropna(subset=["url_key"])
        .drop_duplicates()
        .groupby(["week_label", "url_key"], as_index=False)
        .agg(
            matched_keyword_ids=("keyword_id", lambda s: sorted(set(s))),
            matched_canonicals=("canonical", lambda s: sorted(set(s))),
            matched_categories=("category", lambda s: sorted(set(s))),
            matched_count=("keyword_id", lambda s: len(set(s))),
        )
    )

    # --- 2) Create a stable ordering so "keep first" is deterministic ---
    work_df = normalized_df.copy()

    # Ensure publishedAt exists for ordering; fall back to empty string
    work_df["publishedAt_sort"] = work_df["publishedAt"].fillna("").astype(str)

    # Deterministic sort: oldest->newest, then source/title
    work_df = work_df.sort_values(
        by=["publishedAt_sort", "source_name", "title", "url_key"],
        ascending=[True, True, True, True],
        kind="mergesort",  # stable sort
    )

    before = len(work_df)

    # --- 3) Deduplicate by URL key ---
    dedup_df = (
        work_df.dropna(subset=["url_key"])
              .drop_duplicates(subset=["week_label", "url_key"], keep="first")
              .drop(columns=["publishedAt_sort"])
              .reset_index(drop=True)
    )

    after = len(dedup_df)

    # --- 4) Join back the matched-keyword info (optional but useful) ---
    dedup_df = dedup_df.merge(
        keyword_hits_df,
        on=["week_label", "url_key"],
        how="left",
        validate="one_to_one",
    )

    print("Deduplication completed.")
    print(f"Rows before dedup: {before:,}")
    print(f"Rows after  dedup: {after:,}")
    print(f"Duplicates removed: {before - after:,}")

    display(dedup_df.head(10))
    print("\nKeyword-hit mapping (preview):")
    display(keyword_hits_df.head(10))


Deduplication completed.
Rows before dedup: 1,080
Rows after  dedup: 1,013
Duplicates removed: 67


Unnamed: 0,week_label,keyword_id,canonical,category,fetched_at_utc,publishedAt,source_id,source_name,author,title,description,url,url_key,urlToImage,content,query,matched_keyword_ids,matched_canonicals,matched_categories,matched_count
0,2025-W52,carbon_capture,Carbon Capture,Climate,2026-01-03T05:12:05.649879+00:00,2025-12-22T00:00:00Z,,Nature.com,"Hongli Wu, Jialun Xu, Junjie Tai, Jingkun Gao,...",Biomimetic Mn(III) porphyrin-catalyzed aromati...,Selective catalytic aromaticity-breaking epoxi...,https://www.nature.com/articles/s41467-025-673...,https://www.nature.com/articles/s41467-025-673...,,"<li>Zhang, L. &amp; Ritter, T. A perspective o...","(""carbon capture"" OR ""CCS"" OR ""carbon removal""...",[carbon_capture],[Carbon Capture],[Climate],1
1,2025-W52,advanced_materials,Advanced Materials,Deeptech,2026-01-03T05:12:06.878686+00:00,2025-12-22T00:00:00Z,,Royal Society of Chemistry,"Maria A Malandina, Sergei S. Leonchuk, Cheng Z...","""Flexible energy"": energy harvesting and stora...","J. Mater. Chem. A, 2026, Accepted ManuscriptDO...",https://pubs.rsc.org/en/content/articlelanding...,https://pubs.rsc.org/en/content/articlelanding...,,The rapid development of flexible electronics ...,"(""advanced materials"" OR ""material science"" OR...",[advanced_materials],[Advanced Materials],[Deeptech],1
2,2025-W52,advanced_materials,Advanced Materials,Deeptech,2026-01-03T05:12:06.878670+00:00,2025-12-22T00:00:00Z,,Royal Society of Chemistry,"Akash Balakrishnan, Milan Tom, Natarajan Rajam...",Engineered Cellulose-Supported Photocatalysts ...,"J. Mater. Chem. A, 2026, Accepted ManuscriptDO...",https://pubs.rsc.org/en/content/articlelanding...,https://pubs.rsc.org/en/content/articlelanding...,,The growing global demand for sustainable ener...,"(""advanced materials"" OR ""material science"" OR...",[advanced_materials],[Advanced Materials],[Deeptech],1
3,2025-W52,advanced_materials,Advanced Materials,Deeptech,2026-01-03T05:12:06.878675+00:00,2025-12-22T00:00:00Z,,Royal Society of Chemistry,"Mengran Liu, Kexin Liu, Zhuang Yan, Chenfei Ya...",Metal–(organic cocrystal) framework with a pho...,"J. Mater. Chem. A, 2026, Advance ArticleDOI: 1...",https://pubs.rsc.org/en/content/articlelanding...,https://pubs.rsc.org/en/content/articlelanding...,https://pubs.rsc.org/en/Content/Image/GA/D5TA0...,* \r\n Corresponding authors\r\na\r\n Key Labo...,"(""advanced materials"" OR ""material science"" OR...",[advanced_materials],[Advanced Materials],[Deeptech],1
4,2025-W52,advanced_materials,Advanced Materials,Deeptech,2026-01-03T05:12:06.878681+00:00,2025-12-22T00:00:00Z,,Royal Society of Chemistry,"Olusegun Oluwaseun Jimoh, Tolulope Ajuwon, Som...",PLGA nanoparticles in otoprotection and inner ...,"RSC Adv., 2026, 16,76-106DOI: 10.1039/D5RA0600...",https://pubs.rsc.org/en/content/articlelanding...,https://pubs.rsc.org/en/content/articlelanding...,https://pubs.rsc.org/en/Content/Image/GA/D5RA0...,* \r\n Corresponding authors\r\na\r\n Departme...,"(""advanced materials"" OR ""material science"" OR...",[advanced_materials],[Advanced Materials],[Deeptech],1
5,2025-W52,edge_ai,Edge AI,AI,2026-01-03T05:12:02.404709+00:00,2025-12-22T00:00:08Z,,CNX Software,Debashis Das,Forlinx FCU3011 – An NVIDIA Jetson Orin Nano f...,Forlinx Embedded has recently released the FCU...,https://www.cnx-software.com/2025/12/22/forlin...,https://www.cnx-software.com/2025/12/22/forlin...,https://www.cnx-software.com/wp-content/upload...,Forlinx Embedded has recently released the FCU...,"(""edge AI"" OR ""on-device AI"" OR ""on device AI""...",[edge_ai],[Edge AI],[AI],1
6,2025-W52,climate_tech,Climate Tech,Climate,2026-01-03T05:12:05.337034+00:00,2025-12-22T00:31:12Z,,Livemint,Sakshi Sadashiv,Aavishkaar Group looks to balance long-cycle c...,Aavishkaar Group is recalibrating its climate ...,https://www.livemint.com/companies/news/aavish...,https://www.livemint.com/companies/news/aavish...,https://www.livemint.com/lm-img/img/2025/12/21...,Impact investor Aavishkaar Capital is increasi...,"(""climate tech"" OR ""climate technology"" OR ""cl...","[carbon_capture, climate_tech]","[Carbon Capture, Climate Tech]",[Climate],2
7,2025-W52,advanced_materials,Advanced Materials,Deeptech,2026-01-03T05:12:06.878664+00:00,2025-12-22T00:45:20Z,the-times-of-india,The Times of India,Dipanjan Roy Chaudhury,Italian companies ready to seize opportunities...,Italy's Deputy Prime Minister Antonio Tajani h...,https://economictimes.indiatimes.com/news/econ...,https://economictimes.indiatimes.com/news/econ...,"https://img.etimg.com/thumb/msid-126112178,wid...","Italy, the second industrial powerhouse in Eur...","(""advanced materials"" OR ""material science"" OR...",[advanced_materials],[Advanced Materials],[Deeptech],1
8,2025-W52,climate_tech,Climate Tech,Climate,2026-01-03T05:12:05.337027+00:00,2025-12-22T00:55:00Z,,BusinessLine,K Vaitheeswaran,How a founder’s management style shapes the st...,Explore how different founder management style...,https://www.thehindubusinessline.com/specials/...,https://www.thehindubusinessline.com/specials/...,https://bl-i.thgim.com/public/incoming/yipxv7/...,Founders are wired differently from corporate ...,"(""climate tech"" OR ""climate technology"" OR ""cl...",[climate_tech],[Climate Tech],[Climate],1
9,2025-W52,ai_healthcare,AI in Healthcare,Healthtech,2026-01-03T05:12:04.298492+00:00,2025-12-22T01:51:58Z,,Histalk2.com,Jennifer,Morning Headlines 12/22/25,Holt exits New Mountain to create $30 billion ...,https://histalk2.com/2025/12/21/morning-headli...,https://histalk2.com/2025/12/21/morning-headli...,,Holt exits New Mountain to create $30 billion ...,"(""healthcare AI"" OR ""medical AI"" OR ""AI in hea...",[ai_healthcare],[AI in Healthcare],[Healthtech],1



Keyword-hit mapping (preview):


Unnamed: 0,week_label,url_key,matched_keyword_ids,matched_canonicals,matched_categories,matched_count
0,2025-W52,http://deadline.com/2025/12/harry-styles-surpr...,[spacetech],[SpaceTech],[Deeptech],1
1,2025-W52,http://digiday.com/media/here-are-the-biggest-...,"[ai_agents, generative_ai]","[AI Agents, Generative AI]",[AI],2
2,2025-W52,http://electrek.co/2025/12/23/tesla-signs-mass...,[energy_storage],[Energy Storage],[Climate],1
3,2025-W52,http://electrek.co/2025/12/26/elon-musk-drops-...,[energy_storage],[Energy Storage],[Climate],1
4,2025-W52,http://electrek.co/2025/12/26/honda-buys-out-e...,[energy_storage],[Energy Storage],[Climate],1
5,2025-W52,http://electrek.co/2025/12/28/opinion-its-time...,[energy_storage],[Energy Storage],[Climate],1
6,2025-W52,http://futurism.com/advanced-transport/tesla-r...,[robotics],[Robotics],[AI],1
7,2025-W52,http://futurism.com/future-society/trump-china...,[edge_ai],[Edge AI],[AI],1
8,2025-W52,http://futurism.com/science-energy/doomsday-gl...,[spacetech],[SpaceTech],[Deeptech],1
9,2025-W52,http://futurism.com/science-energy/google-co2-...,[energy_storage],[Energy Storage],[Climate],1


In [12]:
# ============================================================
# Cell 8: Save Raw Outputs (Week-partitioned)
# ============================================================

# This cell saves week-partitioned raw outputs to disk.
#
# Outputs:
# - articles.parquet : deduplicated article corpus (one row per URL per week)
# - keyword_hits.parquet : mapping table (URL -> matched keywords) for metrics
# - metadata.json : run metadata (week, time window, counts, plan caps)
#
# Directory structure:
# data/raw/newsapi/YYYY-WW/
#   - articles.parquet
#   - keyword_hits.parquet
#   - metadata.json

import json

# --- Resolve output directory for the target ISO week ---
week_dir = RAW_DATA_DIR / week_label
week_dir.mkdir(parents=True, exist_ok=True)

articles_path = week_dir / "articles.parquet"
hits_path = week_dir / "keyword_hits.parquet"
meta_path = week_dir / "metadata.json"

# --- Safety checks ---
if dedup_df.empty:
    print("⚠️ dedup_df is empty. Nothing to save for articles.")
else:
    dedup_df.to_parquet(articles_path, index=False)
    print(f"✅ Saved articles: {articles_path}  (rows={len(dedup_df):,})")

if "keyword_hits_df" in globals() and not keyword_hits_df.empty:
    keyword_hits_df.to_parquet(hits_path, index=False)
    print(f"✅ Saved keyword hits: {hits_path}  (rows={len(keyword_hits_df):,})")
else:
    print("⚠️ keyword_hits_df is empty or not defined. Nothing to save for keyword hits.")

# --- Save run metadata (for traceability / reproducibility) ---
run_metadata = {
    "week_label": week_label,
    "newsapi_from": newsapi_from,
    "newsapi_to": newsapi_to,
    "language": DEFAULT_LANGUAGE,
    "page_size": DEFAULT_PAGE_SIZE,
    "sort_by": DEFAULT_SORT_BY,
    "global_exclusions": GLOBAL_EXCLUSIONS,
    "developer_max_results": DEVELOPER_MAX_RESULTS,
    "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
    "counts": {
        "raw_records": int(len(raw_records)),
        "normalized_rows": int(len(normalized_df)) if "normalized_df" in globals() else None,
        "dedup_rows": int(len(dedup_df)) if "dedup_df" in globals() else None,
        "keyword_hits_rows": int(len(keyword_hits_df)) if "keyword_hits_df" in globals() else None,
    },
}

with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(run_metadata, f, ensure_ascii=False, indent=2)

print(f"✅ Saved metadata: {meta_path}")

# --- Preview saved paths ---
print("\nSaved files:")
for p in [articles_path, hits_path, meta_path]:
    if p.exists():
        print(f" - {p}")
    else:
        print(f" - (not created) {p}")


✅ Saved articles: /Users/yuetoya/Desktop/researchOS100-private/notebooks/data/raw/newsapi/2025-W52/articles.parquet  (rows=1,013)
✅ Saved keyword hits: /Users/yuetoya/Desktop/researchOS100-private/notebooks/data/raw/newsapi/2025-W52/keyword_hits.parquet  (rows=1,013)
✅ Saved metadata: /Users/yuetoya/Desktop/researchOS100-private/notebooks/data/raw/newsapi/2025-W52/metadata.json

Saved files:
 - /Users/yuetoya/Desktop/researchOS100-private/notebooks/data/raw/newsapi/2025-W52/articles.parquet
 - /Users/yuetoya/Desktop/researchOS100-private/notebooks/data/raw/newsapi/2025-W52/keyword_hits.parquet
 - /Users/yuetoya/Desktop/researchOS100-private/notebooks/data/raw/newsapi/2025-W52/metadata.json


In [13]:
# ============================================================
# Cell 9: Weekly Fetch Summary (Lightweight Report)
# ============================================================

# This cell produces a lightweight weekly report for quick inspection.
# It summarizes:
# - Request outcomes (success / rate-limited / errors)
# - Record counts by keyword/category
# - Cap indicators for the Developer plan (if applicable)
# - Top sources (by article count)
#
# This report is meant for human review after each weekly run.

# -------------------------------
# 1) Request-level summary
# -------------------------------
if "fetch_log_df" in globals() and not fetch_log_df.empty:
    print("=== Request Summary ===")
    req_summary = (
        fetch_log_df
        .groupby(["event"], as_index=False)
        .agg(
            requests=("event", "count"),
            avg_elapsed_sec=("elapsed_sec", "mean"),
            max_elapsed_sec=("elapsed_sec", "max"),
        )
        .sort_values("requests", ascending=False)
    )
    display(req_summary)

    # Errors by keyword (if any)
    error_events = {"http_error", "network_error", "rate_limited", "max_results_reached"}
    if fetch_log_df["event"].isin(error_events).any():
        print("\n=== Non-success Events by Keyword ===")
        non_success = (
            fetch_log_df[fetch_log_df["event"] != "success"]
            .groupby(["canonical", "event"], as_index=False)
            .agg(n=("event", "count"))
            .sort_values(["n", "canonical"], ascending=[False, True])
        )
        display(non_success)
else:
    print("No request log available (fetch_log_df is empty or not defined).")

# -------------------------------
# 2) Article counts (raw / normalized / dedup)
# -------------------------------
print("\n=== Record Counts ===")
counts_tbl = pd.DataFrame([{
    "week_label": week_label,
    "raw_records": len(raw_records) if "raw_records" in globals() else 0,
    "normalized_rows": len(normalized_df) if "normalized_df" in globals() else 0,
    "dedup_rows": len(dedup_df) if "dedup_df" in globals() else 0,
    "keyword_hits_rows": len(keyword_hits_df) if "keyword_hits_df" in globals() else 0,
}])
display(counts_tbl)

# -------------------------------
# 3) Per-keyword coverage (dedup corpus)
# -------------------------------
# Note: dedup_df contains one row per unique URL per week, but includes matched keyword arrays.
# We use keyword_hits_df for clean per-keyword counts.
if "keyword_hits_df" in globals() and not keyword_hits_df.empty:
    print("\n=== Keyword Coverage (unique URLs matched) ===")
    kw_cov = (
        keyword_hits_df
        .explode("matched_keyword_ids")
        .rename(columns={"matched_keyword_ids": "keyword_id"})
        .merge(
            keywords_df_final[["keyword_id", "canonical", "category"]],
            on="keyword_id",
            how="left",
            validate="many_to_one",
        )
        .groupby(["category", "canonical"], as_index=False)
        .agg(unique_urls=("url_key", "nunique"))
        .sort_values(["category", "unique_urls"], ascending=[True, False])
    )
    display(kw_cov)

    print("\n=== Category Coverage (unique URLs matched) ===")
    cat_cov = (
        kw_cov
        .groupby("category", as_index=False)
        .agg(unique_urls=("unique_urls", "sum"))
        .sort_values("unique_urls", ascending=False)
    )
    display(cat_cov)
else:
    print("\nNo keyword-hit mapping available (keyword_hits_df is empty or not defined).")

# -------------------------------
# 4) Cap indicators (Developer plan)
# -------------------------------
# If you applied the Developer cap logic in Cell 5, you can infer likely capping by:
# - keywords where totalResults reported > DEVELOPER_MAX_RESULTS
# This requires totalResults to be in the request log.
if (
    "fetch_log_df" in globals()
    and not fetch_log_df.empty
    and "totalResults" in fetch_log_df.columns
    and DEVELOPER_MAX_RESULTS is not None
):
    print("\n=== Potentially Capped Keywords (Developer Plan) ===")
    # totalResults is recorded per request; use the first-page success rows.
    first_page = fetch_log_df[(fetch_log_df["event"] == "success") & (fetch_log_df["page"] == 1)]
    capped = (
        first_page
        .groupby("canonical", as_index=False)
        .agg(totalResults=("totalResults", "max"))
        .sort_values("totalResults", ascending=False)
    )
    capped["likely_capped"] = capped["totalResults"] > DEVELOPER_MAX_RESULTS
    display(capped[capped["likely_capped"] == True].reset_index(drop=True))

# -------------------------------
# 5) Top sources (in dedup corpus)
# -------------------------------
if "dedup_df" in globals() and not dedup_df.empty:
    print("\n=== Top Sources (dedup corpus) ===")
    top_sources = (
        dedup_df
        .groupby("source_name", as_index=False)
        .agg(articles=("url_key", "nunique"))
        .sort_values("articles", ascending=False)
        .head(25)
    )
    display(top_sources)

print("\nWeekly fetch summary complete.")


=== Request Summary ===


Unnamed: 0,event,requests,avg_elapsed_sec,max_elapsed_sec
0,success,15,0.450067,1.579



=== Record Counts ===


Unnamed: 0,week_label,raw_records,normalized_rows,dedup_rows,keyword_hits_rows
0,2025-W52,1080,1080,1013,1013



=== Keyword Coverage (unique URLs matched) ===


Unnamed: 0,category,canonical,unique_urls
0,AI,AI Agents,100
2,AI,Generative AI,99
3,AI,Robotics,99
1,AI,Edge AI,76
6,Climate,Energy Storage,98
4,Climate,Carbon Capture,44
5,Climate,Climate Tech,21
9,Deeptech,Quantum Computing,100
10,Deeptech,SpaceTech,95
7,Deeptech,Advanced Materials,76



=== Category Coverage (unique URLs matched) ===


Unnamed: 0,category,unique_urls
0,AI,374
2,Deeptech,293
3,Healthtech,249
1,Climate,163



=== Potentially Capped Keywords (Developer Plan) ===


Unnamed: 0,canonical,totalResults,likely_capped
0,SpaceTech,585,True
1,Generative AI,387,True
2,Robotics,275,True
3,AI Agents,245,True
4,Energy Storage,141,True
5,Clinical AI,107,True
6,Digital Health,107,True
7,Quantum Computing,102,True



=== Top Sources (dedup corpus) ===


Unnamed: 0,source_name,articles
190,Pypi.org,125
93,GlobeNewswire,68
282,Yahoo Entertainment,62
255,The Times of India,58
155,Nature.com,31
130,Linkedin.com,24
204,Royal Society of Chemistry,20
75,Financial Post,19
129,Lifesciencesworld.com,18
24,Barchart.com,14



Weekly fetch summary complete.
