In [1]:
import os
import sys
sys.path.append(".")

from dotenv import load_dotenv
load_dotenv(".env")

import src.ai_process as aip

from src.ai_process import (
    get_supabase_client,
    get_openai_client,
    get_deepseek_client,
    fetch_categories,
    get_or_seed_prompt_template,
    TASK_SUMMARY,
    TASK_CATEGORIES,
    TASK_EVENT,
    TASK_TITLE,
    TASK_TRANSLATE,
    TASK_STORYLINE,
    SUMMARY_VERSION,
    CATS_VERSION,
    EVENT_VERSION,
    TITLE_VERSION,
    TRANSLATE_VERSION,
    STORYLINE_VERSION,
)

sb = get_supabase_client()
oa = get_openai_client()
ds = get_deepseek_client()

print("Clients initialized.")



Clients initialized.


In [2]:
TEST_LIMIT = 20
FILTER_KEYWORD = "ICE"   # set None to disable

q = (
    sb.table("articles")
    .select("id,title,content_text,published_at,ai_processed_at,primary_event_id,primary_category_id,primary_storyline_id")
    .is_("ai_processed_at", "null")
    .order("published_at", desc=True)
    .limit(TEST_LIMIT)
)

if FILTER_KEYWORD:
    # Note: Supabase .ilike works on text columns; this filters by title only.
    # If you want content_text filter too, we'd do OR with .or_(...)
    q = q.ilike("title", f"%{FILTER_KEYWORD}%")

res = q.execute()
articles = res.data or []

print(f"Fetched {len(articles)} article(s).")
for idx, a in enumerate(articles):
    print(f"[{idx}] - {a['id']} | {(a.get('title') or '')[:90]}")



Fetched 20 article(s).
[0] - b96dda0d-c8bd-ab47-1cd8-fa8e53112afb | Gold and silver prices hit high after tariff threat
[1] - 01b59862-cdaf-aafe-be67-9ea924d041cc | Japan PM Takaichi calls snap election three months after taking office
[2] - 6ce0e82f-c356-a05d-aca4-cc4fd31b83a0 | 'The finest in the world': Why the US is buying icebreakers from Finland
[3] - aeb606db-37f9-6d6d-2aab-1fee03659b8c | The pass of the century then brutal reality: the football gods won’t let the Bears have ni
[4] - 711c20e0-74cc-91ab-db70-fa184ef0026d | What ICE is doing on US streets looks terrifying, but don’t forget: it could happen anywhe
[5] - 538daaf7-e773-7e52-3ebb-b0e358537470 | UK property market ‘on the up’ amid new year bounce in asking prices
[6] - 81f80801-7dd2-4ab8-ee53-f57fc5c3969d | Sydney Harbour shark attack: second incident in two days as police warn against swimming
[7] - 03fd6bc1-2d1f-82b9-e88a-492a2149baac | Sydney Harbour shark attack: police warn against swimming and say heavy rain crea

In [3]:
pt_summary_en = get_or_seed_prompt_template(sb, TASK_SUMMARY, SUMMARY_VERSION)
pt_cats       = get_or_seed_prompt_template(sb, TASK_CATEGORIES, CATS_VERSION)
pt_event      = get_or_seed_prompt_template(sb, TASK_EVENT, EVENT_VERSION)
pt_title      = get_or_seed_prompt_template(sb, TASK_TITLE, TITLE_VERSION)
pt_translate  = get_or_seed_prompt_template(sb, TASK_TRANSLATE, TRANSLATE_VERSION)
pt_storyline  = get_or_seed_prompt_template(sb, TASK_STORYLINE, STORYLINE_VERSION)

print("Loaded prompt templates:")
print("summary:   ", pt_summary_en.task, pt_summary_en.version, pt_summary_en.model_name, pt_summary_en.id)
print("cats:      ", pt_cats.task,       pt_cats.version,       pt_cats.model_name,       pt_cats.id)
print("event:     ", pt_event.task,      pt_event.version,      pt_event.model_name,      pt_event.id)
print("title:     ", pt_title.task,      pt_title.version,      pt_title.model_name,      pt_title.id)
print("translate: ", pt_translate.task,  pt_translate.version,  pt_translate.model_name,  pt_translate.id)
print("storyline: ", pt_storyline.task,  pt_storyline.version,  pt_storyline.model_name,  pt_storyline.id)



Loaded prompt templates:
summary:    summary v1 gpt-4.1-mini b63e4bec-1af6-49f5-926a-2e29f2141c5c
cats:       category v1 gpt-4.1-mini 9931b63b-656b-4469-b81d-ada4699b240d
event:      event v1 gpt-4.1-mini 3b0ef6a0-8f65-4718-97fb-ae2ba1b357ef
title:      title v1 gpt-4.1-mini ed56bf9d-2404-403e-b99c-fbfec093eb92
translate:  translate v1 deepseek-chat 2ac99a94-0f91-4f5b-b861-a008f983f393
storyline:  storyline v1 gpt-4.1-mini d220952f-5a35-4709-a9c4-2654cae4e04d


In [4]:
categories = fetch_categories(sb)
print(f"Loaded {len(categories)} categories.")
print([c["slug"] for c in categories][:20])
if not categories:
    raise RuntimeError("No categories found. Seed your categories table first.")


Loaded 14 categories.
['world', 'us', 'politics', 'business', 'tech', 'science', 'health', 'crime', 'immigration', 'climate', 'sports', 'culture', 'opinion', 'other']


In [5]:
IDX = 4  # pick from printed list

article_id = articles[IDX]["id"]
print("Selected:", article_id)

res = (
    sb.table("articles")
    .select("id,title,link,content_text,published_at,primary_event_id,primary_category_id,primary_storyline_id,ai_processed_at")
    .eq("id", article_id)
    .single()
    .execute()
)
article = res.data
print("Loaded:", article["id"], "|", (article.get("title") or "")[:100])
print("content_text chars:", len(article.get("content_text") or ""))


Selected: 711c20e0-74cc-91ab-db70-fa184ef0026d
Loaded: 711c20e0-74cc-91ab-db70-fa184ef0026d | What ICE is doing on US streets looks terrifying, but don’t forget: it could happen anywhere | Nesri
content_text chars: 5343


In [6]:
import re
import json

def extract_json_from_text(txt: str) -> str:
    """
    Extract first JSON object/array from a text blob.
    Handles code fences and leading/trailing commentary.
    """
    if not txt:
        return ""
    t = txt.strip()

    # Remove ```json fences if present
    t = re.sub(r"^```(?:json)?\s*", "", t, flags=re.IGNORECASE).strip()
    t = re.sub(r"\s*```$", "", t).strip()

    # Fast path: already valid JSON
    if (t.startswith("{") and t.endswith("}")) or (t.startswith("[") and t.endswith("]")):
        return t

    # Try to find first {...} or [...]
    m_obj = re.search(r"\{.*\}", t, flags=re.DOTALL)
    if m_obj:
        return m_obj.group(0).strip()

    m_arr = re.search(r"\[.*\]", t, flags=re.DOTALL)
    if m_arr:
        return m_arr.group(0).strip()

    return ""

def run_openai_json_safe(oa, pt, variables: dict, *, debug_label: str = "") -> dict:
    """
    Call aip.run_openai_text then robustly parse JSON.
    Prints raw output on failure for fast diagnosis.
    """
    raw = aip.run_openai_text(oa, pt, variables)
    raw_stripped = (raw or "").strip()

    if not raw_stripped:
        print(f"\n[JSON_SAFE] Empty output for {debug_label or pt.task}:{pt.version} model={pt.model_name}")
        print("Prompt variables keys:", sorted(list(variables.keys())))
        raise ValueError("Empty model output (expected JSON).")

    candidate = extract_json_from_text(raw_stripped)
    if not candidate:
        print(f"\n[JSON_SAFE] Could not locate JSON for {debug_label or pt.task}:{pt.version}")
        print("RAW OUTPUT:\n", raw_stripped[:2000])
        raise ValueError("No JSON object/array found in output.")

    try:
        return json.loads(candidate)
    except json.JSONDecodeError as e:
        print(f"\n[JSON_SAFE] JSON parse failed for {debug_label or pt.task}:{pt.version}: {e}")
        print("RAW OUTPUT (first 2000 chars):\n", raw_stripped[:2000])
        print("EXTRACTED JSON (first 2000 chars):\n", candidate[:2000])
        raise

def storyline_too_specific(st: str) -> bool:
    s = (st or "").strip()
    if not s:
        return True
    # crude heuristics: too long, contains possessive/person/city/date-like tokens
    if len(s) > 60:
        return True
    if any(tok in s for tok in ["Sheriff", "Governor", "Mayor", "Philadelphia", "Minneapolis", "Los Angeles"]):
        return True
    if any(ch.isdigit() for ch in s):  # dates/numbers often mean too specific
        return True
    return False

def generalize_storyline_title(oa, pt_storyline, title: str, summary_en: str, proposed_title: str) -> str:
    # Use the same pt_storyline model, but a direct instruction to broaden.
    raw = aip.run_openai_text(oa, pt_storyline, {
        "title": title,
        "summary_en": summary_en,
    })
    # If your pt_storyline doesn't accept extra variables, use a standalone one-shot:
    prompt = f"""
Rewrite the storyline title to be broader and reusable across many related articles.

Constraints:
- 2 to 6 words if possible
- No person names
- No city/state names
- No dates
- Keep the meaning umbrella-level

Original proposed title: {proposed_title}

Return STRICT JSON only:
{{"storyline_title":"..."}}
""".strip()
    resp = oa.responses.create(model=pt_storyline.model_name, input=prompt, temperature=0.2, max_output_tokens=120)
    txt = (resp.output_text or "").strip()
    j = json.loads(extract_json_from_text(txt))
    return (j.get("storyline_title") or proposed_title).strip()



In [7]:
import json

WRITE_DB = False  # ✅ set True when you want to actually create storyline/event + update centroids

sl_title = None
sl_desc = None
storyline_json = None
storyline_created = None
chosen_storyline_id = None
chosen_storyline_similarity = None
story_matches = []


def preview_article_storyline_event(article: dict):
    title = article.get("title") or ""
    content_text = article.get("content_text") or ""
    if not content_text.strip():
        return {"error": "No content_text"}

    # ---------- Summaries ----------
    content_for_llm = aip.clip_text(content_text, aip.MAX_CHARS_FOR_LLM)

    summary_en = aip.run_openai_text(oa, pt_summary_en, {
        "title": title,
        "content": content_for_llm,
        "language_instruction": aip.language_instruction("en"),
    })
    summary_en = aip.normalize_bullets(summary_en)

    summary_zh = aip.run_deepseek_text(ds, pt_translate, {
        "title_en": title,
        "summary_en": summary_en,
        "extra_rules": "",
    })
    summary_zh = aip.normalize_bullets(summary_zh)

    if aip.looks_truncated_zh(summary_zh):
        summary_zh = aip.run_deepseek_text(ds, pt_translate, {
            "title_en": title,
            "summary_en": summary_en,
            "extra_rules": "上次输出可能被截断。请更精炼，但不得漏信息；保持项目符号数量一致。",
        })
        summary_zh = aip.normalize_bullets(summary_zh)

    title_zh = aip.run_openai_text(oa, pt_title, {
        "title_en": title,
        "summary_en": summary_en,
    }).strip()

    # ---------- Categories ----------
    cat_lines = aip.categories_as_lines(categories)
    cls = run_openai_json_safe(oa, pt_cats, {
        "title": title,
        "content": aip.clip_text(content_text, 8000),
        "categories": cat_lines,
    }, debug_label="categories")

    cat_map = {c["slug"]: c["id"] for c in categories}
    primary_slug = cls.get("primary_slug")
    primary_cat_id = cat_map.get(primary_slug) if primary_slug else None

    # ---------- Embedding ----------
    emb_input = aip.article_embed_text(title, content_text)
    embedding = aip.embed_text(oa, emb_input)

    # ---------- STORYLINE match ----------
    story_matches = aip.rpc_match_storylines(
        sb,
        embedding,
        threshold=aip.STORYLINE_SIM_THRESHOLD,
        match_count=aip.STORYLINE_MATCH_COUNT,
    )

    if story_matches:
        chosen_storyline_id = story_matches[0].get("storyline_id") or story_matches[0].get("id")
        chosen_storyline_similarity = float(story_matches[0].get("similarity", 0.0))
        storyline_created = False
        storyline_json = None

        # Fetch storyline title/description for printing (optional but helpful)
        # This avoids sl_title being undefined and gives you real stored label.
        try:
            sl_row = (
                sb.table("storylines")
                .select("title,description")
                .eq("id", chosen_storyline_id)
                .single()
                .execute()
                .data
            )
            sl_title = (sl_row.get("title") or "").strip() if sl_row else ""
            sl_desc  = (sl_row.get("description") or "").strip() if sl_row else ""
        except Exception:
            sl_title, sl_desc = "", ""

        if WRITE_DB:
            aip.upsert_storyline_centroid(sb, chosen_storyline_id, embedding, model_name=aip.EMBED_MODEL)
            aip.update_storyline_last_updated(sb, chosen_storyline_id, aip.utc_now_iso())

    else:
        storyline_created = True
        storyline_json = run_openai_json_safe(oa, pt_storyline, {
            "title": title,
            "summary_en": summary_en,
        }, debug_label="storyline")

        # IMPORTANT: map to your schema keys
        sl_title = (storyline_json.get("storyline_title") or "").strip()
        sl_desc  = (storyline_json.get("storyline_description") or "").strip()

        if WRITE_DB:
            chosen_storyline_id = aip.create_storyline(
                sb,
                title=sl_title,
                description=sl_desc,
                main_category_id=primary_cat_id,
                started_at=aip.safe_iso_from_article(article),
                last_updated_at=aip.utc_now_iso(),
                model_name=pt_storyline.model_name,
                prompt_version=pt_storyline.version,
                prompt_template_id=pt_storyline.id,
            )
            aip.upsert_storyline_centroid(sb, chosen_storyline_id, embedding, model_name=aip.EMBED_MODEL)
        else:
            chosen_storyline_id = None
            chosen_storyline_similarity = None


    # If matched storyline and we are writing, update centroid + last_updated
    if chosen_storyline_id and story_matches and WRITE_DB:
        aip.upsert_storyline_centroid(sb, chosen_storyline_id, embedding, model_name=aip.EMBED_MODEL)
        aip.update_storyline_last_updated(sb, chosen_storyline_id, aip.utc_now_iso())

    # ---------- EVENT match inside storyline ----------
    event_matches = []
    chosen_event_id = None
    chosen_event_similarity = None
    event_json = None
    event_status = None  # "matched" | "created" | "proposed_only" | "skipped"

    if chosen_storyline_id:
        event_matches = aip.rpc_match_events_in_storyline(
            sb,
            chosen_storyline_id,
            embedding,
            threshold=aip.EVENT_SIM_THRESHOLD,
            match_count=aip.EVENT_MATCH_COUNT,
        )

        if event_matches:
            chosen_event_id = event_matches[0].get("event_id") or event_matches[0].get("id")
            chosen_event_similarity = float(event_matches[0].get("similarity", 0.0))
            event_status = "matched"
            if WRITE_DB and chosen_event_id:
                aip.link_article_event(sb, article_id=article["id"], event_id=chosen_event_id, similarity=chosen_event_similarity)
                aip.upsert_event_centroid(sb, chosen_event_id, embedding, model_name=aip.EMBED_MODEL)
        else:
            # propose an event (always)
            event_json = run_openai_json_safe(oa, pt_event, {
                "title": title,
                "summary_en": summary_en,
                "category_slug": primary_slug or "",
                "category_name": cls.get("primary_name", ""),
            }, debug_label="event")

            # IMPORTANT: map keys to your event schema. If your template returns "title"/"description", keep this.
            # If it returns "event_title"/"event_description", change accordingly.
            evt_title_raw = (event_json.get("title") or event_json.get("event_title") or "").strip() or aip.clip_text(title, 140)
            evt_desc  = (event_json.get("description") or event_json.get("event_description") or "").strip() or aip.clip_text(summary_en, 900)

            evt_title = aip.normalize_event_title(evt_title_raw, max_words=5)

            if WRITE_DB:
                chosen_event_id = aip.create_event(
                    sb,
                    title=evt_title,
                    description=evt_desc,
                    main_category_id=primary_cat_id,
                    started_at=aip.safe_iso_from_article(article),
                    last_updated_at=aip.utc_now_iso(),
                    model_name=pt_event.model_name,
                    prompt_version=pt_event.version,
                    prompt_template_id=pt_event.id,
                )
                aip.update_event_storyline(sb, chosen_event_id, chosen_storyline_id)
                aip.upsert_event_centroid(sb, chosen_event_id, embedding, model_name=aip.EMBED_MODEL)
                aip.link_article_event(sb, article_id=article["id"], event_id=chosen_event_id, similarity=1.0)
                event_status = "created"
            else:
                event_status = "proposed_only"
    else:
        event_status = "skipped"

    sl_title = sl_title or ""
    sl_desc = sl_desc or ""

    return {
        "article_id": article["id"],
        "title_en": title,
        "title_zh": title_zh,
        "summary_en": summary_en,
        "summary_zh": summary_zh,
        "category_json": cls,

        "embedding_dim": len(embedding),

        "storyline": {
            "chosen_storyline_id": chosen_storyline_id,
            "chosen_similarity": chosen_storyline_similarity,
            "created": storyline_created,
            "proposal_json_if_created": storyline_json,
            "top_matches": story_matches[:5],
            "final_storyline_title": sl_title,
            "final_storyline_description": sl_desc,
        },

        "event": {
            "status": event_status,
            "chosen_event_id": chosen_event_id,
            "chosen_similarity": chosen_event_similarity,
            "proposal_json_if_created": event_json,
            "top_matches_in_storyline": event_matches[:5],
        },

        "write_db": WRITE_DB,
    }

result = preview_article_storyline_event(article)
result.keys()



dict_keys(['article_id', 'title_en', 'title_zh', 'summary_en', 'summary_zh', 'category_json', 'embedding_dim', 'storyline', 'event', 'write_db'])

In [8]:
print("=== EN TITLE ===")
print(result["title_en"])

print("\n=== ZH TITLE ===")
print(result["title_zh"])

print("\n=== CATEGORY JSON ===")
print(json.dumps(result["category_json"], indent=2, ensure_ascii=False))

print("\n=== STORYLINE ===")
print("chosen_storyline_id:", result["storyline"]["chosen_storyline_id"])
print("chosen_similarity:", result["storyline"]["chosen_similarity"])
print("created:", result["storyline"]["created"])
if result["storyline"]["proposal_json_if_created"]:
    print("proposal:")
    print(json.dumps(result["storyline"]["proposal_json_if_created"], indent=2, ensure_ascii=False))
print("final_title:", result["storyline"].get("final_storyline_title"))

print("\nTop storyline matches (up to 5):")
for m in result["storyline"]["top_matches"]:
    print(m)

print("\n=== EVENT (within storyline) ===")
print("chosen_event_id:", result["event"]["chosen_event_id"])
print("chosen_similarity:", result["event"]["chosen_similarity"])
if result["event"]["proposal_json_if_created"]:
    print("proposal:")
    print(json.dumps(result["event"]["proposal_json_if_created"], indent=2, ensure_ascii=False))
print("status:", result["event"]["status"])

print("\nTop event matches inside storyline (up to 5):")
for m in result["event"]["top_matches_in_storyline"]:
    print(m)

print("\n=== EN SUMMARY ===")
print(result["summary_en"])

print("\n=== ZH SUMMARY ===")
print(result["summary_zh"])



=== EN TITLE ===
What ICE is doing on US streets looks terrifying, but don’t forget: it could happen anywhere | Nesrine Malik

=== ZH TITLE ===
美国ICE街头执法暴力引发关注专家警示全球警惕

=== CATEGORY JSON ===
{
  "primary_slug": "immigration",
  "secondary_slugs": [
    "politics",
    "us"
  ],
  "scores": {
    "immigration": 0.95,
    "politics": 0.85,
    "us": 0.75
  }
}

=== STORYLINE ===
chosen_storyline_id: None
chosen_similarity: None
created: True
proposal:
{
  "storyline_title": "US Immigration Enforcement Militarization",
  "storyline_description": "This storyline covers the increasing use of militarized tactics by US immigration enforcement agencies, the political and social factors driving these practices, and the broader implications for civil liberties and community relations across the United States."
}
final_title: US Immigration Enforcement Militarization

Top storyline matches (up to 5):

=== EVENT (within storyline) ===
chosen_event_id: None
chosen_similarity: None
status: skipped



In [9]:
title = article.get("title") or ""
content_text = article.get("content_text") or ""
emb_input = aip.article_embed_text(title, content_text)
embedding = aip.embed_text(oa, emb_input)
raw_matches = aip.rpc_match_storylines(sb, embedding, threshold=0.0, match_count=10)
raw_matches[:3]


[{'storyline_id': '4a4e969b-e670-4d6b-8992-fd707cfd1762',
  'similarity': 0.603510773851161,
  'title': 'ICE Enforcement and Local Backlash',
  'description': "This storyline covers the ongoing tensions between ICE immigration enforcement actions and local officials' and communities' responses, including criticism of ICE tactics, calls for accountability, and public protests.",
  'last_updated_at': '2026-01-17T22:32:13.402363+00:00'}]