In [1]:
import os
from dotenv import load_dotenv
load_dotenv(".env")

True

In [2]:
import json
import re
from datetime import datetime, timezone, timedelta
from typing import Any, Dict, List, Tuple, Optional, Set

from supabase import create_client
from openai import OpenAI

from src.prompt_store import get_or_seed_prompt_template, PromptTemplateRow

def utc_now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()

def clip_text(s: str, max_chars: int) -> str:
    return (s or "").strip()[:max_chars]

def normalize_bullets(md: str) -> str:
    if not md:
        return md
    lines = [ln.rstrip() for ln in md.strip().splitlines() if ln.strip()]
    out = []
    for ln in lines:
        s = ln.strip()
        if s.startswith(("-", "•")):
            out.append("- " + s.lstrip("-•").strip())
        elif len(s) >= 3 and s[0].isdigit() and s[1] in (".", "。"):
            out.append("- " + s[2:].strip())
        else:
            out.append(s)
    return "\n".join(out).strip()

def extract_json(txt: str) -> str:
    t = (txt or "").strip()
    t = re.sub(r"^```(?:json)?\s*", "", t, flags=re.IGNORECASE).strip()
    t = re.sub(r"\s*```$", "", t).strip()
    if t.startswith("{") and t.endswith("}"):
        return t
    m = re.search(r"\{.*\}", t, flags=re.DOTALL)
    return m.group(0).strip() if m else ""

def parse_json(txt: str) -> Dict[str, Any]:
    j = extract_json(txt)
    if not j:
        raise ValueError(f"Expected JSON; got:\n{txt[:400]}")
    return json.loads(j)

class SafeDict(dict):
    def __missing__(self, key):
        return ""

def render_template(template: str, variables: Dict[str, Any]) -> str:
    return (template or "").format_map(SafeDict(**variables))

def build_single_prompt(pt: PromptTemplateRow, variables: Dict[str, Any]) -> str:
    user_part = render_template(pt.user_prompt_template, variables)
    if pt.system_prompt and pt.system_prompt.strip():
        sys_part = render_template(pt.system_prompt, variables)
        return f"{sys_part}\n\n{user_part}".strip()
    return user_part.strip()


In [3]:
import src.prompt_versions as pv
import src.prompt_store as ps

print("prompt_versions file:", pv.__file__)
print("prompt_store file:", ps.__file__)

print(("storyline_digest_en", "v1") in pv.PROMPT_SPECS, "in prompt_versions")
print(("storyline_digest_en", "v1") in ps.PROMPT_SPECS, "in prompt_store")

print("len(prompt_versions.PROMPT_SPECS) =", len(pv.PROMPT_SPECS))
print("len(prompt_store.PROMPT_SPECS)    =", len(ps.PROMPT_SPECS))


prompt_versions file: /Users/hensy/Documents/Repos/news-lens/src/prompt_versions.py
prompt_store file: /Users/hensy/Documents/Repos/news-lens/src/prompt_store.py
True in prompt_versions
True in prompt_store
len(prompt_versions.PROMPT_SPECS) = 9
len(prompt_store.PROMPT_SPECS)    = 9


In [4]:
# Supabase + OpenAI + DeepSeek clients
sb = create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
oa = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
ds = OpenAI(
    api_key=os.environ["DEEPSEEK_API_KEY"],
    base_url=os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com"),
)

# Prompt templates (must be seeded in your prompt_store)
pt_summary = get_or_seed_prompt_template(sb, "summary", "v1")
pt_digest_en = get_or_seed_prompt_template(sb, "storyline_digest_en", "v1")
pt_digest_zh = get_or_seed_prompt_template(sb, "storyline_digest_translate_zh", "v1")
pt_polish = get_or_seed_prompt_template(sb, "storyline_title_zh_polish", "v1")  # optional

In [5]:
LOOKBACK_HOURS = 72
TOP_N = 6
K = 3
MIN_ARTICLES = 2
MIN_SOURCES = 2

MAX_CHARS_FOR_LLM = 12000

def fetch_recent_clustered_articles():
    cutoff = (datetime.now(timezone.utc) - timedelta(hours=LOOKBACK_HOURS)).isoformat()
    res = (
        sb.table("articles")
        .select("id,title,link,source_outlet,published_at,content_text,primary_storyline_id,clustered_at,ai_processed_at,ai_eligible,archived")
        .eq("ai_eligible", True)
        .eq("archived", False)
        .not_.is_("clustered_at", "null")
        .not_.is_("primary_storyline_id", "null")
        .gte("published_at", cutoff)
        .order("published_at", desc=True)
        .limit(5000)
        .execute()
    )
    return res.data or []

def rank_storylines(articles: List[Dict[str, Any]]) -> List[Tuple[str, Dict[str, Any]]]:
    agg: Dict[str, Dict[str, Any]] = {}
    for a in articles:
        sid = a.get("primary_storyline_id")
        if not sid:
            continue
        row = agg.setdefault(sid, {"count": 0, "sources": set(), "latest": None})
        row["count"] += 1
        src = a.get("source_outlet") or ""
        if src:
            row["sources"].add(src)
        pub = a.get("published_at")
        if pub and (row["latest"] is None or pub > row["latest"]):
            row["latest"] = pub

    ranked = []
    for sid, st in agg.items():
        st["source_count"] = len(st["sources"])
        if st["count"] >= MIN_ARTICLES and st["source_count"] >= MIN_SOURCES:
            ranked.append((sid, st))

    ranked.sort(key=lambda x: (x[1]["count"], x[1]["source_count"], x[1]["latest"] or ""), reverse=True)
    return ranked

def fetch_articles_for_storyline(storyline_id: str) -> List[Dict[str, Any]]:
    cutoff = (datetime.now(timezone.utc) - timedelta(hours=LOOKBACK_HOURS)).isoformat()
    res = (
        sb.table("articles")
        .select("id,title,link,source_outlet,published_at,content_text,ai_processed_at")
        .eq("primary_storyline_id", storyline_id)
        .eq("ai_eligible", True)
        .eq("archived", False)
        .not_.is_("published_at", "null")  # optional safety
        .gte("published_at", cutoff)
        .order("published_at", desc=True)
        .limit(200)
        .execute()
    )
    return res.data or []


def pick_representative_articles(candidates: List[Dict[str, Any]], k: int) -> List[Dict[str, Any]]:
    candidates = sorted(candidates, key=lambda a: a.get("published_at") or "", reverse=True)

    chosen: List[Dict[str, Any]] = []
    used_sources: Set[str] = set()

    # pass 1: unprocessed + distinct sources
    for a in candidates:
        if len(chosen) >= k:
            break
        if a.get("ai_processed_at") is not None:
            continue
        src = a.get("source_outlet") or ""
        if src and src in used_sources:
            continue
        chosen.append(a)
        if src:
            used_sources.add(src)

    # pass 2: fill remaining with unprocessed (allow repeats)
    if len(chosen) < k:
        for a in candidates:
            if len(chosen) >= k:
                break
            if a.get("ai_processed_at") is not None:
                continue
            if a["id"] in {x["id"] for x in chosen}:
                continue
            chosen.append(a)

    # pass 3: allow already-processed
    if len(chosen) < k:
        for a in candidates:
            if len(chosen) >= k:
                break
            if a["id"] in {x["id"] for x in chosen}:
                continue
            chosen.append(a)

    return chosen


In [6]:
def run_openai_text(pt: PromptTemplateRow, variables: Dict[str, Any]) -> str:
    prompt = build_single_prompt(pt, variables)
    resp = oa.responses.create(
        model=pt.model_name,
        input=prompt,
        temperature=float(pt.temperature or 0.0),
        max_output_tokens=pt.max_output_tokens,
    )
    return (resp.output_text or "").strip()

def run_deepseek_text(pt: PromptTemplateRow, variables: Dict[str, Any]) -> str:
    # DeepSeek uses chat.completions in your codebase
    messages = []
    if pt.system_prompt and pt.system_prompt.strip():
        messages.append({"role": "system", "content": render_template(pt.system_prompt, variables)})
    messages.append({"role": "user", "content": render_template(pt.user_prompt_template, variables)})

    resp = ds.chat.completions.create(
        model=os.getenv("DEEPSEEK_MODEL", "deepseek-chat"),
        messages=messages,
        temperature=float(pt.temperature or 0.0),
        max_tokens=int(pt.max_output_tokens or 800),
    )
    return (resp.choices[0].message.content or "").strip()

def summarize_article_en(title: str, content_text: str) -> str:
    out = run_openai_text(pt_summary, {
        "title": title,
        "content": clip_text(content_text, MAX_CHARS_FOR_LLM),
    })
    return normalize_bullets(out)

def build_digest_input_en(reps: List[Dict[str, Any]], per_article_en: Dict[str, str]) -> str:
    blocks = []
    for a in reps:
        aid = a["id"]
        summ = per_article_en.get(aid, "").strip()
        if not summ:
            continue
        src = a.get("source_outlet") or ""
        title = a.get("title") or ""
        blocks.append(f"Outlet: {src}\nTitle: {title}\nSummary:\n{summ}".strip())
    return "\n\n---\n\n".join(blocks).strip()

def digest_storyline_en(digest_input: str, article_count: int, source_count: int) -> Dict[str, str]:
    out = run_openai_text(pt_digest_en, {
        "digest_input": digest_input,
        "article_count": article_count,
        "source_count": source_count,
    })
    obj = parse_json(out)
    return {"title_en": obj.get("title_en","").strip(), "summary_en": obj.get("summary_en","").strip()}

def translate_digest_zh(title_en: str, summary_en: str) -> Dict[str, str]:
    out = run_deepseek_text(pt_digest_zh, {
        "title_en": title_en,
        "summary_en": summary_en,
    })
    obj = parse_json(out)
    return {"title_zh": obj.get("title_zh","").strip(), "summary_zh": obj.get("summary_zh","").strip()}

def polish_title_zh(title_zh: str) -> str:
    if not title_zh.strip():
        return title_zh
    out = run_openai_text(pt_polish, {"title_zh": title_zh})
    return out.strip() or title_zh


In [7]:
# Controls
DRY_RUN_ONLY = True  # keep True for notebook testing
STORYLINE_INDEX = 0  # choose which top storyline to preview

recent = fetch_recent_clustered_articles()
ranked = rank_storylines(recent)

print(f"Found {len(ranked)} eligible storylines in last {LOOKBACK_HOURS}h.")
for i, (sid, st) in enumerate(ranked[:10]):
    print(i, sid, "count=", st["count"], "sources=", st["source_count"], "latest=", st["latest"])

# storyline_id, stats = ranked[STORYLINE_INDEX]

for i, (storyline_id, stats) in enumerate(ranked[:TOP_N]):
    print(f"\n=== STORYLINE {i+1} ===")
    print("Articles:", stats["count"], "Sources:", stats["source_count"])

    candidates = fetch_articles_for_storyline(storyline_id)
    reps = pick_representative_articles(candidates, k=K)

    print("\nSelected representative articles:")
    for a in reps:
        print("-", a.get("source_outlet"), "|", (a.get("title") or "")[:90])

    # 1) summarize reps in EN (only for testing)
    per_article_en = {}
    for a in reps:
        title = a.get("title") or ""
        content = a.get("content_text") or ""
        if not content.strip():
            continue
        per_article_en[a["id"]] = summarize_article_en(title, content)

    digest_input = build_digest_input_en(reps, per_article_en)
    print("\nDigest input length:", len(digest_input))

    # 2) English storyline digest
    dig_en = digest_storyline_en(digest_input, article_count=stats["count"], source_count=stats["source_count"])

    # 3) translate digest -> zh
    dig_zh = translate_digest_zh(dig_en["title_en"], dig_en["summary_en"])

    # 4) optional polish
    dig_zh["title_zh"] = polish_title_zh(dig_zh["title_zh"])

    # ---- Publishable output ----
    print("\n====================")
    print("ENGLISH (publishable)")
    print("====================")
    print(dig_en["title_en"])
    print(dig_en["summary_en"])

    print("\n====================")
    print("中文（可发布）")
    print("====================")
    print(f"【{dig_zh['title_zh']}】")
    print(dig_zh["summary_zh"])
    print("\n\n")


Found 16 eligible storylines in last 72h.
0 55887cd9-e6e8-430b-8856-aae9c5d52960 count= 34 sources= 5 latest= 2026-01-19T18:12:19+00:00
1 2788d85a-2c50-4bbd-949b-8a3ae10161fb count= 23 sources= 6 latest= 2026-01-19T18:27:54+00:00
2 3497fc1d-2a4d-44a1-908c-3cc77d17c96f count= 15 sources= 4 latest= 2026-01-19T17:29:17+00:00
3 fd4ae98e-34a8-4a8a-b8bd-225017ab046b count= 6 sources= 2 latest= 2026-01-19T16:38:16+00:00
4 c4f91884-e2dc-4c7d-a7a3-bcb061bc5c74 count= 4 sources= 3 latest= 2026-01-19T12:12:38+00:00
5 c6d2f4fb-2a3d-4370-9e98-8e3a5aa0cfbc count= 4 sources= 2 latest= 2026-01-19T09:38:09+00:00
6 60b757fb-8921-45f1-b225-6a79a44343ad count= 3 sources= 2 latest= 2026-01-19T18:00:00+00:00
7 8910b744-8d3d-419a-a69f-6f377a121f6e count= 3 sources= 2 latest= 2026-01-19T17:32:30+00:00
8 8baccaaa-1f3b-475d-b70c-6fa0a158ae22 count= 3 sources= 2 latest= 2026-01-19T08:52:15+00:00
9 c7a210e5-c4ca-4073-bd7d-aa5bc592e0ee count= 3 sources= 2 latest= 2026-01-19T05:00:16+00:00

=== STORYLINE 1 ===
Arti

In [11]:
lines = []
lines.append(f"【{dig_zh['title_zh']}】")
lines.append(dig_zh["summary_zh"])
lines.append("")
lines.append("—— 参考报道 ——")
for a in reps:
    src = a.get("source_outlet") or ""
    title = a.get("title") or ""
    link = a.get("link") or ""
    lines.append(f"- {src}｜{title}\n  {link}")

post_zh = "\n".join(lines).strip()
print(post_zh)


【特朗普因格陵兰争端威胁对欧加征关税 欧盟拟法律反制】
- 美国总统特朗普威胁自2月1日起，对包括丹麦、挪威、瑞典、法国、德国、英国、荷兰和芬兰在内的八个欧洲国家进口商品加征10%关税，此举与格陵兰争端有关。
- 特朗普在被问及是否可能动用武力获取格陵兰时，以“不予置评”回应，未排除使用武力的可能性。
- 关税威胁引发对跨大西洋贸易战的担忧，欧洲股市应声下跌，金银价格则攀升至多年高位。
- 欧盟正考虑一项被称为“大型火箭筒”的强力法律反制措施，可能对价值约930亿欧元的美国进口商品加征报复性关税。
- 此次争端凸显了美欧之间围绕格陵兰战略重要性及更广泛地缘政治问题的紧张关系。
- 市场波动还受到对降息的预期以及中国出口限制等其他因素的影响。
- 美国最高法院预计将很快对特朗普依据紧急权力实施的关税合法性作出裁决。

—— 参考报道 ——
- The Guardian US News｜What are Trump’s latest tariff threats and could the EU hit back with its ‘big bazooka’?
  https://www.theguardian.com/us-news/2026/jan/19/donald-trump-tariff-eu-aci-europe-greenland-trade-war
- The Guardian World News｜Trump says ‘no comment’ when asked if he would seize Greenland by force – Europe live
  https://www.theguardian.com/world/live/2026/jan/19/donald-trump-greenland-denmark-nobel-nato-europe-live-latest-news-updates
- BBC Main Headlines｜Gold and silver prices hit high after tariff threat
  https://www.bbc.com/news/articles/cx2yppj4lg4o?at_medium=RSS&at_campaign=rss
