# FOIA-Free Content Pipeline v2 — Colab Runner

Two-lane architecture:
- **Lane A** (Candidates): ingest → enrich → triage → corroborate → package → render
- **Lane B** (Leads/Hunt): discover → hunt → verify → package → render

Set your API keys below, then run each stage.
- `OPENROUTER_API_KEY` — required for LLM scoring
- `BRAVE_API_KEY` — required for artifact hunt + corroboration
- `YOUTUBE_API_KEY` — optional (RSS fallback if not set)

In [None]:
# 1. Install dependencies
!pip install -q pyyaml python-dotenv openai requests feedparser beautifulsoup4 yt-dlp cloudscraper
!apt-get -qq install -y ffmpeg

In [None]:
# 2. Clone repo and set up Python path
import os, sys, subprocess

REPO_URL = 'https://github.com/jj55222/NEWS--VIEWS.git'
BRANCH = 'claude/foia-free-content-pipeline-qlbzp'
REPO_DIR = '/content/NEWS--VIEWS'

# Always reset to a safe directory first (handles deleted-cwd edge case)
os.chdir('/content')

if os.path.isdir(REPO_DIR):
    # Repo exists — make sure we're on the right branch and pull latest
    os.chdir(REPO_DIR)
    subprocess.run(['git', 'fetch', 'origin', BRANCH], check=False)
    subprocess.run(['git', 'checkout', BRANCH], check=False)
    subprocess.run(['git', 'pull', 'origin', BRANCH], check=False)
else:
    !git clone -b {BRANCH} {REPO_URL} {REPO_DIR}
    os.chdir(REPO_DIR)

# Add to Python path so "from scripts..." imports work
if REPO_DIR not in sys.path:
    sys.path.insert(0, REPO_DIR)

print('Working directory:', os.getcwd())
print('scripts/ exists:', os.path.isdir('scripts'))
print('config/ exists:', os.path.isdir('config'))
if not os.path.isdir('scripts'):
    print('\n⚠ scripts/ missing — run this in a new cell, then re-run this cell:')
    print('  !rm -rf /content/NEWS--VIEWS')

In [None]:
# 3. Set API keys
import os

# REQUIRED
os.environ['OPENROUTER_API_KEY'] = ''  # Your OpenRouter API key
os.environ['BRAVE_API_KEY'] = ''       # Your Brave Search API key (https://brave.com/search/api/)

# OPTIONAL (YouTube RSS fallback if not set)
os.environ['YOUTUBE_API_KEY'] = ''     # Your YouTube Data API v3 key

keys_set = bool(os.environ.get('OPENROUTER_API_KEY')) and bool(os.environ.get('BRAVE_API_KEY'))
print('API keys configured.' if keys_set else 'WARNING: Set OPENROUTER_API_KEY and BRAVE_API_KEY above!')
if not os.environ.get('YOUTUBE_API_KEY'):
    print('Note: YOUTUBE_API_KEY not set — YouTube ingest will use RSS fallback (no duration/view data).')

In [None]:
# 4. Initialize database
import sys, os

# Safety net: ensure project root is on path even if cell 2 was skipped/stale
REPO_DIR = '/content/NEWS--VIEWS'
if os.path.isdir(REPO_DIR):
    os.chdir(REPO_DIR)
    if REPO_DIR not in sys.path:
        sys.path.insert(0, REPO_DIR)

from scripts.db import init_db
from scripts.config_loader import ensure_dirs, load_policy, load_sources

ensure_dirs()
init_db()

policy = load_policy()
sources = load_sources()
enabled = [s for s in sources if s.get("enabled")]
primary = [s for s in enabled if s.get("source_class") == "primary"]
secondary = [s for s in enabled if s.get("source_class") == "secondary"]
discovery = [s for s in enabled if s.get("source_class") == "discovery_only"]

print(f'Policy: {len(policy)} sections')
print(f'Sources: {len(sources)} total ({len(enabled)} enabled)')
print(f'  primary: {len(primary)} | secondary: {len(secondary)} | discovery_only: {len(discovery)}')

In [None]:
# ═══ LANE A: Candidate Flow ═══════════════════════════════════════════════

# 5a. INGEST — Pull new candidates (YouTube + RSS + pages)
from scripts.run_pipeline import run_ingest

ingest_results = run_ingest(days=7, dry_run=False)
ingest_results

In [None]:
# 5b. ENRICH — Add transcripts and entities
from scripts.run_pipeline import run_enrich

enrich_results = run_enrich(limit=200, dry_run=False)
enrich_results

In [None]:
# 5c. TRIAGE — Score and classify candidates
from scripts.run_pipeline import run_triage

triage_results = run_triage(limit=200, dry_run=False)
triage_results

In [None]:
# 6. Review PASS candidates (Lane A)
from scripts.db import get_connection, get_candidates
import json

conn = get_connection()
pass_candidates = get_candidates(conn, status='PASS', limit=50)
print(f'PASS candidates: {len(pass_candidates)}')
print()

for i, c in enumerate(pass_candidates[:20], 1):
    sc = c.get('source_class', '?')
    print(f'{i:2d}. [{c["triage_score"]:3d}] [{sc}] {c["title"][:75]}')
    print(f'    Type: {c["incident_type"]} | URL: {c["url"]}')
    print(f'    Reason: {(c.get("triage_rationale") or "")[:100]}')
    print()

conn.close()

In [None]:
# 7. CORROBORATE — Gather supporting sources for PASS candidates
from scripts.run_pipeline import run_corroborate

corr_results = run_corroborate(limit=20, dry_run=False)
corr_results

In [None]:
# 8. PACKAGE — Build case bundles (timeline, narration, shorts plan)
from scripts.run_pipeline import run_package

package_results = run_package(limit=5, dry_run=False)
package_results

In [None]:
# 9. RENDER — Download, cut, caption, export
from scripts.run_pipeline import run_render

render_results = run_render(limit=3, dry_run=False)
render_results

In [None]:
# ═══ LANE B: Lead/Artifact Flow (v2) ═════════════════════════════════════

# 10a. DISCOVER — RSS + pages → case_leads with hook scoring
from scripts.run_pipeline import run_discover

discover_results = run_discover(days=7, dry_run=False)
discover_results

In [None]:
# 10b. HUNT — Brave Search for primary artifacts (bodycam, dashcam, court, docs)
from scripts.run_pipeline import run_hunt

hunt_results = run_hunt(min_hook=70, limit=50, dry_run=False)
hunt_results

In [None]:
# 12. Pipeline stats (both lanes)
from scripts.db import get_connection

conn = get_connection()
print('=== Pipeline Stats ===')
for table in ['candidates', 'case_leads', 'artifacts', 'case_bundles', 'cases', 'corroboration_sources']:
    try:
        count = conn.execute(f'SELECT COUNT(*) FROM {table}').fetchone()[0]
        if count > 0:
            print(f'  {table}: {count} rows')
    except Exception:
        pass

print()
print('Lane A — Triage distribution:')
for status in ['NEW', 'PASS', 'MAYBE', 'KILL']:
    count = conn.execute(
        'SELECT COUNT(*) FROM candidates WHERE triage_status = ?', (status,)
    ).fetchone()[0]
    if count > 0:
        print(f'  {status}: {count}')

print()
print('Lane B — Lead status distribution:')
for status in ['NEW', 'HUNTING', 'ARTIFACT_FOUND', 'NO_ARTIFACT', 'KILL']:
    count = conn.execute(
        'SELECT COUNT(*) FROM case_leads WHERE status = ?', (status,)
    ).fetchone()[0]
    if count > 0:
        print(f'  {status}: {count}')

print()
print('Bundle status distribution:')
for status in ['APPROVED', 'PACKAGED', 'RENDERED', 'READY_TO_PUBLISH']:
    count = conn.execute(
        'SELECT COUNT(*) FROM case_bundles WHERE status = ?', (status,)
    ).fetchone()[0]
    if count > 0:
        print(f'  {status}: {count}')

conn.close()

In [None]:
# 10c. Review leads + artifacts
from scripts.db import get_connection, get_leads, get_artifacts

conn = get_connection()

# Leads summary
for status in ['NEW', 'HUNTING', 'ARTIFACT_FOUND', 'NO_ARTIFACT']:
    leads = get_leads(conn, status=status)
    if leads:
        print(f'=== {status} leads: {len(leads)} ===')
        for i, lead in enumerate(leads[:10], 1):
            print(f'  {i:2d}. [{lead["hook_score"]:3d}] {lead["title"][:70]}')
            if status == 'ARTIFACT_FOUND':
                arts = get_artifacts(conn, lead['lead_id'])
                primary = [a for a in arts if a.get('source_class') == 'primary']
                print(f'      Artifacts: {len(arts)} total, {len(primary)} primary')
        print()

conn.close()

In [None]:
# 10d. VERIFY + PACKAGE — Corroborate leads and build case bundles
from scripts.run_pipeline import run_verify, run_package_v2

verify_results = run_verify(limit=20, dry_run=False)
print('Verify:', verify_results)

package_v2_results = run_package_v2(limit=10, dry_run=False)
print('Package v2:', package_v2_results)

In [None]:
# 11. Missed Opportunity Report
from scripts.run_pipeline import run_report

report_summary = run_report(top_n=30)
report_summary

In [None]:
# FULL PIPELINE (single command)
# Uncomment the lane you want to run:

# Lane A only (v1 candidate flow):
# from scripts.run_pipeline import run_pipeline
# results = run_pipeline(lane='a', days=7, dry_run=False)
# results

# Lane B only (v2 lead/artifact flow):
# from scripts.run_pipeline import run_pipeline
# results = run_pipeline(lane='b', days=7, dry_run=False)
# results

# Both lanes:
# from scripts.run_pipeline import run_pipeline
# results = run_pipeline(days=7, dry_run=False)
# results