<a href="https://colab.research.google.com/github/k-ferry/cs676-fall-2025/blob/main/project-1/deliverable3/deliverable3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from google.colab import userdata
import os
os.environ["PERPLEXITY_API_KEY"] = userdata.get("PERPLEXITY_API_KEY")  # sets it for src/pplx_client.py


In [2]:
import os
os.makedirs("src", exist_ok=True)  # create src/ if it doesn't exist


In [3]:
%%writefile src/scorer.py
# scorer.py
# Hybrid, interpretable credibility scorer specialized for soccer-card sources.
# This version:
#   - Removes irrelevant .gov/.edu priors
#   - Adds host-specific priors for eBay/COMC/PWCC/Goldin/etc. + manufacturers/graders
#   - Expands hobby terms (e.g., Sapphire, Logofractor, 1/1)
#   - Keeps weights modest so content/seller evidence still drives scores
#
# Public contract (stable):
#   score_url(url) -> {
#     "url": str,
#     "status": "ok" | "invalid_url" | "fetch_error" | ...,
#     "score": {"absolute": float, "percentile": float|None},
#     "signals": [{"name","value","weight","rationale"}, ...],
#     "errors": [str, ...],
#     "meta": {"host": str, "is_ebay": bool, "fetched_at": iso, "elapsed_ms": int, "fetch_ms": int|None, "version": str}
#   }

from __future__ import annotations
import dataclasses
import math
import re
import time
import typing as t
from dataclasses import dataclass
from datetime import datetime, timezone
from urllib.parse import urlparse

# Optional deps (graceful fallbacks)
try:
    from bs4 import BeautifulSoup  # type: ignore
except Exception:
    BeautifulSoup = None

try:
    import requests
except Exception:
    requests = None

try:
    import pandas as pd
except Exception:
    pd = None

# ----------------------------
# Network defaults (fetch path)
# ----------------------------
DEFAULT_TIMEOUT_S = 6.0
DEFAULT_HEADERS = {
    "User-Agent": "CredScorer/0.2 (+https://example.edu/project)",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

# ---------------------------------
# Domain recognizers (compiled once)
# ---------------------------------
# Keep EbayLike for fast checks elsewhere
EbayLike = re.compile(r"(^|\.)ebay\.(com|co\.[a-z]{2}|[a-z]{2})$", re.I)

MarketplacePatterns = {
    # Marketplaces
    "ebay":        re.compile(r"(^|\.)ebay\.(com|co\.[a-z]{2}|[a-z]{2})$", re.I),
    "comc":        re.compile(r"(^|\.)comc\.com$", re.I),
    "pwcc":        re.compile(r"(^|\.)pwccmarketplace\.com$", re.I),
    "goldin":      re.compile(r"(^|\.)goldin\.co$", re.I),
    "myslabs":     re.compile(r"(^|\.)myslabs\.com$", re.I),
    "alt":         re.compile(r"(^|\.)alt\.xyz$", re.I),
    "stockx":      re.compile(r"(^|\.)stockx\.com$", re.I),
    "whatnot":     re.compile(r"(^|\.)whatnot\.com$", re.I),

    # Manufacturers
    "topps":       re.compile(r"(^|\.)topps\.com$", re.I),
    "panini":      re.compile(r"(^|\.)paniniamerica\.net$", re.I),
    "upperdeck":   re.compile(r"(^|\.)upperdeck\.com$", re.I),

    # Grading / population / cert lookup
    "psa":         re.compile(r"(^|\.)psacard\.com$", re.I),
    "beckett":     re.compile(r"(^|\.)beckett\.com$", re.I),
    "sgc":         re.compile(r"(^|\.)gosgc\.com|(^|\.)sgccard\.com$", re.I),

    # Hobby references (light positive)
    "tcdb":        re.compile(r"(^|\.)tradingcarddb\.com$", re.I),
    "beckett_pg":  re.compile(r"(^|\.)beckett\.com/(price-guide|news)", re.I),
    "130point":    re.compile(r"(^|\.)130point\.com$", re.I),
}

LowSignalPatterns = {
    "pinterest":   re.compile(r"(^|\.)pinterest\.", re.I),
    "medium":      re.compile(r"(^|\.)medium\.com$", re.I),
    "blogspot":    re.compile(r"(^|\.)blogspot\.", re.I),
    "tiktok":      re.compile(r"(^|\.)tiktok\.com$", re.I),
    "shorteners":  re.compile(r"(^|\.)bit\.ly$|(^|\.)tinyurl\.com$|(^|\.)t\.co$", re.I),
}

# -----------------------
# Data classes & contract
# -----------------------
@dataclass
class Signal:
    """Interpretable scoring component (normalized value & modest weight)."""
    name: str
    value: float     # 0..1 (higher is better)
    weight: float    # 0..1 (influence)
    rationale: str
    def contribution(self) -> float:
        return self.value * self.weight

@dataclass
class ScoreResult:
    url: str
    status: str
    score_abs: float
    score_pct: float | None
    signals: list[Signal]
    errors: list[str]
    meta: dict[str, t.Any]

def response_json(result: ScoreResult) -> dict:
    return {
        "url": result.url,
        "status": result.status,
        "score": {"absolute": result.score_abs, "percentile": result.score_pct},
        "signals": [dataclasses.asdict(s) for s in result.signals],
        "errors": result.errors,
        "meta": result.meta,
    }

# -------------
# Small helpers
# -------------
def _cheap_text(html: str) -> str:
    """HTML→text, prefer BeautifulSoup, fallback to regex."""
    if 'BeautifulSoup' in globals() and BeautifulSoup is not None:
        try:
            soup = BeautifulSoup(html, "lxml")
        except Exception:
            soup = BeautifulSoup(html, "html.parser")
        for tag in soup(["script","style","noscript"]):
            tag.decompose()
        text = soup.get_text(" ", strip=True)
        return re.sub(r"\s+", " ", text).strip()
    # Fallback
    text = re.sub(r"<script[\s\S]*?</script>", " ", html, flags=re.I)
    text = re.sub(r"<style[\s\S]*?</style>", " ", text, flags=re.I)
    text = re.sub(r"<[^>]+>", " ", text)
    return re.sub(r"\s+", " ", text).strip()

def _count_images(html: str) -> int:
    if 'BeautifulSoup' in globals() and BeautifulSoup is not None:
        try:
            soup = BeautifulSoup(html, "lxml")
        except Exception:
            soup = BeautifulSoup(html, "html.parser")
        return len(soup.find_all("img"))
    return len(re.findall(r"<img\b", html, re.I))

def _squash_0_100(raw: float) -> float:
    """Smooth logistic mapping from raw contribution sum → 0..100 (avoid false precision)."""
    x = raw - 0.8
    sig = 1 / (1 + math.exp(-3.5 * x))
    return round(100 * sig, 2)

def _percentile(x: float, arr: list[float]) -> float:
    if not arr:
        return float("nan")
    rank = sum(1 for a in arr if a <= x)
    return round(100 * rank / len(arr), 2)

def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()

def _elapsed_ms(t0: float) -> int:
    return int((time.perf_counter() - t0) * 1000)

def _synthetic_page_for(host: str) -> str:
    """Deterministic synthetic HTML for dry_run=True (stable tests)."""
    if EbayLike.search(host or ""):
        return (
            "<html><head><title>eBay Listing</title></head><body>"
            "Top Rated Seller (99.7% positive feedback) (12450) feedback. "
            "2024 Topps Chrome UEFA Refractor PSA 10 Rookie /99 auto. "
            "Ships from New York. 30 day returns. <img/><img/><img/><img/><img/><img/>"
            "</body></html>"
        )
    return (
        "<html><body>By John Doe. Published 2023. References: https://doi.org/10.x/y "
        "This is a sample article body with some length and structure.</body></html>"
    )

# -------------------------
# Host prior (domain score)
# -------------------------
def _host_category(host: str) -> tuple[str, float, float, str]:
    """
    Map host -> (category, value [0..1], weight [0..1], rationale).
    Priors are modest; evidence from content/seller specifics dominates.
    """
    h = (host or "").lower()

    # Marketplaces (ordered by vetting strength / typical buyer protection)
    for name, pat in MarketplacePatterns.items():
        if pat.search(h):
            if name == "ebay":
                return ("marketplace", 0.78, 0.12, "Trusted marketplace (eBay)")
            if name in {"pwcc", "goldin"}:
                return ("marketplace", 0.76, 0.11, f"Reputable auction marketplace ({name})")
            if name in {"comc", "myslabs", "alt"}:
                return ("marketplace", 0.72, 0.10, f"Known card marketplace ({name})")
            if name in {"stockx", "whatnot"}:
                return ("marketplace", 0.66, 0.08, f"General marketplace ({name})")

    # Authorities
    for name in ["topps", "panini", "upperdeck", "psa", "beckett", "sgc"]:
        if MarketplacePatterns[name].search(h):
            return ("authority", 0.80, 0.12, f"Official/authority ({name})")

    # Hobby references (light)
    for name in ["tcdb", "beckett_pg", "130point"]:
        if MarketplacePatterns[name].search(h):
            return ("reference", 0.68, 0.08, f"Hobby reference ({name})")

    # Low-signal hosts (light negative)
    for name, pat in LowSignalPatterns.items():
        if pat.search(h):
            return ("low_signal", 0.50, 0.05, f"Low-signal host ({name})")

    # Generic baselines
    if h.endswith(".com"):
        return ("generic", 0.60, 0.06, ".com baseline")
    return ("unknown", 0.52, 0.05, "Unknown/low-signal domain")

def _signal_domain_baseline(host: str) -> Signal:
    category, val, wt, why = _host_category(host)
    return Signal("domain_prior", val, wt, why)

def _signal_transport_security(scheme: str) -> Signal:
    return Signal("https", 1.0 if scheme == "https" else 0.4, 0.04, "HTTPS vs HTTP transport")

# ----------------------------
# Content heuristics (generic)
# ----------------------------
def _signals_content_quality(html: str) -> list[Signal]:
    s: list[Signal] = []
    text = _cheap_text(html)
    n = len(text.split())

    # Length band (very short is suspect; very long = diminishing returns)
    if n <= 30:
        v, why = 0.20, "Very short body"
    elif n <= 120:
        v, why = 0.55, "Short body"
    elif n <= 2500:
        v, why = 0.80, "Reasonable body length"
    else:
        v, why = 0.60, "Very long body"
    s.append(Signal("content_length", v, 0.07, why))

    # Outbound refs/links density (weak proxy for sourcing)
    cites = len(re.findall(r"(doi\.org/|https?://)\S+", text))
    s.append(Signal("citations_links", min(cites/5, 1.0), 0.04, "Outbound refs/links density"))

    # Author/date hint
    has_authorish = bool(re.search(r"\bby\s+[A-Z][a-z]+", text))
    s.append(Signal("author_block_hint", 1.0 if has_authorish else 0.5, 0.03, "Author/date block hints"))
    return s

# ---------------------------------------
# eBay-aware signals + lightweight sentiment
# ---------------------------------------
CARD_TERMS = {
    # Rookie / desirability
    "rookie": 0.12, "rc": 0.08, "rookie card": 0.10, "true rookie": 0.08,

    # Grading
    "psa 10": 0.16, "bgs 9.5": 0.10, "sgc 10": 0.08, "gem mint": 0.12,

    # Serial/auto
    "1/1": 0.14, "one of one": 0.14, "auto": 0.12, "autograph": 0.12, "/": 0.10,

    # Sets/variants (Topps Chrome universe and friends)
    "refractor": 0.08, "sapphire": 0.08, "logofractor": 0.10, "mojo": 0.06, "speckle": 0.06,
    "aqua": 0.05, "gold": 0.06, "orange": 0.06, "red": 0.06, "black": 0.06,
    "prizm": 0.08, "topps": 0.06, "merlin": 0.06, "select": 0.06, "optic": 0.06,
    "megacracks": 0.10, "megarcracks": 0.08,  # common misspelling safety net
}

_POS = {"grail","pc","beautiful","clean","crisp","gem","iconic","undervalued","deal","bargain","goat","legend","heat"}
_NEG = {"creased","damage","ding","scratches","scratched","off-center","offcenter","trimmed","fake","reprint","altered","stain","worst","overpriced"}

def _sentiment_features(text: str) -> list[Signal]:
    tokens = re.findall(r"[a-zA-Z\-]+", text.lower())
    pos_hits = sum(1 for w in tokens if w in _POS)
    neg_hits = sum(1 for w in tokens if w in _NEG)
    total = max(pos_hits + neg_hits, 1)
    polarity = (pos_hits - neg_hits) / total      # [-1,1]
    val = (polarity + 1) / 2                      # [0,1]
    return [Signal("sentiment", val, 0.05, f"lexicon polarity {polarity:.2f}")]

def _signals_ebay_listing(html: str) -> list[Signal]:
    s: list[Signal] = []
    text = _cheap_text(html)
    lower = text.lower()

    # Sentiment (small nudge)
    s.extend(_sentiment_features(text))

    # Seller feedback %
    m = re.search(r"(\d{1,3}\.\d)\%\s*positive feedback", text, re.I)
    if m:
        pct = float(m.group(1))
        v = 0.2 + 0.8 * (pct / 100.0)            # maps 0..100% → ~0.2..1.0
        s.append(Signal("seller_feedback_pct", min(v, 1.0), 0.12, f"Seller feedback {pct}%"))
    else:
        s.append(Signal("seller_feedback_pct", 0.55, 0.06, "Feedback % not found"))

    # Seller feedback count (log scale → diminishing returns)
    m2 = re.search(r"\((\d{2,6})\)\s*feedback", text, re.I)
    if m2:
        cnt = int(m2.group(1))
        v = min(math.log10(max(cnt, 1)) / 5.0 + 0.4, 1.0)
        s.append(Signal("seller_feedback_count", v, 0.08, f"Feedback count {cnt}"))

    # Top Rated badge
    if re.search(r"top rated seller", text, re.I):
        s.append(Signal("top_rated", 1.0, 0.06, "Top Rated Seller badge"))

    # Returns policy
    if re.search(r"\b(30|60)\s*day returns?\b", text, re.I):
        s.append(Signal("returns_policy", 0.92, 0.05, "30/60-day returns"))
    elif re.search(r"no returns", text, re.I):
        s.append(Signal("returns_policy", 0.50, 0.05, "No returns"))

    # Listing specificity: hobby keywords + jersey number hint
    term_score = 0.0
    for k, w in CARD_TERMS.items():
        if k in lower:
            term_score += w
    if re.search(r"\b#?\d{1,2}\b", lower):
        term_score += 0.04
    term_score = min(term_score, 1.0)
    s.append(Signal("card_specificity_terms", term_score, 0.14, "Hobby keywords present"))

    # Year + Set present
    any_year = bool(re.search(r"\b(19|20)\d{2}\b", text))
    any_set  = bool(re.search(r"(prizm|topps|merlin|select|optic|megacracks|chrome|sapphire|logofractor)", lower))
    s.append(Signal("year_set_hint", 1.0 if (any_year and any_set) else 0.6, 0.06, "Year+Set mentioned"))

    # Images
    imgs = _count_images(html)
    if imgs >= 8:
        s.append(Signal("image_count", 0.95, 0.05, f"{imgs} images"))
    elif imgs >= 4:
        s.append(Signal("image_count", 0.75, 0.05, f"{imgs} images"))
    else:
        s.append(Signal("image_count", 0.55, 0.05, f"{imgs} images"))

    # Shipping traceability
    if re.search(r"ships from\s+[A-Za-z ]+", lower):
        s.append(Signal("shipping_from", 0.70, 0.03, "Ships-from present"))

    return s

# -------------------------
# Core scorer & batch rank
# -------------------------
def score_url(
    url: str,
    *,
    dry_run: bool=False,
    cohort_scores: t.Sequence[float] | None=None,
    session: t.Any | None=None,
) -> dict:
    """
    Evaluate one URL and return structured credibility JSON (public contract).
    """
    t0 = time.perf_counter()
    errors: list[str] = []
    signals: list[Signal] = []

    # 1) URL parse/validate
    try:
        parsed = urlparse(url)
        if parsed.scheme not in {"http","https"} or not parsed.netloc:
            raise ValueError("URL must include http(s) scheme and host")
        host = parsed.hostname or ""
    except Exception as e:
        result = ScoreResult(
            url=url, status="invalid_url", score_abs=0.0, score_pct=None, signals=[],
            errors=[f"invalid_url: {e}"], meta={"fetched_at": _now_iso(), "elapsed_ms": _elapsed_ms(t0)}
        )
        return response_json(result)

    # 2) Domain/transport priors
    signals.append(_signal_domain_baseline(host))
    signals.append(_signal_transport_security(parsed.scheme))

    # 3) Fetch or synthesize content
    html: str | None = None
    status: str = "ok"
    fetched_ms = None

    if dry_run:
        html = _synthetic_page_for(host)
        fetched_ms = _elapsed_ms(t0)
    else:
        if requests is None:
            errors.append("requests_not_available"); status = "fetch_error"
        else:
            try:
                sess = session or requests.Session()
                r = sess.get(url, headers=DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT_S)
                fetched_ms = _elapsed_ms(t0)
                if r.status_code >= 400:
                    raise RuntimeError(f"HTTP {r.status_code}")
                html = r.text
            except Exception as e:
                errors.append(f"fetch_error: {e}"); status = "fetch_error"

    # 4) Content/platform signals
    if html:
        try:
            signals.extend(_signals_content_quality(html))
        except Exception as e:
            errors.append(f"content_parse_error: {e}")
        try:
            if EbayLike.search(host or ""):
                signals.extend(_signals_ebay_listing(html))
        except Exception as e:
            errors.append(f"ebay_parse_error: {e}")

    # 5) Aggregate score
    raw = sum(s.contribution() for s in signals)
    abs_score = _squash_0_100(raw)

    # 6) Optional percentile vs cohort
    pct = None
    if cohort_scores:
        try:
            pct = _percentile(abs_score, list(cohort_scores))
        except Exception as e:
            errors.append(f"percentile_error: {e}")

    # 7) Assemble payload
    result = ScoreResult(
        url=url,
        status=status,
        score_abs=abs_score,
        score_pct=pct,
        signals=signals,
        errors=errors,
        meta={
            "host": host,
            "is_ebay": bool(EbayLike.search(host or "")),
            "fetched_at": _now_iso(),
            "elapsed_ms": _elapsed_ms(t0),
            "fetch_ms": fetched_ms,
            "version": "d3-0.2",
        },
    )
    return response_json(result)

def rank_listings(urls: list[str], *, dry_run: bool=False) -> list[dict]:
    """
    Score a list of URLs, attach within-batch percentiles, return rows sorted by absolute score desc.
    """
    rows: list[dict] = []
    sess = None if dry_run else (requests.Session() if requests else None)

    tmp: list[dict] = []
    abs_scores: list[float] = []
    for u in urls:
        r = score_url(u, dry_run=dry_run, session=sess)
        tmp.append(r)
        abs_scores.append(r["score"]["absolute"])

    for r in tmp:
        r["score"]["percentile"] = _percentile(r["score"]["absolute"], abs_scores)
        rows.append(r)

    rows.sort(key=lambda d: d["score"]["absolute"], reverse=True)
    return rows

def to_dataframe(rows: list[dict]):
    """
    Convenience: flatten results into a DataFrame; expands signal contributions as columns.
    """
    if pd is None:
        raise RuntimeError("pandas not installed")
    flat = []
    for r in rows:
        base = {
            "url": r["url"],
            "score_abs": r["score"]["absolute"],
            "score_pct": r["score"].get("percentile"),
            "status": r["status"],
            "host": r["meta"].get("host"),
            "is_ebay": r["meta"].get("is_ebay"),
        }
        for sig in r["signals"]:
            base[f"sig_{sig['name']}"] = sig["value"] * sig["weight"]
        flat.append(base)
    return pd.DataFrame(flat).sort_values("score_abs", ascending=False)


Writing src/scorer.py


In [4]:
!ls -la src | head -n 20


total 28
drwxr-xr-x 2 root root  4096 Oct  3 18:43 .
drwxr-xr-x 1 root root  4096 Oct  3 18:43 ..
-rw-r--r-- 1 root root 18380 Oct  3 18:43 scorer.py


In [5]:
from src.scorer import score_url, rank_listings
rows = rank_listings(["https://www.ebay.com/itm/123","https://www.comc.com/Cards/Soccer"], dry_run=True)
rows[0]["score"], rows[0]["meta"]["host"]


({'absolute': 41.93, 'percentile': 100.0}, 'www.ebay.com')

In [6]:
%%writefile src/pplx_client.py
# pplx_client.py
# Thin client for Perplexity Chat Completions API to discover URLs.

from __future__ import annotations
import os, re, json, time
import typing as t
import requests

def _extract_urls(text: str) -> list[str]:
    urls = re.findall(r'https?://[^\s)>\]"}]+', text, flags=re.I)
    out, seen = [], set()
    for u in urls:
        u = u.rstrip('.,);:')
        if u not in seen:
            seen.add(u); out.append(u)
    return out

def pplx_search_sources(player: str, *, max_urls: int = 12, api_key: str | None = None, model: str = "sonar-pro") -> dict:
    """
    Returns: {"prompt": <str>, "answer": <str>, "citations": [url,...], "urls": [url,...]}
    """
    key = api_key or os.getenv("PERPLEXITY_API_KEY")
    assert key, "Missing PERPLEXITY_API_KEY (env var). In Colab, set it from Secrets."

    base = "https://api.perplexity.ai/chat/completions"
    headers = {"Authorization": f"Bearer {key}", "Content-Type": "application/json"}

    system = (
        "You are a research assistant. Return reputable URLs that directly reference "
        "specific soccer trading cards (set, year, variant, grade/serial where possible). "
        "Prefer official marketplaces (eBay item pages, PWCC, Goldin), manufacturer pages, "
        "and credible hobby references. Include recent/active listings where possible."
    )
    user = (
        f"Player: {player}\n"
        "Task: Find specific active or recent listings and authoritative references for this player's cards. "
        "Return direct item or reference URLs (not just homepages). Include a mix of marketplaces and credible sources."
    )
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        "temperature": 0.2,
        "top_p": 0.9
    }
    r = requests.post(base, headers=headers, data=json.dumps(payload), timeout=30)
    r.raise_for_status()
    data = r.json()

    answer = ""
    try:
        answer = data["choices"][0]["message"]["content"]
    except Exception:
        answer = json.dumps(data)[:2000]

    citations = []
    try:
        citations = data.get("citations") or data["choices"][0]["message"].get("citations") or []
    except Exception:
        citations = []

    urls = list(dict.fromkeys((citations or []) + _extract_urls(answer)))[:max_urls]
    return {"prompt": user, "answer": answer, "citations": citations, "urls": urls}


Writing src/pplx_client.py


In [8]:
%%writefile app.py
# app.py — Streamlit front-end for Deliverable 3
import os
import streamlit as st
import pandas as pd

from src.pplx_client import pplx_search_sources
from src.scorer import rank_listings

st.set_page_config(page_title="Soccer Card Source Credibility", layout="wide")
st.title("Soccer Card Source Credibility (RAG + Scorer)")
st.caption("Enter a player; we’ll fetch recent sources via Perplexity, then score each URL’s credibility.")

# In Colab, you can set this env var at runtime from Secrets (see notebook cell):
# os.environ['PERPLEXITY_API_KEY'] = '...'  # (handled in notebook)

player = st.text_input("Player name", value="Bukayo Saka")
col_a, col_b, col_c = st.columns([1,1,2])
with col_a:
    max_urls = st.slider("Max URLs", min_value=5, max_value=25, value=12, step=1)
with col_b:
    dry_run = st.checkbox("Dry run (synthetic pages for scoring)", value=False)
with col_c:
    st.info("Scores are 0–100 with percentiles within this cohort. Click a row for rationale in the URL.")

if st.button("Search & Score") and player.strip():
    with st.spinner("Searching Perplexity and scoring sources..."):
        discovery = pplx_search_sources(player.strip(), max_urls=max_urls)
        urls = discovery["urls"]
        if not urls:
            st.warning("No URLs found. Try a different player spelling or a well-known card.")
        else:
            rows = rank_listings(urls, dry_run=dry_run)

            # Build table with top signal rationales (compact)
            table = []
            for r in rows:
                top3 = sorted(r["signals"], key=lambda s: s["value"]*s["weight"], reverse=True)[:3]
                rationale = "; ".join([f"{s['name']}: {s['rationale']}" for s in top3])
                table.append({
                    "Score": r["score"]["absolute"],
                    "Pct": r["score"]["percentile"],
                    "Host": r["meta"]["host"],
                    "Status": r["status"],
                    "URL": r["url"],
                    "Top rationales": rationale
                })
            df = pd.DataFrame(table).sort_values("Score", ascending=False)
            st.dataframe(df, use_container_width=True)
            st.caption(f"Perplexity returned {len(urls)} URLs for {player.strip()}.")


Writing app.py


In [9]:
%%writefile requirements.txt
streamlit
requests
beautifulsoup4
lxml
scikit-learn
pandas
numpy


Writing requirements.txt
