<a href="https://colab.research.google.com/github/k-ferry/cs676-fall-2025/blob/main/deliverable1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!mkdir -p src examples


In [4]:
%%writefile src/credibility_scoring_core.py
"""
credibility_scoring_core.py
Offline, dependency-light scoring core for source credibility.

Design goals:
- Pure functions: input is a dict of features you've extracted elsewhere; output is a dict with subscores + overall.
- No I/O, no networking, no global state. Easy to unit-test and to call from RAG/chatbot later.
- Transparent formulas with detailed comments for Step 2 (code review & documentation).

Expected input keys (examples live in examples/sample_data.py):
    authority_level: str  -> 'peer_reviewed_journal' | 'medical_org_or_governing_body' | 'coach_site_with_bios' | 'general_blog' | 'forum_or_social'
    content_type: str     -> 'systematic_review_meta' | 'rct_or_cohort' | 'narrative_review' | 'expert_opinion' | 'anecdote' | 'coaching_practice_article'
    references_count: int
    citation_density: float   # heuristic: refs per ~250 words, usually 0..1
    word_count: int
    sentence_count: int
    has_author: bool
    has_date: bool
    has_update_stamp: bool
    has_author_bio: bool
    has_contact: bool
    has_schema_org: bool
    age_days: Optional[int]   # how many days old the article is (None if unknown)
    evergreen: bool           # True if topic should be treated as timeless physiology
    affiliate_links: int
    promo_score: float        # 0..1 salesy tone proxy (0 = not salesy, 1 = very salesy)
    sensationalism_score: float  # 0..1 title clickbait proxy
    reception: Optional[float]   # 0..1 placeholder for backlinks/expert votes (default 0.5)
"""

from __future__ import annotations
from typing import Dict, Any, Optional

# ----------------------------
# Scoring weights (sum ~1.0)
# Adjust later via calibration against human labels.
# ----------------------------
WEIGHTS = {
    "authority": 0.22,
    "evidence": 0.26,
    "transparency": 0.12,
    "timeliness": 0.14,
    "independence": 0.10,
    "reception": 0.10,
    "presentation": 0.06,
}

# ---------- helpers ----------

def _normalize(x: float) -> float:
    """Clamp to [0,1] to keep subscores well-behaved."""
    return max(0.0, min(1.0, float(x)))

def _map_authority(level: str) -> float:
    """
    Map a simple categorical authority to [0,1].
    You can expand this table or move to a learned model later.
    """
    table = {
        "peer_reviewed_journal": 1.0,
        "medical_org_or_governing_body": 0.9,
        "coach_site_with_bios": 0.7,
        "general_blog": 0.4,
        "forum_or_social": 0.2,
    }
    return _normalize(table.get(level, 0.5))

def _map_evidence(content_type: str, references_count: int, citation_density: float, word_count: int) -> float:
    """
    Combine content type (what kind of article) with simple reference heuristics.
    - type_base: a prior for evidence strength by genre
    - base_floor: small pages (<250 words) can’t score very high on references
    - boost: more references and higher density suggest stronger sourcing
    """
    type_base = {
        "systematic_review_meta": 1.0,
        "rct_or_cohort": 0.9,
        "narrative_review": 0.7,
        "expert_opinion": 0.5,
        "anecdote": 0.2,
        "coaching_practice_article": 0.6,
    }.get(content_type, 0.5)

    base_floor = 0.2 if word_count < 250 else 0.4
    boost = min(0.5, 0.3 * float(citation_density) + 0.02 * min(int(references_count), 30))
    # Blend genre prior (60%) with reference heuristics (40%)
    score = 0.6 * type_base + 0.4 * (base_floor + boost)
    return _normalize(score)

def _map_transparency(has_author: bool, has_date: bool, has_update_stamp: bool,
                      has_author_bio: bool, has_contact: bool, has_schema_org: bool) -> float:
    """
    Basic transparency: identity, timing, and contactability.
    Small bonus for schema.org presence (structured metadata).
    """
    parts = sum([bool(has_author), bool(has_date), bool(has_update_stamp),
                 bool(has_author_bio), bool(has_contact)])
    base = parts / 5.0
    bonus = 0.1 if has_schema_org else 0.0
    return _normalize(base + bonus)

def _map_timeliness(age_days: Optional[int], evergreen: bool=False) -> float:
    """
    Recency is important for applied training guidance.
    If evergreen=True (e.g., core physiology), use a softer default.
    """
    if evergreen:
        return 0.6
    if age_days is None:
        return 0.4
    if age_days <= 730:     # <= 2 years
        return 1.0
    if age_days <= 1825:    # 2–5 years
        return 0.8
    if age_days <= 3650:    # 5–10 years
        return 0.5
    return 0.3

def _map_independence(affiliate_links: int, promo_score: float) -> float:
    """
    Penalize for many affiliate links and salesy tone.
    Independence ≈ 1 - (penalties), clamped to [0,1].
    """
    affiliate_links = max(0, int(affiliate_links))
    promo_score = max(0.0, float(promo_score))
    aff_pen = min(0.6, 0.15 * affiliate_links)
    promo_pen = min(0.6, 0.7 * promo_score)
    return _normalize(1.0 - (aff_pen + promo_pen))

def _map_presentation(word_count: int, sentence_count: int, sensationalism_score: float) -> float:
    """
    Simple readability-ish proxy and penalty for clickbait.
    """
    word_count = int(word_count)
    sentence_count = max(1, int(sentence_count))
    sensationalism_score = max(0.0, float(sensationalism_score))
    if word_count <= 0:
        return 0.2
    avg_len = word_count / sentence_count
    readability = 1.0 if 10 <= avg_len <= 35 else 0.6
    penalty = 0.3 * sensationalism_score
    return _normalize(readability - penalty)

# ---------- public API ----------

def score_source(sample: Dict[str, Any]) -> Dict[str, Any]:
    """
    Core scorer: takes a dict of pre-extracted variables, returns subscores and an overall 0–100 score.
    No external effects; safe to call in real-time systems.

    Returns:
        {
          "subscores": {...},     # each in [0,1], rounded to 3 decimals
          "overall_score": 87.6,  # weighted sum, 0..100
        }
    """
    authority = _map_authority(sample.get("authority_level", ""))
    evidence = _map_evidence(
        content_type=sample.get("content_type",""),
        references_count=int(sample.get("references_count", 0)),
        citation_density=float(sample.get("citation_density", 0.0)),
        word_count=int(sample.get("word_count", 0)),
    )
    transparency = _map_transparency(
        has_author=bool(sample.get("has_author", False)),
        has_date=bool(sample.get("has_date", False)),
        has_update_stamp=bool(sample.get("has_update_stamp", False)),
        has_author_bio=bool(sample.get("has_author_bio", False)),
        has_contact=bool(sample.get("has_contact", False)),
        has_schema_org=bool(sample.get("has_schema_org", False)),
    )
    timeliness = _map_timeliness(
        age_days=sample.get("age_days", None),
        evergreen=bool(sample.get("evergreen", False))
    )
    independence = _map_independence(
        affiliate_links=int(sample.get("affiliate_links", 0)),
        promo_score=float(sample.get("promo_score", 0.0))
    )
    reception = _normalize(float(sample.get("reception", 0.5)))
    presentation = _map_presentation(
        word_count=int(sample.get("word_count", 0)),
        sentence_count=int(sample.get("sentence_count", 1)),
        sensationalism_score=float(sample.get("sensationalism_score", 0.0))
    )

    subscores = {
        "authority": round(authority, 3),
        "evidence": round(evidence, 3),
        "transparency": round(transparency, 3),
        "timeliness": round(timeliness, 3),
        "independence": round(independence, 3),
        "reception": round(reception, 3),
        "presentation": round(presentation, 3),
    }

    overall = 100.0 * sum(subscores[k] * WEIGHTS[k] for k in WEIGHTS)
    return {
        "subscores": subscores,
        "overall_score": round(overall, 1),
    }


Writing src/credibility_scoring_core.py


In [5]:
%%writefile examples/sample_data.py
# Minimal sample dataset for demos and unit tests.
# Feel free to edit or expand these.

SAMPLES = [
    {
        "id": "S1_meta_review_recent",
        "authority_level": "peer_reviewed_journal",
        "content_type": "systematic_review_meta",
        "references_count": 45,
        "citation_density": 0.95,
        "word_count": 4200,
        "sentence_count": 210,
        "has_author": True,
        "has_date": True,
        "has_update_stamp": True,
        "has_author_bio": True,
        "has_contact": True,
        "has_schema_org": True,
        "age_days": 180,
        "evergreen": False,
        "affiliate_links": 0,
        "promo_score": 0.0,
        "sensationalism_score": 0.0,
        "reception": 0.8,
    },
    {
        "id": "S2_org_position_stand",
        "authority_level": "medical_org_or_governing_body",
        "content_type": "narrative_review",
        "references_count": 80,
        "citation_density": 0.85,
        "word_count": 6500,
        "sentence_count": 300,
        "has_author": True,
        "has_date": True,
        "has_update_stamp": True,
        "has_author_bio": False,
        "has_contact": True,
        "has_schema_org": True,
        "age_days": 3*365,
        "evergreen": False,
        "affiliate_links": 0,
        "promo_score": 0.0,
        "sensationalism_score": 0.0,
        "reception": 0.9,
    },
    {
        "id": "S3_coach_blog_cited",
        "authority_level": "coach_site_with_bios",
        "content_type": "coaching_practice_article",
        "references_count": 12,
        "citation_density": 0.55,
        "word_count": 1800,
        "sentence_count": 90,
        "has_author": True,
        "has_date": True,
        "has_update_stamp": False,
        "has_author_bio": True,
        "has_contact": True,
        "has_schema_org": False,
        "age_days": 30,
        "evergreen": False,
        "affiliate_links": 0,
        "promo_score": 0.05,
        "sensationalism_score": 0.0,
        "reception": 0.6,
    },
    {
        "id": "S4_marketing_affiliates",
        "authority_level": "general_blog",
        "content_type": "expert_opinion",
        "references_count": 2,
        "citation_density": 0.10,
        "word_count": 900,
        "sentence_count": 45,
        "has_author": False,
        "has_date": True,
        "has_update_stamp": False,
        "has_author_bio": False,
        "has_contact": False,
        "has_schema_org": False,
        "age_days": 120,
        "evergreen": False,
        "affiliate_links": 4,
        "promo_score": 0.5,
        "sensationalism_score": 0.2,
        "reception": 0.3,
    },
    {
        "id": "S5_forum_old_anecdote",
        "authority_level": "forum_or_social",
        "content_type": "anecdote",
        "references_count": 0,
        "citation_density": 0.0,
        "word_count": 300,
        "sentence_count": 20,
        "has_author": False,
        "has_date": False,
        "has_update_stamp": False,
        "has_author_bio": False,
        "has_contact": False,
        "has_schema_org": False,
        "age_days": 9*365,
        "evergreen": False,
        "affiliate_links": 0,
        "promo_score": 0.0,
        "sensationalism_score": 0.1,
        "reception": 0.2,
    },
    {
        "id": "S6_evergreen_physiology",
        "authority_level": "peer_reviewed_journal",
        "content_type": "narrative_review",
        "references_count": 30,
        "citation_density": 0.6,
        "word_count": 3200,
        "sentence_count": 150,
        "has_author": True,
        "has_date": True,
        "has_update_stamp": False,
        "has_author_bio": True,
        "has_contact": True,
        "has_schema_org": False,
        "age_days": 11*365,
        "evergreen": True,
        "affiliate_links": 0,
        "promo_score": 0.0,
        "sensationalism_score": 0.0,
        "reception": 0.75,
    },
]


Writing examples/sample_data.py


In [6]:
import json
import pandas as pd
from src.credibility_scoring_core import score_source, WEIGHTS
from examples.sample_data import SAMPLES

print("Weights used:", WEIGHTS)

rows = []
for s in SAMPLES:
    out = score_source(s)
    row = {"id": s["id"], "overall_score": out["overall_score"], **{f"sub_{k}": v for k,v in out["subscores"].items()}}
    rows.append(row)

df = pd.DataFrame(rows).sort_values("overall_score", ascending=False)
df


Weights used: {'authority': 0.22, 'evidence': 0.26, 'transparency': 0.12, 'timeliness': 0.14, 'independence': 0.1, 'reception': 0.1, 'presentation': 0.06}


Unnamed: 0,id,overall_score,sub_authority,sub_evidence,sub_transparency,sub_timeliness,sub_independence,sub_reception,sub_presentation
0,S1_meta_review_recent,97.0,1.0,0.96,1.0,1.0,1.0,0.8,1.0
1,S2_org_position_stand,87.1,0.9,0.78,0.9,0.8,1.0,0.9,1.0
5,S6_evergreen_physiology,83.8,1.0,0.78,0.8,0.6,1.0,0.75,1.0
2,S3_coach_blog_cited,78.4,0.7,0.682,0.8,1.0,0.965,0.6,1.0
3,S4_marketing_affiliates,47.0,0.4,0.488,0.2,1.0,0.05,0.3,0.94
4,S5_forum_old_anecdote,36.5,0.2,0.28,0.0,0.5,1.0,0.2,0.97


In [7]:
# Basic range checks
assert all(0.0 <= v <= 100.0 for v in df["overall_score"]), "overall_score out of range"
subcols = [c for c in df.columns if c.startswith("sub_")]
for c in subcols:
    assert all(0.0 <= v <= 1.0 for v in df[c]), f"{c} out of range [0,1]"

# A rough ordering expectation: research > org > coach > marketing > forum (not absolute, but typically true here)
order_expect = ["S1_meta_review_recent", "S2_org_position_stand"]
top2 = set(df.head(2)["id"].tolist())
assert top2 == set(order_expect), f"Top-2 unexpected: got {top2}"

print("Sanity checks passed.")


Sanity checks passed.
