<a href="https://colab.research.google.com/github/janettgarciiaa/Project_1/blob/main/Updated_Deliverable_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ----------------------------------------------------------
# Project: Credibility Score for Articles / Sources / References
# Deliverable 1 – Janet Garcia (Original Version)
# ----------------------------------------------------------
# Hybrid model: Surface Reputation + Content Quality + Linguistic Tone
# ----------------------------------------------------------

import re
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
from urllib.parse import urlparse
import json

# ---------------- Layer 1: Surface Check ----------------
HIGH_TRUST_DOMAINS = [
    "nih.gov", "who.int", "reuters.com", "bbc.com", "nytimes.com",
    "theguardian.com", "nature.com", "sciencenews.org", "harvard.edu", ".gov", ".edu"
]
LOW_TRUST_DOMAINS = [
    "buzzfeed.com", "infowars.com", "thegatewaypundit.com",
    "naturalnews.com", "worldnewsdailyreport.com", "medium.com", "substack.com"
]

def surface_layer(url: str):
    """Assesses domain trust and site accessibility."""
    try:
        domain = urlparse(url).netloc.lower()
    except Exception:
        return 0, ["Invalid URL structure."]

    score, notes = 50, []

    if any(dom in domain for dom in HIGH_TRUST_DOMAINS):
        score += 25
        notes.append(f"[+25] Trusted or verified domain ({domain}).")
    elif any(dom in domain for dom in LOW_TRUST_DOMAINS):
        score -= 25
        notes.append(f"[-25] Domain known for opinion or misinformation ({domain}).")
    else:
        notes.append(f"[±0] Unknown domain: {domain} not in database.")

    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            score += 5
            notes.append("[+5] Website accessible and responsive.")
        else:
            score -= 10
            notes.append("[-10] Source unreachable or restricted.")
    except requests.RequestException:
        score -= 10
        notes.append("[-10] Could not fetch content.")

    return max(0, min(100, score)), notes


# ---------------- Layer 2: Content Quality & Authorship ----------------
def content_layer(text: str):
    """Detects author lines, citations, and structured content."""
    score, notes = 50, []

    if re.search(r"\bby\s+[A-Z][a-z]+", text[:400]):
        score += 15
        notes.append("[+15] Author line detected.")
    else:
        score -= 5
        notes.append("[-5] Missing author name at start of article.")

    if re.search(r"(references|sources|citations|bibliography)", text, re.IGNORECASE):
        score += 10
        notes.append("[+10] Mentions references or citations.")
    else:
        notes.append("[±0] No clear citation section found.")

    paragraph_count = text.count("\n")
    if paragraph_count < 3:
        score -= 10
        notes.append("[-10] Very short content — may lack depth.")
    elif paragraph_count > 10:
        score += 5
        notes.append("[+5] Substantive content length detected.")

    return max(0, min(100, score)), notes


# ---------------- Layer 3: Linguistic Tone (Mini-ML) ----------------
def linguistic_layer(text: str):
    """Evaluates tone using TextBlob sentiment and subjectivity."""
    if not text.strip():
        return 50, ["Empty text content."]

    blob = TextBlob(text)
    subj = blob.sentiment.subjectivity
    pol = blob.sentiment.polarity

    score, notes = 50, []

    # Subjectivity (lower = better)
    if subj < 0.3:
        score += 20
        notes.append(f"[+20] Objective tone ({subj:.2f}).")
    elif subj > 0.6:
        score -= 15
        notes.append(f"[-15] Highly subjective writing ({subj:.2f}).")
    else:
        notes.append(f"[±0] Moderate subjectivity ({subj:.2f}).")

    # Polarity (closer to 0 = neutral)
    if abs(pol) < 0.3:
        score += 10
        notes.append(f"[+10] Balanced sentiment ({pol:.2f}).")
    else:
        score -= 10
        notes.append(f"[-10] Strong emotional tone ({pol:.2f}).")

    return max(0, min(100, score)), notes


# ---------------- Text Extractor ----------------
def extract_text(url: str):
    """Retrieves visible text from the given article."""
    try:
        r = requests.get(url, timeout=5)
        soup = BeautifulSoup(r.content, "html.parser")
        paragraphs = [p.get_text() for p in soup.find_all("p")]
        return "\n".join(paragraphs)[:4000]
    except Exception:
        return ""


# ---------------- Main Hybrid Function ----------------
def evaluate_source(url: str):
    """Integrates all three layers to return credibility assessment."""
    text = extract_text(url)

    s_score, s_exp = surface_layer(url)
    c_score, c_exp = content_layer(text)
    l_score, l_exp = linguistic_layer(text)

    # Adaptive weights: if text is missing, emphasize domain
    if not text.strip():
        w1, w2, w3 = 0.7, 0.2, 0.1
    else:
        w1, w2, w3 = 0.4, 0.3, 0.3

    final = (s_score * w1 + c_score * w2 + l_score * w3)
    explanation = " | ".join(s_exp + c_exp + l_exp)

    return {
        "url": url,
        "score": round(final / 100, 2),
        "explanation": explanation
    }


# ---------------- Example Run ----------------
if __name__ == "__main__":
    urls = [
        "https://www.nih.gov/news-events/news-releases",
        "https://www.theguardian.com/world",
        "https://www.naturalnews.com/supplements.html"
    ]

    print("\n---- Janet’s Credibility Engine ----\n")
    for link in urls:
        result = evaluate_source(link)
        print(json.dumps(result, indent=4))



---- Janet’s Credibility Engine ----

{
    "url": "https://www.nih.gov/news-events/news-releases",
    "score": 0.68,
    "explanation": "[+25] Trusted or verified domain (www.nih.gov). | [+5] Website accessible and responsive. | [-5] Missing author name at start of article. | [+10] Mentions references or citations. | [+5] Substantive content length detected. | [\u00b10] Moderate subjectivity (0.47). | [+10] Balanced sentiment (0.18)."
}
{
    "url": "https://www.theguardian.com/world",
    "score": 0.68,
    "explanation": "[+25] Trusted or verified domain (www.theguardian.com). | [+5] Website accessible and responsive. | [-5] Missing author name at start of article. | [\u00b10] No clear citation section found. | [-10] Very short content \u2014 may lack depth. | Empty text content."
}
{
    "url": "https://www.naturalnews.com/supplements.html",
    "score": 0.45,
    "explanation": "[-25] Domain known for opinion or misinformation (www.naturalnews.com). | [+5] Website accessible and