In [None]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import json

def rate_url_validity(user_query: str, url: str) -> dict:
    """
    Evaluates the validity of a given URL by computing various metrics including
    domain trust, content relevance, fact-checking, bias, and citation scores.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        page_text = " ".join([p.text for p in soup.find_all("p")])[:5000]  # Limit text size
    except Exception as e:
        return {"error": f"Failed to fetch content: {str(e)}"}

    # === Domain Trust (Placeholder for Moz API) ===
    domain_trust = 60  # Placeholder value

    # === Content Relevance (Semantic Similarity) ===
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    similarity_score = util.pytorch_cos_sim(model.encode(user_query), model.encode(page_text)).item() * 100

    # === Fact-Checking via Google Fact Check API ===
    fact_check_score = check_facts(page_text)

    # === Bias Detection (Stance Analysis) ===
    bias_score = detect_bias(page_text)

    # === Citation Check via CrossRef & Semantic Scholar ===
    citation_count = check_citations(url)
    citation_score = min(citation_count * 10, 100)  # Normalize

    # === Compute Final Validity Score ===
    final_score = (
        (0.3 * domain_trust) +
        (0.3 * similarity_score) +
        (0.2 * fact_check_score) +
        (0.1 * bias_score) +
        (0.1 * citation_score)
    )

    return {
        "Domain Trust": domain_trust,
        "Content Relevance": similarity_score,
        "Fact-Check Score": fact_check_score,
        "Bias Score": bias_score,
        "Citation Score": citation_score,
        "Final Validity Score": final_score
    }

# === Fact-Checking via Google Fact Check API ===
def check_facts(text: str) -> int:
    api_url = f"https://toolbox.google.com/factcheck/api/v1/claimsearch?query={text[:200]}"
    try:
        response = requests.get(api_url, timeout=5)
        response.raise_for_status()
        data = response.json()
        if "claims" in data:
            ratings = [claim["claimReview"][0]["textualRating"].lower() for claim in data["claims"]]
            if "true" in ratings:
                return 90
            elif "false" in ratings:
                return 20
        return 50
    except:
        return 50  # Default uncertainty score

# === Bias Detection via Stance Analysis ===
def detect_bias(text: str) -> int:
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
    model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
    inputs = tokenizer(text[:512], return_tensors="pt", truncation=True)
    logits = model(**inputs).logits
    probabilities = logits.softmax(dim=-1).tolist()[0]
    neutral_score = probabilities[1] * 100  # Neutral stance
    return int(neutral_score)

# === Citation Check via CrossRef API ===
def check_citations(url: str) -> int:
    crossref_api = f"https://api.crossref.org/works?query.bibliographic={url}"
    try:
        response = requests.get(crossref_api, timeout=5)
        data = response.json()
        return len(data.get("message", {}).get("items", []))
    except:
        return 0  # Assume no citations found


In [None]:
user_prompt = "how long after a negative covid test are you contagious"
url_to_check = "https://www.unmc.edu/healthsecurity/transmission/2024/01/23/you-can-still-be-contagious-with-covid-if-you-have-a-negative-test-heres-why/"

result = rate_url_validity(user_prompt, url_to_check)
print(result)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

{'Domain Trust': 60, 'Content Relevance': 20.581138134002686, 'Fact-Check Score': 50, 'Bias Score': 93, 'Citation Score': 100, 'Final Validity Score': 53.4743414402008}
