In [8]:
import requests
import re
import urllib.parse
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from google.colab import userdata
import os
from google.oauth2 import service_account
from googleapiclient.discovery import build
from google.auth.transport.requests import Request
import json
from textblob import TextBlob

def extract_title(url: str) -> str:
    """Fetch and extract the title of a webpage."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an error for bad responses (4xx, 5xx)

        soup = BeautifulSoup(response.text, "html.parser")
        title = soup.title.string.strip() if soup.title else "No title found"

        return title
    except requests.RequestException as e:
        return f"Error fetching title: {e}"

class URLValidator:
    """
    A production-ready URL validation class that evaluates the credibility of a webpage
    using multiple factors: domain trust, content relevance, fact-checking, bias detection, and citations.
    """

    def __init__(self):
        # Set up authentication for Google API (Service Account)
        # Assuming the JSON file is uploaded to the Colab environment's root directory
        service_account_file = '/content/stalwart-realm-428100-n4-3a783fbfb092.json'
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = service_account_file

        # Authenticate and build the service
        self.credentials = service_account.Credentials.from_service_account_file(
            service_account_file,  # Use the variable here as well
            scopes=["https://www.googleapis.com/auth/cloud-platform"]
        )
        self.service = build('factchecktools', 'v1alpha1', credentials=self.credentials)

       # Load models once to avoid redundant API calls
        self.similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
        self.sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

    def fetch_page_content(self, url: str) -> str:
        """ Fetches and extracts text content from the given URL. """
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            return " ".join([p.text for p in soup.find_all("p")])  # Extract paragraph text
        except requests.RequestException:
            return ""  # Fail gracefully by returning an empty string

    def extract_doi_from_url(self, url: str) -> str:
        """Extract DOI from a URL if present."""
        match = re.search(r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+", url)
        return match.group(0) if match else None

    def get_domain_trust(self, url: str) -> int:
        """ Retrieves domain trust score from the Moz API using Google Colab Secrets. """
        try:
            MOZ_API_TOKEN = userdata.get("MOZ_SECRET_KEY")

            if not MOZ_API_TOKEN:
                raise ValueError("Moz API token is not set. Please add it in the Secrets tab.")

            api_url = "https://lsapi.seomoz.com/v2/url_metrics"

            headers = {
                "Authorization": f"Bearer {MOZ_API_TOKEN}",
                "Content-Type": "application/json"
            }
            payload = {"targets": [url]}

            response = requests.post(api_url, headers=headers, json=payload, timeout=5)

            response.raise_for_status()

            data = response.json()

            domain_authority = data.get("results", [{}])[0].get("domain_authority", 0)

            return int(domain_authority)

        except Exception as e:
            print(f"Error fetching Moz domain trust: {e}")
            return 50  # Default fallback

    def compute_similarity_score(self, user_query: str, content: str) -> int:
        """ Computes semantic similarity between user query and page content. """
        if not user_query or not content:
            return 0  # Return 0 similarity if no valid input

        # Limit content size to avoid model errors (first 500 words)
        truncated_content = " ".join(content.split()[:500])

        try:
            query_embedding = self.similarity_model.encode(user_query, convert_to_tensor=True)
            content_embedding = self.similarity_model.encode(truncated_content, convert_to_tensor=True)

            similarity = util.pytorch_cos_sim(query_embedding, content_embedding)

            return int(similarity.item() * 100)  # Convert similarity to percentage (0-100)

        except Exception as e:
            print(f"Error in compute_similarity_score: {e}")
            return 0  # Return 0 on failure

    def check_facts(self, content: str) -> int:
        """ Cross-checks extracted content with Google Fact Check API using the API key. """
        if not content:
            return 50

        # Make a request to the Fact Check API using the API key
        query = content[:500]  # Use the first 500 characters of content as the query
        api_key = userdata.get('GOOGLE_FACT_CHECK_API')  # Replace with your actual API key
        url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?query={urllib.parse.quote(query)}&key={api_key}"

        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an exception for HTTP errors
            data = response.json()

            claims = data.get('claims', [])

            # If there are fact-check results, return a higher score
            if claims:
                return 80  # If claims are found, return a score of 80
            else:
                return 40  # No claims found, return a lower score
        except requests.exceptions.RequestException as e:
            print(f"Error with Fact Check API: {e}")
            return 50  # Default uncertainty score in case of error

    def detect_bias(self, content: str) -> int:
        """ Detects potential bias in content, using sentiment analysis and subjectivity checks. """
        if not content:
            return 50  # Neutral score for empty content

        # Sentiment analysis using a stronger model
        sentiment_result = self.sentiment_analyzer(content[:512])[0]
        sentiment_score = sentiment_result["score"]
        sentiment_label = sentiment_result["label"]

        # Subjectivity check (optional)
        blob = TextBlob(content)
        subjectivity_score = blob.sentiment.subjectivity  # 0 (objective) to 1 (subjective)

        # Combine sentiment and subjectivity to score bias
        if sentiment_label == "POSITIVE":
            bias_score = 100 * (1 - subjectivity_score)  # Higher score for positive but objective content
        elif sentiment_label == "NEGATIVE":
            bias_score = 30 * (1 - subjectivity_score)  # Negative sentiment might indicate potential bias
        else:  # Neutral sentiment
            bias_score = 50 * (1 - subjectivity_score)  # Neutral sentiment is generally non-biased

        # Adjust based on subjectivity: more subjective content may imply bias
        if subjectivity_score > 0.5:  # Highly subjective, potentially biased
            bias_score *= 0.8  # Lower score for more subjective content

        # Ensure the score is within a reasonable range
        return max(30, min(100, int(bias_score)))

    def get_citeseerx_citations(self, title: str) -> int:
        """Search CiteSeerX for citation count based on the title."""
        search_url = f"https://citeseerx.ist.psu.edu/search?q={title.replace(' ', '+')}&submit=Search"
        page_content = self.fetch_page_content(search_url)

        if page_content:
            soup = BeautifulSoup(page_content, 'html.parser')

            # Locate citation count elements (this depends on CiteSeerX page structure)
            citation_texts = soup.find_all(string=re.compile(r"Cited by \d+"))

            if citation_texts:
                citation_counts = [int(re.search(r"\d+", text).group()) for text in citation_texts]
                return max(citation_counts, default=0)  # Return highest citation count found

        return 0

    def get_serpapi_citations(self, title: str) -> int:
        """Search Google Scholar using SerpAPI and extract the citation count."""
        SERPAPI_KEY = userdata.get("SERP_API_KEY")  # Ensure API key is available

        if not SERPAPI_KEY:
            print("SerpAPI key not found. Set it in Colab Secrets.")
            return 0

        params = {
            "engine": "google_scholar",
            "q": title,
            "api_key": SERPAPI_KEY
        }

        try:
            response = requests.get("https://serpapi.com/search", params=params, timeout=10)
            response.raise_for_status()  # Raise an error if request fails
            data = response.json()

            #print("SerpAPI Response:", json.dumps(data, indent=2))  # Debugging Step

            total_citations = 0  # Initialize citation count

            # Loop through all organic results
            for result in data.get("organic_results", []):
                cited_by = result.get("inline_links", {}).get("cited_by", {})  # Extract 'cited_by'
                if "total" in cited_by:
                    citations = cited_by["total"]
                    #print(f"Found {citations} citations in result: {result.get('title', 'Unknown')}")
                    total_citations += citations  # Add to total citations

            #print(f"Total Citations Found: {total_citations}")
            return total_citations  # Return the correct total citations

        except requests.exceptions.RequestException as e:
            print(f"Error fetching SerpAPI citations: {e}")

        return 0  # Default to 0 if no citations found

    def check_citations(self, url: str, title: str) -> int:
        """Check citations from multiple sources (CiteSeerX + Google Scholar via SerpAPI)."""
        citeseerx_citations = self.get_citeseerx_citations(title)
        serpapi_citations = self.get_serpapi_citations(title)

        total_citations = citeseerx_citations + serpapi_citations  # Combine results

        # Scale the citation count to a score between 0-100 (assuming 200 citations is a max cap)
        citation_score = min(100, (total_citations / 200) * 100)  # Adjust scale as needed

        #print(f"Total Citations: {total_citations}, Citation Score: {citation_score}")

        return int(citation_score)

    def get_star_rating(self, score: float) -> tuple:
        """ Converts a score (0-100) into a 1-5 star rating. """
        stars = max(1, min(5, round(score / 20)))  # Normalize 100-scale to 5-star scale
        return stars, "⭐" * stars

    def generate_explanation(self, domain_trust, similarity_score, fact_check_score, bias_score, citation_score, final_score) -> str:
        """ Generates a human-readable explanation for the score. """
        reasons = []
        if domain_trust < 50:
            reasons.append("The source has low domain authority.")
        if similarity_score < 50:
            reasons.append("The content is not highly relevant to your query.")
        if fact_check_score < 50:
            reasons.append("Limited fact-checking verification found.")
        if bias_score < 50:
            reasons.append("Potential bias detected in the content.")
        if citation_score < 30:
            reasons.append("Few citations found for this content.")

        return " ".join(reasons)
        self.service = build('factchecktools', 'v1alpha1', credentials=self.credentials)

        return " ".join(reasons) if reasons else "This source is highly credible and relevant."

    def rate_url_validity(self, user_query: str, url: str) -> dict:
        """Main function to evaluate the validity of a webpage."""
        content = self.fetch_page_content(url)

        domain_trust = self.get_domain_trust(url)
        similarity_score = self.compute_similarity_score(user_query, content)
        fact_check_score = self.check_facts(content)
        bias_score = self.detect_bias(content)

        article_title = extract_title(url)  # Get title from webpage
        citation_score = self.check_citations(url, article_title)  # Now uses SerpAPI

        final_score = (
            (0.25 * domain_trust) +
            (0.25 * similarity_score) +
            (0.2 * fact_check_score) +
            (0.1 * bias_score) +
            (0.2 * citation_score)  # Increased weight for citations
        )

        stars, icon = self.get_star_rating(final_score)
        explanation = self.generate_explanation(domain_trust, similarity_score, fact_check_score, bias_score, citation_score, final_score)

        return {
            "raw_score": {
                "Domain Trust": domain_trust,
                "Content Relevance": similarity_score,
                "Fact-Check Score": fact_check_score,
                "Bias Score": bias_score,
                "Citation Score": citation_score,
                "Final Validity Score": final_score
            },
            "stars": {
                "score": stars,
                "icon": icon
            },
            "explanation": explanation
        }

user_prompt = 'What are the symptoms of COVID-19?'
url_to_check = 'https://www.mayoclinic.org/diseases-conditions/coronavirus/symptoms-causes/syc-20479963'
article_title = extract_title(url_to_check)  # Extract title from the webpage
url_validator = URLValidator()
result = url_validator.rate_url_validity(user_prompt, url_to_check)  # Pass article_title
print(result)

user_prompt = 'What is artificial intelligence?'
url_to_check = 'https://www.ibm.com/think/topics/artificial-intelligence'
article_title = extract_title(url_to_check)  # Extract title from the webpage
url_validator = URLValidator()
result = url_validator.rate_url_validity(user_prompt, url_to_check)  # Pass article_title
print(result)

user_prompt = 'How does climate change affect wildlife?'
url_to_check = 'https://www.nps.gov/articles/000/wildlife-climateimpact.htm'
article_title = extract_title(url_to_check)  # Extract title from the webpage
url_validator = URLValidator()
result = url_validator.rate_url_validity(user_prompt, url_to_check)  # Pass article_title
print(result)

Device set to use cpu


{'raw_score': {'Domain Trust': 92, 'Content Relevance': 60, 'Fact-Check Score': 80, 'Bias Score': 30, 'Citation Score': 1, 'Final Validity Score': 57.2}, 'stars': {'score': 3, 'icon': '⭐⭐⭐'}, 'explanation': 'Potential bias detected in the content. Few citations found for this content.'}


Device set to use cpu


{'raw_score': {'Domain Trust': 93, 'Content Relevance': 56, 'Fact-Check Score': 40, 'Bias Score': 52, 'Citation Score': 1, 'Final Validity Score': 50.650000000000006}, 'stars': {'score': 3, 'icon': '⭐⭐⭐'}, 'explanation': 'Limited fact-checking verification found. Few citations found for this content.'}


Device set to use cpu


{'raw_score': {'Domain Trust': 92, 'Content Relevance': 66, 'Fact-Check Score': 40, 'Bias Score': 30, 'Citation Score': 100, 'Final Validity Score': 70.5}, 'stars': {'score': 4, 'icon': '⭐⭐⭐⭐'}, 'explanation': 'Limited fact-checking verification found. Potential bias detected in the content.'}


In [2]:
user_prompts = [
    "What are the symptoms of COVID-19?",
    "How effective is the flu vaccine this year?",
    "What are the causes of global warming?",
    "How does exercise affect mental health?",
    "What are the benefits of meditation?",
    "How do vaccines work?",
    "What is artificial intelligence?",
    "What are the side effects of the COVID-19 vaccine?",
    "How to reduce stress at work?",
    "What are the signs of depression?",
    "How to prevent heart disease?",
    "What is quantum computing?",
    "What is the best diet for weight loss?",
    "How does climate change affect wildlife?",
    "What are the symptoms of anxiety?",
    "How to improve sleep quality?",
    "What are the symptoms of the flu?",
    "What is the difference between Type 1 and Type 2 diabetes?",
    "How to manage time effectively?",
    "What is the impact of social media on teenagers?"
]

urls_to_check = [
    "https://www.mayoclinic.org/diseases-conditions/coronavirus/symptoms-causes/syc-20479963",
    "https://www.flu.com/Articles/2024/2024-2025-Flu-Vaccine-Effectiveness",
    "https://www.nrdc.org/stories/global-warming-101",
    "https://www.betterhealth.vic.gov.au/health/healthyliving/exercise-and-mental-health",
    "https://health.ucdavis.edu/blog/cultivating-health/10-health-benefits-of-meditation-and-how-to-focus-on-mindfulness-and-compassion/2022/12",
    "https://www.cdc.gov/vaccines/basics/explaining-how-vaccines-work.html#:~:text=Vaccines%20work%20by%20imitating%20an,vaccines%20at%20the%20recommended%20times.",
    "https://www.ibm.com/think/topics/artificial-intelligence",
    "https://www.cdc.gov/vaccine-safety/vaccines/covid-19.html",
    "https://www.mind.org.uk/information-support/tips-for-everyday-living/how-to-be-mentally-healthy-at-work/work-and-stress/",
    "https://www.samhsa.gov/",
    "https://www.mayoclinic.org/diseases-conditions/heart-disease/in-depth/heart-disease-prevention/art-20046502",
    "https://www.ibm.com/think/topics/quantum-computing",
    "https://www.health.harvard.edu/topics/diet-and-weight-loss",
    "https://www.nps.gov/articles/000/wildlife-climateimpact.htm",
    "https://www.mayoclinic.org/diseases-conditions/anxiety/symptoms-causes/syc-20350961",
    "https://www.mayoclinic.org/healthy-lifestyle/adult-health/in-depth/sleep/art-20048379",
    "https://www.cdc.gov/flu/signs-symptoms/index.html",
    "https://uvahealth.com/services/diabetes-care/types",
    "https://slack.com/blog/productivity/time-management-tips-at-work",
    "https://www.yalemedicine.org/news/social-media-teen-mental-health-a-parents-guide"
]

# Create an instance of the URLValidator
url_validator = URLValidator()

# Loop through each prompt and URL, getting the results for each combination
results = []

for user_prompt, url_to_check in zip(user_prompts, urls_to_check):
    result = url_validator.rate_url_validity(user_prompt, url_to_check)

    stars_score = result["stars"]["score"]
    stars_icon = "⭐" * stars_score  # Create the correct number of stars using the score

    results.append({
        "prompt": user_prompt,
        "url": url_to_check,
        "result": {
            **result,  # Include the original result data
            "stars": {  # Correctly format the star rating here
                "score": stars_score,
                "icon": stars_icon  # Correctly display the stars as characters
            }
        }
    })

# Print out the results in a readable JSON format
import json
print(json.dumps(results, indent=2))

Device set to use cpu


[
  {
    "prompt": "What are the symptoms of COVID-19?",
    "url": "https://www.mayoclinic.org/diseases-conditions/coronavirus/symptoms-causes/syc-20479963",
    "result": {
      "raw_score": {
        "Domain Trust": 92,
        "Content Relevance": 60,
        "Fact-Check Score": 80,
        "Bias Score": 30,
        "Citation Score": 1,
        "Final Validity Score": 57.2
      },
      "stars": {
        "score": 3,
        "icon": "\u2b50\u2b50\u2b50"
      },
      "explanation": "Potential bias detected in the content. Few citations found for this content."
    }
  },
  {
    "prompt": "How effective is the flu vaccine this year?",
    "url": "https://www.flu.com/Articles/2024/2024-2025-Flu-Vaccine-Effectiveness",
    "result": {
      "raw_score": {
        "Domain Trust": 24,
        "Content Relevance": 73,
        "Fact-Check Score": 40,
        "Bias Score": 30,
        "Citation Score": 1,
        "Final Validity Score": 35.45
      },
      "stars": {
        "score":

In [3]:
pip install gradio transformers sentence-transformers beautifulsoup4 textblob google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client requests



In [6]:
import gradio as gr
#from URLValidator import URLValidator, extract_title  # Ensure this file is in the same directory

# Initialize the URL validator
url_validator = URLValidator()

def validate_url(user_prompt, url_to_check):
    article_title = extract_title(url_to_check)
    result = url_validator.rate_url_validity(user_prompt, url_to_check)
    return result["raw_score"], result["stars"], result["explanation"]

# Define the Gradio interface
interface = gr.Interface(
    fn=validate_url,
    inputs=[gr.Textbox(label="User Prompt"), gr.Textbox(label="URL to Check")],
    outputs=[gr.JSON(label="Raw Scores"), gr.JSON(label="Stars"), gr.Textbox(label="Explanation")],
    title="URL Credibility Checker",
    description="Enter a topic or query and a URL to assess its credibility based on domain trust, content relevance, fact-checking, bias detection, and citations."
)

# Launch the interface
if __name__ == "__main__":
    interface.launch()

Device set to use cpu


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://273cff7aa89e0eef20.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
