In [46]:
import requests
import urllib.parse
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from google.colab import userdata
import os
from google.oauth2 import service_account
from googleapiclient.discovery import build
from google.auth.transport.requests import Request

class URLValidator:
    """
    A production-ready URL validation class that evaluates the credibility of a webpage
    using multiple factors: domain trust, content relevance, fact-checking, bias detection, and citations.
    """

    def __init__(self):
        # Set up authentication for Google API (Service Account)
        # Assuming the JSON file is uploaded to the Colab environment's root directory
        service_account_file = 'GoogleFactCheckAPI_JSON_File'
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = service_account_file

        # Authenticate and build the service
        self.credentials = service_account.Credentials.from_service_account_file(
            service_account_file,  # Use the variable here as well
            scopes=["https://www.googleapis.com/auth/cloud-platform"]
        )
        self.service = build('factchecktools', 'v1alpha1', credentials=self.credentials)


       # Load models once to avoid redundant API calls
        self.similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
        self.sentiment_analyzer = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment")

    def fetch_page_content(self, url: str) -> str:
        """ Fetches and extracts text content from the given URL. """
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            return " ".join([p.text for p in soup.find_all("p")])  # Extract paragraph text
        except requests.RequestException:
            return ""  # Fail gracefully by returning an empty string

    def get_domain_trust(self, url: str) -> int:
        """ Retrieves domain trust score from the Moz API using Google Colab Secrets. """
        try:
            # Retrieve API token from Colab Secrets
            MOZ_API_TOKEN = userdata.get("MOZ_API_TOKEN")  # Ensure the correct secret name

            if not MOZ_API_TOKEN:
                raise ValueError("Moz API token is not set. Please add it in the Secrets tab.")

            # Correct API URL
            api_url = "https://lsapi.seomoz.com/v2/url_metrics"

            headers = {
                "Authorization": f"Bearer {MOZ_API_TOKEN}",
                "Content-Type": "application/json"
            }
            payload = {"targets": [url]}

            # Send API request
            response = requests.post(api_url, headers=headers, json=payload, timeout=5)
            print("Moz API Response:", response.status_code, response.text)  # Debugging

            response.raise_for_status()
            data = response.json()

            # Extract domain authority score
            domain_authority = data.get("results", [{}])[0].get("domain_authority", 0)
            return int(domain_authority)

        except Exception as e:
          print(f"Error fetching Moz domain trust: {e}")
          return 50  # Default score if API fails

    def compute_similarity_score(self, user_query: str, content: str) -> int:
        """ Computes semantic similarity between user query and page content. """
        if not content:
            return 0
        return int(util.pytorch_cos_sim(self.similarity_model.encode(user_query), self.similarity_model.encode(content)).item() * 100)

    def check_facts(self, content: str) -> int:
        """ Cross-checks extracted content with Google Fact Check API. """
        if not content:
            return 50

        # Make a request to the Fact Check API using the service client
        query = content[:500]  # Use the first 500 characters of content as the query
        try:
            response = self.service.claims().search(query=query).execute()
            claims = response.get('claims', [])

            # If there are fact-check results, return a higher score
            if claims:
                return 80  # If claims are found, return a score of 80
            else:
                return 40  # No claims found, return a lower score
        except Exception as e:
            print(f"Error with Fact Check API: {e}")
            return 50  # Default uncertainty score in case of error

    def detect_bias(self, content: str) -> int:
        """ Uses NLP sentiment analysis to detect potential bias in content. """
        if not content:
            return 50
        sentiment_result = self.sentiment_analyzer(content[:512])[0]
        return 100 if sentiment_result["label"] == "POSITIVE" else 50 if sentiment_result["label"] == "NEUTRAL" else 30

    def check_citations(self, url: str) -> int:
        """ Check the citation count of the article using both CrossRef and Semantic Scholar. """
        # Look for a DOI in the URL (CrossRef requires DOI)
        doi = self.extract_doi_from_url(url)

        # CrossRef API: Look for citation count via DOI
        if doi:
            crossref_citations = self.get_crossref_citations(doi)
            if crossref_citations is not None:
                return crossref_citations

        # If no DOI or CrossRef citation data is found, use Semantic Scholar API
        semantic_scholar_citations = self.get_semantic_scholar_citations(url)
        return semantic_scholar_citations

    def extract_doi_from_url(self, url: str) -> str:
        """ Extract DOI from a URL (if present). """
        # Check if the URL contains 'doi.org', which is common in DOI links
        if "doi.org" in url:
            parts = url.split("doi.org/")
            if len(parts) > 1:
                return parts[1]
        return None

    def get_crossref_citations(self, doi: str) -> int:
        """ Fetch the citation count for a DOI using the CrossRef API. """
        api_url = f"https://api.crossref.org/works/{doi}/citation"
        try:
            response = requests.get(api_url)
            data = response.json()
            citation_count = data['message']['citation_count']
            return citation_count
        except requests.exceptions.RequestException:
            return None  # Return None if CrossRef request fails

    def get_semantic_scholar_citations(self, url: str) -> int:
        """ Fetch the citation count from the Semantic Scholar API. """
        # Use Semantic Scholar API to fetch citation count
        api_url = f"https://api.semanticscholar.org/graph/v1/paper/lookup?url={url}"
        try:
            response = requests.get(api_url)
            data = response.json()
            citation_count = data.get('citationCount', 0)
            return citation_count
        except requests.exceptions.RequestException:
            return 0  # Return 0 if Semantic Scholar request fails

    def get_star_rating(self, score: float) -> tuple:
        """ Converts a score (0-100) into a 1-5 star rating. """
        stars = max(1, min(5, round(score / 20)))  # Normalize 100-scale to 5-star scale
        return stars, "⭐" * stars

    def generate_explanation(self, domain_trust, similarity_score, fact_check_score, bias_score, citation_score, final_score) -> str:
        """ Generates a human-readable explanation for the score. """
        reasons = []
        if domain_trust < 50:
            reasons.append("The source has low domain authority.")
        if similarity_score < 50:
            reasons.append("The content is not highly relevant to your query.")
        if fact_check_score < 50:
            reasons.append("Limited fact-checking verification found.")
        if bias_score < 50:
            reasons.append("Potential bias detected in the content.")
        if citation_score < 30:
            reasons.append("Few citations found for this content.")

        return " ".join(reasons)
        self.service = build('factchecktools', 'v1alpha1', credentials=self.credentials)

       # Load models once to avoid redundant API calls
        self.similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
        self.sentiment_analyzer = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment")

    def fetch_page_content(self, url: str) -> str:
        """ Fetches and extracts text content from the given URL. """
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            return " ".join([p.text for p in soup.find_all("p")])  # Extract paragraph text
        except requests.RequestException:
            return ""  # Fail gracefully by returning an empty string

    def get_domain_trust(self, url: str) -> int:
        """ Retrieves domain trust score from the Moz API using Google Colab Secrets. """
        try:
            # Retrieve API token from Colab Secrets
            MOZ_API_TOKEN = userdata.get("MOZ_SECRET_KEY")  # Ensure the correct secret name

            if not MOZ_API_TOKEN:
                raise ValueError("Moz API token is not set. Please add it in the Secrets tab.")

            # Correct API URL
            api_url = "https://lsapi.seomoz.com/v2/url_metrics"

            headers = {
                "Authorization": f"Bearer {MOZ_API_TOKEN}",
                "Content-Type": "application/json"
            }
            payload = {"targets": [url]}

            # Send API request
            response = requests.post(api_url, headers=headers, json=payload, timeout=5)

            response.raise_for_status()
            data = response.json()

            # Extract domain authority score
            domain_authority = data.get("results", [{}])[0].get("domain_authority", 0)
            return int(domain_authority)

        except Exception as e:
          return 50  # Default score if API fails

    def compute_similarity_score(self, user_query: str, content: str) -> int:
        """ Computes semantic similarity between user query and page content. """
        if not content:
            return 0
        return int(util.pytorch_cos_sim(self.similarity_model.encode(user_query), self.similarity_model.encode(content)).item() * 100)

    def check_facts(self, content: str) -> int:
        """ Cross-checks extracted content with Google Fact Check API. """
        if not content:
            return 50

        # Ensure we truncate the content to 500 characters and properly encode it
        query = content[:500]  # Limit to the first 500 characters to ensure it's not too long
        encoded_query = urllib.parse.quote(query)

        # Fact Check API URL
        api_url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?query={encoded_query}"

        try:

            # Use the credentials to get an access token for the request
            auth_request = Request()
            self.credentials.refresh(auth_request)
            headers = {'Authorization': f'Bearer {self.credentials.token}'}

            # Send GET request to the API
            response = requests.get(api_url, headers=headers)

            # Check if the response is successful
            if response.status_code == 200:
                data = response.json()
                return 80 if "claims" in data and data["claims"] else 40
            else:
                return 50  # Default uncertainty score

        except requests.exceptions.RequestException as e:
            return 50  # Default uncertainty score

    def detect_bias(self, content: str) -> int:
        """ Uses NLP sentiment analysis to detect potential bias in content. """
        if not content:
            return 50
        sentiment_result = self.sentiment_analyzer(content[:512])[0]
        return 100 if sentiment_result["label"] == "POSITIVE" else 50 if sentiment_result["label"] == "NEUTRAL" else 30

    def check_citations(self, url: str) -> int:
        """ Check the citation count of the article using both CrossRef and Semantic Scholar. """
        # Look for a DOI in the URL (CrossRef requires DOI)
        doi = self.extract_doi_from_url(url)

        # CrossRef API: Look for citation count via DOI
        if doi:
            crossref_citations = self.get_crossref_citations(doi)
            if crossref_citations is not None:
                return crossref_citations

        # If no DOI or CrossRef citation data is found, use Semantic Scholar API
        semantic_scholar_citations = self.get_semantic_scholar_citations(url)
        return semantic_scholar_citations

    def extract_doi_from_url(self, url: str) -> str:
        """ Extract DOI from a URL (if present). """
        # Check if the URL contains 'doi.org', which is common in DOI links
        if "doi.org" in url:
            parts = url.split("doi.org/")
            if len(parts) > 1:
                return parts[1]
        return None

    def get_crossref_citations(self, doi: str) -> int:
        """ Fetch the citation count for a DOI using the CrossRef API. """
        api_url = f"https://api.crossref.org/works/{doi}/citation"
        try:
            response = requests.get(api_url)
            data = response.json()
            citation_count = data['message']['citation_count']
            return citation_count
        except requests.exceptions.RequestException:
            return None  # Return None if CrossRef request fails

    def get_semantic_scholar_citations(self, url: str) -> int:
        """ Fetch the citation count from the Semantic Scholar API. """
        # Use Semantic Scholar API to fetch citation count
        api_url = f"https://api.semanticscholar.org/graph/v1/paper/lookup?url={url}"
        try:
            response = requests.get(api_url)
            data = response.json()
            citation_count = data.get('citationCount', 0)
            return citation_count
        except requests.exceptions.RequestException:
            return 0  # Return 0 if Semantic Scholar request fails

    def get_star_rating(self, score: float) -> tuple:
        """ Converts a score (0-100) into a 1-5 star rating. """
        stars = max(1, min(5, round(score / 20)))  # Normalize 100-scale to 5-star scale
        return stars, "⭐" * stars

    def generate_explanation(self, domain_trust, similarity_score, fact_check_score, bias_score, citation_score, final_score) -> str:
        """ Generates a human-readable explanation for the score. """
        reasons = []
        if domain_trust < 50:
            reasons.append("The source has low domain authority.")
        if similarity_score < 50:
            reasons.append("The content is not highly relevant to your query.")
        if fact_check_score < 50:
            reasons.append("Limited fact-checking verification found.")
        if bias_score < 50:
            reasons.append("Potential bias detected in the content.")
        if citation_score < 30:
            reasons.append("Few citations found for this content.")

        return " ".join(reasons) if reasons else "This source is highly credible and relevant."

    def rate_url_validity(self, user_query: str, url: str) -> dict:
        """ Main function to evaluate the validity of a webpage. """
        content = self.fetch_page_content(url)

        domain_trust = self.get_domain_trust(url)
        similarity_score = self.compute_similarity_score(user_query, content)
        fact_check_score = self.check_facts(content)
        bias_score = self.detect_bias(content)
        citation_score = self.check_citations(url)

        final_score = (
            (0.3 * domain_trust) +
            (0.3 * similarity_score) +
            (0.2 * fact_check_score) +
            (0.1 * bias_score) +
            (0.1 * citation_score)
        )

        stars, icon = self.get_star_rating(final_score)
        explanation = self.generate_explanation(domain_trust, similarity_score, fact_check_score, bias_score, citation_score, final_score)

        return {
            "raw_score": {
                "Domain Trust": domain_trust,
                "Content Relevance": similarity_score,
                "Fact-Check Score": fact_check_score,
                "Bias Score": bias_score,
                "Citation Score": citation_score,
                "Final Validity Score": final_score
            },
            "stars": {
                "score": stars,
                "icon": icon
            },
            "explanation": explanation
        }

Device set to use cpu


{'raw_score': {'Domain Trust': 95, 'Content Relevance': 28, 'Fact-Check Score': 50, 'Bias Score': 30, 'Citation Score': 0, 'Final Validity Score': 49.9}, 'stars': {'score': 2, 'icon': '⭐⭐'}, 'explanation': 'The content is not highly relevant to your query. Potential bias detected in the content. Few citations found for this content.'}


In [55]:
user_prompts = [
    "What are the symptoms of COVID-19?",
    "How effective is the flu vaccine this year?",
    "What are the causes of global warming?",
    "How does exercise affect mental health?",
    "What are the benefits of meditation?",
    "How do vaccines work?",
    "What is artificial intelligence?",
    "What are the side effects of the COVID-19 vaccine?",
    "How to reduce stress at work?",
    "What are the signs of depression?",
    "How to prevent heart disease?",
    "What is quantum computing?",
    "What is the best diet for weight loss?",
    "How does climate change affect wildlife?",
    "What are the symptoms of anxiety?",
    "How to improve sleep quality?",
    "What are the symptoms of the flu?",
    "What is the difference between Type 1 and Type 2 diabetes?",
    "How to manage time effectively?",
    "What is the impact of social media on teenagers?"
]

urls_to_check = [
    "https://www.mayoclinic.org/diseases-conditions/coronavirus/symptoms-causes/syc-20479963",
    "https://www.flu.com/Articles/2024/2024-2025-Flu-Vaccine-Effectiveness",
    "https://www.nrdc.org/stories/global-warming-101",
    "https://www.betterhealth.vic.gov.au/health/healthyliving/exercise-and-mental-health",
    "https://health.ucdavis.edu/blog/cultivating-health/10-health-benefits-of-meditation-and-how-to-focus-on-mindfulness-and-compassion/2022/12",
    "https://www.cdc.gov/vaccines/basics/explaining-how-vaccines-work.html#:~:text=Vaccines%20work%20by%20imitating%20an,vaccines%20at%20the%20recommended%20times.",
    "https://www.ibm.com/think/topics/artificial-intelligence",
    "https://www.cdc.gov/vaccine-safety/vaccines/covid-19.html",
    "https://www.mind.org.uk/information-support/tips-for-everyday-living/how-to-be-mentally-healthy-at-work/work-and-stress/",
    "https://www.samhsa.gov/",
    "https://www.mayoclinic.org/diseases-conditions/heart-disease/in-depth/heart-disease-prevention/art-20046502",
    "https://www.ibm.com/think/topics/quantum-computing",
    "https://www.health.harvard.edu/topics/diet-and-weight-loss",
    "https://www.nps.gov/articles/000/wildlife-climateimpact.htm",
    "https://www.mayoclinic.org/diseases-conditions/anxiety/symptoms-causes/syc-20350961",
    "https://www.mayoclinic.org/healthy-lifestyle/adult-health/in-depth/sleep/art-20048379",
    "https://www.cdc.gov/flu/signs-symptoms/index.html",
    "https://uvahealth.com/services/diabetes-care/types",
    "https://slack.com/blog/productivity/time-management-tips-at-work",
    "https://www.yalemedicine.org/news/social-media-teen-mental-health-a-parents-guide"
]

# Create an instance of the URLValidator
url_validator = URLValidator()

# Loop through each prompt and URL, getting the results for each combination
results = []

for user_prompt, url_to_check in zip(user_prompts, urls_to_check):
    result = url_validator.rate_url_validity(user_prompt, url_to_check)

    stars_score = result["stars"]["score"]
    stars_icon = "⭐" * stars_score  # Create the correct number of stars using the score

    results.append({
        "prompt": user_prompt,
        "url": url_to_check,
        "result": {
            **result,  # Include the original result data
            "stars": {  # Correctly format the star rating here
                "score": stars_score,
                "icon": stars_icon  # Correctly display the stars as characters
            }
        }
    })

# Print out the results in a readable JSON format
import json
print(json.dumps(results, indent=2))

Device set to use cpu


[
  {
    "prompt": "What are the symptoms of COVID-19?",
    "url": "https://www.mayoclinic.org/diseases-conditions/coronavirus/symptoms-causes/syc-20479963",
    "result": {
      "raw_score": {
        "Domain Trust": 50,
        "Content Relevance": 60,
        "Fact-Check Score": 50,
        "Bias Score": 30,
        "Citation Score": 0,
        "Final Validity Score": 46.0
      },
      "stars": {
        "score": 2,
        "icon": "\u2b50\u2b50"
      },
      "explanation": "Potential bias detected in the content. Few citations found for this content."
    }
  },
  {
    "prompt": "How effective is the flu vaccine this year?",
    "url": "https://www.flu.com/Articles/2024/2024-2025-Flu-Vaccine-Effectiveness",
    "result": {
      "raw_score": {
        "Domain Trust": 50,
        "Content Relevance": 73,
        "Fact-Check Score": 50,
        "Bias Score": 30,
        "Citation Score": 0,
        "Final Validity Score": 49.9
      },
      "stars": {
        "score": 2,
   