<a href="https://colab.research.google.com/github/hk-vk/CursorGroqProxy/blob/main/yeah_url.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Install necassary modules**

In [21]:
!pip install pandas numpy requests beautifulsoup4 tldextract python-whois scikit-learn transformers torch tensorflow nanoid




In [22]:
import pandas as pd
import numpy as np
import requests
import tldextract
import whois
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
from transformers import pipeline
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
import nanoid


In [23]:
def create_trusted_sources_db():
    return {
        # Major News Sources
        'manoramaonline.com': {'name': 'Malayala Manorama', 'credibility_score': 0.95, 'established': 1888},
        'mathrubhumi.com': {'name': 'Mathrubhumi', 'credibility_score': 0.95, 'established': 1923},
        'madhyamam.com': {'name': 'Madhyamam', 'credibility_score': 0.90, 'established': 1987},
        'deshabhimani.com': {'name': 'Deshabhimani', 'credibility_score': 0.85, 'established': 1942},
        'keralakaumudi.com': {'name': 'Kerala Kaumudi', 'credibility_score': 0.90, 'established': 1911},

        # Regional Sources
        'chandrikadaily.com': {'name': 'Chandrika', 'credibility_score': 0.80, 'region': 'Kozhikode'},
        'janmabhumi.in': {'name': 'Janmabhumi', 'credibility_score': 0.80, 'region': 'Thiruvananthapuram'},
        'sirajlive.com': {'name': 'Siraj Daily', 'credibility_score': 0.75, 'region': 'Kozhikode'},
        'metrovaartha.com': {'name': 'Metro Vaartha', 'credibility_score': 0.75, 'region': 'Kochi'},

        # Online News Portals
        'southlive.in': {'name': 'South Live', 'credibility_score': 0.70},
        'thejasnews.com': {'name': 'Thejas News', 'credibility_score': 0.70},
        '24newslive.com': {'name': '24 News', 'credibility_score': 0.75},

        # Fact Checkers
        'malayalam.factcrescendo.com': {'name': 'Fact Crescendo Malayalam', 'credibility_score': 0.90},
        'malayalam.vishvasnews.com': {'name': 'Vishvas News Malayalam', 'credibility_score': 0.85}
    }


In [24]:
class URLFeatureExtractor:
    def __init__(self):
        self.trusted_sources = create_trusted_sources_db()
        self.suspicious_tlds = ['xyz', 'top', 'buzz', 'guru', 'club', 'online']

    def extract_features(self, url):
        try:
            parsed_url = urlparse(url)
            domain = tldextract.extract(url).registered_domain

            features = {
                'url_length': len(url),
                'domain_length': len(domain),
                'path_length': len(parsed_url.path),
                'num_dots': url.count('.'),
                'num_hyphens': url.count('-'),
                'num_underscores': url.count('_'),
                'num_slashes': url.count('/'),
                'num_equals': url.count('='),
                'num_digits': sum(c.isdigit() for c in url),
                'has_https': int(url.startswith('https://')),
                'is_trusted_domain': int(domain in self.trusted_sources),
                'has_suspicious_tld': int(tldextract.extract(url).suffix in self.suspicious_tlds)
            }

            # Add domain age
            try:
                domain_info = whois.whois(domain)
                creation_date = domain_info.creation_date
                if isinstance(creation_date, list):
                    creation_date = creation_date[0]
                features['domain_age'] = (datetime.now() - creation_date).days
            except:
                features['domain_age'] = -1

            return features

        except Exception as e:
            print(f"Error extracting features: {str(e)}")
            return None


In [25]:
class URLCredibilityAnalyzer:
    def __init__(self):
        self.feature_extractor = URLFeatureExtractor()
        self.model = RandomForestClassifier()

    def check_ssl_certificate(self, url):
        try:
            response = requests.get(url, verify=True, timeout=10)
            return True
        except:
            return False

    def analyze_url(self, url):
        try:
            domain = tldextract.extract(url).registered_domain
            features = self.feature_extractor.extract_features(url)

            if features is None:
                return {
                    'credibility_score': 0.0,
                    'status': 'error',
                    'message': 'Failed to extract features'
                }

            # Check trusted sources first
            if domain in self.feature_extractor.trusted_sources:
                source_info = self.feature_extractor.trusted_sources[domain]
                return {
                    'credibility_score': source_info['credibility_score'],
                    'status': 'trusted',
                    'source_name': source_info['name'],
                    'features': features
                }

            # Calculate credibility score
            score = 0.5  # Base score

            if features['has_https']:
                score += 0.1
            if features['domain_age'] > 365:
                score += 0.1
            if features['has_suspicious_tld']:
                score -= 0.2

            return {
                'credibility_score': max(min(score, 1.0), 0.0),
                'status': 'analyzed',
                'features': features,
                'warning_flags': self._get_warning_flags(features)
            }

        except Exception as e:
            return {
                'credibility_score': 0.0,
                'status': 'error',
                'message': str(e)
            }

    def _get_warning_flags(self, features):
        flags = []
        if not features['has_https']:
            flags.append('No HTTPS security')
        if features['domain_age'] < 180:
            flags.append('Recently registered domain')
        if features['has_suspicious_tld']:
            flags.append('Suspicious top-level domain')
        return flags


In [26]:
class URLExistenceChecker:
    def __init__(self):
        self.session = requests.Session()
        self.timeout = 10
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

    def check_url_exists(self, url):
        try:
            # Add scheme if not present
            if not url.startswith(('http://', 'https://')):
                url = 'https://' + url

            # Try HEAD request first (faster)
            response = self.session.head(
                url,
                timeout=self.timeout,
                allow_redirects=True,
                headers=self.headers
            )

            # If HEAD fails, try GET
            if response.status_code != 200:
                response = self.session.get(
                    url,
                    timeout=self.timeout,
                    allow_redirects=True,
                    headers=self.headers
                )

            return {
                'exists': True,
                'status_code': response.status_code,
                'accessible': response.status_code == 200,
                'final_url': response.url,
                'is_redirect': len(response.history) > 0,
                'content_type': response.headers.get('content-type', '')
            }

        except requests.ConnectionError:
            return {
                'exists': False,
                'error': 'Connection failed',
                'reason': 'Unable to connect to server'
            }
        except requests.Timeout:
            return {
                'exists': False,
                'error': 'Timeout',
                'reason': 'Request timed out'
            }
        except requests.RequestException as e:
            return {
                'exists': False,
                'error': 'Request failed',
                'reason': str(e)
            }


In [33]:
def verify_url(url):
    # First check if URL exists and is accessible
    existence_checker = URLExistenceChecker()
    existence_result = existence_checker.check_url_exists(url)

    print(f"\nURL Verification Results for: {url}")
    print("-" * 50)

    # If URL doesn't exist or isn't accessible, return early
    if not existence_result['exists']:
        print(f"❌ URL Not Accessible")
        print(f"Error: {existence_result['error']}")
        print(f"Reason: {existence_result['reason']}")
        return existence_result

    # If URL exists, proceed with credibility analysis
    analyzer = URLCredibilityAnalyzer()
    result = analyzer.analyze_url(url)

    # Print existence details
    print(f"URL Status: {'✅ Accessible' if existence_result['accessible'] else '❌ Not Accessible'}")
    print(f"Status Code: {existence_result['status_code']}")
    if existence_result['is_redirect']:
        print(f"Redirected to: {existence_result['final_url']}")

    print("\nCredibility Analysis:")
    if result['status'] == 'trusted':
        print(f"✅ Trusted Source: {result['source_name']}")
        print(f"Credibility Score: {result['credibility_score']:.2f}")

    elif result['status'] == 'analyzed':
        print(f"Credibility Score: {result['credibility_score']:.2f}")
        if result['warning_flags']:
            print("\nWarning Flags:")
            for flag in result['warning_flags']:
                print(f"⚠️ {flag}")

    else:
        print(f"❌ Error: {result['message']}")

    return {
        'existence_check': existence_result,
        'credibility_check': result,
        'final_score': result.get('credibility_score', 0.0),
        'is_accessible': existence_result['accessible'],
        'is_trusted': result['status'] == 'trusted'
    }
test_urls = [
    "https://www.manoramaonline.com/news/latest-news.html",
     "https://www.sirajlive.com/plane-crash-in-south-korea-28-dead.html",
     "https://thisnotexit.daily/",
     "https://www.news.xyz"
]

for url in test_urls:
    verify_url(url)
    print("\n")



URL Verification Results for: https://www.manoramaonline.com/news/latest-news.html
--------------------------------------------------
URL Status: ✅ Accessible
Status Code: 200

Credibility Analysis:
✅ Trusted Source: Malayala Manorama
Credibility Score: 0.95



URL Verification Results for: https://www.sirajlive.com/plane-crash-in-south-korea-28-dead.html
--------------------------------------------------
URL Status: ✅ Accessible
Status Code: 200

Credibility Analysis:
✅ Trusted Source: Siraj Daily
Credibility Score: 0.75



URL Verification Results for: https://thisnotexit.daily/
--------------------------------------------------
❌ URL Not Accessible
Error: Connection failed
Reason: Unable to connect to server



URL Verification Results for: https://www.news.xyz
--------------------------------------------------
❌ URL Not Accessible
Error: Connection failed
Reason: Unable to connect to server


