In [1]:
!pip install requests beautifulsoup4



In [2]:
!pip install requests beautifulsoup4 pandas



In [9]:
# Run this in Google Colab
!pip install requests beautifulsoup4 pandas whois python-whois -q

import requests
from bs4 import BeautifulSoup
import pandas as pd
import whois
import time
from datetime import datetime
import socket
import ssl

def get_comprehensive_stats(url):
    """
    Get comprehensive website statistics
    """
    # Ensure we have a clean domain
    if '://' in url:
        url = url.split('://')[1]
    domain = url.replace('www.', '').split('/')[0]

    all_stats = {}

    # 1. Basic HTTP Analysis
    print(f"Analyzing {domain}...")

    try:
        # Test connection
        start_time = time.time()
        response = requests.get(f"https://{domain}", timeout=15, allow_redirects=True)
        load_time = time.time() - start_time

        all_stats['Domain'] = domain
        all_stats['Final URL'] = response.url
        all_stats['HTTP Status'] = response.status_code
        all_stats['Load Time'] = f"{load_time:.2f}s"
        all_stats['Content Size'] = f"{len(response.content):,} bytes"
        all_stats['Encoding'] = response.encoding

        # Headers analysis
        headers = response.headers
        all_stats['Server'] = headers.get('Server', 'Unknown')
        all_stats['Powered By'] = headers.get('X-Powered-By', 'Unknown')

        # Check if using CDN
        cdn_indicators = ['cloudflare', 'akamai', 'fastly', 'cloudfront']
        server = headers.get('Server', '').lower()
        for cdn in cdn_indicators:
            if cdn in server:
                all_stats['CDN'] = cdn.capitalize()
                break

        # Check security headers
        security_headers = ['Strict-Transport-Security', 'Content-Security-Policy',
                           'X-Frame-Options', 'X-Content-Type-Options']
        for header in security_headers:
            if header in headers:
                all_stats[f"Security: {header}"] = "Present"

    except Exception as e:
        all_stats['Connection Error'] = str(e)

    # 2. WHOIS Information
    print("Getting domain registration info...")
    try:
        w = whois.whois(domain)

        if w.domain_name:
            all_stats['Registered'] = 'Yes'
            if w.creation_date:
                if isinstance(w.creation_date, list):
                    all_stats['Created'] = w.creation_date[0].strftime('%Y-%m-%d')
                else:
                    all_stats['Created'] = w.creation_date.strftime('%Y-%m-%d')

            if w.expiration_date:
                if isinstance(w.expiration_date, list):
                    all_stats['Expires'] = w.expiration_date[0].strftime('%Y-%m-%d')
                else:
                    all_stats['Expires'] = w.expiration_date.strftime('%Y-%m-%d')

            if w.registrar:
                all_stats['Registrar'] = w.registrar
    except:
        all_stats['WHOIS Info'] = 'Not available'

    # 3. SSL Certificate Check
    print("Checking SSL certificate...")
    try:
        context = ssl.create_default_context()
        with socket.create_connection((domain, 443), timeout=10) as sock:
            with context.wrap_socket(sock, server_hostname=domain) as ssock:
                cert = ssock.getpeercert()

                # Get dates
                not_before = datetime.strptime(cert['notBefore'], '%b %d %H:%M:%S %Y %Z')
                not_after = datetime.strptime(cert['notAfter'], '%b %d %H:%M:%S %Y %Z')

                all_stats['SSL Issuer'] = dict(x[0] for x in cert.get('issuer', []))
                all_stats['SSL Valid From'] = not_before.strftime('%Y-%m-%d')
                all_stats['SSL Valid Until'] = not_after.strftime('%Y-%m-%d')
                all_stats['SSL Days Left'] = (not_after - datetime.now()).days
    except Exception as e:
        all_stats['SSL Status'] = f'Error: {str(e)}'

    # 4. Content Analysis
    print("Analyzing page content...")
    try:
        response = requests.get(f"https://{domain}", timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Basic metrics
        all_stats['Title'] = soup.title.string if soup.title else 'No title'
        all_stats['Meta Description'] = 'Found' if soup.find('meta', attrs={'name': 'description'}) else 'Missing'
        all_stats['H1 Tags'] = len(soup.find_all('h1'))
        all_stats['Total Links'] = len(soup.find_all('a'))
        all_stats['Images'] = len(soup.find_all('img'))

        # Check for common frameworks
        html = str(soup).lower()
        if 'wp-content' in html:
            all_stats['CMS'] = 'WordPress'
        elif 'shopify' in html:
            all_stats['CMS'] = 'Shopify'
        elif 'joomla' in html:
            all_stats['CMS'] = 'Joomla'

    except:
        all_stats['Content Analysis'] = 'Failed'

    # 5. Social Media Presence (check for links)
    print("Checking social media links...")
    try:
        soup = BeautifulSoup(requests.get(f"https://{domain}", timeout=10).text, 'html.parser')
        social_platforms = {
            'facebook.com': 'Facebook',
            'twitter.com': 'Twitter',
            'linkedin.com': 'LinkedIn',
            'instagram.com': 'Instagram',
            'youtube.com': 'YouTube'
        }

        social_links = []
        for link in soup.find_all('a', href=True):
            href = link['href'].lower()
            for platform, name in social_platforms.items():
                if platform in href:
                    social_links.append(name)

        if social_links:
            all_stats['Social Media'] = ', '.join(set(social_links))
    except:
        pass

    return all_stats

# Main function for Google Colab
def analyze_website():
    print("WEBSITE STATISTICS ANALYZER")
    print("=" * 50)

    # Get user input
    website = input("Enter website URL (e.g., google.com): ").strip()

    if not website:
        website = "github.com"  # Default example

    # Get statistics
    stats = get_comprehensive_stats(website)

    # Display results
    print("\n" + "="*60)
    print("ANALYSIS RESULTS")
    print("="*60)

    for key, value in stats.items():
        print(f"{key:25} : {value}")

    # Create DataFrame for display
    df = pd.DataFrame(list(stats.items()), columns=['Metric', 'Value'])

    # Save to CSV
    df.to_csv(f'website_analysis_{website.replace("/", "_")}.csv', index=False)
    print(f"\n Results saved to CSV file")

    return df

# Run the analysis
df = analyze_website()

# Display as table
from IPython.display import display
display(df)

WEBSITE STATISTICS ANALYZER
Enter website URL (e.g., google.com): https://sds.ubd.edu.bn/staff/
Analyzing sds.ubd.edu.bn...
Getting domain registration info...
Checking SSL certificate...
Analyzing page content...
Checking social media links...

ANALYSIS RESULTS
Connection Error          : HTTPSConnectionPool(host='sds.ubd.edu.bn', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1010)')))
Registered                : Yes
Created                   : 1998-09-01
Expires                   : 2026-09-01
Registrar                 : IMAGINE SDN BHD
SSL Status                : Error: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1010)
Content Analysis          : Failed

 Results saved to CSV file


Unnamed: 0,Metric,Value
0,Connection Error,"HTTPSConnectionPool(host='sds.ubd.edu.bn', por..."
1,Registered,Yes
2,Created,1998-09-01
3,Expires,2026-09-01
4,Registrar,IMAGINE SDN BHD
5,SSL Status,Error: [SSL: CERTIFICATE_VERIFY_FAILED] certif...
6,Content Analysis,Failed
