# Monitor Prototype

## Analysis india.gov.in

### Fetch urls from the domain

In [1]:
from bs4 import BeautifulSoup
import requests
def fetch_urls_from_sitemap(domain):
    sitemap_url = f"https://{domain}/sitemap.xml"
    urls = set()

    resp = requests.get(sitemap_url, timeout=10)
    if resp.status_code != 200:
        return urls

    soup = BeautifulSoup(resp.text, "xml")
    for loc in soup.find_all("loc"):
        urls.add(loc.text.strip())

    return urls

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque
all_urls = fetch_urls_from_sitemap('india.gov.in')

In [3]:
len(all_urls)

56

In [16]:
all_urls_txt = "\n".join(sorted(all_urls))
with open("all_urls.txt", "w") as f:
    f.write(all_urls_txt)


### Fetch monitoring metrics

In [17]:
with open("./data/txt/all_urls.txt", "r") as f:
    all_urls = set(line.strip() for line in f if line.strip())
print(len(all_urls), "URLs loaded from all_urls.txt")

56 URLs loaded from all_urls.txt


In [18]:

import requests
import ssl
import socket
import time
from urllib.parse import urlparse
import datetime
from tqdm import tqdm

def assess_availability(url):
    try:
        start_time = time.time()
        resp = requests.get(url, timeout=10)
        response_time = time.time() - start_time
        return {
            'status_code': resp.status_code,
            'response_time': response_time,
            'is_available': resp.status_code == 200
        }
    except requests.RequestException as e:
        return {
            'status_code': None,
            'response_time': None,
            'is_available': False,
            'error': str(e)
        }

def assess_security(url):
    parsed = urlparse(url)
    hostname = parsed.hostname
    port = parsed.port or (443 if parsed.scheme == 'https' else 80)
    
    # Check SSL/TLS
    ssl_info = {}
    if parsed.scheme == 'https':
        try:
            context = ssl.create_default_context()
            with socket.create_connection((hostname, port)) as sock:
                with context.wrap_socket(sock, server_hostname=hostname) as ssock:
                    cert = ssock.getpeercert()
                    ssl_info = {
                        'has_ssl': True,
                        'cert_expires': cert['notAfter'],
                        'issuer': cert['issuer'][0][0][1] if cert['issuer'] else None,
                        'days_until_expiry': (datetime.datetime.strptime(cert['notAfter'], '%b %d %H:%M:%S %Y %Z') - datetime.datetime.now()).days,
                        'current_date': datetime.datetime.now().isoformat()
                    }
        except Exception as e:
            ssl_info = {'has_ssl': False, 'error': str(e)}
    else:
        ssl_info = {'has_ssl': False}
    
    try:
        resp = requests.get(url, timeout=10)
        headers = resp.headers
        security_headers = {
            'content_security_policy': 'Content-Security-Policy' in headers,
            'x_frame_options': 'X-Frame-Options' in headers,
            'strict_transport_security': 'Strict-Transport-Security' in headers
        }
    except:
        security_headers = {}
    
    return {
        'ssl_info': ssl_info,
        'security_headers': security_headers
    }

def assess_performance(url):
    try:
        start_time = time.time()
        resp = requests.get(url, timeout=10)
        total_time = time.time() - start_time
        page_size = len(resp.content)
        return {
            'total_load_time': total_time,
            'page_size_bytes': page_size,
            'status_code': resp.status_code
        }
    except requests.RequestException as e:
        return {'error': str(e)}

def assess_accessibility(url):
    try:
        resp = requests.get(url, timeout=10)
        soup = BeautifulSoup(resp.text, 'html.parser')
        has_alt = any(img.get('alt') for img in soup.find_all('img'))
        has_headings = bool(soup.find_all(['h1', 'h2', 'h3']))
        return {
            'has_alt_text': has_alt,
            'has_headings': has_headings,
        }
    except Exception as e:
        return {'error': str(e)}


def assess_urls(urls):
    results = {}
    for url in tqdm(urls):
        results[url] = {
            'availability': assess_availability(url),
            'security': assess_security(url),
            'performance': assess_performance(url),
            'accessibility': assess_accessibility(url)
        }
    return results


In [None]:
# Run assessments
results = assess_urls(all_urls)

  0%|          | 0/56 [00:00<?, ?it/s]

 34%|███▍      | 19/56 [12:41<24:42, 40.07s/it]

In [None]:

def get_tables(data):
    tables = {}
    for key in tqdm(data[next(iter(data))], desc=f"building table {key}"):  # Get the first subdictionary's keys
        # Create a DataFrame for each key
        tables[key] = pd.DataFrame.from_dict(
            {url: data[url][key] for url in tqdm(data)},
            orient='index'
        )
    return tables




Table for availability:
                                          status_code  response_time
http://india.gov.in/visitor-summary               200            0.5
http://india.gov.in/news/pib-photographs          200            0.4


Table for security:
                                          has_ssl issuer
http://india.gov.in/visitor-summary          True     US
http://india.gov.in/news/pib-photographs     True     US


Table for performance:
                                          total_load_time  page_size_bytes
http://india.gov.in/visitor-summary                   0.3           150000
http://india.gov.in/news/pib-photographs              0.2           120000


Table for accessibility:
                                          has_alt_text  has_headings
http://india.gov.in/visitor-summary               True          True
http://india.gov.in/news/pib-photographs         False          True




In [None]:
from utility import timeit

@timeit
def assess_accessibility(url):