# Search API Test Notebook

This self-contained notebook tests multiple web search providers and reports latency, status, and top results.

Providers included:
- Bing Web Search API (Azure Cognitive Services)
- SerpAPI (engine=bing)
- SearXNG (public or self-hosted)
- DuckDuckGo HTML (no-JS HTML endpoint)
- Brave HTML

Instructions:
- Set API keys/endpoints in the Config cell below (or via environment variables).
- Run the cells top-to-bottom. The notebook installs missing dependencies at runtime.
- Results include a summary table and sample results per provider.

Notes:
- Use responsibly and comply with each provider's Terms of Service.
- HTML providers may change markup; simple parsers here are best-effort.



In [15]:
# Runtime setup: imports and on-the-fly installs
import sys, subprocess, json, os, time, random
from typing import List, Dict, Any, Optional, Tuple

# Lazy installer

def ensure(package: str, import_name: Optional[str] = None):
    try:
        __import__(import_name or package)
    except Exception:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

# Ensure dependencies used by HTML parsers and table rendering
for pkg, imp in [
    ('requests', 'requests'),
    ('beautifulsoup4', 'bs4'),
    ('pandas', 'pandas'),
    ('tabulate', 'tabulate'),
]:
    ensure(pkg, imp)

import requests
from bs4 import BeautifulSoup
import pandas as pd
from tabulate import tabulate

# Utility: timing
class Timer:
    def __enter__(self):
        self.start = time.time()
        return self
    def __exit__(self, exc_type, exc, tb):
        self.end = time.time()
        self.ms = int((self.end - self.start) * 1000)

# Normalize result schema
Result = Dict[str, str]


def truncate(text: str, n: int = 200) -> str:
    if not text:
        return ''
    return text if len(text) <= n else text[:n] + '...'



In [16]:
# Config: API keys and endpoints
CONFIG = {
    # Bing Web Search API (Azure)
    # Docs: https://learn.microsoft.com/azure/cognitive-services/bing-web-search/
    'bing_api_key': os.getenv('BING_API_KEY', ''),
    'bing_endpoint': os.getenv('BING_ENDPOINT', 'https://api.bing.microsoft.com/v7.0/search'),

    # SerpAPI (Bing engine)
    # Docs: https://serpapi.com/
    'serpapi_key': os.getenv('SERPAPI_KEY', ''),

    # SearXNG public/self-hosted endpoint
    # Example remote fallback: https://search.bus-hit.me
    'searxng_base': os.getenv('SEARXNG_BASE', 'http://192.168.219.113:8080'),
    # Optional internal-only base (WSL bridge)
    'searxng_internal_base': os.getenv('SEARXNG_INTERNAL_BASE', 'http://localhost:8080'),

    # HTML providers config
    'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'timeout': 15,
}

# Basic validation helper

def require(value: str, name: str) -> None:
    if not value:
        raise ValueError(f"Missing required config: {name}. Set env var or edit CONFIG.")

In [17]:
# Connectivity diagnostics (optional)
import socket

print('??Connectivity checks ??')
try:
    r = requests.get('https://example.com', timeout=10)
    print('example.com:', r.status_code)
except Exception as e:
    print('example.com failed:', type(e).__name__, str(e)[:200])

try:
    ip = requests.get('https://api.ipify.org?format=json', timeout=10)
    print('Public IP:', ip.json().get('ip'))
except Exception as e:
    print('Public IP check failed:', type(e).__name__, str(e)[:200])

print('HTTP_PROXY:', os.getenv('HTTP_PROXY') or os.getenv('http_proxy'))
print('HTTPS_PROXY:', os.getenv('HTTPS_PROXY') or os.getenv('https_proxy'))
try:
    print('Hostname:', socket.gethostname())
except Exception:
    pass


??Connectivity checks ??
example.com: 200
Public IP: 115.138.25.219
HTTP_PROXY: None
HTTPS_PROXY: None
Hostname: DESKTOP-TBVDNCR


In [18]:
# Provider implementations

headers_common = {
    'User-Agent': CONFIG['user_agent'],
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
}


def search_bing_api(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    require(CONFIG['bing_api_key'], 'bing_api_key')
    url = CONFIG['bing_endpoint']
    params = {
        'q': query,
        'count': max_results,
        'mkt': 'en-US',
        'responseFilter': 'Webpages'
    }
    headers = {
        'Ocp-Apim-Subscription-Key': CONFIG['bing_api_key']
    }
    with Timer() as t:
        r = requests.get(url, params=params, headers=headers, timeout=CONFIG['timeout'])
        r.raise_for_status()
        data = r.json()
    items = []
    for w in (data.get('webPages', {}) or {}).get('value', []):
        items.append({
            'title': w.get('name', ''),
            'snippet': truncate(w.get('snippet', ''), 200),
            'url': w.get('url', ''),
            'source': 'Bing API'
        })
    return items[:max_results], {'latency_ms': t.ms, 'status': r.status_code}


def search_serpapi_bing(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    require(CONFIG['serpapi_key'], 'serpapi_key')
    url = 'https://serpapi.com/search.json'
    params = {
        'engine': 'bing',
        'q': query,
        'num': max_results,
        'api_key': CONFIG['serpapi_key']
    }
    with Timer() as t:
        r = requests.get(url, params=params, timeout=CONFIG['timeout'])
        r.raise_for_status()
        data = r.json()
    items = []
    for res in data.get('organic_results', [])[:max_results]:
        items.append({
            'title': res.get('title', ''),
            'snippet': truncate(res.get('snippet', ''), 200),
            'url': res.get('link', ''),
            'source': 'SerpAPI (Bing)'
        })
    return items, {'latency_ms': t.ms, 'status': r.status_code}


def search_searxng(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    base = CONFIG.get('searxng_base', 'http://172.20.221.97:8080').rstrip('/')
    url = f"{base}/search"
    params = {
        'q': query,
        'format': 'json',
        'engines': 'bing,duckduckgo',
        'language': 'en'
    }
    with Timer() as t:
        r = requests.get(url, params=params, headers=headers_common, timeout=CONFIG['timeout'])
        r.raise_for_status()
        data = r.json()
    items = []
    for res in data.get('results', [])[:max_results]:
        items.append({
            'title': res.get('title', ''),
            'snippet': truncate(res.get('content', ''), 200),
            'url': res.get('url', ''),
            'source': 'SearXNG'
        })
    return items, {'latency_ms': t.ms, 'status': r.status_code}


def search_duckduckgo_html(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    url = 'https://html.duckduckgo.com/html/'
    payload = {'q': query}
    session = requests.Session()
    session.trust_env = False
    session.headers.update(headers_common)
    with Timer() as t:
        r = session.post(url, data=payload, timeout=CONFIG['timeout'])
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'html.parser')
    items: List[Result] = []
    for element in soup.find_all('div', class_='result')[:max_results]:
        a = element.find('a', class_='result__a')
        if not a:
            continue
        title = a.get_text(strip=True) or ''
        href = a.get('href', '')
        snippet_elem = element.find('a', class_='result__snippet')
        snippet = snippet_elem.get_text(strip=True) if snippet_elem else ''
        if not title or not href:
            continue
        items.append({
            'title': title,
            'snippet': truncate(snippet, 200) if snippet else 'DuckDuckGo result',
            'url': href,
            'source': 'DuckDuckGo (HTML)'
        })
    return items, {'latency_ms': t.ms, 'status': r.status_code}


def search_brave_html(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    from urllib.parse import quote_plus
    url = f"https://search.brave.com/search?q={quote_plus(query)}"
    session.trust_env = False
    session = requests.Session()
    session.headers.update(headers_common)
    with Timer() as t:
        r = session.get(url, timeout=CONFIG['timeout'])
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'html.parser')
    items: List[Result] = []
    # Brave changes often; attempt multiple selectors
    # Primary: div.snippet with a.result-header
    for element in soup.find_all('div', class_='snippet'):
        a = element.find('a', class_='result-header')
        if not a:
            continue
        title = a.get_text(strip=True) or ''
        href = a.get('href', '')
        desc = ''
        p = element.find('p', class_='snippet-description')
        if p:
            desc = p.get_text(strip=True)
        if title and href and href.startswith('http'):
            items.append({
                'title': title,
                'snippet': truncate(desc, 200) if desc else 'Brave result',
                'url': href,
                'source': 'Brave (HTML)'
            })
        if len(items) >= max_results:
            break
    return items[:max_results], {'latency_ms': t.ms, 'status': r.status_code}


In [19]:
# Benchmark harness

PROVIDERS = [
    ('Bing API', search_bing_api),
    ('SerpAPI (Bing)', search_serpapi_bing),
    ('SearXNG', search_searxng),
    ('DuckDuckGo HTML', search_duckduckgo_html),
    ('Brave HTML', search_brave_html),
]

TEST_QUERIES = [
    'python web scraping tutorial',
    'latest LLM research 2025',
    'how to use pandas merge',
]

MAX_RESULTS = 5


def run_single(provider_name: str, fn, query: str, max_results: int = MAX_RESULTS) -> Dict[str, Any]:
    try:
        results, meta = fn(query, max_results=max_results)
        return {
            'provider': provider_name,
            'query': query,
            'ok': True,
            'count': len(results),
            'latency_ms': meta.get('latency_ms', None),
            'status': meta.get('status', None),
            'sample': results[:2] if results else []
        }
    except Exception as e:
        return {
            'provider': provider_name,
            'query': query,
            'ok': False,
            'error': str(e)
        }


def run_benchmarks(queries: List[str] = TEST_QUERIES) -> pd.DataFrame:
    rows = []
    for q in queries:
        for name, fn in PROVIDERS:
            rows.append(run_single(name, fn, q))
    df = pd.DataFrame(rows)
    return df



In [20]:
# SearXNG: SSL fallback, proxy support, and endpoint rotation
from urllib.parse import urlparse

SEARXNG_ENDPOINTS = [
    CONFIG.get('searxng_base', 'http://172.20.221.97:8080'),
    # Optional internal-only base (WSL bridge)
    CONFIG.get('searxng_internal_base', 'http://localhost:8080'),
    'https://searx.be',
    'https://search.bus-hit.me',
]

PROXY_ENV = {
    'http': os.getenv('HTTP_PROXY') or os.getenv('http_proxy') or None,
    'https': os.getenv('HTTPS_PROXY') or os.getenv('https_proxy') or None,
}

# Replace search_searxng with a hardened version

def search_searxng(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    headers = dict(headers_common)
    session = requests.Session()
    session.trust_env = False
    session.headers.update(headers)

    # Try multiple endpoints, handle SSL errors, and optionally disable verification as last resort
    last_exc = None
    for idx, base in enumerate([e for e in SEARXNG_ENDPOINTS if e]):
        base = base.rstrip('/')
        url = f"{base}/search"
        params = {
            'q': query,
            'format': 'json',
            'engines': 'bing,duckduckgo',
            'language': 'en'
        }

        # Attempt 1: normal GET with proxies if set
        try:
            with Timer() as t:
                r = session.get(url, params=params, timeout=CONFIG['timeout'], proxies=PROXY_ENV if any(PROXY_ENV.values()) else None)
                r.raise_for_status()
                data = r.json()
            items = []
            for res in data.get('results', [])[:max_results]:
                items.append({
                    'title': res.get('title', ''),
                    'snippet': truncate(res.get('content', ''), 200),
                    'url': res.get('url', ''),
                    'source': f"SearXNG ({urlparse(base).netloc})"
                })
            return items, {'latency_ms': t.ms, 'status': r.status_code}
        except requests.exceptions.SSLError as e:
            last_exc = e
            # Attempt 2: allow self-signed/invalid certs (verification off) for lab environments
            try:
                with Timer() as t:
                    r = session.get(url, params=params, timeout=CONFIG['timeout'], verify=False, proxies=PROXY_ENV if any(PROXY_ENV.values()) else None)
                    r.raise_for_status()
                    data = r.json()
                items = []
                for res in data.get('results', [])[:max_results]:
                    items.append({
                        'title': res.get('title', ''),
                        'snippet': truncate(res.get('content', ''), 200),
                        'url': res.get('url', ''),
                        'source': f"SearXNG (insecure) ({urlparse(base).netloc})"
                    })
                return items, {'latency_ms': t.ms, 'status': r.status_code}
            except Exception as e2:
                last_exc = e2
                continue
        except Exception as e:
            last_exc = e
            continue

    raise RuntimeError(f"SearXNG failed across endpoints: {type(last_exc).__name__}: {str(last_exc)[:200]}")


In [21]:
# HTML providers: SSL fallback and proxy support

PROXY_ENV = {
    'http': os.getenv('HTTP_PROXY') or os.getenv('http_proxy') or None,
    'https': os.getenv('HTTPS_PROXY') or os.getenv('https_proxy') or None,
}

ALLOW_INSECURE_SSL = bool(os.getenv('ALLOW_INSECURE_SSL', '0') in ['1', 'true', 'True'])

# Wrap the existing HTML functions with hardened versions

_original_ddg = search_duckduckgo_html
_original_brave = search_brave_html


def search_duckduckgo_html(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    url = 'https://html.duckduckgo.com/html/'
    payload = {'q': query}
    session = requests.Session()
    session.trust_env = False
    session.headers.update(headers_common)
    # Attempt 1: normal POST
    try:
        with Timer() as t:
            r = session.post(url, data=payload, timeout=CONFIG['timeout'], proxies=PROXY_ENV if any(PROXY_ENV.values()) else None)
            r.raise_for_status()
            soup = BeautifulSoup(r.text, 'html.parser')
        items: List[Result] = []
        for element in soup.find_all('div', class_='result')[:max_results]:
            a = element.find('a', class_='result__a')
            if not a:
                continue
            title = a.get_text(strip=True) or ''
            href = a.get('href', '')
            snippet_elem = element.find('a', class_='result__snippet')
            snippet = snippet_elem.get_text(strip=True) if snippet_elem else ''
            if not title or not href:
                continue
            items.append({
                'title': title,
                'snippet': truncate(snippet, 200) if snippet else 'DuckDuckGo result',
                'url': href,
                'source': 'DuckDuckGo (HTML)'
            })
        return items, {'latency_ms': t.ms, 'status': r.status_code}
    except requests.exceptions.SSLError as e:
        if not ALLOW_INSECURE_SSL:
            raise
        # Attempt 2: insecure SSL for lab envs
        with Timer() as t:
            r = session.post(url, data=payload, timeout=CONFIG['timeout'], proxies=PROXY_ENV if any(PROXY_ENV.values()) else None, verify=False)
            r.raise_for_status()
            soup = BeautifulSoup(r.text, 'html.parser')
        items: List[Result] = []
        for element in soup.find_all('div', class_='result')[:max_results]:
            a = element.find('a', class_='result__a')
            if not a:
                continue
            title = a.get_text(strip=True) or ''
            href = a.get('href', '')
            snippet_elem = element.find('a', class_='result__snippet')
            snippet = snippet_elem.get_text(strip=True) if snippet_elem else ''
            if not title or not href:
                continue
            items.append({
                'title': title,
                'snippet': truncate(snippet, 200) if snippet else 'DuckDuckGo result',
                'url': href,
                'source': 'DuckDuckGo (HTML, insecure)'
            })
        return items, {'latency_ms': t.ms, 'status': r.status_code}


def search_brave_html(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    from urllib.parse import quote_plus
    url = f"https://search.brave.com/search?q={quote_plus(query)}"
    session.trust_env = False
    session = requests.Session()
    session.headers.update(headers_common)
    # Attempt 1: normal GET
    try:
        with Timer() as t:
            r = session.get(url, timeout=CONFIG['timeout'], proxies=PROXY_ENV if any(PROXY_ENV.values()) else None)
            r.raise_for_status()
            soup = BeautifulSoup(r.text, 'html.parser')
    except requests.exceptions.SSLError as e:
        if not ALLOW_INSECURE_SSL:
            raise
        with Timer() as t:
            r = session.get(url, timeout=CONFIG['timeout'], proxies=PROXY_ENV if any(PROXY_ENV.values()) else None, verify=False)
            r.raise_for_status()
            soup = BeautifulSoup(r.text, 'html.parser')
    items: List[Result] = []
    for element in soup.find_all('div', class_='snippet'):
        a = element.find('a', class_='result-header')
        if not a:
            continue
        title = a.get_text(strip=True) or ''
        href = a.get('href', '')
        desc = ''
        p = element.find('p', class_='snippet-description')
        if p:
            desc = p.get_text(strip=True)
        if title and href and href.startswith('http'):
            items.append({
                'title': title,
                'snippet': truncate(desc, 200) if desc else 'Brave result',
                'url': href,
                'source': 'Brave (HTML, insecure)' if 'verify=False' in str(r.request.__dict__) else 'Brave (HTML)'
            })
        if len(items) >= max_results:
            break
    return items[:max_results], {'latency_ms': t.ms, 'status': r.status_code}


In [22]:
# Unified request helper: proxies, CA bundle, retries, insecure fallback
import urllib3
from requests.exceptions import SSLError, ProxyError, ConnectionError, ReadTimeout

# Optional config via env or direct edits here
PROXY_URL = os.getenv('PROXY_URL', '').strip()  # e.g., http://user:pass@host:port
PROXY_USERNAME = os.getenv('PROXY_USERNAME', '').strip()
PROXY_PASSWORD = os.getenv('PROXY_PASSWORD', '').strip()
CA_BUNDLE_PATH = os.getenv('CA_BUNDLE_PATH', '').strip()  # e.g., C:\certs\corp_ca.pem
RETRIES = int(os.getenv('REQUEST_RETRIES', '2'))
BACKOFF = float(os.getenv('REQUEST_BACKOFF', '0.7'))

if ALLOW_INSECURE_SSL:
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def build_proxies() -> Optional[Dict[str, str]]:
    # Prefer explicit PROXY_URL if provided; else environment proxies
    if PROXY_URL:
        return {'http': PROXY_URL, 'https': PROXY_URL}
    if any(PROXY_ENV.values()):
        return PROXY_ENV
    # Support basic auth if host provided separately (rare); construct URL
    if os.getenv('PROXY_HOST') and os.getenv('PROXY_PORT'):
        host = os.getenv('PROXY_HOST').strip()
        port = os.getenv('PROXY_PORT').strip()
        user = PROXY_USERNAME or os.getenv('PROXY_USERNAME', '').strip()
        pwd = PROXY_PASSWORD or os.getenv('PROXY_PASSWORD', '').strip()
        if user and pwd:
            return {
                'http': f'http://{user}:{pwd}@{host}:{port}',
                'https': f'http://{user}:{pwd}@{host}:{port}'
            }
        return {'http': f'http://{host}:{port}', 'https': f'http://{host}:{port}'}
    return {}  # disable system proxy usage when not configured


PROXIES = build_proxies()


def request_with_fallback(method: str, url: str, *, params=None, data=None, headers=None, timeout=None) -> requests.Response:
    last_exc = None
    attempts = []

    # Strategy order: explicit CA bundle -> default verify -> insecure if allowed
    strategies = []
    if CA_BUNDLE_PATH:
        strategies.append(('verify_path', CA_BUNDLE_PATH))
    strategies.append(('verify_true', True))
    if ALLOW_INSECURE_SSL:
        strategies.append(('verify_false', False))

    for attempt in range(RETRIES + 1):
        for label, verify in strategies:
            try:
                resp = requests.request(
                    method,
                    url,
                    params=params,
                    data=data,
                    headers=headers,
                    timeout=timeout or CONFIG['timeout'],
                    proxies=PROXIES,
                    verify=verify
                )
                resp.raise_for_status()
                return resp
            except (SSLError, ProxyError, ConnectionError, ReadTimeout) as e:
                last_exc = e
                attempts.append(f"{label}:{type(e).__name__}")
                time.sleep(BACKOFF * (attempt + 1))
            except Exception as e:
                last_exc = e
                attempts.append(f"{label}:{type(e).__name__}")
                time.sleep(BACKOFF * (attempt + 1))
    raise RuntimeError(f"Request failed after retries: {type(last_exc).__name__}: {str(last_exc)[:200]} | attempts={attempts}")


print('??Network wrapper ready ??')
print('PROXIES:', PROXIES)
print('CA_BUNDLE_PATH:', CA_BUNDLE_PATH or '(default trust store)')
print('ALLOW_INSECURE_SSL:', ALLOW_INSECURE_SSL)
print('RETRIES:', RETRIES, 'BACKOFF:', BACKOFF)


??Network wrapper ready ??
PROXIES: {}
CA_BUNDLE_PATH: (default trust store)
ALLOW_INSECURE_SSL: False
RETRIES: 2 BACKOFF: 0.7


In [23]:
# Rebind providers to use the unified request helper

def search_searxng(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    from urllib.parse import urlparse
    last_exc = None
    for base in [e for e in SEARXNG_ENDPOINTS if e]:
        base = base.rstrip('/')
        url = f"{base}/search"
        params = {
            'q': query,
            'format': 'json',
            'engines': 'bing,duckduckgo',
            'language': 'en'
        }
        try:
            with Timer() as t:
                r = request_with_fallback('GET', url, params=params, headers=headers_common, timeout=CONFIG['timeout'])
                data = r.json()
            items = []
            for res in data.get('results', [])[:max_results]:
                items.append({
                    'title': res.get('title', ''),
                    'snippet': truncate(res.get('content', ''), 200),
                    'url': res.get('url', ''),
                    'source': f"SearXNG ({urlparse(base).netloc})"
                })
            return items, {'latency_ms': t.ms, 'status': r.status_code}
        except Exception as e:
            last_exc = e
            continue
    raise RuntimeError(f"SearXNG failed: {type(last_exc).__name__}: {str(last_exc)[:200]}")


def search_duckduckgo_html(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    url = 'https://html.duckduckgo.com/html/'
    payload = {'q': query}
    with Timer() as t:
        r = request_with_fallback('POST', url, data=payload, headers=headers_common, timeout=CONFIG['timeout'])
        soup = BeautifulSoup(r.text, 'html.parser')
    items: List[Result] = []
    for element in soup.find_all('div', class_='result')[:max_results]:
        a = element.find('a', class_='result__a')
        if not a:
            continue
        title = a.get_text(strip=True) or ''
        href = a.get('href', '')
        snippet_elem = element.find('a', class_='result__snippet')
        snippet = snippet_elem.get_text(strip=True) if snippet_elem else ''
        if not title or not href:
            continue
        items.append({
            'title': title,
            'snippet': truncate(snippet, 200) if snippet else 'DuckDuckGo result',
            'url': href,
            'source': 'DuckDuckGo (HTML)'
        })
    return items, {'latency_ms': t.ms, 'status': r.status_code}


def search_brave_html(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    from urllib.parse import quote_plus
    url = f"https://search.brave.com/search?q={quote_plus(query)}"
    with Timer() as t:
        r = request_with_fallback('GET', url, headers=headers_common, timeout=CONFIG['timeout'])
        soup = BeautifulSoup(r.text, 'html.parser')
    items: List[Result] = []
    for element in soup.find_all('div', class_='snippet'):
        a = element.find('a', class_='result-header')
        if not a:
            continue
        title = a.get_text(strip=True) or ''
        href = a.get('href', '')
        desc = ''
        p = element.find('p', class_='snippet-description')
        if p:
            desc = p.get_text(strip=True)
        if title and href and href.startswith('http'):
            items.append({
                'title': title,
                'snippet': truncate(desc, 200) if desc else 'Brave result',
                'url': href,
                'source': 'Brave (HTML)'
            })
        if len(items) >= max_results:
            break
    return items[:max_results], {'latency_ms': t.ms, 'status': r.status_code}

# Update provider list to use the rebound functions
PROVIDERS = [
    ('Bing API', search_bing_api),
    ('SerpAPI (Bing)', search_serpapi_bing),
    ('SearXNG', search_searxng),
    ('DuckDuckGo HTML', search_duckduckgo_html),
    ('Brave HTML', search_brave_html),
]

print('Providers rebound to robust network wrapper')


Providers rebound to robust network wrapper


In [24]:
# Proxy/SSL diagnostics (company network)
import ssl
import certifi

print('🔍 Proxy/SSL Diagnostics 🔍')
print('HTTP_PROXY:', os.getenv('HTTP_PROXY') or os.getenv('http_proxy'))
print('HTTPS_PROXY:', os.getenv('HTTPS_PROXY') or os.getenv('https_proxy'))
print('PROXY_URL:', os.getenv('PROXY_URL'))
print('PROXY_HOST:', os.getenv('PROXY_HOST'))
print('PROXY_PORT:', os.getenv('PROXY_PORT'))
print('CA_BUNDLE_PATH:', os.getenv('CA_BUNDLE_PATH'))
print('ALLOW_INSECURE_SSL:', os.getenv('ALLOW_INSECURE_SSL'))

# Test DNS and TCP via proxies if configured
TEST_URLS = [
    'https://example.com',
    CONFIG.get('searxng_base', 'http://172.20.221.97:8080').rstrip('/') + '/search'
]

for u in TEST_URLS:
    try:
        with Timer() as t:
            r = request_with_fallback('GET', u, headers={'User-Agent': CONFIG['user_agent']}, timeout=10)
        print(f'OK {u} in {t.ms} ms - status {r.status_code}')
    except Exception as e:
        print(f'FAIL {u}:', type(e).__name__, str(e)[:300])

# Show default cert store in use
try:
    print('certifi.where():', certifi.where())
except Exception:
    pass

try:
    ctx = ssl.create_default_context(cafile=os.getenv('CA_BUNDLE_PATH') or None)
    print('SSL context created. Custom CA used:', bool(os.getenv('CA_BUNDLE_PATH')))
except Exception as e:
    print('SSL context error:', type(e).__name__, str(e)[:200])


🔍 Proxy/SSL Diagnostics 🔍
HTTP_PROXY: None
HTTPS_PROXY: None
PROXY_URL: None
PROXY_HOST: None
PROXY_PORT: None
CA_BUNDLE_PATH: None
ALLOW_INSECURE_SSL: None
OK https://example.com in 761 ms - status 200
FAIL http://192.168.219.113:8080/search: RuntimeError Request failed after retries: ReadTimeout: HTTPConnectionPool(host='192.168.219.113', port=8080): Read timed out. (read timeout=10) | attempts=['verify_true:ReadTimeout', 'verify_true:ReadTimeout', 'verify_true:ReadTimeout']
certifi.where(): C:\Users\Lee\AppData\Roaming\Python\Python313\site-packages\certifi\cacert.pem
SSL context created. Custom CA used: False


## Accessing SearXNG directly via URL

You can open a SearXNG instance in your browser and run a search by navigating to:

- Base URL (homepage):
  - `https://search.bus-hit.me/`
- JSON API endpoint (browser-friendly for quick checks):
  - `https://search.bus-hit.me/search?q=python&format=json`
- HTML results page:
  - `https://search.bus-hit.me/search?q=python`

Replace `search.bus-hit.me` with your preferred or company-hosted SearXNG instance.

Common public instances (availability varies):
- `https://searx.be`
- `https://searx.tiekoetter.com`
- `https://searxng.site`

Tip: If you are behind a corporate proxy with a custom CA, your browser may trust the proxy automatically. If you want Python to trust the same CA, export the CA bundle path and rerun the notebook:
- Windows PowerShell:
  - `$env:CA_BUNDLE_PATH = "C:\\path\\to\\corp_ca.pem"`
- CMD:
  - `set CA_BUNDLE_PATH=C:\path\to\corp_ca.pem`


In [25]:
# SearXNG endpoint health and rotation (handle 403 bans)
import time as _time
from urllib.parse import urlparse as _urlparse
from requests.exceptions import HTTPError as _HTTPError

SEARXNG_HEALTH = globals().get('SEARXNG_HEALTH', {})  # persist across runs in session
SEARXNG_BAN_SECONDS = int(os.getenv('SEARXNG_BAN_SECONDS', '3600'))  # 1 hour default
SEARXNG_FAVOR_HOSTS = {'172.20.221.97:8080': 3, 'localhost:8080': 2}  # weight boost for local instance


def _now() -> float:
    return _time.time()


def _host_of(base: str) -> str:
    try:
        return _urlparse(base).netloc
    except Exception:
        return base


def get_searxng_candidates() -> list:
    # Build weighted list with health and bans
    candidates = []
    now = _now()
    for base in [e for e in SEARXNG_ENDPOINTS if e]:
        host = _host_of(base)
        h = SEARXNG_HEALTH.get(host, {'ok': 0, 'fail': 0, 'banned_until': 0})
        banned = h.get('banned_until', 0) > now
        weight = h.get('ok', 0) - h.get('fail', 0)
        weight += SEARXNG_FAVOR_HOSTS.get(host, 0)
        candidates.append((banned, -weight, base))
    # Not banned first, higher weight first
    candidates.sort()
    # Return ordered bases
    ordered = [base for banned, _w, base in candidates if not banned]
    # Append banned ones at the end (last resort if bans expire inside same run)
    ordered += [base for banned, _w, base in candidates if banned]
    return ordered


def search_searxng(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    ordered = get_searxng_candidates()
    last_exc = None
    for base in ordered:
        base = base.rstrip('/')
        host = _host_of(base)
        url = f"{base}/search"
        params = {
            'q': query,
            'format': 'json',
            'engines': 'bing,duckduckgo',
            'language': 'en'
        }
        try:
            with Timer() as t:
                r = request_with_fallback('GET', url, params=params, headers=headers_common, timeout=CONFIG['timeout'])
                data = r.json()
            items = []
            for res in data.get('results', [])[:max_results]:
                items.append({
                    'title': res.get('title', ''),
                    'snippet': truncate(res.get('content', ''), 200),
                    'url': res.get('url', ''),
                    'source': f"SearXNG ({host})"
                })
            # mark success
            h = SEARXNG_HEALTH.get(host, {'ok': 0, 'fail': 0, 'banned_until': 0})
            h['ok'] = h.get('ok', 0) + 1
            h['last_ok_at'] = _now()
            h['banned_until'] = 0
            SEARXNG_HEALTH[host] = h
            return items, {'latency_ms': t.ms, 'status': r.status_code}
        except Exception as e:
            last_exc = e
            emsg = str(e)
            # Detect 403 and ban temporarily
            if '403' in emsg or 'Forbidden' in emsg:
                h = SEARXNG_HEALTH.get(host, {'ok': 0, 'fail': 0, 'banned_until': 0})
                h['fail'] = h.get('fail', 0) + 1
                h['banned_until'] = _now() + SEARXNG_BAN_SECONDS
                h['last_status'] = 403
                SEARXNG_HEALTH[host] = h
            else:
                h = SEARXNG_HEALTH.get(host, {'ok': 0, 'fail': 0, 'banned_until': 0})
                h['fail'] = h.get('fail', 0) + 1
                h['last_status'] = getattr(getattr(e, 'response', None), 'status_code', None)
                SEARXNG_HEALTH[host] = h
            continue
    raise RuntimeError(f"SearXNG all endpoints failed. Last error: {type(last_exc).__name__}: {str(last_exc)[:200]}. Health={SEARXNG_HEALTH}")

print('SearXNG rotation enabled. Ordered candidates:', get_searxng_candidates())


SearXNG rotation enabled. Ordered candidates: ['http://localhost:8080', 'http://192.168.219.113:8080', 'https://search.bus-hit.me', 'https://searx.be']


In [26]:
# Windows corporate CA support (certifi-win32) and optional SearXNG force
try:
    ensure('certifi-win32', 'certifi_win32')
    import certifi_win32  # noqa: F401
    print('certifi-win32 loaded: Windows certificate store integrated with certifi')
except Exception as e:
    print('certifi-win32 load failed:', type(e).__name__, str(e)[:200])

# Optionally force a specific SearXNG base via env
_force = os.getenv('SEARXNG_FORCE_BASE', '').strip()
if _force:
    SEARXNG_ENDPOINTS = [_force]
    print('SEARXNG_FORCE_BASE set -> using single endpoint:', _force)
else:
    print('SEARXNG_FORCE_BASE not set; using endpoints list as configured')


certifi-win32 load failed: CalledProcessError Command '['c:\\Python313\\python.exe', '-m', 'pip', 'install', 'certifi-win32']' returned non-zero exit status 1.
SEARXNG_FORCE_BASE not set; using endpoints list as configured


In [27]:
# Run and report

# Optional: quickly check which providers are enabled by config
print('Config summary:')
print('- bing_api_key set:', bool(CONFIG['bing_api_key']))
print('- serpapi_key set:', bool(CONFIG['serpapi_key']))
print('- searxng_base:', CONFIG.get('searxng_base', 'http://172.20.221.97:8080'))

# Execute benchmarks
results_df = run_benchmarks(TEST_QUERIES)

# Summary table
summary_cols = ['provider', 'ok', 'count', 'latency_ms', 'status', 'query', 'error']
summary = results_df.reindex(columns=[c for c in summary_cols if c in results_df.columns])
print('\nSummary:')
print(tabulate(summary.fillna(''), headers='keys', tablefmt='github'))

# Show sample results for successful runs
print('\nSamples:')
for _, row in results_df.iterrows():
    if row.get('ok') and row.get('sample'):
        print(f"\n== {row['provider']} | Query: {row['query']} | Count: {row['count']} | {row.get('latency_ms', '')} ms ==")
        for i, item in enumerate(row['sample'], 1):
            print(f"{i}. {item.get('title','')}")
            if item.get('snippet'):
                print(f"   {truncate(item['snippet'], 160)}")
            if item.get('url'):
                print(f"   URL: {item['url']}")


Config summary:
- bing_api_key set: False
- serpapi_key set: False
- searxng_base: http://192.168.219.113:8080

Summary:
|    | provider        | ok    | count   | latency_ms   | status   | query                        | error                                                                                                                                                                                                                                                                                                                            |
|----|-----------------|-------|---------|--------------|----------|------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|  0 | Bing API        | False | 