# Search API Test Notebook

This self-contained notebook tests multiple web search providers and reports latency, status, and top results.

Providers included:
- Bing Web Search API (Azure Cognitive Services)
- SerpAPI (engine=bing)
- SearXNG (public or self-hosted)
- DuckDuckGo HTML (no-JS HTML endpoint)
- Brave HTML

Instructions:
- Set API keys/endpoints in the Config cell below (or via environment variables).
- Run the cells top-to-bottom. The notebook installs missing dependencies at runtime.
- Results include a summary table and sample results per provider.

Notes:
- Use responsibly and comply with each provider's Terms of Service.
- HTML providers may change markup; simple parsers here are best-effort.



In [None]:
# Runtime setup: imports and on-the-fly installs
import sys, subprocess, json, os, time, random
from typing import List, Dict, Any, Optional, Tuple

# Lazy installer

def ensure(package: str, import_name: Optional[str] = None):
    try:
        __import__(import_name or package)
    except Exception:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

# Ensure dependencies used by HTML parsers and table rendering
for pkg, imp in [
    ('requests', 'requests'),
    ('beautifulsoup4', 'bs4'),
    ('pandas', 'pandas'),
    ('tabulate', 'tabulate'),
]:
    ensure(pkg, imp)

import requests
from bs4 import BeautifulSoup
import pandas as pd
from tabulate import tabulate

# Utility: timing
class Timer:
    def __enter__(self):
        self.start = time.time()
        return self
    def __exit__(self, exc_type, exc, tb):
        self.end = time.time()
        self.ms = int((self.end - self.start) * 1000)

# Normalize result schema
Result = Dict[str, str]


def truncate(text: str, n: int = 200) -> str:
    if not text:
        return ''
    return text if len(text) <= n else text[:n] + '...'



In [None]:
# Config: API keys and endpoints
CONFIG = {
    # Bing Web Search API (Azure)
    # Docs: https://learn.microsoft.com/azure/cognitive-services/bing-web-search/
    'bing_api_key': os.getenv('BING_API_KEY', ''),
    'bing_endpoint': os.getenv('BING_ENDPOINT', 'https://api.bing.microsoft.com/v7.0/search'),

    # SerpAPI (Bing engine)
    # Docs: https://serpapi.com/
    'serpapi_key': os.getenv('SERPAPI_KEY', ''),

    # SearXNG public/self-hosted endpoint
    # Example public: https://search.bus-hit.me
    'searxng_base': os.getenv('SEARXNG_BASE', 'https://search.bus-hit.me'),

    # HTML providers config
    'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'timeout': 15,
}

# Basic validation helper

def require(value: str, name: str) -> None:
    if not value:
        raise ValueError(f"Missing required config: {name}. Set env var or edit CONFIG.")



In [None]:
# Connectivity diagnostics (optional)
import socket

print('— Connectivity checks —')
try:
    r = requests.get('https://example.com', timeout=10)
    print('example.com:', r.status_code)
except Exception as e:
    print('example.com failed:', type(e).__name__, str(e)[:200])

try:
    ip = requests.get('https://api.ipify.org?format=json', timeout=10)
    print('Public IP:', ip.json().get('ip'))
except Exception as e:
    print('Public IP check failed:', type(e).__name__, str(e)[:200])

print('HTTP_PROXY:', os.getenv('HTTP_PROXY') or os.getenv('http_proxy'))
print('HTTPS_PROXY:', os.getenv('HTTPS_PROXY') or os.getenv('https_proxy'))
try:
    print('Hostname:', socket.gethostname())
except Exception:
    pass


In [None]:
# Provider implementations

headers_common = {
    'User-Agent': CONFIG['user_agent'],
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
}


def search_bing_api(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    require(CONFIG['bing_api_key'], 'bing_api_key')
    url = CONFIG['bing_endpoint']
    params = {
        'q': query,
        'count': max_results,
        'mkt': 'en-US',
        'responseFilter': 'Webpages'
    }
    headers = {
        'Ocp-Apim-Subscription-Key': CONFIG['bing_api_key']
    }
    with Timer() as t:
        r = requests.get(url, params=params, headers=headers, timeout=CONFIG['timeout'])
        r.raise_for_status()
        data = r.json()
    items = []
    for w in (data.get('webPages', {}) or {}).get('value', []):
        items.append({
            'title': w.get('name', ''),
            'snippet': truncate(w.get('snippet', ''), 200),
            'url': w.get('url', ''),
            'source': 'Bing API'
        })
    return items[:max_results], {'latency_ms': t.ms, 'status': r.status_code}


def search_serpapi_bing(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    require(CONFIG['serpapi_key'], 'serpapi_key')
    url = 'https://serpapi.com/search.json'
    params = {
        'engine': 'bing',
        'q': query,
        'num': max_results,
        'api_key': CONFIG['serpapi_key']
    }
    with Timer() as t:
        r = requests.get(url, params=params, timeout=CONFIG['timeout'])
        r.raise_for_status()
        data = r.json()
    items = []
    for res in data.get('organic_results', [])[:max_results]:
        items.append({
            'title': res.get('title', ''),
            'snippet': truncate(res.get('snippet', ''), 200),
            'url': res.get('link', ''),
            'source': 'SerpAPI (Bing)'
        })
    return items, {'latency_ms': t.ms, 'status': r.status_code}


def search_searxng(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    base = CONFIG['searxng_base'].rstrip('/')
    url = f"{base}/search"
    params = {
        'q': query,
        'format': 'json',
        'engines': 'bing,duckduckgo',
        'language': 'en'
    }
    with Timer() as t:
        r = requests.get(url, params=params, headers=headers_common, timeout=CONFIG['timeout'])
        r.raise_for_status()
        data = r.json()
    items = []
    for res in data.get('results', [])[:max_results]:
        items.append({
            'title': res.get('title', ''),
            'snippet': truncate(res.get('content', ''), 200),
            'url': res.get('url', ''),
            'source': 'SearXNG'
        })
    return items, {'latency_ms': t.ms, 'status': r.status_code}


def search_duckduckgo_html(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    url = 'https://html.duckduckgo.com/html/'
    payload = {'q': query}
    session = requests.Session()
    session.headers.update(headers_common)
    with Timer() as t:
        r = session.post(url, data=payload, timeout=CONFIG['timeout'])
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'html.parser')
    items: List[Result] = []
    for element in soup.find_all('div', class_='result')[:max_results]:
        a = element.find('a', class_='result__a')
        if not a:
            continue
        title = a.get_text(strip=True) or ''
        href = a.get('href', '')
        snippet_elem = element.find('a', class_='result__snippet')
        snippet = snippet_elem.get_text(strip=True) if snippet_elem else ''
        if not title or not href:
            continue
        items.append({
            'title': title,
            'snippet': truncate(snippet, 200) if snippet else 'DuckDuckGo result',
            'url': href,
            'source': 'DuckDuckGo (HTML)'
        })
    return items, {'latency_ms': t.ms, 'status': r.status_code}


def search_brave_html(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    from urllib.parse import quote_plus
    url = f"https://search.brave.com/search?q={quote_plus(query)}"
    session = requests.Session()
    session.headers.update(headers_common)
    with Timer() as t:
        r = session.get(url, timeout=CONFIG['timeout'])
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'html.parser')
    items: List[Result] = []
    # Brave changes often; attempt multiple selectors
    # Primary: div.snippet with a.result-header
    for element in soup.find_all('div', class_='snippet'):
        a = element.find('a', class_='result-header')
        if not a:
            continue
        title = a.get_text(strip=True) or ''
        href = a.get('href', '')
        desc = ''
        p = element.find('p', class_='snippet-description')
        if p:
            desc = p.get_text(strip=True)
        if title and href and href.startswith('http'):
            items.append({
                'title': title,
                'snippet': truncate(desc, 200) if desc else 'Brave result',
                'url': href,
                'source': 'Brave (HTML)'
            })
        if len(items) >= max_results:
            break
    return items[:max_results], {'latency_ms': t.ms, 'status': r.status_code}



In [None]:
# Benchmark harness

PROVIDERS = [
    ('Bing API', search_bing_api),
    ('SerpAPI (Bing)', search_serpapi_bing),
    ('SearXNG', search_searxng),
    ('DuckDuckGo HTML', search_duckduckgo_html),
    ('Brave HTML', search_brave_html),
]

TEST_QUERIES = [
    'python web scraping tutorial',
    'latest LLM research 2025',
    'how to use pandas merge',
]

MAX_RESULTS = 5


def run_single(provider_name: str, fn, query: str, max_results: int = MAX_RESULTS) -> Dict[str, Any]:
    try:
        results, meta = fn(query, max_results=max_results)
        return {
            'provider': provider_name,
            'query': query,
            'ok': True,
            'count': len(results),
            'latency_ms': meta.get('latency_ms', None),
            'status': meta.get('status', None),
            'sample': results[:2] if results else []
        }
    except Exception as e:
        return {
            'provider': provider_name,
            'query': query,
            'ok': False,
            'error': str(e)
        }


def run_benchmarks(queries: List[str] = TEST_QUERIES) -> pd.DataFrame:
    rows = []
    for q in queries:
        for name, fn in PROVIDERS:
            rows.append(run_single(name, fn, q))
    df = pd.DataFrame(rows)
    return df



In [None]:
# SearXNG: SSL fallback, proxy support, and endpoint rotation
from urllib.parse import urlparse

SEARXNG_ENDPOINTS = [
    CONFIG['searxng_base'],
    'https://searx.be',
    'https://searx.tiekoetter.com',
    'https://searxng.site',
]

PROXY_ENV = {
    'http': os.getenv('HTTP_PROXY') or os.getenv('http_proxy') or None,
    'https': os.getenv('HTTPS_PROXY') or os.getenv('https_proxy') or None,
}

# Replace search_searxng with a hardened version

def search_searxng(query: str, max_results: int = 5) -> Tuple[List[Result], Dict[str, Any]]:
    headers = dict(headers_common)
    session = requests.Session()
    session.headers.update(headers)

    # Try multiple endpoints, handle SSL errors, and optionally disable verification as last resort
    last_exc = None
    for idx, base in enumerate([e for e in SEARXNG_ENDPOINTS if e]):
        base = base.rstrip('/')
        url = f"{base}/search"
        params = {
            'q': query,
            'format': 'json',
            'engines': 'bing,duckduckgo',
            'language': 'en'
        }

        # Attempt 1: normal GET with proxies if set
        try:
            with Timer() as t:
                r = session.get(url, params=params, timeout=CONFIG['timeout'], proxies=PROXY_ENV if any(PROXY_ENV.values()) else None)
                r.raise_for_status()
                data = r.json()
            items = []
            for res in data.get('results', [])[:max_results]:
                items.append({
                    'title': res.get('title', ''),
                    'snippet': truncate(res.get('content', ''), 200),
                    'url': res.get('url', ''),
                    'source': f"SearXNG ({urlparse(base).netloc})"
                })
            return items, {'latency_ms': t.ms, 'status': r.status_code}
        except requests.exceptions.SSLError as e:
            last_exc = e
            # Attempt 2: allow self-signed/invalid certs (verification off) for lab environments
            try:
                with Timer() as t:
                    r = session.get(url, params=params, timeout=CONFIG['timeout'], verify=False, proxies=PROXY_ENV if any(PROXY_ENV.values()) else None)
                    r.raise_for_status()
                    data = r.json()
                items = []
                for res in data.get('results', [])[:max_results]:
                    items.append({
                        'title': res.get('title', ''),
                        'snippet': truncate(res.get('content', ''), 200),
                        'url': res.get('url', ''),
                        'source': f"SearXNG (insecure) ({urlparse(base).netloc})"
                    })
                return items, {'latency_ms': t.ms, 'status': r.status_code}
            except Exception as e2:
                last_exc = e2
                continue
        except Exception as e:
            last_exc = e
            continue

    raise RuntimeError(f"SearXNG failed across endpoints: {type(last_exc).__name__}: {str(last_exc)[:200]}")


In [None]:
# Run and report

# Optional: quickly check which providers are enabled by config
print('Config summary:')
print('- bing_api_key set:', bool(CONFIG['bing_api_key']))
print('- serpapi_key set:', bool(CONFIG['serpapi_key']))
print('- searxng_base:', CONFIG['searxng_base'])

# Execute benchmarks
results_df = run_benchmarks(TEST_QUERIES)

# Summary table
summary_cols = ['provider', 'ok', 'count', 'latency_ms', 'status', 'query', 'error']
summary = results_df.reindex(columns=[c for c in summary_cols if c in results_df.columns])
print('\nSummary:')
print(tabulate(summary.fillna(''), headers='keys', tablefmt='github'))

# Show sample results for successful runs
print('\nSamples:')
for _, row in results_df.iterrows():
    if row.get('ok') and row.get('sample'):
        print(f"\n== {row['provider']} | Query: {row['query']} | Count: {row['count']} | {row.get('latency_ms', '')} ms ==")
        for i, item in enumerate(row['sample'], 1):
            print(f"{i}. {item.get('title','')}")
            if item.get('snippet'):
                print(f"   {truncate(item['snippet'], 160)}")
            if item.get('url'):
                print(f"   URL: {item['url']}")

