**Run on Google Colab (Quickstart)**

```bash
! git clone --branch main --single-branch https://github.com/sbaaihamza/scrapping-lib.git
%cd scrapping-lib
! pip install -e ".[browser,dev]"
! playwright install
# Preferred (installs OS deps automatically on supported distros):
! playwright install --with-deps chromium
# If needed (manual deps fallback):
! apt-get update
! apt-get install -y libxcomposite1 libxcursor1 libgtk-3-0 libatk1.0-0 libcairo2 libgdk-pixbuf2.0-0
%cd /content/scrapping-lib/notebooks
```

*Note: Playwright has both sync and async APIs. These notebooks are designed to be async-safe for Jupyter/Colab. If you encounter OS dependency issues, use the `playwright install --with-deps chromium` command.*



# Engine In-Depth: HTTP Engine Cases

This notebook covers the configuration and operation of the `http` engine, ranging from basic page fetches to complex pagination and rate limiting.

### Output Directory
Results will be written to `results_notebook_http/`.

## Setup

In [None]:
import json
import os
import sys
from pathlib import Path


def find_repo_root(start_path):
    p = Path(start_path).resolve()
    for parent in [p] + list(p.parents):
        if (parent / 'pyproject.toml').exists():
            return parent
    return p

REPO_ROOT = find_repo_root(Path.cwd())
sys.path.append(str(REPO_ROOT))
os.chdir(str(REPO_ROOT))

ONLINE = os.getenv('ONLINE', '0') == '1'
RESULTS_DIR = Path('results_notebook_http')
print(f'Python version: {sys.version}')
print(f'Repo root: {REPO_ROOT}')
print(f'Online mode: {ONLINE}')

## Case 0: Minimal Public Page
HTTP engine is best for static sites that don't require JavaScript.

In [None]:
from scrapping.engines.http import HttpEngine
from scrapping.extraction.link_extractors import LinkExtractRequest, extract_links

url = 'https://quotes.toscrape.com/'
if ONLINE:
    engine = HttpEngine()
    res = engine.get(url)
    if res.ok:
        html = res.text
    else:
        print(f"‚ùå Fetch failed: {res.short_error()}")
        html = ""
else:
    print('ONLINE=0: using fixtures')
    fixture_path = 'tests/fixtures/html/listing_quotes.html'
    if Path(fixture_path).exists():
        html = Path(fixture_path).read_text()
    else:
        html = ""

if html:
    req = LinkExtractRequest(html=html, method='css', selector='.quote span a')
    links = extract_links(req)
    print(f'Found {len(links)} links. Samples: {links[:3]}')
else:
    print("No HTML to parse.")

## Case 1: Advanced Link Extraction
Moving beyond basic CSS selectors to robust patterns.

In [None]:
import re

from scrapping.extraction.link_extractors import LinkExtractRequest, extract_links, normalize_url

# 1. Regex extraction (Absolute URLs)
html_abs = '<a href="https://example.com/p/1">P1</a> <a href="https://example.com/p/2">P2</a>'
req_abs = LinkExtractRequest(html=html_abs, method='regex', pattern=r'https://example\.com/p/\d+')
print(f"Regex (Absolute): {extract_links(req_abs)}")

# 2. Regex extraction (Relative URLs + Base URL join)
html_rel = '<a href="/p/1">P1</a> <a href="/p/2">P2</a>'
req_rel = LinkExtractRequest(html=html_rel, base_url="https://example.com", method='regex', pattern=r'/p/\d+')
print(f"Regex (Relative): {extract_links(req_rel)}")

# 3. Include/Exclude filters
all_links = ['/p/1', '/p/2', '/about', '/contact']
include = r'/p/\d+'

filtered = [link for link in all_links if re.search(include, link)]
print(f"Filtered (Include /p/\d+): {filtered}")

# 4. Canonicalization
urls = [
    "https://example.com/p/1#fragment",
    "https://example.com/p/1/",
    "https://example.com/p/1?utm_source=fb"
]
normalized = [normalize_url(u, drop_fragments=True, drop_tracking_params=True) for u in urls]
print(f"Normalized: {normalized}")

In [None]:
import re

# Verify the regex patterns above
test_abs = "https://example.com/p/123"
pattern_abs = r'https://example\.com/p/\d+'
matches_abs = re.findall(pattern_abs, test_abs)
print(f"Absolute regex check: {matches_abs}")
assert len(matches_abs) == 1

test_rel = "/p/456"
pattern_rel = r'/p/\d+'
matches_rel = re.findall(pattern_rel, test_rel)
print(f"Relative regex check: {matches_rel}")
assert len(matches_rel) == 1


## Case 2: Pagination Patterns
Learn how to handle template-based and discovery-based pagination.

In [None]:
from scrapping.pipeline.stages import discover_listing_urls

# 1. Template: Page path
cfg_page = {
    'entrypoints': [{ 'url': 'https://quotes.toscrape.com/page/{page}/', 'paging': {'mode': 'page', 'start': 1, 'pages': 3} }]
}
print(f"Page Template: {discover_listing_urls(cfg_page)}")

# 2. Template: Offset
cfg_offset = {
    'entrypoints': [{ 'url': 'https://httpbin.org/get?start={offset}', 'paging': {'mode': 'offset', 'start': 0, 'step': 10, 'pages': 3} }]
}
print(f"Offset Template: {discover_listing_urls(cfg_offset)}")

# 3. Discovery: Next-link
def extract_next_link(html, base_url):
    req = LinkExtractRequest(html=html, base_url=base_url, method='css', selector='li.next a')
    links = extract_links(req)
    return links[0] if links else None

if ONLINE:
    res = HttpEngine().get('https://quotes.toscrape.com/')
    next_page = extract_next_link(res.text, res.final_url)
    print(f"Discovered Next Page: {next_page}")
else:
    print("Next Page extraction: (needs online or fixture with next link)")

## Case 3: Rate Limiting & Retries
Handling high-load targets safely and respectfully.

In [None]:
from scrapping.engines.http import HttpEngine, HttpEngineOptions

# 1. Retry behavior on status codes
if ONLINE:
    url_429 = 'https://httpbin.org/status/429'
    # Exponential backoff with jitter is the default
    opts = HttpEngineOptions(max_retries=2, base_delay_s=1.0, backoff_mode='exp')
    engine = HttpEngine(options=opts)
    print('Fetching 429 endpoint (will retry with exp backoff)...')
    res = engine.get(url_429)
    print(f'Final status: {res.status_code} after {len(res.engine_trace)} retries')
else:
    print('Offline: Skipping 429 demo')

# 2. Rate Limiting (RPS control)
opts_rl = HttpEngineOptions(rps=0.5) # 1 request every 2 seconds
engine_rl = HttpEngine(options=opts_rl)
print("Engine configured with 0.5 RPS (compliance-first)")

## Case 4: Block Detection
Identify when a site is blocking traffic.

In [None]:
from scrapping.runtime.blocks import classify_blocks

html_blocked = "<html><title>Access Denied</title><body>Please solve CAPTCHA to continue.</body></html>"
signals = classify_blocks(html_blocked)
print(f"Detection signals: {signals}")

## Final Trial: http_quotes.json

In [None]:
from scrapping.orchestrator import Orchestrator, OrchestratorOptions

with open('examples/configs/real/http_quotes.json') as f:
    cfg = json.load(f)

if ONLINE:
    orch = Orchestrator(options=OrchestratorOptions(results_dir=RESULTS_DIR, parallelism=1))
    # Limit for trial
    cfg['sources'][0]['entrypoints'][0]['paging']['pages'] = 1
    out = orch.run(cfg)
    print(json.dumps(out, indent=2))
else:
    print('ONLINE=0: skipping online trial')