Task 06 Question 03


In [2]:
import csv
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

BASE_URL = "https://realpython.github.io/fake-jobs/"

headers = {
    # Pretend to be a browser to avoid basic bot blocks
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# Session with retries for transient HTTP issues
session = requests.Session()
retry = Retry(
    total=3,
    backoff_factor=0.5,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"],
)
session.mount("https://", HTTPAdapter(max_retries=retry))
session.headers.update(headers)

jobs = []
url = BASE_URL
pages_crawled = 0
max_pages = 3

while url and pages_crawled < max_pages:
    try:
        resp = session.get(url, timeout=15)
        resp.raise_for_status()
    except requests.HTTPError as e:
        # e.response can be None; be defensive
        code = getattr(e.response, "status_code", "unknown")
        reason = getattr(e.response, "reason", "")
        print(f"HTTP error {code} {reason} at {url}")
        break
    except requests.RequestException as e:
        print(f"Request error at {url}: {e}")
        break

    soup = BeautifulSoup(resp.text, "html.parser")

    # Parse each job card
    for card in soup.select("div.card-content"):
        title = card.select_one("h2.title")
        company = card.select_one("h3.company")
        location = card.select_one("p.location")
        if title and company and location:
            jobs.append(
                {
                    "title": title.get_text(strip=True),
                    "company": company.get_text(strip=True),
                    "location": location.get_text(strip=True),
                }
            )

    # Follow pagination: li.next > a
    next_link = soup.select_one("li.next > a")
    url = urljoin(url, next_link["href"]) if next_link and next_link.has_attr("href") else None
    pages_crawled += 1

# Save to CSV
csv_path = "fake_jobs.csv"
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["title", "company", "location"])
    writer.writeheader()
    writer.writerows(jobs)

print(f"Saved {len(jobs)} rows to {csv_path}")



Saved 100 rows to fake_jobs.csv
