In [1]:
import requests
from logger import get_logger
from bs4 import BeautifulSoup
import re

register = get_logger()

In [2]:
def _request_page(base_url: str, concat_url=None):
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
    }
    with requests.Session() as session:
        address = base_url + concat_url if concat_url else base_url
        try:
            r = session.get(url=address, timeout=5, headers=HEADERS)

            if r.status_code == 200:
                register.info(
                    f"Request OK, status code: {r.status_code}, URL: {address}"
                )
                return r.content, session
            else:
                register.warning(f"Bad Request: {r.status_code}, URL: {address}")
        except requests.RequestException as e:
            register.error(f"Request failed: {e}, URL: {address}")
            return None

In [3]:
def _collect_pages(content):
    soup = BeautifulSoup(content, "html.parser")
    address_founded = []
    pattern = r"page-(\d+)\.html"

    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "previous" not in href and re.search(pattern, href):
            href_formated = href.replace("catalogue/", "")
            if "page-1.html" in href_formated:
                continue
            else:
                address_founded.append(href_formated)

    return address_founded

In [4]:
def scrape_all_pages(base_url, session=None, sub_path="/catalogue/"):
    try:

        visited_pages = set()
        to_visit = [""]
        while to_visit:
            current_page = to_visit.pop(0)
            if current_page in visited_pages:
                continue

            if current_page == "":
                url_base = base_url
            else:
                url_base = base_url + sub_path
                register.info(f"Current page is: {current_page}")

            content, session = _request_page(base_url=url_base, concat_url=current_page)
            if content is None:
                continue

            visited_pages.add(current_page)

            new_pages = _collect_pages(content=content)
            for page in new_pages:
                if page not in visited_pages and page not in to_visit:
                    to_visit.append(page)
        register.info(f"Scraping completed. Pages visited: {len(visited_pages)}")
        return visited_pages, session

    except Exception as e:
        register.error(f"Exception in fuction 'scrape_all_pages': {e}")
        if session:
            register.info("Session close and finished.")
            session.close()

In [5]:
def scrape_all_address_books(base_url, visited_pages, session):
    try:
        for url in visited_pages:
            if url == "":
                content, session = _request_page(base_url=base_url)

            elif url != "":
                content, session = _request_page(
                    base_url=base_url, concat_url=("catalogue/" + url)
                )

            soup = BeautifulSoup(content, "html.parser")
            products_books = soup.find_all(class_="product_pod")
            product_address = set()

            for product in products_books:
                address = product.find("a", href=True)
                product_address.add(address.get("href"))

        register.info(
            f"Scraping completed. Books address collected: {len(visited_pages)}"
        )
        return product_address, session

    except Exception as e:
        register.error(f"Exception in fuction 'scrape_all_address_books': {e}")
        if session:
            register.info("Session close and finished.")
            session.close()

In [6]:
def parser_books(base_url, product_address, session):
    try:
        print(len(product_address))
        for url in product_address:
            print(url)
            content, session = _request_page(
                base_url=base_url, concat_url=("catalogue/" + url)
            )
            soup = BeautifulSoup(content, "html.parser")
            book_title = soup.find("h1").text
            price = soup.find(class_="price_color").text
            qtd_stock = soup.find("i", class_="icon-ok").text

    except Exception as e:
        register.error(f"Exception in fuction 'parser_books': {e}")
        if session:
            register.info("Session close and finished.")
            session.close()

In [None]:
base_url = "https://books.toscrape.com/"
visited_pages, session = scrape_all_pages(base_url=base_url)

In [None]:
product_address, session = scrape_all_address_books(
    base_url=base_url, visited_pages=visited_pages, session=session
)

In [None]:
parser_books(base_url=base_url, product_address=product_address, session=session)