In [1]:
!pip3 install --upgrade pip
!pip3 install requests beautifulsoup4



In [2]:
import requests
from logger import get_logger
from bs4 import BeautifulSoup
import re

register = get_logger()

In [3]:
def _request_page(base_url: str, concat_url=None):
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
    }
    with requests.Session() as session:
        address = base_url + concat_url if concat_url else base_url
        try:
            r = session.get(url=address, timeout=5, headers=HEADERS)

            if r.status_code == 200:
                register.info(
                    f"Request OK, status code: {r.status_code}, URL: {address}"
                )
                return r.content, session
            else:
                register.warning(f"Bad Request: {r.status_code}, URL: {address}")
        except requests.RequestException as e:
            register.error(f"Request failed: {e}, URL: {address}")
            return None

In [4]:
def _collect_pages(content):
    soup = BeautifulSoup(content, "html.parser")
    address_founded = []
    pattern = r"page-(\d+)\.html"

    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "previous" not in href and re.search(pattern, href):
            href_formated = href.replace("catalogue/", "")
            if "page-1.html" in href_formated:
                continue
            else:
                address_founded.append(href_formated)

    return address_founded

In [5]:
def scrape_all_pages(base_url, session=None, sub_path="/catalogue/"):
    try:
        visited_pages = set()
        to_visit = [""]
        while to_visit:
            current_page = to_visit.pop(0)
            if current_page in visited_pages:
                continue

            if current_page == "":
                url_base = base_url
            else:
                url_base = base_url + sub_path
                register.info(f"Current page is: {current_page}")

            content, session = _request_page(base_url=url_base, concat_url=current_page)
            if content is None:
                continue

            visited_pages.add(current_page)

            new_pages = _collect_pages(content=content)
            for page in new_pages:
                if page not in visited_pages and page not in to_visit:
                    to_visit.append(page)
        register.info(f"Scraping completed. Pages visited: {len(visited_pages)}")
        return visited_pages, session

    except Exception as e:
        register.error(f"Exception in fuction 'scrape_all_pages': {e}")
        if session:
            register.info("Session close and finished.")
            session.close()

In [6]:
def scrape_all_address_books(base_url, visited_pages, session):
    try:
        product_address = set()

        for url in visited_pages:
            if url == "":
                content, session = _request_page(base_url=base_url)

            elif url != "":
                content, session = _request_page(
                    base_url=base_url, concat_url=("catalogue/" + url)
                )

            soup = BeautifulSoup(content, "html.parser")
            products_books = soup.find_all(class_="image_container")

            for product in products_books:
                address = product.find("a", href=True)
                address_collected = address.get("href")
                if "catalogue/" in address_collected:
                    address_collected.replace("catalogue/", "")
                    product_address.add(address_collected)
                else:
                    product_address.add(address_collected)

        register.info(
            f"Scraping completed. Books address collected: {len(product_address)}"
        )
        return product_address, session

    except Exception as e:
        register.error(f"Exception in fuction 'scrape_all_address_books': {e}")
        if session:
            register.info("Session close and finished.")
            session.close()

In [7]:
def parser_books(base_url, product_address, session):
    try:
        for url in product_address:
            content, session = _request_page(
                base_url=base_url, concat_url=("catalogue/" + url)
            )
            soup = BeautifulSoup(content, "html.parser")
            book_title = soup.find("h1")
            price = soup.find(class_="price_color")
            p_tag = soup.find("p", class_="instock availability")
            qtd_stock = p_tag.get_text(strip=True)
            positions_p_tag = soup.find_all("p")

            for position in positions_p_tag:
                position[4]
            print(f"Book title is: {book_title.text}, Price: {price.text}, Stock: {qtd_stock} \n Description of book: {description.text}")

    except Exception as e:
        register.error(f"Exception in fuction 'parser_books': {e}")
        if session:
            register.info("Session close and finished.")
            session.close()

In [8]:
base_url = "https://books.toscrape.com/"
visited_pages, session = scrape_all_pages(base_url=base_url)

2025-01-28 18:00:51 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com/
2025-01-28 18:00:51 - logger:INFO - Current page is: page-2.html
2025-01-28 18:00:52 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com//catalogue/page-2.html
2025-01-28 18:00:52 - logger:INFO - Current page is: page-3.html
2025-01-28 18:00:53 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com//catalogue/page-3.html
2025-01-28 18:00:53 - logger:INFO - Current page is: page-4.html
2025-01-28 18:00:54 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com//catalogue/page-4.html
2025-01-28 18:00:54 - logger:INFO - Current page is: page-5.html
2025-01-28 18:00:55 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com//catalogue/page-5.html
2025-01-28 18:00:55 - logger:INFO - Current page is: page-6.html
2025-01-28 18:00:56 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrap

In [9]:
product_address, session = scrape_all_address_books(
    base_url=base_url, visited_pages=visited_pages, session=session
)

2025-01-28 18:01:39 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com/
2025-01-28 18:01:40 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com/catalogue/page-35.html
2025-01-28 18:01:41 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com/catalogue/page-42.html
2025-01-28 18:01:42 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com/catalogue/page-44.html
2025-01-28 18:01:43 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com/catalogue/page-33.html
2025-01-28 18:01:44 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com/catalogue/page-49.html
2025-01-28 18:01:44 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com/catalogue/page-4.html
2025-01-28 18:01:45 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com/catalogue/page-50.html
2025-01-28 18:01:47 - logger:INFO - Request OK, status 

In [10]:
parser_books(base_url=base_url, product_address=product_address, session=session)

2025-01-28 18:02:26 - logger:INFO - Request OK, status code: 200, URL: https://books.toscrape.com/catalogue/the-white-cat-and-the-monk-a-retelling-of-the-poem-pangur-ban_865/index.html
2025-01-28 18:02:26 - logger:ERROR - Exception in fuction 'parser_books': 4
2025-01-28 18:02:26 - logger:INFO - Session close and finished.
