In [None]:
!pip3 install --upgrade pip
!pip3 install requests beautifulsoup4 lxml pandas

# Step One - Extract informations
#### Function below is necessary for scrape informations from Books to Scrape

In [2]:
import requests
from logger import get_logger
from bs4 import BeautifulSoup
from lxml import html
import re
import concurrent.futures
import sqlite3
import pandas as pd


register = get_logger()

In [3]:
def _request_page(base_url: str, session, concat_url=None):
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
    }
    address = base_url + concat_url if concat_url else base_url
    try:
        r = session.get(url=address, timeout=5, headers=HEADERS)
        if r.status_code == 200:
            register.info(f"Request OK, status code: {r.status_code}, URL: {address}")
            return r.content
        else:
            register.warning(f"Bad Request: {r.status_code}, URL: {address}")
            return None
    except requests.RequestException as e:
        register.error(f"Request failed: {e}, URL: {address}")
        return None

In [4]:
def _collect_pages(content):
    soup = BeautifulSoup(content, "html.parser")
    address_founded = []
    pattern = r"page-(\d+)\.html"

    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "previous" not in href and re.search(pattern, href):
            href_formated = href.replace("catalogue/", "")
            if "page-1.html" in href_formated:
                continue
            else:
                address_founded.append(href_formated)

    return address_founded

In [5]:
def scrape_all_pages(base_url, session, sub_path="/catalogue/"):
    try:
        visited_pages = set()
        to_visit = [""]
        while to_visit:
            current_page = to_visit.pop(0)
            if current_page in visited_pages:
                continue

            url_base = base_url if current_page == "" else base_url + sub_path
            register.info(f"Current page is: {current_page}")

            content = _request_page(
                base_url=url_base, session=session, concat_url=current_page
            )
            if content is None:
                continue

            visited_pages.add(current_page)

            new_pages = _collect_pages(content=content)
            for page in new_pages:
                if page not in visited_pages and page not in to_visit:
                    to_visit.append(page)

        register.info(f"Scraping completed. Pages visited: {len(visited_pages)}")
        return visited_pages

    except Exception as e:
        register.error(f"Exception in fuction 'scrape_all_pages': {e}")
        if session:
            register.info("Session close and finished.")
            session.close()

In [6]:
def scrape_all_address_books(base_url, visited_pages, session):
    try:
        product_address = set()

        for url in visited_pages:
            content = _request_page(
                base_url=base_url,
                session=session,
                concat_url=f"/catalogue/{url}" if url else None,
            )
            if content is None:
                continue

            soup = BeautifulSoup(content, "html.parser")
            products_books = soup.find_all(class_="image_container")

            for product in products_books:
                address = product.find("a", href=True)
                product_address.add(address.get("href"))

        register.info(
            f"Scraping completed. Books address collected: {len(product_address)}"
        )
        return product_address

    except Exception as e:
        register.error(f"Exception in function 'scrape_all_address_books': {e}")

    except Exception as e:
        register.error(f"Exception in fuction 'scrape_all_address_books': {e}")
        if session:
            register.info("Session close and finished.")
            session.close()

# Step Two - Parser and transform informations

In [7]:
def _transform_informations(book_title, price_of_book, qtd_stock, description_of_book):
    if not book_title or not price_of_book or not qtd_stock or not description_of_book:
        return "N/A", 0.0, 0, "N/A"

    title = book_title.get_text(strip=True).capitalize() if book_title else "N/A"
    price = (
        float(price_of_book.get_text(strip=True).replace("£", ""))
        if price_of_book
        else 0.0
    )
    stock = re.findall(r"\d+", qtd_stock.get_text(strip=True)) if qtd_stock else []
    stock = int(stock[0]) if stock else 0
    description = description_of_book[0].strip() if description_of_book else "N/A"

    return title, price, stock, description

In [8]:
def parser_books(base_url, product_address, session):
    information = {"title": [], "price": [], "stock": [], "description": []}

    def parse_url(url):
        try:
            url = url.replace("catalogue/", "") if "catalogue" in url else url
            content = _request_page(
                base_url=base_url, session=session, concat_url=f"/catalogue/{url}"
            )
            if content is None:
                return

            soup = BeautifulSoup(content, "html.parser")
            tree = html.fromstring(content)

            book_title = soup.find("h1")
            price_of_book = soup.find(class_="price_color")
            qtd_stock = soup.find("p", class_="instock availability")
            description_of_book = tree.xpath(
                '//*[@id="content_inner"]/article/p/text()'
            )

            title, price, stock, description = _transform_informations(
                book_title=book_title,
                price_of_book=price_of_book,
                qtd_stock=qtd_stock,
                description_of_book=description_of_book,
            )

            if title == "N/A" or price == 0.0 or stock == 0 or description == "N/A":
                register.warning(f"Warning: Invalid data for URL {url}")

            information["title"].append(title)
            information["price"].append(price)
            information["stock"].append(stock)
            information["description"].append(description)

        except Exception as e:
            register.error(f"Exception in function 'parse_url' for URL {url}: {e}")

    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(parse_url, product_address)

    return information

# Step Three - Connect to database where we increment informations from scrape

In [9]:
def connection_to_database(database):
    try:
        con = sqlite3.connect(database=database)
        cursor = con.cursor()
        register.info("Connection to database OK!")
        return con, cursor
    except Exception as e:
        register.error(f"Exception in fuction 'connection_to_database': {e}")

In [10]:
def create_table_for_books(cursor, con):
    try:
        cursor.execute(
            """CREATE TABLE IF NOT EXISTS books (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT NOT NULL,
            description TEXT,
            price REAL NOT NULL,
            availability INTEGER NOT NULL
        );"""
        )
        con.commit()

    except Exception as e:
        register.error(f"Exception in fuction 'insert_values': {e}")

In [11]:
def insert_values(cursor, con, title, price, stock, description):
    try:
        cursor.execute(
            """INSERT INTO books(title,description,price,availability) VALUES(?,?,?,?)""",
            (title, description, price, stock),
        )
        con.commit()
        register.info(f"Insert values from book: {title} OK!")

    except Exception as e:
        register.error(f"Exception in fuction 'insert_values': {e}")

In [12]:
def output_db_csv(database):
    try:
        con = sqlite3.connect(database)
        query = """SELECT * FROM books"""
        df = pd.read_sql(query, con)
        df.to_csv("registers.csv", index=False)
    except Exception as e:
        register.error(f"Exception in fuction 'output_db_csv': {e}")
    finally:
        if con:
            con.close()

In [13]:
URL = "https://books.toscrape.com"
DATABASE = "database.sqlite3"

In [None]:
try:
    with requests.Session() as session:
        visited_pages = scrape_all_pages(base_url=URL, session=session)
        product_address = scrape_all_address_books(
            base_url=URL, visited_pages=visited_pages, session=session
        )
        information = parser_books(base_url=URL, product_address=product_address, session=session)
        num_books = len(information["title"])
        con, cursor = connection_to_database(database=DATABASE)
        create_table_for_books(con=con, cursor=cursor)
        for i in range(num_books):
            title = information["title"][i]
            price = information["price"][i]
            stock = information["stock"][i]
            description = information["description"][i]
            insert_values(
                cursor=cursor,
                con=con,
                title=title,
                price=price,
                stock=stock,
                description=description,
            )
finally:
    if con:
        con.close()

In [17]:
output_db_csv(database=DATABASE)