# Books to Scrape → CSV (Colab)
 - Coleta todas as páginas/promoções
 - Extrai: título, preço, rating (1-5), disponibilidade, categoria, URL da imagem
 - Salva em /data/books.csv
 - Limpeza/validação de campos

In [1]:
import os
import re
import time
import math
import requests
import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup

In [2]:
BASE_URL = "https://books.toscrape.com/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/122.0.0.0 Safari/537.36"
}

In [3]:
# --- Funções utilitárias -----------------------------------------------------

def get_soup(url: str) -> BeautifulSoup:
    """Faz GET com retry simples e retorna BeautifulSoup."""
    for attempt in range(3):
        try:
            r = requests.get(url, headers=HEADERS, timeout=20)
            r.raise_for_status()
            return BeautifulSoup(r.text, "html.parser")
        except requests.RequestException as e:
            if attempt == 2:
                raise
            time.sleep(1.5)  # backoff curto e educado

In [4]:
def parse_rating_to_int(tag) -> int:
    """
    Converte o rating em texto (classe CSS) para inteiro 1-5.
    Ex.: <p class="star-rating Three"> → 3
    """
    mapping = {"one": 1, "two": 2, "three": 3, "four": 4, "five": 5}
    if not tag:
        return None
    classes = [c.lower() for c in tag.get("class", [])]
    for name, val in mapping.items():
        if name in classes:
            return val
    return None

In [5]:
def clean_price_to_float(price_text: str) -> float:
    """
    Extrai número do preço (ex.: '£51.77' → 51.77).
    Caso não encontre, retorna NaN.
    """
    if not price_text:
        return math.nan
    # Captura digitos com ponto decimal (independente de símbolo de moeda)
    m = re.search(r"(\d+(?:\.\d+)?)", price_text.replace(",", "."))
    return float(m.group(1)) if m else math.nan

In [6]:
def extract_category_from_detail(detail_url: str) -> str:
    """
    Abre a página do livro e captura a categoria via breadcrumb.
    Normalmente fica no 3º <li> do .breadcrumb.
    """
    soup = get_soup(detail_url)
    bc = soup.select(".breadcrumb li")
    # Ex.: [Home] > [Books] > [Category] > [Book Title]
    if len(bc) >= 3:
        return bc[2].get_text(strip=True)
    return None

In [7]:
# --- Loop principal de scraping ---------------------------------------------

def scrape_books() -> pd.DataFrame:
    """
    Percorre todas as páginas de catálogo e extrai informações
    de cada livro. Busca a categoria na página de detalhe.
    """
    data = []

    # Começa na home; o site possui paginação com 'li.next a'
    current_url = BASE_URL
    page_idx = 1

    while True:
        soup = get_soup(current_url)

        # Cada livro está em <article class="product_pod">
        books = soup.select("article.product_pod")
        print(f"Página {page_idx:02d}: {len(books)} livros")

        for book in books:
            # Título e link do detalhe
            h3 = book.find("h3")
            a = h3.find("a") if h3 else None
            title = a.get("title", "").strip() if a else None
            detail_href = a.get("href") if a else None
            detail_url = urljoin(current_url, detail_href) if detail_href else None

            # Preço
            price_text = book.select_one(".price_color")
            price_text = price_text.get_text(strip=True) if price_text else None

            # Rating (via classe CSS)
            rating_tag = book.select_one("p.star-rating")
            rating_int = parse_rating_to_int(rating_tag)

            # Disponibilidade (lista/grade mostra algo como "In stock")
            availability_tag = book.select_one(".availability")
            availability = (
                availability_tag.get_text(" ", strip=True) if availability_tag else None
            )

            # Imagem
            img_tag = book.find("img")
            img_src = img_tag.get("src") if img_tag else None
            image_url = urljoin(current_url, img_src) if img_src else None

            # Categoria (somente no detalhe)
            category = extract_category_from_detail(detail_url) if detail_url else None

            # Limpeza inicial
            price_float = clean_price_to_float(price_text)

            data.append(
                {
                    "title": title,
                    "price": price_float,          # já como float
                    "rating": int(rating_int) if rating_int is not None else None,
                    "availability": availability,
                    "category": category,
                    "image": image_url,
                    "detail_url": detail_url,
                }
            )

            # Atraso curto para ser gentil com o servidor
            time.sleep(0.15)

        # Próxima página
        next_link = soup.select_one("li.next a")
        if not next_link:
            break
        next_href = next_link.get("href")
        current_url = urljoin(current_url, next_href)
        page_idx += 1
        time.sleep(0.5)

    df = pd.DataFrame(data)
    return df

In [8]:
# --- Execução: scraping + limpeza extra + validação + export ----------------

df_raw = scrape_books()

Página 01: 20 livros
Página 02: 20 livros
Página 03: 20 livros
Página 04: 20 livros
Página 05: 20 livros
Página 06: 20 livros
Página 07: 20 livros
Página 08: 20 livros
Página 09: 20 livros
Página 10: 20 livros
Página 11: 20 livros
Página 12: 20 livros
Página 13: 20 livros
Página 14: 20 livros
Página 15: 20 livros
Página 16: 20 livros
Página 17: 20 livros
Página 18: 20 livros
Página 19: 20 livros
Página 20: 20 livros
Página 21: 20 livros
Página 22: 20 livros
Página 23: 20 livros
Página 24: 20 livros
Página 25: 20 livros
Página 26: 20 livros
Página 27: 20 livros
Página 28: 20 livros
Página 29: 20 livros
Página 30: 20 livros
Página 31: 20 livros
Página 32: 20 livros
Página 33: 20 livros
Página 34: 20 livros
Página 35: 20 livros
Página 36: 20 livros
Página 37: 20 livros
Página 38: 20 livros
Página 39: 20 livros
Página 40: 20 livros
Página 41: 20 livros
Página 42: 20 livros
Página 43: 20 livros
Página 44: 20 livros
Página 45: 20 livros
Página 46: 20 livros
Página 47: 20 livros
Página 48: 20

In [9]:
df_raw.head(10)

Unnamed: 0,title,price,rating,availability,category,image,detail_url
0,A Light in the Attic,51.77,3,In stock,Poetry,https://books.toscrape.com/media/cache/2c/da/2...,https://books.toscrape.com/catalogue/a-light-i...
1,Tipping the Velvet,53.74,1,In stock,Historical Fiction,https://books.toscrape.com/media/cache/26/0c/2...,https://books.toscrape.com/catalogue/tipping-t...
2,Soumission,50.1,1,In stock,Fiction,https://books.toscrape.com/media/cache/3e/ef/3...,https://books.toscrape.com/catalogue/soumissio...
3,Sharp Objects,47.82,4,In stock,Mystery,https://books.toscrape.com/media/cache/32/51/3...,https://books.toscrape.com/catalogue/sharp-obj...
4,Sapiens: A Brief History of Humankind,54.23,5,In stock,History,https://books.toscrape.com/media/cache/be/a5/b...,https://books.toscrape.com/catalogue/sapiens-a...
5,The Requiem Red,22.65,1,In stock,Young Adult,https://books.toscrape.com/media/cache/68/33/6...,https://books.toscrape.com/catalogue/the-requi...
6,The Dirty Little Secrets of Getting Your Dream...,33.34,4,In stock,Business,https://books.toscrape.com/media/cache/92/27/9...,https://books.toscrape.com/catalogue/the-dirty...
7,The Coming Woman: A Novel Based on the Life of...,17.93,3,In stock,Default,https://books.toscrape.com/media/cache/3d/54/3...,https://books.toscrape.com/catalogue/the-comin...
8,The Boys in the Boat: Nine Americans and Their...,22.6,4,In stock,Default,https://books.toscrape.com/media/cache/66/88/6...,https://books.toscrape.com/catalogue/the-boys-...
9,The Black Maria,52.15,1,In stock,Poetry,https://books.toscrape.com/media/cache/58/46/5...,https://books.toscrape.com/catalogue/the-black...


In [10]:
# Limpezas finais e validações adicionais
df = df_raw.copy()

In [11]:
# Normaliza strings
for col in ["title", "availability", "category", "image", "detail_url"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().replace({"None": pd.NA})

In [12]:
# Garante tipos finais
df["price"] = pd.to_numeric(df["price"], errors="coerce")
df["rating"] = pd.to_numeric(df["rating"], errors="coerce").astype("Int64")

In [13]:
# Valida faixas razoáveis (rating 1–5)
df.loc[~df["rating"].between(1, 5, inclusive="both"), "rating"] = pd.NA

In [14]:
# Remove duplicatas por detail_url se houver (opcional)
if "detail_url" in df.columns:
    df = df.drop_duplicates(subset=["detail_url"], keep="first")

In [15]:
# Reordena colunas conforme pedido
final_cols = ["title", "price", "rating", "availability", "category", "image", "detail_url"]
df = df[final_cols]

In [16]:
# Mostra um resumo rápido
print("Total de livros coletados:", len(df))
print(df.head(3))

Total de livros coletados: 1000
                  title  price  rating availability            category  \
0  A Light in the Attic  51.77       3     In stock              Poetry   
1    Tipping the Velvet  53.74       1     In stock  Historical Fiction   
2            Soumission  50.10       1     In stock             Fiction   

                                               image  \
0  https://books.toscrape.com/media/cache/2c/da/2...   
1  https://books.toscrape.com/media/cache/26/0c/2...   
2  https://books.toscrape.com/media/cache/3e/ef/3...   

                                          detail_url  
0  https://books.toscrape.com/catalogue/a-light-i...  
1  https://books.toscrape.com/catalogue/tipping-t...  
2  https://books.toscrape.com/catalogue/soumissio...  


In [None]:
df.info()

In [18]:
# (Opcional) estatísticas básicas para exploração no Colab
print("\nResumo de preços:")
print(df["price"].describe())


Resumo de preços:
count    1000.00000
mean       35.07035
std        14.44669
min        10.00000
25%        22.10750
50%        35.98000
75%        47.45750
max        59.99000
Name: price, dtype: float64

Distribuição de ratings:
rating
1    226
2    196
3    203
4    179
5    196
Name: count, dtype: Int64


In [None]:
print("\nDistribuição de ratings:")
print(df["rating"].value_counts(dropna=False).sort_index())

In [19]:
df.head(15)

Unnamed: 0,title,price,rating,availability,category,image,detail_url
0,A Light in the Attic,51.77,3,In stock,Poetry,https://books.toscrape.com/media/cache/2c/da/2...,https://books.toscrape.com/catalogue/a-light-i...
1,Tipping the Velvet,53.74,1,In stock,Historical Fiction,https://books.toscrape.com/media/cache/26/0c/2...,https://books.toscrape.com/catalogue/tipping-t...
2,Soumission,50.1,1,In stock,Fiction,https://books.toscrape.com/media/cache/3e/ef/3...,https://books.toscrape.com/catalogue/soumissio...
3,Sharp Objects,47.82,4,In stock,Mystery,https://books.toscrape.com/media/cache/32/51/3...,https://books.toscrape.com/catalogue/sharp-obj...
4,Sapiens: A Brief History of Humankind,54.23,5,In stock,History,https://books.toscrape.com/media/cache/be/a5/b...,https://books.toscrape.com/catalogue/sapiens-a...
5,The Requiem Red,22.65,1,In stock,Young Adult,https://books.toscrape.com/media/cache/68/33/6...,https://books.toscrape.com/catalogue/the-requi...
6,The Dirty Little Secrets of Getting Your Dream...,33.34,4,In stock,Business,https://books.toscrape.com/media/cache/92/27/9...,https://books.toscrape.com/catalogue/the-dirty...
7,The Coming Woman: A Novel Based on the Life of...,17.93,3,In stock,Default,https://books.toscrape.com/media/cache/3d/54/3...,https://books.toscrape.com/catalogue/the-comin...
8,The Boys in the Boat: Nine Americans and Their...,22.6,4,In stock,Default,https://books.toscrape.com/media/cache/66/88/6...,https://books.toscrape.com/catalogue/the-boys-...
9,The Black Maria,52.15,1,In stock,Poetry,https://books.toscrape.com/media/cache/58/46/5...,https://books.toscrape.com/catalogue/the-black...


In [28]:
FINAL_CSV = "dados-books.csv"

In [29]:
df.to_csv(FINAL_CSV, index=False)