# Teste para obter base de dados com todos os livros do site
***Site:** [Books to scrape](https://books.toscrape.com/)*

## Imports

In [116]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
from IPython.display import Image, display
from tqdm import tqdm
from word2number import w2n

## Página inicial do site

In [2]:
url = "https://books.toscrape.com/"
response = requests.get(url)

if response.status_code == 200:
    html_content = response.text
    print('Página obtida com sucesso')
else:
    print(f"Erro ao acessar a página. Código de status: {response.status_code}")

Página obtida com sucesso


In [3]:
soup = BeautifulSoup(html_content, 'html.parser')

In [4]:
print(soup.title.string.strip())

All products | Books to Scrape - Sandbox


## Obter nomes e links das categorias de livros

In [90]:
categories_links = {}

categories_soup = soup.find_all("a")

for cat in categories_soup:
    attr = list(cat.attrs.values())[0]

    if attr.startswith("catalogue/category"):
        categories_links[cat.get_text(strip = True)] = attr.replace('index.html', '')

categories_links.pop('Books')
print(f"Quantidade de categorias = {len(categories_links)}")
categories_links

Quantidade de categorias = 50


{'Travel': 'catalogue/category/books/travel_2/',
 'Mystery': 'catalogue/category/books/mystery_3/',
 'Historical Fiction': 'catalogue/category/books/historical-fiction_4/',
 'Sequential Art': 'catalogue/category/books/sequential-art_5/',
 'Classics': 'catalogue/category/books/classics_6/',
 'Philosophy': 'catalogue/category/books/philosophy_7/',
 'Romance': 'catalogue/category/books/romance_8/',
 'Womens Fiction': 'catalogue/category/books/womens-fiction_9/',
 'Fiction': 'catalogue/category/books/fiction_10/',
 'Childrens': 'catalogue/category/books/childrens_11/',
 'Religion': 'catalogue/category/books/religion_12/',
 'Nonfiction': 'catalogue/category/books/nonfiction_13/',
 'Music': 'catalogue/category/books/music_14/',
 'Default': 'catalogue/category/books/default_15/',
 'Science Fiction': 'catalogue/category/books/science-fiction_16/',
 'Sports and Games': 'catalogue/category/books/sports-and-games_17/',
 'Add a comment': 'catalogue/category/books/add-a-comment_18/',
 'Fantasy': 'c

## Salvar as informações dos livros em um dataframe

In [118]:
base_livros = pd.DataFrame()

def get_books_attrs(url, cat):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    livros_soup = soup.find_all('article')
    dados = []

    for livro in livros_soup:
        title = livro.find('h3').find('a')['title']
        price = float(livro.find_all('p')[1].get_text().replace('Â£', ''))
        rating = int(w2n.word_to_num(livro.find('p')['class'][1].lower()))
        availability = livro.find_all('p')[2].get_text(strip=True)
        image = urljoin("https://books.toscrape.com/", livro.find('img')['src'])

        dados.append({
            'title': title,
            'price': price,
            'rating': rating,
            'availability': availability,
            'category': cat,
            'image': image
        })

    return pd.DataFrame(dados), soup

for cat, link in tqdm(categories_links.items()):
    # print(f'Categoria: {cat}')

    url_cat = url + link

    while True:
        tmp, soup_cat = get_books_attrs(url_cat, cat)
        base_livros = pd.concat([base_livros, tmp], ignore_index = True)

        next_button = soup_cat.find('li', class_ = 'next')

        if next_button:
            next_page = next_button.find('a')['href']
            url_cat = urljoin(url_cat, next_page)
        else:
            break

100%|██████████| 50/50 [01:04<00:00,  1.29s/it]


In [119]:
print(f"Quantidade de livros: {base_livros.shape[0]}")
print(base_livros.dtypes)
base_livros.head()

Quantidade de livros: 1000
title            object
price           float64
rating            int64
availability     object
category         object
image            object
dtype: object


Unnamed: 0,title,price,rating,availability,category,image
0,It's Only the Himalayas,45.17,2,In stock,Travel,https://books.toscrape.com/media/cache/27/a5/2...
1,Full Moon over Noahâs Ark: An Odyssey to Mou...,49.43,4,In stock,Travel,https://books.toscrape.com/media/cache/57/77/5...
2,See America: A Celebration of Our National Par...,48.87,3,In stock,Travel,https://books.toscrape.com/media/cache/9a/7e/9...
3,Vagabonding: An Uncommon Guide to the Art of L...,36.94,2,In stock,Travel,https://books.toscrape.com/media/cache/d5/bf/d...
4,Under the Tuscan Sun,37.33,3,In stock,Travel,https://books.toscrape.com/media/cache/98/c2/9...


In [123]:
base_livros['category'].apply(lambda x: len(x)).max()

np.int64(18)

In [120]:
base_livros.rating.value_counts()

rating
1    226
3    203
2    196
5    196
4    179
Name: count, dtype: int64

In [121]:
base_livros.category.value_counts()

category
Default               152
Nonfiction            110
Sequential Art         75
Add a comment          67
Fiction                65
Young Adult            54
Fantasy                48
Romance                35
Mystery                32
Food and Drink         30
Childrens              29
Historical Fiction     26
Classics               19
Poetry                 19
History                18
Womens Fiction         17
Horror                 17
Science Fiction        16
Science                14
Music                  13
Business               12
Travel                 11
Philosophy             11
Thriller               11
Humor                  10
Autobiography           9
Art                     8
Religion                7
Psychology              7
Christian Fiction       6
Spirituality            6
New Adult               6
Sports and Games        5
Biography               5
Self Help               5
Health                  4
Politics                3
Contemporary            3
Chr

## Exibindo as imagens

In [101]:
display(Image(url=base_livros.image[0]))