# Teste para obter base de dados com todos os livros do site
***Site:** [Books to scrape](https://books.toscrape.com/)*

## Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
from IPython.display import Image, display
from tqdm import tqdm
from word2number import w2n

## Página inicial do site

In [2]:
url = "https://books.toscrape.com/"
response = requests.get(url)

if response.status_code == 200:
    html_content = response.text
    print('Página obtida com sucesso')
else:
    print(f"Erro ao acessar a página. Código de status: {response.status_code}")

Página obtida com sucesso


In [3]:
soup = BeautifulSoup(html_content, 'html.parser')

In [4]:
print(soup.title.string.strip())

All products | Books to Scrape - Sandbox


## Obter nomes e links das categorias de livros

In [None]:
categories_links = {}

categories_soup = soup.find_all("a")

for cat in categories_soup:
    attr = list(cat.attrs.values())[0]

    if attr.startswith("catalogue/category"):
        categories_links[cat.get_text(strip = True)] = attr.replace('index.html', '')

categories_links.pop('Books')
print(f"Quantidade de categorias = {len(categories_links)}")

Quantidade de categorias = 50


## Salvar as informações dos livros em um dataframe

In [None]:
base_livros = pd.DataFrame()

def get_books_attrs(url, cat):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    livros_soup = soup.find_all('article')
    dados = []

    for livro in livros_soup:
        title = livro.find('h3').find('a')['title']
        price = float(livro.find_all('p')[1].get_text().replace('Â£', ''))
        rating = int(w2n.word_to_num(livro.find('p')['class'][1].lower()))
        availability = livro.find_all('p')[2].get_text(strip=True)
        image = urljoin("https://books.toscrape.com/", livro.find('img')['src'])

        dados.append({
            'title': title,
            'price': price,
            'rating': rating,
            'availability': availability,
            'category': cat,
            'image': image
        })

    return pd.DataFrame(dados), soup

for cat, link in tqdm(categories_links.items()):

    url_cat = url + link

    while True:
        tmp, soup_cat = get_books_attrs(url_cat, cat)
        base_livros = pd.concat([base_livros, tmp], ignore_index = True)

        next_button = soup_cat.find('li', class_ = 'next')

        if next_button:
            next_page = next_button.find('a')['href']
            url_cat = urljoin(url_cat, next_page)
        else:
            break

100%|██████████| 50/50 [01:22<00:00,  1.65s/it]


In [7]:
print(f"Quantidade de livros: {base_livros.shape[0]}")
print(base_livros.dtypes)
base_livros.head()

Quantidade de livros: 1000
title            object
price           float64
rating            int64
availability     object
category         object
image            object
dtype: object


Unnamed: 0,title,price,rating,availability,category,image
0,It's Only the Himalayas,45.17,2,In stock,Travel,https://books.toscrape.com/media/cache/27/a5/2...
1,Full Moon over Noahâs Ark: An Odyssey to Mou...,49.43,4,In stock,Travel,https://books.toscrape.com/media/cache/57/77/5...
2,See America: A Celebration of Our National Par...,48.87,3,In stock,Travel,https://books.toscrape.com/media/cache/9a/7e/9...
3,Vagabonding: An Uncommon Guide to the Art of L...,36.94,2,In stock,Travel,https://books.toscrape.com/media/cache/d5/bf/d...
4,Under the Tuscan Sun,37.33,3,In stock,Travel,https://books.toscrape.com/media/cache/98/c2/9...


## Exibindo as imagens

In [101]:
display(Image(url=base_livros.image[0]))

## Salvando a base de dados em arquivo .csv

In [10]:
base_livros.to_csv('../../data/base_livros.csv', index = False)

In [11]:
pd.read_csv('../../data/base_livros.csv')

Unnamed: 0,title,price,rating,availability,category,image
0,It's Only the Himalayas,45.17,2,In stock,Travel,https://books.toscrape.com/media/cache/27/a5/2...
1,Full Moon over Noahâs Ark: An Odyssey to Mou...,49.43,4,In stock,Travel,https://books.toscrape.com/media/cache/57/77/5...
2,See America: A Celebration of Our National Par...,48.87,3,In stock,Travel,https://books.toscrape.com/media/cache/9a/7e/9...
3,Vagabonding: An Uncommon Guide to the Art of L...,36.94,2,In stock,Travel,https://books.toscrape.com/media/cache/d5/bf/d...
4,Under the Tuscan Sun,37.33,3,In stock,Travel,https://books.toscrape.com/media/cache/98/c2/9...
...,...,...,...,...,...,...
995,Why the Right Went Wrong: Conservatism--From G...,52.65,4,In stock,Politics,https://books.toscrape.com/media/cache/db/1b/d...
996,Equal Is Unfair: America's Misguided Fight Aga...,56.86,1,In stock,Politics,https://books.toscrape.com/media/cache/00/11/0...
997,Amid the Chaos,36.58,1,In stock,Cultural,https://books.toscrape.com/media/cache/52/46/5...
998,Dark Notes,19.19,5,In stock,Erotica,https://books.toscrape.com/media/cache/6e/4e/6...
