In [None]:
!pip install BeautifulSoup4
!pip install sqlalchemy psycopg2-binary pandas
!pip install requests


In [None]:
from urllib.request import urlopen
from urllib.error import URLError, HTTPError
from bs4 import BeautifulSoup as soup
import json
import time
import requests
from bs4 import BeautifulSoup
import requests
import json
import time

In [None]:
def scrape_books(base_url="http://books.toscrape.com/", timeout=10, wait=1, save_json=True):

    url = base_url + "catalogue/page-1.html"
    libros = []
    i = 1

    while url:
        try:
            print(f"\nAccediendo a: {url}")

            uClient = urlopen(url, timeout=timeout)
            page_html = uClient.read()
            uClient.close()

            page_soup = soup(page_html, "html.parser")

            bookshelf = page_soup.findAll("li", {"class": "col-xs-6 col-sm-4 col-md-3 col-lg-3"})

            for book in bookshelf:
                book_title = book.h3.a["title"]
                a_href = book.h3.a["href"]
                book_url = base_url + "catalogue/" + a_href.replace("../../../", "")

                libros.append({
                    "titulo": book_title,
                    "url": book_url
                })

                print(f"{i}. {book_title}")
                i += 1

            next_button = page_soup.find("li", {"class": "next"})
            if next_button:
                next_href = next_button.a["href"]
                url = base_url + "catalogue/" + next_href
            else:
                url = None

            time.sleep(wait)

        except HTTPError as e:
            print(f"Error HTTP {e.code} al acceder a {url}")
            break
        except URLError as e:
            print(f"Error de conexión: {e.reason}")
            break
        except Exception as e:
            print(f"Error inesperado: {e}")
            break

    if save_json:
        with open("libros_url.json", "w", encoding="utf-8") as f:
            json.dump(libros, f, ensure_ascii=False, indent=4)

    print(f"\nTotal de libros scrapeados: {len(libros)}")
    return libros


In [None]:
# Llamar a la función
libros = scrape_books()

In [36]:
url = []
for libro in libros:
    url.append(libro["url"])
print(len(url))

1000


In [None]:

def obtener_datos_libro(url_libro):
    response = requests.get(url_libro)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Título
    titulo = soup.h1.text.strip()

    # Precio
    precio_tag = soup.find('p', class_='price_color')
    precio = precio_tag.text.strip()

    # Rating
    rating_tag = soup.find('p', class_='star-rating')
    rating_str = rating_tag['class'][1]

    rating_map = {
        "One": 1,
        "Two": 2,
        "Three": 3,
        "Four": 4,
        "Five": 5
    }

    rating = rating_map.get(rating_str, 0) 

    # Descripción
    descripcion_tag = soup.select_one('#product_description ~ p')
    descripcion = descripcion_tag.text.strip() if descripcion_tag else ""

    # Género (3er <li> en el breadcrumb)
    breadcrumb = soup.select("ul.breadcrumb li a")
    genero = breadcrumb[2].text.strip() if len(breadcrumb) > 2 else ""


    # Tabla de datos adicionales
    tabla = soup.find('table', class_='table table-striped')
    filas = tabla.find_all('tr')
    datos = {fila.th.text.strip(): fila.td.text.strip() for fila in filas}

    # Diccionario con los datos
    libro_data = {
        "Titulo": titulo,
        "Precio": precio,
        "rating": rating,
        "descripcion": descripcion,
        "Genero": genero, 
        "Upc": datos.get('UPC', ''),
        "tipo_producto": datos.get('Product Type', ''),
        "precio_excl_tax": datos.get('Price (excl. tax)', ''),
        "precio_incl_tax": datos.get('Price (incl. tax)', ''),
        "tax": datos.get('Tax', ''),
        "numero_stock": ''.join(filter(str.isdigit, datos.get('Availability', ''))),
        "numero_reviews": datos.get('Number of reviews', ''),
    }

    return libro_data

# ⬇️ Procesar todos los libros y guardar al final
todos_los_libros = []
i = 1
for libro in libros:
    print(f"Libro scrapeado {i}")
    datos = obtener_datos_libro(libro["url"])
    todos_los_libros.append(datos)
    i += 1

# Guardar todo en un único JSON
with open("libros.json", "w", encoding="utf-8") as f:
    json.dump(todos_los_libros, f, ensure_ascii=False, indent=4)

print(f"\n✅ Total de libros guardados: {len(todos_los_libros)}")

In [None]:
def obtener_autores_google_books(archivo_in="libros.json", archivo_out="autores_libros.json"):
    with open(archivo_in, "r", encoding="utf-8") as f:
        libros = json.load(f)

    autores_data = []

    for libro in libros:
        query = libro["Titulo"].replace(" ", "+")  # limpiar búsqueda
        url = f"https://www.googleapis.com/books/v1/volumes?q=intitle:{query}"

        try:
            resp = requests.get(url, timeout=10).json()

            autores = []
            if "items" in resp:
                volumen = resp["items"][0]["volumeInfo"]
                autores = volumen.get("authors", [])

            autores_data.append({
                "titulo": libro["Titulo"],
                "autores": autores
            })

            print(f"{libro['Titulo']} → {autores if autores else 'No encontrado'}")

            time.sleep(0.5)  # anti-rate-limit

        except Exception as e:
            print(f"Error con '{libro['Titulo']}': {e}")
            autores_data.append({
                "titulo": libro["Titulo"],
                "autores": []
            })

    with open(archivo_out, "w", encoding="utf-8") as f:
        json.dump(autores_data, f, ensure_ascii=False, indent=4)

    return autores_data


# Uso:
obtener_autores_google_books()
