In [1]:
import requests
from bs4 import BeautifulSoup

import csv
import json

import time
import random

In [2]:
base_url = "http://books.toscrape.com/catalogue/category/books_1/page-{}.html"

In [3]:
product_list = []

for page in range(1, 4):
    url = base_url.format(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    products = soup.select('article.product_pod')

    for product in products:
        title = product.h3.a['title']
        price = product.select_one('p.price_color').text
        availability = product.select_one('p.availability').text.strip()
        rating = product.p['class'][1]
        image_rel = product.find("div", class_="image_container").find("img")["src"]
        image_url = "http://books.toscrape.com/" + image_rel.replace("../", "")

        product_list.append({
            "title": title,
            "price": price,
            "availability": availability,
            "rating": rating,
            "image_url": image_url
        })

    time.sleep(1)
    print(f"Processed page {page}")

Processed page 1
Processed page 2
Processed page 3


In [4]:
product_list[0]

{'title': 'A Light in the Attic',
 'price': 'Â£51.77',
 'availability': 'In stock',
 'rating': 'Three',
 'image_url': 'http://books.toscrape.com/media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg'}

In [5]:
from utils.paths import DATA_RAW_DIR
path_csv = str(DATA_RAW_DIR / "products_multipage.csv")
print(type(path_csv))

<class 'str'>


In [7]:
with open(path_csv, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['title', 'price', 'availability', 'rating', 'image_url']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for product in product_list:
        writer.writerow(product)
        
print(f"{len(product_list)} products saved to {path_csv}")

60 products saved to /Users/jasonssdev/Dev/Learning/Platzi/platzi-web-scraping-python/data/raw/products_multipage.csv


## 2) Manejando errores y excepciones comunes

In [8]:
product_list = []

for page in range(47, 53):  
    url = base_url.format(page)
    try:
        response = requests.get(url)
        response.raise_for_status()  
        soup = BeautifulSoup(response.text, "html.parser")
        products = soup.select("article.product_pod")
    except requests.RequestException as e:
        print(f"Wrong in page {page}: {e}")
        continue 

    for product in products:
        try:
            title = product.h3.a['title']
            price = product.select_one('p.price_color').text
            availability = product.select_one('p.availability').text.strip()
            rating = product.p['class'][1]
            image_rel = product.find("div", class_="image_container").find("img")["src"]
            image_url = "http://books.toscrape.com/" + image_rel.replace("../", "")

            product_list.append({
                "title": title,
                "price": price,
                "availability": availability,
                "rating": rating,
                "image_url": image_url
                })
        except Exception as ex:
            print("Data extraction error:", ex)
    time.sleep(1)
    print(f"Page {page} processed.")

Page 47 processed.
Page 48 processed.
Page 49 processed.
Page 50 processed.
Wrong in page 51: 404 Client Error: Not Found for url: http://books.toscrape.com/catalogue/category/books_1/page-51.html
Wrong in page 52: 404 Client Error: Not Found for url: http://books.toscrape.com/catalogue/category/books_1/page-52.html


In [9]:
product_errors = str(DATA_RAW_DIR / "products_with_errors.csv")
print(type(product_errors))

<class 'str'>


In [10]:
with open(product_errors, "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ['title', 'price', 'availability', 'rating', 'image_url']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    writer.writerows(product_list)

print(f"{len(product_list)} products with errors saved to {product_errors}")

80 products with errors saved to /Users/jasonssdev/Dev/Learning/Platzi/platzi-web-scraping-python/data/raw/products_with_errors.csv


## 3) Buenas prácticas: headers, tiempos y ética del scraping

In [11]:
header = {
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"
}

In [12]:
base_url = "http://books.toscrape.com/catalogue/category/books_1/page-{}.html"

In [13]:
product_list = []

for page in range(1, 4):  
    url = base_url.format(page)
    try:
        response = requests.get(url)
        response.raise_for_status()  
        soup = BeautifulSoup(response.text, "html.parser")
        products = soup.select("article.product_pod")
    except requests.RequestException as e:
        print(f"Wrong in page {page}: {e}")
        continue 

    for product in products:
        try:
            title = product.h3.a['title']
            price = product.select_one('p.price_color').text
            availability = product.select_one('p.availability').text.strip()
            rating = product.p['class'][1]
            image_rel = product.find("div", class_="image_container").find("img")["src"]
            image_url = "http://books.toscrape.com/" + image_rel.replace("../", "")

            product_list.append({
                "title": title,
                "price": price,
                "availability": availability,
                "rating": rating,
                "image_url": image_url
                })
        except Exception as ex:
            print("Data extraction error:", ex)

    sleep_time = random.uniform(1, 3)
    time.sleep(sleep_time)
    print(f"Page {page} processed with a sleep time of {sleep_time:.2f} seconds.")

Page 1 processed with a sleep time of 1.93 seconds.
Page 2 processed with a sleep time of 2.14 seconds.
Page 3 processed with a sleep time of 1.17 seconds.


## 4) Guardar

In [14]:
csv_file = str(DATA_RAW_DIR / "products_csv_file.csv")

with open(csv_file, "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ['title', 'price', 'availability', 'rating', 'image_url']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    writer.writerows(product_list)

print(f"{len(product_list)} products saved to {csv_file}")


60 products saved to /Users/jasonssdev/Dev/Learning/Platzi/platzi-web-scraping-python/data/raw/products_csv_file.csv


In [17]:
json_file = str(DATA_RAW_DIR / "products_json_file.json")

with open(json_file, "w", newline="", encoding="utf-8") as jsonfile:
    json.dump(product_list, jsonfile, indent=4, ensure_ascii=False)

print(f"{len(product_list)} products saved to {json_file}")


60 products saved to /Users/jasonssdev/Dev/Learning/Platzi/platzi-web-scraping-python/data/raw/products_json_file.json


In [18]:
import pandas as pd

df = pd.DataFrame(product_list)

df.to_excel(str(DATA_RAW_DIR / "products_excel_file.xlsx"), index=False)

print(f"{len(product_list)} products saved to Excel file.")

60 products saved to Excel file.
