## Dependencies

In [18]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
   ---------------------------------------- 0.0/250.9 kB ? eta -:--:--
   ---------------------------------------- 0.0/250.9 kB ? eta -:--:--
   - -------------------------------------- 10.2/250.9 kB ? eta -:--:--
   - -------------------------------------- 10.2/250.9 kB ? eta -:--:--
   ------ -------------------------------- 41.0/250.9 kB 245.8 kB/s eta 0:00:01
   -------------- ------------------------ 92.2/250.9 kB 476.3 kB/s eta 0:00:01
   ---------------------------------------  245.8/250.9 kB 1.1 MB/s eta 0:00:01
   ---------------------------------------  245.8/250.9 kB 1.1 MB/s eta 0:00:01
   -------------------------------------- 250.9/250.9 kB 768.9 kB/s eta 0:00:00
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing 


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\irmin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [1]:
import requests
from bs4 import BeautifulSoup

import csv
import json

import time
import random

## 1) Pagination and Scraping Multiple Pages

In [3]:
base_url = "http://books.toscrape.com/catalogue/category/books_1/page-{}.html"

In [8]:
base_url = "http://books.toscrape.com/catalogue/category/books_1/page-{}.html"
product_list = []

# Loop through the first 3 pages
for page in range(1, 4):
    url = base_url.format(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    products = soup.select("article.product_pod")
    
    for product in products:
        title = product.find("h3").find("a")["title"]
        price = product.find("p", class_="price_color").get_text()
        image_rel = product.find("div", class_="image_container").find("img")["src"]
        image_url = "http://books.toscrape.com/" + image_rel.replace("../", "")
        product_list.append({
            "title": title,
            "price": price,
            "image_url": image_url
        })
    
    # Brief pause between pages to simulate real browsing
    time.sleep(1)
    print(f"Page {page} processed.")

Page 1 processed.
Page 2 processed.
Page 3 processed.


In [9]:
with open("resultados/productos_multi.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["title", "price", "image_url"])
    writer.writeheader()
    writer.writerows(product_list)

print(f"Multi-page scraping completed: {len(product_list)} products saved to productos_multi.csv")

Multi-page scraping completed: 60 products saved to productos_multi.csv


## 2) Handling Errors and Common Exceptions

In [10]:
product_list = []

for page in range(47, 53):  # Test with 6 pages
    url = base_url.format(page)
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises error for 400 or 500 status codes
        soup = BeautifulSoup(response.text, "html.parser")
        products = soup.select("article.product_pod")
    except requests.RequestException as e:
        print(f"Error on page {page}: {e}")
        continue  # Continue with next iteration

    for product in products:
        try:
            title = product.find("h3").find("a")["title"]
            price = product.find("p", class_="price_color").get_text()
            image_rel = product.find("div", class_="image_container").find("img")["src"]
            image_url = "http://books.toscrape.com/" + image_rel.replace("../", "")
            product_list.append({
                "title": title,
                "price": price,
                "image_url": image_url
            })
        except Exception as ex:
            print("Error extracting data from a product:", ex)
    time.sleep(1)
    print(f"Page {page} processed.")

Page 47 processed.
Page 48 processed.
Page 49 processed.
Page 50 processed.
Error on page 51: 404 Client Error: Not Found for url: http://books.toscrape.com/catalogue/category/books_1/page-51.html
Error on page 52: 404 Client Error: Not Found for url: http://books.toscrape.com/catalogue/category/books_1/page-52.html


In [11]:
with open("resultados/productos_con_errores.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["title", "price", "image_url"])
    writer.writeheader()
    writer.writerows(product_list)

print(f"Scraping completed with error handling: {len(product_list)} products saved to productos_con_errores.csv")

Scraping completed with error handling: 80 products saved to productos_con_errores.csv


## 3) Best Practices: Headers, Timing, and Scraping Ethics


### üìú What is robots.txt?

It's a file that websites place at their root (https://site.com/robots.txt) to indicate which parts of the site can or cannot be explored by bots. Although it's not a "law" (it doesn't technically prevent it), it's an ethical norm to respect it.

In [12]:
# Define a header
headers = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Mobile Safari/537.36"
}

In [13]:
base_url = "http://books.toscrape.com/catalogue/category/books_1/page-{}.html"

In [14]:
product_list = []

for page in range(1, 4):
    url = base_url.format(page)
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        products = soup.select("article.product_pod")
    except requests.RequestException as e:
        print(f"Error on page {page}: {e}")
        continue

    for product in products:
        try:
            title = product.find("h3").find("a")["title"]
            price = product.find("p", class_="price_color").get_text()
            image_rel = product.find("div", class_="image_container").find("img")["src"]
            image_url = "http://books.toscrape.com/" + image_rel.replace("../", "")
            product_list.append({
                "title": title,
                "price": price,
                "image_url": image_url
            })
        except Exception as ex:
            print("Error extracting data from a product:", ex)
    
    # Random pause to mimic human behavior
    sleep_time = random.uniform(1, 3)
    time.sleep(sleep_time)
    print(f"Page {page} processed with a pause of {sleep_time:.2f} seconds.")

Page 1 processed with a pause of 2.76 seconds.
Page 2 processed with a pause of 1.44 seconds.
Page 3 processed with a pause of 2.98 seconds.


### Save as CSV


In [15]:
with open("resultados/productos_eticos.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["title", "price", "image_url"])
    writer.writeheader()
    writer.writerows(product_list)

print(f"Ethical scraping completed: {len(product_list)} products saved to productos_eticos.csv")

Ethical scraping completed: 60 products saved to productos_eticos.csv


### Save as JSON


In [16]:
with open("resultados/productos_final.json", "w", encoding="utf-8") as jsonfile:
    json.dump(product_list, jsonfile, indent=4, ensure_ascii=False)

print(f"Data exported: {len(product_list)} products in productos_final.json")

Data exported: 60 products in productos_final.json


### Save as Excel


In [19]:
#!pip install pandas openpyxl
import pandas as pd

# Convert to Excel
df = pd.DataFrame(product_list)

# Save as Excel file
df.to_excel("resultados/productos_eticos.xlsx", index=False)

print(f"Ethical scraping completed: {len(product_list)} products saved to productos_eticos.xlsx")

Ethical scraping completed: 60 products saved to productos_eticos.xlsx


### Save to Google Form


In [20]:
import requests
import time

### Example of form URL structure:
https://docs.google.com/forms/d/e/1FAIpQLScFALHeZ6y-CJI_vy3f_78MFyNNGm4jz9ZZoLykEPbCpVdOrQ/viewform?usp=pp_url&entry.1204702772=pinocho&entry.464991896=1500&entry.406922421=www.pinocho.com

In [22]:
# Form URL
url = "https://docs.google.com/forms/d/e/1FAIpQLScFALHeZ6y-CJI_vy3f_78MFyNNGm4jz9ZZoLykEPbCpVdOrQ/formResponse"

# Headers to avoid 401 error
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
    "Referer": "https://docs.google.com/forms/d/e/1FAIpQLScFALHeZ6y-CJI_vy3f_78MFyNNGm4jz9ZZoLykEPbCpVdOrQ/viewform"
}

# Loop through and submit each product
for i, product in enumerate(product_list[0:5], start=1):
    payload = {
        "entry.1204702772": product["title"],        # field 1: title
        "entry.464991896": product["price"],        # field 2: price
        "entry.406922421": product["image_url"]     # field 3: image
    }

    response = requests.post(url, data=payload, headers=headers)

    if response.status_code == 200:
        print(f"‚úÖ Product {i} sent: {product['title']}")
    else:
        print(f"‚ùå Error sending product {i} - Status code: {response.status_code}")
    
    time.sleep(1)

‚úÖ Product 1 sent: A Light in the Attic
‚úÖ Product 2 sent: Tipping the Velvet
‚úÖ Product 3 sent: Soumission
‚úÖ Product 4 sent: Sharp Objects
‚úÖ Product 5 sent: Sapiens: A Brief History of Humankind
