# Web Scrapping lab

In this lab you will scrappe this [website](https://books.toscrape.com/) of books.

You have to create a Pandas DataFrame with all the books listed in the page. Each row of the DataFrame should contain information of each book. In particular, the DataFrmae must contain:

* category
* title
* price
* stock availability
* star rating (number of stars)
* description
* UPC

Happy scrapping!



# Server verification

Load the needed libraries, and make sure thar you can obtain the correct status code.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Set up the URL for scraping
base_url = 'http://books.toscrape.com/catalogue/page-{}.html'  # Pagination URL
books_data = []

# Loop through multiple pages
for page in range(1, 3):  # Change range for more pages
    url = base_url.format(page)
    response = requests.get(url)

    # Check the status code
    if response.status_code == 200:
        print(f"Successfully accessed page {page}.")
    else:
        print(f"Failed to retrieve the page {page}. Status code: {response.status_code}")
        continue  # Skip to the next iteration

    # Step 3: Parse the HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # Step 4: Extract the required information
    for book in soup.select('.product_pod'):
        title = book.h3.a['title']  # Title of the book
        price = book.select_one('.price_color').get_text(strip=True)  # Price
        stock_availability = book.select_one('.instock.availability').get_text(strip=True)  # Stock
        star_rating = book.select_one('.star-rating')['class'][1]  # Extract star rating class

        # Extract the link to the book details page to get UPC and description
        book_link = book.h3.a['href']
        book_detail_url = f"http://books.toscrape.com/catalogue/{book_link}"
        detail_response = requests.get(book_detail_url)
        
        if detail_response.status_code == 200:
            detail_soup = BeautifulSoup(detail_response.content, 'html.parser')
            description = detail_soup.select_one('#product_description + p').get_text(strip=True)  # Description
            UPC = detail_soup.select_one('table tr:nth-of-type(1) td').get_text(strip=True)  # UPC
        else:
            print(f"Failed to retrieve details for {title}. Status code: {detail_response.status_code}")
            description = None
            UPC = None

        # Convert star rating class to numeric rating
        star_mapping = {
            'One': 1,
            'Two': 2,
            'Three': 3,
            'Four': 4,
            'Five': 5
        }
        star_rating_numeric = star_mapping.get(star_rating, 0)

        # Append book data to the list
        books_data.append({
            'title': title,
            'price': price,
            'stock availability': stock_availability,
            'star rating': star_rating_numeric,
            'description': description,
            'UPC': UPC
        })

# Step 5: Create a Pandas DataFrame
books_df = pd.DataFrame(books_data)

# Display the DataFrame
display(books_df)

# Book categories

Create the code to collect the **relative urls** from the left panel to obtain a list with all the book categories.

In [None]:
#Step 1: Set up the URL for scraping the main page
url = 'http://books.toscrape.com/'

# Step 2: Make a request to the main page
response = requests.get(url)

# Check the status code
if response.status_code == 200:
    print("Successfully accessed the main page.")
else:
    print(f"Failed to retrieve the main page. Status code: {response.status_code}")

# Step 3: Parse the HTML
soup = BeautifulSoup(response.content, 'html.parser')

# Step 4: Extract category links from the left panel
categories = soup.select('.side_categories ul li a')  # Select the category links

# Step 5: Collect relative URLs and category names
category_list = []
for category in categories:
    category_name = category.get_text(strip=True)
    category_url = category['href']  # Relative URL
    category_list.append({
        'name': category_name,
        'url': category_url
    })

# Step 6: Display the collected categories
for cat in category_list:
    print(f"Category: {cat['name']}, URL: {cat['url']}")

# Books in a given category

Use. web scrapping and list comprehension to obtain the **absolute** url of each book to be scraped

In [None]:
# Step 1: Set up the base URL for scraping the main page
base_url = 'http://books.toscrape.com/'
url = base_url  # Main page URL

# Step 2: Make a request to the main page
response = requests.get(url)

# Check the status code
if response.status_code == 200:
    print("Successfully accessed the main page.")
else:
    print(f"Failed to retrieve the main page. Status code: {response.status_code}")

# Step 3: Parse the HTML
soup = BeautifulSoup(response.content, 'html.parser')

# Step 4: Extract relative URLs of books and convert to absolute URLs using list comprehension
book_links = [base_url + book.h3.a['href'] for book in soup.select('.product_pod')]

# Step 5: Display the absolute URLs of the books
for link in book_links:
    print(link)

# Book details

Create a Python function that given a book_url as an input returns a dictionary with the following structure:

```Python
{"Title": title, "Price": price, "Availability": availability, "Rating": rating, "Description": description, "UPC": upc}
```

where `description` should contain the book's summary given in the Product description, and the values are the book's associated information.

In [None]:
def get_book_info(book_url):
    # Step 1: Make a request to the book's URL
    response = requests.get(book_url)

    # Check the status code
    if response.status_code != 200:
        print(f"Failed to retrieve the book page. Status code: {response.status_code}")
        return None

    # Step 2: Parse the HTML content of the book page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Step 3: Extract the required information
    title = soup.h1.get_text(strip=True)  # Title
    price = soup.select_one('.price_color').get_text(strip=True)  # Price
    availability = soup.select_one('.instock.availability').get_text(strip=True)  # Availability
    rating = soup.select_one('.star-rating')['class'][1]  # Extract star rating class
    
    # Convert star rating class to numeric rating
    star_mapping = {
        'One': 1,
        'Two': 2,
        'Three': 3,
        'Four': 4,
        'Five': 5
    }
    rating_numeric = star_mapping.get(rating, 0)

    # Description (summary) and UPC extraction
    description = soup.select_one('#product_description + p').get_text(strip=True)  # Description
    upc = soup.select_one('table tr:nth-of-type(1) td').get_text(strip=True)  # UPC

    # Step 4: Return the extracted information in a dictionary
    return {
        "Title": title,
        "Price": price,
        "Availability": availability,
        "Rating": rating_numeric,
        "Description": description,
        "UPC": upc
    }

# Example usage:
book_url = 'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'  # Replace with a valid book URL
book_info = get_book_info(book_url)
print(book_info)

# Collect and store all the information from the books in a Pandas DataFrame

Start with the following dictionary:

```python
books_dict = {"Title": [], "Price": [], "Availability": [], "Rating": [], "Description": [], "UPC": [], "Category": [] }
```

Then, iterate over all the categories and all the books in a given category to collect any book information using the previous function. Fill the previous dictionary with the information about each book.

Show the first five rows of the previous final Pandas DataFrame.

Tip: You can use the function `tqdm` from the library `tqdm` to show a progress bar if in iterable of a for loop as shown below :wink: :

```python
from tqdm import tqdm

for elem in tqdm(iterable):
    # some code
```





In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

def get_book_info(book_url):
    """Fetch book details from the book page."""
    response = requests.get(book_url)
    if response.status_code != 200:
        print(f"Failed to retrieve the book page. Status code: {response.status_code} for URL: {book_url}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.h1.get_text(strip=True)  # Title
    price = soup.select_one('.price_color').get_text(strip=True)  # Price
    availability = soup.select_one('.instock.availability').get_text(strip=True)  # Availability
    rating = soup.select_one('.star-rating')['class'][1]  # Rating
    star_mapping = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
    rating_numeric = star_mapping.get(rating, 0)
    description = soup.select_one('#product_description + p').get_text(strip=True)  # Description
    upc = soup.select_one('table tr:nth-of-type(1) td').get_text(strip=True)  # UPC

    return {
        "Title": title,
        "Price": price,
        "Availability": availability,
        "Rating": rating_numeric,
        "Description": description,
        "UPC": upc
    }

# Step 1: Collect all category URLs
base_url = 'http://books.toscrape.com/'
response = requests.get(base_url)

if response.status_code != 200:
    print(f"Failed to retrieve the main page. Status code: {response.status_code}")
else:
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract category links from the left panel
    categories = soup.select('.side_categories ul li a')
    category_links = {cat.get_text(strip=True): base_url + cat['href'] for cat in categories if cat['href'] != '#'}

    # Step 2: Initialize books_dict
    books_dict = {
        "Title": [],
        "Price": [],
        "Availability": [],
        "Rating": [],
        "Description": [],
        "UPC": [],
        "Category": []
    }

    # Step 3: Iterate over each category and collect book information
    for category_name, category_url in tqdm(category_links.items(), desc="Processing categories"):
        while category_url:  # Loop to handle pagination
            category_response = requests.get(category_url)
            if category_response.status_code != 200:
                print(f"Failed to retrieve the category page. Status code: {category_response.status_code} for URL: {category_url}")
                break

            category_soup = BeautifulSoup(category_response.content, 'html.parser')

            # Find all book links on the category page
            book_links = [base_url + book.h3.a['href'] for book in category_soup.select('.product_pod')]

            if not book_links:
                print(f"No books found in category: {category_name}")
                break

            # Use tqdm to show a progress bar for book processing
            for book_link in tqdm(book_links, desc=f"Processing books in {category_name}", leave=False):
                book_info = get_book_info(book_link)
                if book_info:
                    # Fill the dictionary with book information
                    books_dict["Title"].append(book_info["Title"])
                    books_dict["Price"].append(book_info["Price"])
                    books_dict["Availability"].append(book_info["Availability"])
                    books_dict["Rating"].append(book_info["Rating"])
                    books_dict["Description"].append(book_info["Description"])
                    books_dict["UPC"].append(book_info["UPC"])
                    books_dict["Category"].append(category_name)

            # Check for the next page
            next_page = category_soup.select_one('.next a')
            if next_page:
                category_url = base_url + next_page['href']  # Update the URL to the next page
            else:
                break  # No more pages

    # Step 4: Create a Pandas DataFrame from the dictionary
    books_df = pd.DataFrame(books_dict)

    # Step 5: Display the first five rows of the DataFrame
    print(books_df.head())