<a href="https://colab.research.google.com/github/idowujames/web-scrapping-bookstoscrape/blob/main/bookstostcrape_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [8]:
def construct_page_url(base_url, relative_url):
    """
    Constructs the full URL for a page by combining the base URL and the relative URL.

    Args:
        base_url (str): The base URL of the website.
        relative_url (str): The relative URL of the page.

    Returns:
        str: The full URL of the page.
    """
    return f"{base_url}/catalogue/{relative_url}"



def parse_url(url):
    """
    Sends a GET request to the given URL and returns the parsed HTML content as a BeautifulSoup object.

    Args:
        url (str): The URL to send the GET request to.

    Returns:
        BeautifulSoup: The parsed HTML content.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while parsing URL: {e}")
        return None




def scrape_book_info_page(soup):
    """
    Extracts book information from a single book page.

    Args:
        soup (BeautifulSoup): The parsed HTML content of the book page.

    Returns:
        Dict[str, str]: A dictionary containing the book information.
    """
    product_info = {}
    product_info['Name'] = soup.find('div', {'class': 'col-sm-6 product_main'}).find('h1').text

    # Extract book details
    book_table_rows = soup.find('table', {'class': 'table table-striped'}).find_all('tr')
    for row in book_table_rows:
        name = row.find('th').text
        value = row.find('td').text
        product_info[name] = value

    # Extract image link
    image_relative_link = soup.find('div', {'id': 'product_gallery'}).find('img').get('src').strip('../')
    img_link = f"{base_url}/{image_relative_link}"
    product_info['Image'] = img_link

    return product_info



def scrape_books(base_url, start_page = 1):
    """
    Scrapes book information from the website, starting from the specified page number.

    Args:
        base_url (str): The base URL of the website.
        start_page (int, optional): The starting page number for scraping. Defaults to 1.

    Returns:
        list: A list of dictionaries, where each dictionary contains the information of a book.
    """
    book_list = []
    page_num = start_page

    while True:
        next_page_url = f"page-{page_num}.html"
        page_url = construct_page_url(base_url, next_page_url)
        print(f"Scraping page: {next_page_url}")

        soup = parse_url(page_url)
        if not soup:
            break

        books = soup.find('ol', {'class': 'row'}).find_all('li')

        # Iterate through each book
        for book in books:
            # Get the URL of the book
            catalogue_url = book.find('a').get('href')
            book_url = construct_page_url(base_url, catalogue_url)

            # Send a request to get the book page content
            book_soup = parse_url(book_url)
            if not book_soup:
                continue

            book_info = scrape_book_info_page(book_soup)
            book_list.append(book_info)

        page_num += 1

        next_li = soup.find('ul', class_='pager').find('li', class_='next')
        if not next_li:  # Break if next link is not found
            break

    return book_list



In [9]:
base_url = 'https://books.toscrape.com'
books = scrape_books(base_url,49)

Scraping page: page-49.html
Scraping page: page-50.html


In [10]:
print("Number of books scraped:", len(books))
print(books[-1])

Number of books scraped: 40
{'Name': '1,000 Places to See Before You Die', 'UPC': '228ba5e7577e1d49', 'Product Type': 'Books', 'Price (excl. tax)': '£26.08', 'Price (incl. tax)': '£26.08', 'Tax': '£0.00', 'Availability': 'In stock (1 available)', 'Number of reviews': '0', 'Image': 'https://books.toscrape.com/media/cache/9e/10/9e106f81f65b293e488718a4f54a6a3f.jpg'}


In [12]:
df = pd.DataFrame(books)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Name               40 non-null     object
 1   UPC                40 non-null     object
 2   Product Type       40 non-null     object
 3   Price (excl. tax)  40 non-null     object
 4   Price (incl. tax)  40 non-null     object
 5   Tax                40 non-null     object
 6   Availability       40 non-null     object
 7   Number of reviews  40 non-null     object
 8   Image              40 non-null     object
dtypes: object(9)
memory usage: 2.9+ KB
None


Unnamed: 0,Name,UPC,Product Type,Price (excl. tax),Price (incl. tax),Tax,Availability,Number of reviews,Image
0,On the Road (Duluoz Legend),9ff997ab15713e53,Books,£32.36,£32.36,£0.00,In stock (1 available),0,https://books.toscrape.com/media/cache/7f/9b/7...
1,Old Records Never Die: One Man's Quest for His...,a1985630339f4b3d,Books,£55.66,£55.66,£0.00,In stock (1 available),0,https://books.toscrape.com/media/cache/aa/55/a...
2,Off Sides (Off #1),b6f1cfadc4dd79d4,Books,£39.45,£39.45,£0.00,In stock (1 available),0,https://books.toscrape.com/media/cache/4a/58/4...
3,Of Mice and Men,39592d9d72e717c4,Books,£47.11,£47.11,£0.00,In stock (1 available),0,https://books.toscrape.com/media/cache/a0/bb/a...
4,Myriad (Prentor #1),63e20a0f98218a87,Books,£58.75,£58.75,£0.00,In stock (1 available),0,https://books.toscrape.com/media/cache/48/7d/4...
