In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## **Scraping the Website**

In [9]:
def construct_page_url(base_url, relative_url):
    """
    Constructs the full URL for a page by combining the base URL and the relative URL.

    Args:
        base_url (str): The base URL of the website.
        relative_url (str): The relative URL of the page.

    Returns:
        str: The full URL of the page.
    """
    return f"{base_url}/catalogue/{relative_url}"



def parse_url(url):
    """
    Sends a GET request to the given URL and returns the parsed HTML content as a BeautifulSoup object.

    Args:
        url (str): The URL to send the GET request to.

    Returns:
        BeautifulSoup: The parsed HTML content.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while parsing URL: {e}")
        return None




def scrape_book_info_page(soup):
    """
    Extracts book information from a single book page.

    Args:
        soup (BeautifulSoup): The parsed HTML content of the book page.

    Returns:
        Dict[str, str]: A dictionary containing the book information.
    """
    product_info = {}

    product_info['Name'] = soup.find('div', {'class': 'col-sm-6 product_main'}).find('h1').text
    product_info['Rating'] = soup.find('p', {'class': 'star-rating'}).get('class')[1]

    # Extract book table details
    book_table_rows = soup.find('table', {'class': 'table table-striped'}).find_all('tr')
    for row in book_table_rows:
        name = row.find('th').text
        value = row.find('td').text
        product_info[name] = value

    # Extract image link
    image_relative_link = soup.find('div', {'id': 'product_gallery'}).find('img').get('src').strip('../')
    img_link = f"{base_url}/{image_relative_link}"
    product_info['Image'] = img_link

    return product_info



def scrape_books(base_url, start_page = 1):
    """
    Scrapes book information from the website, starting from the specified page number.

    Args:
        base_url (str): The base URL of the website.
        start_page (int, optional): The starting page number for scraping. Defaults to 1.

    Returns:
        list: A list of dictionaries, where each dictionary contains the information of a book.
    """
    book_list = []
    page_num = start_page

    while True:
        next_page_url = f"page-{page_num}.html"
        page_url = construct_page_url(base_url, next_page_url)
        print(f"Scraping page: {next_page_url}")

        soup = parse_url(page_url)
        if not soup:
            break

        books = soup.find('ol', {'class': 'row'}).find_all('li')

        # Iterate through each book
        for book in books:
            # Get the URL of the book
            catalogue_url = book.find('a').get('href')
            book_url = construct_page_url(base_url, catalogue_url)

            # Send a request to get the book page content
            book_soup = parse_url(book_url)
            if not book_soup:
                continue

            book_info = scrape_book_info_page(book_soup)
            book_list.append(book_info)

        page_num += 1

        next_li = soup.find('ul', class_='pager').find('li', class_='next')
        if not next_li:  # Break if next link is not found
            break

    return book_list



In [None]:
base_url = 'https://books.toscrape.com'
books = scrape_books(base_url,1)

In [22]:
print("Number of books scraped:", len(books))
print(books[1:2])

Number of books scraped: 1000
[{'Name': 'Tipping the Velvet', 'Rating': 'One', 'UPC': '90fa61229261140a', 'Product Type': 'Books', 'Price (excl. tax)': '£53.74', 'Price (incl. tax)': '£53.74', 'Tax': '£0.00', 'Availability': 'In stock (20 available)', 'Number of reviews': '0', 'Image': 'https://books.toscrape.com/media/cache/08/e9/08e94f3731d7d6b760dfbfbc02ca5c62.jpg'}]


## **Putting The Scrapped Data Into A Pandas Dataframe**

In [None]:
books_data = pd.DataFrame(books)
print(books_data.info())
books_data.head()

## **Cleaning The Data**

In [127]:
def clean_data(df):
    """
    Cleans the pandas DataFrame containing product data from the scraped books.toscrape.com .

    Args:
        df (pandas.DataFrame): The input DataFrame.

    Returns:
        pandas.DataFrame: The cleaned DataFrame.

    This function performs the following operations:
    1. Renames columns to lowercase and snake_case.
    2. Removes 'price_(incl._tax)' and 'tax' columns.
    3. Renames 'price_(excl._tax)' column to 'price'.
    4. Removes '£' symbol from 'price' column and converts the column into a float.
    5. Splits 'availability' column into 'stock_status' and 'stock_count' columns.
    6. Removes non-digit characters from 'stock_count' column and converts the column into an int.
    7. Maps 'rating' column to integer values based on a predefined dictionary.
    8. Drops 'upc', 'product_type', 'availability', and 'number_of_reviews' columns.
    """

    # Make all the column names lowercase and snake case
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # Get the price column and remove cuolumn with duplicate data
    df = (
        df.drop(columns=['price_(incl._tax)', 'tax'], axis=1)
        .rename(columns={'price_(excl._tax)': 'price'})
        .assign(price=lambda x: x['price'].str.replace('£', ''))
        .astype({'price': 'float'})
    )

    # Split the availability column into stock status and stock count
    df[['stock_status', 'stock_count']] = df['availability'].str.split("(", expand=True)

    # Extract only the numbers from the stock count
    df['stock_count'] = df['stock_count'].str.replace(r'[^\d\.]', '', regex=True).astype(int)

    # Map the Rating column to int numbers
    rating_mapping = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
    df['rating'] = df['rating'].map(rating_mapping)

    # Dropping unnecessary columns
    drop_columns = ['upc', 'product_type', 'availability', 'number_of_reviews']
    df = df.drop(columns=drop_columns, axis=1)

    return df

## **Saving The Cleaned Data Into a Dataframe**

In [None]:
df = clean_data(books_data)

print(df.info())
df.head()

## **Exporting The Cleaned Data Into A CSV File**

In [130]:
df.to_csv('books_to_scrape_data.csv', index=False)

## **Steps Taken To Get The Cleaning Function**

#### Make all the column names lowercase and snake case

In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

#### Checking to see if Price (incl. tax) & Price (excl. tax) are duplicates

In [117]:
len(df.query('`price_(excl._tax)` != `price_(incl._tax)`'))

0

#### It appears Price (incl. tax) & Price (excl. tax) are duplicates so we can drop one of them
- also drop the tax column since it appears no tax was applied
- rename the remaining price column to just price
- remove the pounds symbol
- and convert the price column to a float

In [118]:
df = (df.drop(columns=['price_(incl._tax)', 'tax'], axis=1)
.rename(columns={'price_(excl._tax)': 'price'})
.assign(price=lambda df: df['price'].str.replace('£', ''))
.astype({'price': 'float'})
)

print(df.info())
df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               1000 non-null   object 
 1   rating             1000 non-null   object 
 2   upc                1000 non-null   object 
 3   product_type       1000 non-null   object 
 4   price              1000 non-null   float64
 5   availability       1000 non-null   object 
 6   number_of_reviews  1000 non-null   object 
 7   image              1000 non-null   object 
dtypes: float64(1), object(7)
memory usage: 62.6+ KB
None


Unnamed: 0,name,rating,upc,product_type,price,availability,number_of_reviews,image
2,Soumission,One,6957f44c3847a760,Books,50.1,In stock (20 available),0,https://books.toscrape.com/media/cache/ee/cf/e...
691,'Salem's Lot,Four,6e9403f00acd6752,Books,49.56,In stock (4 available),0,https://books.toscrape.com/media/cache/d3/55/d...
854,"David and Goliath: Underdogs, Misfits, and the...",One,d234ddecef49ef4e,Books,17.81,In stock (3 available),0,https://books.toscrape.com/media/cache/e5/2a/e...
333,Barefoot Contessa Back to Basics,One,999912e63a3262ae,Books,28.01,In stock (14 available),0,https://books.toscrape.com/media/cache/29/b1/2...
94,"Lumberjanes, Vol. 1: Beware the Kitten Holy (L...",Three,b6ee99bcf06fc91f,Books,45.61,In stock (16 available),0,https://books.toscrape.com/media/cache/1c/78/1...


#### Split the availability column into stock status and stock count

In [119]:
df[['stock_status', 'stock_count']] = df['availability'].str.split("(", expand=True)


#### Extract only the numbers from the stock count

In [None]:
df['stock_count'] = df['stock_count'].str.replace('[^\d\.]','', regex=True).astype(int)

In [120]:
# Check if number of reviews contain any values other than 0

df['number_of_reviews'].value_counts() # No reviews in the dataframe

# Check if all the product type are books

df['product_type'].value_counts() # All the products are books

Books    1000
Name: product_type, dtype: int64

#### Map the Rating column to int numbers

In [121]:
rating_dict = {
    'One': 1,
    'Two': 2,
    'Three': 3,
    'Four': 4,
    'Five': 5
}

df['rating'] = df['rating'].map(rating_dict)

#### Dropping all the unecessary columns

In [122]:
# Dropping all the unecessary columns

df = df.drop(columns=['upc', 'product_type', 'availability', 'number_of_reviews'])

Unnamed: 0,name,rating,price,image,stock_status,stock_count
0,A Light in the Attic,3,51.77,https://books.toscrape.com/media/cache/fe/72/f...,In stock,22
1,Tipping the Velvet,1,53.74,https://books.toscrape.com/media/cache/08/e9/0...,In stock,20
2,Soumission,1,50.10,https://books.toscrape.com/media/cache/ee/cf/e...,In stock,20
3,Sharp Objects,4,47.82,https://books.toscrape.com/media/cache/c0/59/c...,In stock,20
4,Sapiens: A Brief History of Humankind,5,54.23,https://books.toscrape.com/media/cache/ce/5f/c...,In stock,20
...,...,...,...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,1,55.53,https://books.toscrape.com/media/cache/99/df/9...,In stock,1
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",4,57.06,https://books.toscrape.com/media/cache/30/98/3...,In stock,1
997,A Spy's Devotion (The Regency Spies of London #1),5,16.97,https://books.toscrape.com/media/cache/f9/6b/f...,In stock,1
998,1st to Die (Women's Murder Club #1),1,53.98,https://books.toscrape.com/media/cache/f6/8e/f...,In stock,1
