In [1]:
# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
import pandas as pd

# Configure Chrome options
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
chrome_options.binary_location = 'C:/Users/ssd/Desktop/data/ETL/chrome/chrome.exe'

# Initialize the Chrome webdriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)


# Specify the URL to scrape
url = 'https://books.toscrape.com'

# Open the specified URL in the Chrome browser and maximize the window
driver.get(url)
driver.maximize_window()

In [2]:
# Get total number of pages
pages = driver.find_element(By.XPATH, '//li[@class="current"]').text
pages_count = int(pages.split()[-1])

# Extracting and printing the pages count
print("Pages Count:", pages_count)

Pages Count: 50


In [3]:
# Function to navigate to a specific page
def navigate_to_page(page_number):
    if page_number > 1:
        driver.find_element(By.XPATH, f"//*[contains(@href, 'page-{page_number}')]").click()
        driver.implicitly_wait(4)
        
# Create an empty list to store data
data = []

# Loop through pages
for page_number in range(1, pages_count + 1):
    navigate_to_page(page_number)

    # Loop through books on the page
    books = driver.find_elements(By.XPATH, '//a[@title]')
    for book in books:
        book.click()
        driver.implicitly_wait(3)

        # Extracting Title
        try:
            title = driver.find_element(By.XPATH, '//h1').text.strip()
        except NoSuchElementException:
            title = 'N/A'
        
        # Extracting Price
        try:
            price = driver.find_element(By.XPATH, '//p[@class="price_color"]').text.strip()
        except NoSuchElementException:
            price = 'N/A'
        
        # Extracting Availability
        try:
            availability = driver.find_element(By.XPATH, '//p[@class="instock availability"]').text.strip()
        except NoSuchElementException:
            availability = 'N/A'
        
        # Extracting Rating
        try:
            rating_tag = driver.find_element(By.CSS_SELECTOR, 'p.star-rating')
            rating = rating_tag.get_attribute('class').split()[1] if rating_tag else 'N/A'
        except NoSuchElementException:
            rating = 'N/A'
        
        # Extracting Product Description
        try:
            description = driver.find_element(By.XPATH, '//div[@id="product_description"]/following-sibling::p').text.strip()
        except NoSuchElementException:
            description = 'N/A'
        
        # Extracting Product Information
        try:
            upc = driver.find_element(By.XPATH, '//th[text()="UPC"]/following-sibling::td').text.strip()
        except NoSuchElementException:
            upc = 'N/A'
        
        try:
            product_type = driver.find_element(By.XPATH, '//th[text()="Product Type"]/following-sibling::td').text.strip()
        except NoSuchElementException:
            product_type = 'N/A'
        
        try:
            price_excl_tax = driver.find_element(By.XPATH, '//th[text()="Price (excl. tax)"]/following-sibling::td').text.strip()
        except NoSuchElementException:
            price_excl_tax = 'N/A'
        
        try:
            price_incl_tax = driver.find_element(By.XPATH, '//th[text()="Price (incl. tax)"]/following-sibling::td').text.strip()
        except NoSuchElementException:
            price_incl_tax = 'N/A'
        
        try:
            tax = driver.find_element(By.XPATH, '//th[text()="Tax"]/following-sibling::td').text.strip()
        except NoSuchElementException:
            tax = 'N/A'
        
        try:
            availability_info = driver.find_element(By.XPATH, '//th[text()="Availability"]/following-sibling::td').text.strip()
        except NoSuchElementException:
            availability_info = 'N/A'
        
        try:
            num_reviews = driver.find_element(By.XPATH, '//th[text()="Number of reviews"]/following-sibling::td').text.strip()
        except NoSuchElementException:
            num_reviews = 'N/A'


        # Print or use the extracted data as needed
        # print("Title:", title)
        # print("Price:", price)
        # print("Availability:", availability)
        # print("Rating:", rating)
        # print("Description:", description)
        # print("UPC:", upc)
        # print("Product Type:", product_type)
        # print("Price (excl. tax):", price_excl_tax)
        # print("Price (incl. tax):", price_incl_tax)
        # print("Tax:", tax)
        # print("Availability Info:", availability_info)
        # print("Number of Reviews:", num_reviews)

        # Append the data to the list
        data.append({
            'Title': title,
            'Price': price,
            'Availability': availability,
            'Rating': rating,
            'Description': description,
            'UPC': upc,
            'Product Type': product_type,
            'Price (excl. tax)': price_excl_tax,
            'Price (incl. tax)': price_incl_tax,
            'Tax': tax,
            'Availability Info': availability_info,
            'Number of Reviews': num_reviews
        })

        # Navigate back to the previous page
        driver.back()

# Close the WebDriver
driver.quit()

# Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,Title,Price,Availability,Rating,Description,UPC,Product Type,Price (excl. tax),Price (incl. tax),Tax,Availability Info,Number of Reviews
0,A Light in the Attic,£51.77,In stock (22 available),Three,It's hard to imagine a world without A Light i...,a897fe39b1053632,Books,£51.77,£51.77,£0.00,In stock (22 available),0
1,Tipping the Velvet,£53.74,In stock (20 available),One,"""Erotic and absorbing...Written with starling ...",90fa61229261140a,Books,£53.74,£53.74,£0.00,In stock (20 available),0
2,Soumission,£50.10,In stock (20 available),One,"Dans une France assez proche de la nôtre, un h...",6957f44c3847a760,Books,£50.10,£50.10,£0.00,In stock (20 available),0
3,Sharp Objects,£47.82,In stock (20 available),Four,"WICKED above her hipbone, GIRL across her hear...",e00eb4fd7b871a48,Books,£47.82,£47.82,£0.00,In stock (20 available),0
4,Sapiens: A Brief History of Humankind,£54.23,In stock (20 available),Five,From a renowned historian comes a groundbreaki...,4165285e1663650f,Books,£54.23,£54.23,£0.00,In stock (20 available),0


In [5]:
# Save the DataFrame to an Excel file
df.to_csv('data/book_data.csv', index=False)