# WebScraping - Selenium + BeautifulSoup

---
* Author:  [Yuttapong Mahasittiwat](mailto:khala1391@gmail.com)
* Technologist | Data Modeler | Data Analyst
* [YouTube](https://www.youtube.com/khala1391)
* [LinkedIn](https://www.linkedin.com/in/yuttapong-m/)
* [Tableau](https://public.tableau.com/app/profile/yuttapong.m/vizzes)
---

ref: [WS CubeTech youtube channel](https://www.youtube.com/watch?v=UabBGhnVqSo&list=PLc20sA5NNOvrsn3a78ewy2VTCXVV47NB4&index=1&t=0s)

In [1]:
import datetime
print(datetime.datetime.now())

2024-10-21 22:49:38.655263


In [None]:
## --------------------------------------------------------------------------
## import library

from urllib.request import urlopen  # option#1
import requests  # option#2
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time



# Define the dynamic XPath for the sub-menu items
xpaths = [f'//*[@id="mainmenu1"]/li[2]/ul/li[2]/ul/li[{i}]/a' for i in range(4,15)]

# Loop through each XPath item
for path2 in xpaths:
    try:
        ## --------------------------------------------------------------------------
        ## setup selenium

        driver = webdriver.Chrome()
        driver.get("https://www.naiin.com/")

        # Navigate to the website
        driver.maximize_window()

        # Locate the element to perform the hover action on
        path = '//*[@id="mainmenu1"]/li[2]/a'
        element_to_hover = driver.find_element(By.XPATH, path)

        # Create an instance of the ActionChains class
        actions = ActionChains(driver)

        # Perform the hover action
        actions.move_to_element(element_to_hover).perform()
        time.sleep(3)
        
        # Locate the element for each sub-category
        element_to_hover2 = driver.find_element(By.XPATH, path2)

        # Perform hover action on the submenu and click
        actions.move_to_element(element_to_hover2).perform()
        time.sleep(3)
        actions.click(element_to_hover2).perform()
        time.sleep(3)

        ## --------------------------------------------------------------------------
        ## setup BeautifulSoup to scrape data

        url = driver.current_url
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "lxml")

        # Scrape books, authors, and prices
        books = soup.find_all("a", class_="itemname")
        book_list = [book.text.strip() for book in books]
        print(f"Books found: {len(book_list)}")

        authors = soup.find_all("a", class_="inline-block tw-whitespace-normal tw-block")
        author_list = [author.text.strip() for author in authors]
        print(f"Authors found: {len(author_list)}")

        prices = soup.find_all("p", class_="txt-price")
        price_list = [price.text.strip() for price in prices]
        print(f"Prices found: {len(price_list)}")

        ## --------------------------------------------------------------------------
        ## while loop to handle pagination

        while True:
            try:
                np = soup.find("a", class_="nav-pag pag-next").get("href")
                print(f"Next page URL: {np}")

                url = np
                r = requests.get(url)
                soup = BeautifulSoup(r.text, "lxml")
                time.sleep(2)

                books = soup.find_all("a", class_="itemname")
                book_list.extend([book.text.strip() for book in books])
                print(f"Books total: {len(book_list)}")

                authors = soup.find_all("a", class_="inline-block tw-whitespace-normal tw-block")
                author_list.extend([author.text.strip() for author in authors])
                print(f"Authors total: {len(author_list)}")

                prices = soup.find_all("p", class_="txt-price")
                price_list.extend([price.text.strip() for price in prices])
                print(f"Prices total: {len(price_list)}")

            except Exception as e:
                print(f"Error: {e}")
                print(f"Total books: {len(book_list)}")
                print(f"Total authors: {len(author_list)}")
                print(f"Total prices: {len(price_list)}")
                break

        ## --------------------------------------------------------------------------
        ## save each category's data as CSV

        df = pd.DataFrame({"book": book_list,
                           "author": author_list,
                           "price": price_list})
        df.to_csv(f'data/book_details_category_{xpaths.index(path2)+1}.csv', encoding='utf-8-sig')

    except Exception as e:
        print(f"Error processing path {path2}: {e}")
