In [2]:
import os
import time
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse, parse_qs, unquote

def clean_amazon_url(url):
    try:
        # Amazon base URL
        amazon_base_url = "https://www.amazon.com"

        # Convert relative to absolute URL if needed
        if not url.startswith("http"):
            url = amazon_base_url + url

        # Parse the URL
        parsed_url = urlparse(url)
        query_params = parse_qs(parsed_url.query)

        # Case 1: Sponsored Ad Link (Redirect)
        if 'url' in query_params:
            # Extract the actual product URL from "url" parameter
            actual_url = unquote(query_params['url'][0])
            return clean_amazon_url(amazon_base_url + actual_url)

        # Case 2: Direct Product URL
        path_parts = parsed_url.path.split('/')
        if 'dp' in path_parts:
            asin_index = path_parts.index('dp') + 1
            asin = path_parts[asin_index] if asin_index < len(path_parts) else None

            if asin:
                # Construct the clean product link
                clean_url = f"{amazon_base_url}/dp/{asin}"
                return clean_url
            else:
                raise ValueError("ASIN not found in URL.")

        raise ValueError("Invalid Amazon product URL format.")

    except Exception as e:
        print(f"Error: {e}")
        return None

In [9]:
driver_path = "C:/Users/jackc/chromedriver-win64/chromedriver.exe"
service = Service(driver_path)

# Initialize the WebDriver
browser = webdriver.Chrome(service=service)

In [10]:
# load the webpage
browser.get("https://www.amazon.com")

In [13]:
# Output folder for saving HTML files
data_folder = "textbook_data"
os.makedirs(data_folder, exist_ok=True)  # Ensure folder exists

# ---------------------------------------------------------------------------------------------
# Search skills: what skills do you want to get course data on
# ---------------------------------------------------------------------------------------------
# Specify the path to the file containing the extracted skills
file_path = './all_skills.txt'

# Open the file and read its contents
with open(file_path, 'r', encoding='utf-8') as file:
    skills = file.readlines()  # Read each line as an individual skill
tech_skills = [skill.strip() for skill in skills]

latest_skill_index = 0

# ------------------------------------------------------------------------------------------------
# List of URLs to scrape
# ------------------------------------------------------------------------------------------------

dataset = []
for idx, skill in enumerate(tech_skills[latest_skill_index:]):
    count = 0
    data = []
    # Find search bar and input and click
    
    input_searchs = browser.find_elements(By.ID, "twotabsearchtextbox")
    search_buttons = browser.find_elements(By.ID, "nav-search-submit-button")
    if input_searchs and search_buttons:
        input_searchs[0].clear()
        input_searchs[0].send_keys(skill+" Book")
        search_buttons[0].click()
        time.sleep(random.randint(2,3))
    else:
        dataset.append(data)
        continue
    # Find all book elements in this page
    product_buttons = browser.find_elements(By.CSS_SELECTOR, 'div[data-cy="title-recipe"] a.a-link-normal.s-link-style.a-text-normal')
    max_count = min(1, len(product_buttons)) if product_buttons else 0
    
    print(f"max {max_count} ", end="")
    while max_count > count:
        print(f"{count+1} ", end="")
        product_buttons = browser.find_elements(By.CSS_SELECTOR, 'div[data-cy="title-recipe"] a.a-link-normal.s-link-style.a-text-normal')
        if product_buttons and len(product_buttons) > count:
            head = "https://www.amazon.com"
            link = head + browser.execute_script("return arguments[0].getAttribute('href');", product_buttons[count])\
                  if browser.execute_script("return arguments[0].getAttribute('href');", product_buttons[count]) else None
            browser.execute_script("document.activeElement.blur();")
            time.sleep(1)
            product_buttons[count].click()
            time.sleep(1)
        else:
            data.append({
                'skill': None,
                'link': None,
                'image_link': None,
                'author': None,
                'title': None,
                'price': None,
                'publisher': None,
                'language': None,
                'num_page': None,
                'isbn': None,
                'rankings': None,
                'rating': None,
                'num_rating': None
            })
            count += 1
            continue
            
        title_elements = browser.find_elements(By.CSS_SELECTOR, 'div.a-section.a-spacing-none h1#title span#productTitle')
        title= browser.execute_script("return arguments[0].textContent;", title_elements[0])\
        if title_elements else None
        
        price_elements = browser.find_elements(By.CSS_SELECTOR,\
                        'div#buybox div.a-section.a-spacing-none.aok-align-center.aok-relative span.aok-offscreen')
        price = browser.execute_script("return arguments[0].textContent;", price_elements[0])\
        if price_elements else None

        img_elements = browser.find_elements(By.CSS_SELECTOR, 'span.a-declarative div#imgTagWrapperId.imgTagWrapper img#landingImage')
        img_link = browser.execute_script("return arguments[0].getAttribute('src');", img_elements[0])\
        if img_elements else None
        
        author_elements = browser.find_elements(By.CSS_SELECTOR,\
                    'div#bylineInfo span.author.notFaded a.a-link-normal')
        author = browser.execute_script("return arguments[0].textContent;", author_elements[0])\
        if author_elements else None
        
        product_details = browser.find_elements(By.CSS_SELECTOR, "div#detailBulletsWrapper_feature_div")
        alists = product_details[0].find_elements(By.CSS_SELECTOR, "div#detailBullets_feature_div li span.a-list-item") if product_details else None
        publisher, language, num_page, isbn = (None, None, None, None)
        if alists:
            for span in alists:
                category_elements = span.find_elements(By.CSS_SELECTOR, "span")
                category = browser.execute_script("return arguments[0].textContent;", category_elements[0])\
                if category_elements else None
                info = browser.execute_script("return arguments[0].textContent;", category_elements[-1])\
                if category_elements else None
                if category and "Publisher" in category:
                    publisher = info
                elif category and "Language" in category:
                    language = info
                elif category and "Paperback" in category:
                    num_page = info
                elif category and "ISBN" in category:
                    isbn = info

        # Find the ranking list
        rank_elements = browser.find_elements(By.CSS_SELECTOR, "ul.a-unordered-list.a-nostyle.a-vertical.zg_hrsr li")

        # Extract the ranking and category using JavaScript
        rankings = []
        if rank_elements:
            for rank in rank_elements:
                # Get full text using JavaScript
                full_text = browser.execute_script("return arguments[0].textContent;", rank).strip()

                # Extract the ranking number and category
                if " in " in full_text:
                    number, category = full_text.split(" in ", 1)
                    number = int(number.replace("#", "").replace(",", "").strip())
                    category = category.strip()
                    rankings.append((number, category))
        rating_elements = browser.find_elements(By.CSS_SELECTOR, "div#averageCustomerReviews_feature_div.celwidget")
        rating_tags = rating_elements[0].find_elements(By.CSS_SELECTOR, "span#acrPopover span.a-size-base.a-color-base")\
        if rating_elements else None
        rating = browser.execute_script("return arguments[0].textContent;", rating_tags[0])\
                if rating_tags else None
        num_rating_tags = rating_elements[0].find_elements(By.CSS_SELECTOR, "span#acrCustomerReviewText.a-size-base")\
        if rating_elements else None
        num_rating = browser.execute_script("return arguments[0].textContent;", num_rating_tags[0])\
                if num_rating_tags else None
        new = {
                'skill': skill,
                'link': link,
                'image_link': img_link,
                'author': author,
                'title': title,
                'price': price,
                'publisher': publisher,
                'language': language,
                'num_page': num_page,
                'isbn': isbn,
                'rankings': rankings,
                'rating': rating,
                'num_rating': num_rating
            }
        if new not in data:
            data.append(new)
        browser.back()
        count += 1
    df = pd.DataFrame(data)
    skillname = skill.replace("/", "_").replace("*", "_")
    csv_path = os.path.join(data_folder, f"{skillname}.csv")
    df.to_csv(csv_path, index=False)
    print(f"{skillname}.csv saved at: {csv_path}")

    print("")
    print(f"-----------------------Progress({latest_skill_index+idx+1}/{len(tech_skills)})--------------------")
browser.quit()

max 1 1 Python.csv saved at: textbook_data\Python.csv

-----------------------Progress(1/9481)--------------------
max 1 1 Java.csv saved at: textbook_data\Java.csv

-----------------------Progress(2/9481)--------------------
max 1 1 Sql.csv saved at: textbook_data\Sql.csv

-----------------------Progress(3/9481)--------------------
max 1 1 Aws.csv saved at: textbook_data\Aws.csv

-----------------------Progress(4/9481)--------------------
max 1 1 Machine Learning.csv saved at: textbook_data\Machine Learning.csv

-----------------------Progress(5/9481)--------------------
max 1 1 JS.csv saved at: textbook_data\JS.csv

-----------------------Progress(6/9481)--------------------
max 1 1 C++.csv saved at: textbook_data\C++.csv

-----------------------Progress(7/9481)--------------------


KeyboardInterrupt: 