In [1]:
import csv                                      #滾動畫面版本
import logging 
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time

# 設定 logging 的基本配置，將日誌等級設定為 INFO，並指定日誌格式
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 讀取 CSV 檔案，並返回一個包含網站和流量的列表
def read_csv(file_path):
    websites = []  # 用來儲存網站和對應流量的列表
    try:
        # 開啟 CSV 檔案
        with open(file_path, 'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)  # 使用 DictReader 將每一列作為字典讀取
            for row in reader:
                # 檢查是否存在 "電商網站" 和 "上個月訪問量(M)" 這兩個關鍵列
                if '電商網站' in row and '上個月訪問量(M)' in row:
                    website = row['電商網站']  # 取得網站名稱
                    traffic_str = row['上個月訪問量(M)']  # 取得訪問量的字串
                    # 過濾掉無效或空白的網站名稱
                    if website and website != '------------------>':
                        try:
                            # 將訪問量字串轉換為浮點數，並加入網站列表
                            traffic = float(traffic_str)
                            websites.append((website, traffic))
                            logging.info(f"Added website: {website} with traffic: {traffic}")
                        except ValueError:
                            logging.warning(f"Skipping invalid traffic value for {website}: {traffic_str}")
                else:
                    logging.warning(f"Row missing required columns: {row}")
    except Exception as e:
        logging.error(f"Error reading CSV file: {e}")
    
    logging.info(f"Total websites read: {len(websites)}")
    return websites  # 返回網站與流量的列表

# 檢查網站是否存在反爬蟲機制
def check_anti_scraping(driver, url):
    try:
        # 檢查網址是否帶有 "http"，如果沒有則補充 "https://"
        full_url = f"https://{url}" if not url.startswith('http') else url
        logging.info(f"Checking URL: {full_url}")
        driver.get(full_url)  # 使用 Selenium 進入該網站
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))  # 等待網頁內容加載
        time.sleep(2)  # 稍微等待 JS 或可能的重定向完成
        
        # 模擬滾動到頁面底部以加載所有內容
        last_height = driver.execute_script("return document.body.scrollHeight")  # 獲取當前頁面高度
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  # 滾動到頁面底部
            time.sleep(2)  # 等待新內容加載
            
            new_height = driver.execute_script("return document.body.scrollHeight")  # 獲取新頁面高度
            if new_height == last_height:  # 如果高度未改變，則跳出循環
                break
            last_height = new_height  # 更新頁面高度
        
        # 定義反爬蟲機制可能出現的關鍵字
        indicators = ["captcha", "security check", "verify you're not a robot", "access denied", "blocked", "403 forbidden"]
        
        page_source = driver.page_source.lower()  # 取得頁面內容並轉為小寫進行關鍵字檢查
        for indicator in indicators:
            if indicator in page_source:  # 如果頁面中包含關鍵字，則表示觸發反爬蟲
                logging.info(f"Anti-scraping detected for {url}: {indicator}")
                return True
        
        logging.info(f"No anti-scraping detected for {url}")
        return False  # 未檢測到反爬蟲機制
    except TimeoutException:
        logging.warning(f"Timeout occurred for {url}")
        return True  # 若超時未加載，假設存在反爬蟲
    except Exception as e:
        logging.error(f"Error checking {url}: {e}")
        return True  # 遇到其他錯誤也假設存在反爬蟲

# 根據流量和反爬蟲機制將網站分類
def categorize_websites(websites, median_traffic):
    # 定義四個象限來分類網站
    quadrants = {
        "High Traffic, No Anti-Scraping": [],
        "High Traffic, Anti-Scraping": [],
        "Low Traffic, No Anti-Scraping": [],
        "Low Traffic, Anti-Scraping": []
    }
    
    # 啟動 Selenium 的 Chrome 瀏覽器 (需安裝 chromedriver 並設定在 PATH 中)
    driver = webdriver.Chrome()
    
    try:
        for website, traffic in websites:
            logging.info(f"Processing website: {website} with traffic: {traffic}")
            # 檢查該網站是否有反爬蟲機制
            has_anti_scraping = check_anti_scraping(driver, website)
            
            # 根據訪問量和反爬蟲機制進行分類
            if traffic >= median_traffic:  # 如果訪問量大於或等於中位數
                if has_anti_scraping:
                    quadrants["High Traffic, Anti-Scraping"].append(website)
                else:
                    quadrants["High Traffic, No Anti-Scraping"].append(website)
            else:  # 如果訪問量小於中位數
                if has_anti_scraping:
                    quadrants["Low Traffic, Anti-Scraping"].append(website)
                else:
                    quadrants["Low Traffic, No Anti-Scraping"].append(website)
            
            logging.info(f"Categorized {website} as: {'High' if traffic >= median_traffic else 'Low'} Traffic, {'Anti-Scraping' if has_anti_scraping else 'No Anti-Scraping'}")
    finally:
        driver.quit()  # 完成分類後，關閉瀏覽器
    
    return quadrants  # 返回分類結果

def main():
    file_path = "電商網站流量平均修改.csv"  # 指定 CSV 檔案的路徑
    websites = read_csv(file_path)  # 讀取網站與流量
    if not websites:  # 若無網站資料，則記錄錯誤並退出
        logging.error("No websites were read from the CSV file. Exiting.")
        return
    
    median_traffic = 13.73  # 中位數流量 (根據先前描述設定)
    logging.info(f"Using median traffic: {median_traffic}")
    
    quadrants = categorize_websites(websites, median_traffic)  # 根據流量和反爬蟲機制進行分類
    
    # 列出分類結果
    for quadrant, sites in quadrants.items():
        print(f"\n{quadrant}:")
        for site in sites:
            print(f"- {site}")
        logging.info(f"Websites in {quadrant}: {len(sites)}")

if __name__ == "__main__":
    main()   

2024-10-20 03:39:52,840 - INFO - Added website: amazon.com with traffic: 2553.0
2024-10-20 03:39:52,842 - INFO - Added website: temu.com with traffic: 635.1
2024-10-20 03:39:52,844 - INFO - Added website: ebay.com with traffic: 594.7
2024-10-20 03:39:52,845 - INFO - Added website: amazon.co.jp with traffic: 532.0
2024-10-20 03:39:52,847 - INFO - Added website: aliexpress.com with traffic: 517.9
2024-10-20 03:39:52,849 - INFO - Added website: amazon.in with traffic: 459.0
2024-10-20 03:39:52,850 - INFO - Added website: walmart.com with traffic: 416.6
2024-10-20 03:39:52,851 - INFO - Added website: amazon.de with traffic: 407.7
2024-10-20 03:39:52,854 - INFO - Added website: etsy.com with traffic: 404.4
2024-10-20 03:39:52,856 - INFO - Added website: amazon.co.uk with traffic: 381.9
2024-10-20 03:39:52,858 - INFO - Added website: ebay.co.uk with traffic: 207.3
2024-10-20 03:39:52,860 - INFO - Added website: amazon.ca with traffic: 191.0
2024-10-20 03:39:52,861 - INFO - Added website: tao


High Traffic, No Anti-Scraping:
- temu.com
- amazon.co.jp
- aliexpress.com
- amazon.in
- amazon.de
- amazon.co.uk
- ebay.co.uk
- taobao.com
- craigslist.org
- costco.com
- alibaba.com
- kohls.com
- poshmark.com
- samsclub.com
- capitaloneshopping.com
- provenpixel.com
- swagbucks.com
- livenation.com
- linksprf.com
- retailmenot.com
- dhgate.com
- 914trk.com
- offerup.com
- vividseats.com
- tjx.com

High Traffic, Anti-Scraping:
- amazon.com
- ebay.com
- walmart.com
- etsy.com
- amazon.ca
- target.com
- mercari.com
- rakuten.com
- ticketmaster.com
- wayfair.com
- slickdeals.net
- eventbrite.com
- stubhub.com
- michaels.com
- seatgeek.com
- qvc.com
- tractorsupply.com
- bedbathandbeyond.com
- groupon.com
- kelkoogroup.net
- axs.com
- hibid.com
- media-amazon.com
- wal-mart.com
- barnesandnoble.com

Low Traffic, No Anti-Scraping:
- therealreal.com
- woot.com
- couponfollow.com
- westelm.com
- hsn.com
- coupert.com
- abebooks.com
- belk.com
- dollargeneral.com
- 905trk.com
- worldmarket.c