In [None]:
# 圖像辨識API
import google.generativeai as genai

from PIL import Image

from dotenv import load_dotenv
import os


def ocr_api():

    # 載入 .env 文件中的所有變數
    load_dotenv()

    # 使用 os.getenv 獲取環境變數
    api_key = os.getenv("GEMINI_API_KEY")

    # 設定 API 金鑰
    genai.configure(api_key=api_key)

    # 選擇模型（Gemini Pro）
    model = genai.GenerativeModel("gemini-1.5-flash-8b")

    # 讀取圖片
    image = Image.open("captcha.jpg")

    # 發送圖片 + 提示詞

    response = model.generate_content(
        ["這張圖片的數字是什麼？只給我純數字，沒有任何其他符號", image]
    )

    text = response.text

    modified_text = text.replace(">", "7")
    modified_text = text.replace("x", "4")

    return modified_text

In [None]:
# 爬蟲
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
import time

from dotenv import load_dotenv
import requests
import os


def web_crawl(mode: str, keyword1: str):

    # print("正在進行爬蟲(chrome)，使用無頭模式所以不會有視窗")

    # 設定下載目錄（根據需求修改）
    # 建議使用 /tmp 或其他有效的 Linux 路徑
    download_directory = "paper/"

    # 設置 Chrome 選項
    options = Options()

    # options.add_argument("--headless")  # 無頭模式
    # options.add_argument("--no-sandbox")  # 禁用沙箱

    # options.add_argument("--disable-dev-shm-usage")  # 共享內存問題
    # options.add_argument("--disable-gpu")  # 禁用 GPU
    # options.add_argument("--disable-notifications")  # 禁用通知彈窗

    # options.add_argument("--window-size=1920x1080")  # 設置窗口大小，防止某些元素不可見

    options.add_experimental_option(
        "prefs",
        {
            "download.default_directory": download_directory,  # 設定預設下載目錄
            "download.prompt_for_download": False,  # 禁止顯示下載提示框
            "download.directory_upgrade": True,  # 允許覆蓋舊的下載文件
        },
    )

    # 創建webdriver實例

    chrome = webdriver.Chrome(options=options)
    # 等待特定元素出現（最多等待 10 秒）
    wait = WebDriverWait(chrome, 10)
    try:
        # 打開目標網站

        url = f"https://ndltd.ncl.edu.tw"

        chrome.get(url)

        # 找到 href 標籤，title 為 "登入"
        login_link = wait.until(
            EC.visibility_of_element_located((By.XPATH, "//a[@title='登入']"))
        )
        login_link.click()  # 點擊登入按鈕

        while True:
            try:

                # 根據 alt 屬性尋找圖片
                image = wait.until(
                    EC.visibility_of_element_located((By.XPATH, "//img[@alt='驗證碼']"))
                )
                img_src = image.get_attribute("src")
                # 獲取圖片的 src 屬性
                # print(img_src)

                # 取得 Selenium 的 Cookies

                cookies = {
                    cookie["name"]: cookie["value"] for cookie in chrome.get_cookies()
                }

                # 設定 Headers
                headers = {
                    "User-Agent": "Mozilla/5.0",
                    "Referer": "https://ndltd.ncl.edu.tw/",
                }

                # 用 requests 下載圖片（帶入 Cookies）
                session = requests.Session()
                session.cookies.update(cookies)

                response = session.get(img_src, headers=headers)

                if response.status_code == 200:
                    with open("captcha.jpg", "wb") as file:
                        file.write(response.content)

                    print("驗證碼圖片下載成功！")
                else:
                    print("下載失敗，狀態碼:", response.status_code)

                # 找到輸入框並輸入帳號密碼
                username = wait.until(
                    EC.visibility_of_element_located((By.NAME, "userid"))
                )
                password = wait.until(
                    EC.visibility_of_element_located((By.NAME, "passwd"))
                )
                validinput = wait.until(
                    EC.visibility_of_element_located((By.NAME, "validinput"))
                )

                # 載入 .env 文件中的所有變數
                load_dotenv()
                # 清除欄位中的內容

                username.clear()
                password.clear()

                username.send_keys(os.getenv("USERNAME"))
                password.send_keys(os.getenv("PASSWORD"))

                try:
                    result = ocr_api()
                    print("驗證碼為: " + result)
                except:
                    print("辨識驗證碼失敗")
                    break

                validinput.send_keys(result)

                login_button = wait.until(
                    EC.visibility_of_element_located(
                        (By.XPATH, "//input[contains(@value, '登入')]")
                    )
                )
                login_button.click()

                # 嘗試切換到 alert，接受 alert
                alert = chrome.switch_to.alert
                alert.accept()

                print("驗證碼錯誤，重新嘗試")

            except:

                # 如果沒有彈出 alert，就跳出循環
                print("Login Success")
                break
        # -------------------------------------------------------------------------------
        if mode == "basic":

            searchbar = wait.until(EC.visibility_of_element_located((By.NAME, "qs0")))
            searchbar.clear()
            searchbar.send_keys(keyword1)

            search_button = wait.until(
                EC.visibility_of_element_located((By.NAME, "gs32search"))
            )
            search_button.click()
        elif mode == "advance":
            advance_search_button = wait.until(
                EC.visibility_of_element_located((By.XPATH, '//a[@title="進階查詢"]'))
            )
            advance_search_button.click()

            thesis_info = {
                "論文名稱": "ti",
                "研究生": "au",
                "校院名稱": "asc",
                "系所": "sdp",
                "論文出版年": "pyr",
                "指導教授": "ad",
                "口試委員": "say",
                "關鍵詞": "kw",
                "論文目次": "tc",
                "摘要": "ab",
                "論文參考文獻": "rf",
                "畢業學年度": "yr",
                "學門": "sglv1",
                "學類": "sglv2",
            }

            searchbar1 = wait.until(EC.visibility_of_element_located((By.NAME, "qs0")))
            searchbar1.clear()
            searchbar1.send_keys(keyword1)

            # 找到 select 元素
            keyword1_code = thesis_info[keyword1]
            select_element_1 = wait.until(
                EC.visibility_of_element_located((By.NAME, "qf0"))
            )
            select = Select(select_element_1)
            select.select_by_value(keyword1_code)

            keyword2 = ""
            searchbar2 = wait.until(EC.visibility_of_element_located((By.NAME, "qs1")))
            searchbar3.clear()
            searchbar2.send_keys(keyword2)

            # 找到 select 元素
            keyword2_code = thesis_info[keyword2]
            select_element_2 = wait.until(
                EC.visibility_of_element_located((By.NAME, "qf1"))
            )
            select = Select(select_element_2)
            select.select_by_value(keyword2_code)

            keyword3 = ""
            searchbar3 = wait.until(EC.visibility_of_element_located((By.NAME, "qs2")))
            searchbar3.clear()
            searchbar3.send_keys(keyword3)

            # 找到 select 元素
            # keyword3_code = thesis_info[keyword3]
            keyword3_code = ""
            select_element_3 = wait.until(
                EC.visibility_of_element_located((By.NAME, "qf2"))
            )
            select = Select(select_element_3)
            select.select_by_value(keyword3_code)

        # ---------------------------------------------------------------------------------

        while True:

            # 抓取所有 title="電子全文" 的 <a> 標籤
            links = wait.until(
                EC.presence_of_all_elements_located(
                    (By.XPATH, '//a[@title="電子全文"]')
                )
            )

            for link in links:
                # 點擊第一個 "電子全文" 按鈕
                link.click()

                # time.sleep(1)

                # 切換到新開啟的視窗
                chrome.switch_to.window(chrome.window_handles[-1])

                while True:
                    try:

                        # 根據 alt 屬性尋找圖片

                        image = chrome.find_element(By.XPATH, "//img[@alt='驗證碼']")
                        img_src = image.get_attribute("src")
                        # 獲取圖片的 src 屬性
                        # print(img_src)

                        # 取得 Selenium 的 Cookies

                        cookies = {
                            cookie["name"]: cookie["value"]
                            for cookie in chrome.get_cookies()
                        }

                        # 設定 Headers
                        headers = {
                            "User-Agent": "Mozilla/5.0",
                            "Referer": "https://ndltd.ncl.edu.tw/",
                        }

                        # 用 requests 下載圖片（帶入 Cookies）
                        session = requests.Session()
                        session.cookies.update(cookies)

                        response = session.get(img_src, headers=headers)

                        if response.status_code == 200:
                            with open("captcha.jpg", "wb") as file:
                                file.write(response.content)

                            print("驗證碼圖片下載成功！")
                        else:
                            print("下載失敗，狀態碼:", response.status_code)

                        validinput = chrome.find_element(By.NAME, "validinput")

                        try:
                            result = ocr_api()
                            print("驗證碼為: " + result)
                        except:
                            print("辨識驗證碼失敗")
                            break

                        validinput.send_keys(result)

                        ok_button = chrome.find_element(By.NAME, "ok")
                        ok_button.click()
                        time.sleep(3)

                        # 嘗試切換到 alert，接受 alert
                        alert = chrome.switch_to.alert
                        alert.accept()

                        print("驗證碼錯誤，重新嘗試")

                    except:

                        # 如果沒有彈出 alert，就跳出循環
                        print("Login Success")
                        time.sleep(3)

                        break

                # 找到下載按鈕並點擊
                # 找到包含「下載」的 `<a>` 標籤
                download_link = wait.until(
                    EC.visibility_of_element_located(
                        (By.XPATH, "//a[contains(text(), '下載')]")
                    )
                )

                # 點擊下載
                download_link.click()

                # 關閉新視窗並切回原本的視窗
                chrome.close()
                chrome.switch_to.window(chrome.window_handles[0])

            try:
                # 找到 alt 屬性為 "下一頁" 的 input 按鈕
                next_button = wait.until(
                    EC.visibility_of_element_located(
                        (By.XPATH, "//input[@alt='下一頁']")
                    )
                )
                print("前往下一頁的結果")
                # 點擊按鈕
                next_button.click()
            except:
                print("這是最後一頁")
                break

    except Exception as e:
        print(f"執行過程中發生錯誤: {e}")

    finally:

        # 關閉瀏覽器
        chrome.close()

    return True

In [None]:
web_crawl(mode="advance", keyword="囤房稅")

In [None]:
# web_crawl(mode="basic", keyword="囤房稅")