In [None]:
# 圖像辨識API
import google.generativeai as genai

from PIL import Image

from dotenv import load_dotenv
import os


def ocr_api():

    # 載入 .env 文件中的所有變數
    load_dotenv()

    # 使用 os.getenv 獲取環境變數
    api_key = os.getenv("GEMINI_API_KEY")

    # 設定 API 金鑰
    genai.configure(api_key=api_key)

    # 選擇模型（Gemini Pro）
    model = genai.GenerativeModel("gemini-1.5-flash-8b")

    # 讀取圖片
    image = Image.open("captcha.jpg")

    # 發送圖片 + 提示詞

    response = model.generate_content(
        ["這張圖片的數字是什麼？只給我純數字，沒有任何其他符號", image]
    )

    text = response.text

    modified_text = text.replace(">", "7")
    modified_text = text.replace("x", "4")

    return modified_text

In [None]:
# 爬蟲
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoAlertPresentException,
)
from dotenv import load_dotenv
import time
import requests
import os


def check_keyword(keywords: list):

    thesis_info = [
        "論文名稱",
        "研究生",
        "校院名稱",
        "系所",
        "論文出版年",
        "指導教授",
        "口試委員",
        "關鍵詞",
        "論文目次",
        "摘要",
        "論文參考文獻",
        "畢業學年度",
        "學門",
        "學類",
    ]
    fields_of_study = [
        "教育學門",
        "藝術學門",
        "人文學門",
        "設計學門",
        "社會及行為科學學門",
        "傳播學門",
        "商業及管理學門",
        "法律學門",
        "生命科學學門",
        "自然科學學門",
        "數學及統計學門",
        "電算機學門",
        "工程學門",
        "建築及都市規劃學門",
        "農業科學學門",
        "獸醫學門",
        "醫藥衛生學門",
        "社會服務學門",
        "民生學門",
        "運輸服務學門",
        "環境保護學門",
        "",
    ]
    logic = ["and", "or", "not"]
    # example: "中央大學" "校院名稱" "and" "2021" "論文出版年" "教育學門"

    if keywords[1] not in thesis_info:
        print("欄位選項輸入錯誤!!!")
        return False
    elif keywords[2] not in logic:
        print("邏輯選項輸入錯誤!!!")
        return False
    elif keywords[4] not in thesis_info:
        print("欄位選項輸入錯誤!!!")
        return False
    elif keywords[5] not in fields_of_study:
        print("學位選項輸入錯誤!!!")
        return False


def web_crawl(mode: str, keywords: list):

    if mode == "advance":
        if check_keyword(keywords) == False:
            return False

    # print("正在進行爬蟲(chrome)，使用無頭模式所以不會有視窗")

    # 設定下載目錄（根據需求修改）
    # 建議使用 Windows 絕對路徑
    download_directory = "D:\\PlagiarismDetector\\paper_webcrawling\\paper"
    # 檢查資料夾是否存在，不存在則建立
    os.makedirs(download_directory, exist_ok=True)

    # 設置 Chrome 選項
    options = Options()

    # options.add_argument("--headless")  # 無頭模式
    # options.add_argument("--no-sandbox")  # 禁用沙箱

    # options.add_argument("--disable-dev-shm-usage")  # 共享內存問題
    # options.add_argument("--disable-gpu")  # 禁用 GPU
    # options.add_argument("--disable-notifications")  # 禁用通知彈窗

    # options.add_argument("--window-size=1920x1080")  # 設置窗口大小，防止某些元素不可見

    options.add_experimental_option(
        "prefs",
        {
            "download.default_directory": download_directory,  # 設定預設下載目錄
            "download.prompt_for_download": False,  # 禁止顯示下載提示框
            "download.directory_upgrade": True,  # 允許覆蓋舊的下載文件
        },
    )

    # 創建webdriver實例
    chrome = webdriver.Chrome(options=options)

    # 等待特定元素出現（最多等待 10 秒）
    wait = WebDriverWait(chrome, 10)

    # ------------------------------------------------------------------------------

    # 通過驗證碼
    def identify_validate_code():
        try:
            # 根據 alt 屬性尋找圖片
            image = wait.until(
                EC.visibility_of_element_located((By.XPATH, "//img[@alt='驗證碼']"))
            )
            img_src = image.get_attribute("src")
            # 獲取圖片的 src 屬性
            # print(img_src)

            # 取得 Selenium 的 Cookies

            cookies = {
                cookie["name"]: cookie["value"] for cookie in chrome.get_cookies()
            }

            # 設定 Headers
            headers = {
                "User-Agent": "Mozilla/5.0",
                "Referer": "https://ndltd.ncl.edu.tw/",
            }

            # 用 requests 下載圖片（帶入 Cookies）
            session = requests.Session()
            session.cookies.update(cookies)

            response = session.get(img_src, headers=headers)

            if response.status_code == 200:
                with open("captcha.jpg", "wb") as file:
                    file.write(response.content)

                print("驗證碼圖片下載成功")
            else:
                print("下載失敗，狀態碼:", response.status_code)

            try:
                result = ocr_api()
                print("驗證碼為: " + result)
            except:
                raise Exception("辨識驗證碼失敗")

            validinput = wait.until(
                EC.visibility_of_element_located((By.NAME, "validinput"))
            )
            validinput.send_keys(result)

        except Exception as e:
            print(f"辨識驗證碼發生錯誤: {e}")

    # 打開網站
    try:
        # 打開目標網站

        url = f"https://ndltd.ncl.edu.tw"

        chrome.get(url)

        # 找到 href 標籤，title 為 "登入"
        login_link = wait.until(
            EC.visibility_of_element_located((By.XPATH, "//a[@title='登入']"))
        )
        login_link.click()  # 點擊登入按鈕

    except Exception as e:
        print(f"打開網站發生錯誤: {e}")

    # -------------------------------------------------------------------------------

    # 登入帳號
    try:
        while True:

            # 載入 .env 文件中的所有變數
            load_dotenv()

            # 找到輸入框並輸入帳號密碼
            username = wait.until(EC.visibility_of_element_located((By.NAME, "userid")))
            password = wait.until(EC.visibility_of_element_located((By.NAME, "passwd")))

            # 清除欄位中的內容
            username.clear()
            password.clear()

            username.send_keys(os.getenv("USERNAME"))
            password.send_keys(os.getenv("PASSWORD"))

            identify_validate_code()

            login_button = wait.until(
                EC.visibility_of_element_located(
                    (By.XPATH, "//input[contains(@value, '登入')]")
                )
            )
            login_button.click()

            time.sleep(1)
            try:
                alert = chrome.switch_to.alert
                print("登入驗證碼，重新嘗試")
                alert.accept()  # 自動點擊確定
            except NoAlertPresentException:
                print("登入驗證碼正確")
                break

    except Exception as e:
        print(f"登入發生錯誤: {e}")

    # -------------------------------------------------------------------------------

    # 查詢
    try:
        if mode == "basic":
            searchbar = wait.until(EC.visibility_of_element_located((By.NAME, "qs0")))
            searchbar.clear()
            searchbar.send_keys(keywords[0])

            search_button = wait.until(
                EC.visibility_of_element_located((By.NAME, "gs32search"))
            )
            search_button.click()
        elif mode == "advance":
            advance_search_button = wait.until(
                EC.visibility_of_element_located((By.XPATH, '//a[@title="進階查詢"]'))
            )
            advance_search_button.click()

            # 可選的部分
            thesis_info = {
                "論文名稱": "ti",
                "研究生": "au",
                "校院名稱": "asc",
                "系所": "sdp",
                "論文出版年": "pyr",
                "指導教授": "ad",
                "口試委員": "say",
                "關鍵詞": "kw",
                "論文目次": "tc",
                "摘要": "ab",
                "論文參考文獻": "rf",
                "畢業學年度": "yr",
                "學門": "sglv1",
                "學類": "sglv2",
            }
            # example: "中央大學" "校院名稱" "and" "2021" "論文出版年" "教育學門"

            # searchbar1
            searchbar1 = wait.until(EC.visibility_of_element_located((By.NAME, "qs0")))
            searchbar1.clear()
            searchbar1.send_keys(keywords[0])

            # keyword1
            keyword1_element = wait.until(
                EC.visibility_of_element_located((By.NAME, "qf0"))
            )
            keyword1 = Select(keyword1_element)
            keyword1.select_by_value(thesis_info[keywords[1]])

            # logic
            logic_element = wait.until(
                EC.visibility_of_element_located((By.NAME, "qo1"))
            )
            logic = Select(logic_element)
            logic.select_by_value(keywords[2])

            # searchbar2
            searchbar2 = wait.until(EC.visibility_of_element_located((By.NAME, "qs1")))
            searchbar2.clear()
            searchbar2.send_keys(keywords[3])

            # keyword2
            keyword2_element = wait.until(
                EC.visibility_of_element_located((By.NAME, "qf1"))
            )
            keyword2 = Select(keyword2_element)
            keyword2.select_by_value(thesis_info[keywords[4]])

            # field
            if keywords[5] != "":
                field_element = wait.until(
                    EC.visibility_of_element_located((By.NAME, "ltsglv1"))
                )
                field = Select(field_element)
                field.select_by_value(keywords[5])

            # 寫死的部分

            # 語言
            language = wait.until(
                EC.element_to_be_clickable(
                    (By.XPATH, "//input[@type='checkbox' and @value='中文']")
                )
            )
            language.click()

            # 全文類型
            article_type = wait.until(
                EC.element_to_be_clickable((By.NAME, "cacheinternet"))
            )
            article_type.click()

            # 論文種類
            thesis_type_element = wait.until(
                EC.visibility_of_element_located((By.ID, "ltthesistype"))
            )
            thesis_type = Select(thesis_type_element)
            thesis_type.select_by_value("1")

            # search button
            search_button = wait.until(
                EC.visibility_of_element_located((By.NAME, "gs32search"))
            )
            search_button.click()

    except Exception as e:
        print(f"查詢過程中發生錯誤: {e}")

    # ---------------------------------------------------------------------------------
    # 抓取資料
    try:
        while True:
            # 抓取所有 title="電子全文" 的 <a> 標籤
            links = wait.until(
                EC.presence_of_all_elements_located(
                    (By.XPATH, '//a[@title="電子全文"]')
                )
            )

            for link in links:
                # 點擊第一個 "電子全文" 按鈕
                link.click()

                # 切換到新開啟的視窗
                chrome.switch_to.window(chrome.window_handles[-1])

                while True:
                    identify_validate_code()

                    ok_button = wait.until(
                        EC.visibility_of_element_located((By.NAME, "ok"))
                    )
                    ok_button.click()

                    time.sleep(1)
                    try:
                        alert = chrome.switch_to.alert
                        print("下載驗證碼錯誤，重新嘗試")
                        alert.accept()  # 自動點擊確定
                    except NoAlertPresentException:
                        print("下載驗證碼正確")
                        break

                # 找到下載按鈕並點擊
                # 找到包含「下載」的 `<a>` 標籤
                download_link = wait.until(
                    EC.visibility_of_element_located(
                        (By.XPATH, "//a[contains(text(), '下載')]")
                    )
                )

                # 點擊下載
                download_link.click()

                # 關閉新視窗並切回原本的視窗
                chrome.close()
                chrome.switch_to.window(chrome.window_handles[0])

            try:
                # 找到 alt 屬性為 "下一頁" 的 input 按鈕
                next_button = wait.until(
                    EC.visibility_of_element_located(
                        (By.XPATH, "//input[@alt='下一頁']")
                    )
                )
                print("前往下一頁")
                # 點擊按鈕
                next_button.click()
            except:
                print("這是最後一頁")
                break

    except Exception as e:
        print(f"抓取資料發生錯誤: {e}")

    finally:
        # 關閉瀏覽器
        chrome.close()
        print("程式正常結束")

    return True

# 欄位選項
- 論文名稱
- 研究生
- 校院名稱
- 系所
- 論文出版年
- 指導教授
- 口試委員
- 關鍵詞
- 論文目次
- 摘要
- 論文參考文獻
- 畢業學年度
- 學門
- 學類
# 學門選項
- 教育學門
- 藝術學門
- 人文學門
- 設計學門
- 社會及行為科學學門
- 傳播學門
- 商業及管理學門
- 法律學門
- 生命科學學門
- 自然科學學門
- 數學及統計學門
- 電算機學門
- 工程學門
- 建築及都市規劃學門
- 農業科學學門
- 獸醫學門
- 醫藥衛生學門
- 社會服務學門
- 民生學門
- 運輸服務學門
- 環境保護學門

In [None]:
# 進階查詢
# 參數依照以下規則
# 第一個關鍵字,第一個欄位名稱,邏輯,第二個關鍵字,第二個欄位名稱,學門(只有這個可以空，其他都不能空)
keywords = ["中央大學", "校院名稱", "and", "2021", "論文出版年", ""]
web_crawl(mode="advance", keywords=keywords)

In [None]:
# 簡易查詢
# 只有一個關鍵字
keywords = ["囤房稅"]
web_crawl(mode="basic", keywords=keywords)