In [2]:
!pip install requests beautifulsoup4 selenium webdriver-manager

import requests
from bs4 import BeautifulSoup
import sqlite3
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

urls = [
    "https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_faculty",
    "https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_professor",
    "https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_Associate_Professor",
    "https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_Assistant_Professor",
    "https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_Lecturer",
    "https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Part_time_faculty"
]
db_filename = "teacher_research_areas_combined.db"
txt_filename = "all_teacher_research_areas_combined.txt"
use_selenium = True

conn = sqlite3.connect(db_filename)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS teachers (
    url TEXT,
    name TEXT,
    research_area TEXT,
    method TEXT  -- 記錄爬取方法 (requests/selenium)
)
""")
conn.commit()

all_teacher_research_areas_txt = ""

if use_selenium:
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    try:
        driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
    except Exception as e:
        print(f"Selenium WebDriver 初始化失敗：{e}")
        use_selenium = False
        print("將嘗試使用 requests 和 BeautifulSoup 進行爬蟲。")
else:
    driver = None

for url in urls:
    print(f"正在爬取：{url} (使用 {'Selenium' if use_selenium else 'requests'})")
    all_teacher_research_areas_txt += f"--- {url} (使用 {'Selenium' if use_selenium else 'requests'}) ---\n"
    try:
        if use_selenium and driver:
            driver.get(url)
            time.sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
        else:
            response = requests.get(url)
            response.raise_for_status()
            response.encoding = 'utf-8'
            soup = BeautifulSoup(response.text, 'html.parser')

        teacher_items = soup.find_all('div', class_='i-member-item')

        for item in teacher_items:
            name_span = item.find('span', class_='member-data-value-name')
            research_area_span = item.find('span', class_='member-data-value-7')

            if name_span and research_area_span:
                name_link = name_span.find('a')
                if name_link:
                    name = name_link.text.strip()
                else:
                    name = name_span.text.strip()
                research_area = research_area_span.text.strip()

                cursor.execute("INSERT INTO teachers (url, name, research_area, method) VALUES (?, ?, ?, ?)",
                               (url, name, research_area, 'selenium' if use_selenium else 'requests'))
                all_teacher_research_areas_txt += f"{name}：{research_area}\n"

    except Exception as e:
        error_message = f"爬取 {url} 時發生錯誤 (使用 {'Selenium' if use_selenium else 'requests'})：{e}\n"
        print(error_message)
        all_teacher_research_areas_txt += error_message + "\n"

    all_teacher_research_areas_txt += "\n"
    conn.commit()
    time.sleep(1)

if use_selenium and driver:
    driver.quit()

with open(txt_filename, "w", encoding="utf-8") as file:
    file.write(all_teacher_research_areas_txt)

conn.close()

print(f"所有網頁的老師專長已儲存到 {txt_filename} 檔案中。")
print(f"所有網頁的老師專長也已儲存到 SQLite 資料庫 {db_filename} 中。")

Selenium WebDriver 初始化失敗：Message: unknown error: cannot find Chrome binary
Stacktrace:
#0 0x5a02bda444e3 <unknown>
#1 0x5a02bd773c76 <unknown>
#2 0x5a02bd79a757 <unknown>
#3 0x5a02bd799029 <unknown>
#4 0x5a02bd7d7ccc <unknown>
#5 0x5a02bd7d747f <unknown>
#6 0x5a02bd7cede3 <unknown>
#7 0x5a02bd7a42dd <unknown>
#8 0x5a02bd7a534e <unknown>
#9 0x5a02bda043e4 <unknown>
#10 0x5a02bda083d7 <unknown>
#11 0x5a02bda12b20 <unknown>
#12 0x5a02bda09023 <unknown>
#13 0x5a02bd9d71aa <unknown>
#14 0x5a02bda2d6b8 <unknown>
#15 0x5a02bda2d847 <unknown>
#16 0x5a02bda3d243 <unknown>
#17 0x7bd503c2cac3 <unknown>

將嘗試使用 requests 和 BeautifulSoup 進行爬蟲。
正在爬取：https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_faculty (使用 requests)
正在爬取：https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_professor (使用 requests)
正在爬取：https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_Associate_Professor (使用 requests)
正在爬取：https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_Assistant_Professor

In [6]:
import sqlite3
import pandas as pd # 為了更方便地顯示查詢結果

# Change the database filename to match the one used in the first code cell
db_filename = "teacher_research_areas_combined.db"
conn = sqlite3.connect(db_filename)
cursor = conn.cursor()

# 查詢所有資料
cursor.execute("SELECT * FROM teachers")
results = cursor.fetchall()
print("所有資料：", results)

# 使用 pandas DataFrame 顯示資料
df = pd.read_sql_query("SELECT * FROM teachers", conn)
print("\n使用 Pandas DataFrame 顯示資料：")
print(df)

conn.close()

所有資料： [('https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_faculty', '張文鐘 (Wen-Thong Chang)', '物連網系統、視訊串流、通訊系統、影像處理', 'requests'), ('https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_faculty', '蔡進發 （Jeffrey J.P. Tsai）', '人工智能、生物資料、軟體工程、分散式即時系統', 'requests'), ('https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_faculty', '許聞廉(Wen-Lian Hsu)', '演算法分析、生物資訊、計算語言、自然語言理解、智慧型對話系統', 'requests'), ('https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_faculty', '陳良弼(Arbee L.P. Chen)', '巨量資料分析、資料庫偏愛查詢、行動資訊系統', 'requests'), ('https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_faculty', '黃明祥(Min-Shiang Hwang)', '資訊與網路安全、密碼與藏密學、區塊鏈技術、智慧運算與行動通訊、管理資訊系統', 'requests'), ('https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_faculty', '許慶賢(Ching-Hsien Hsu)', '雲端計算、人工智慧與應用、高性能計算、大數據與串流運算、平行與分散式系統、智慧醫療', 'requests'), ('https://csie.asia.edu.tw/zh_tw/TeacherIntroduction/Full_time_faculty', '李文熙(Wen-Hsi Lee)', '有機薄膜電晶體、半導體薄膜製程、薄膜太陽能電池、電子陶瓷、射頻陶瓷元件、系統構裝',