In [None]:
# -*- coding: utf-8 -*-
import csv, os, time, random, sys, re
import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

START_URL = "https://sugang.hongik.ac.kr/cn50000.jsp"

# ---------------- 정규식 패턴 ----------------
DAY_PATTERN = re.compile(r"[월화수목금토일]")
INT_PATTERN = re.compile(r"^\d+$")
INVALID_PROF = {"전체", "없음", "미정", "-", ""}

# 최소 2자리 이상의 숫자를 포함하는 요일/시간 텍스트만 인정
def is_time_text(txt: str) -> bool:
    return bool(re.search(r"[월화수목금토일]\s*\d{2,}", txt))

# 강화된 교수명 패턴
PROF_NAME_PATTERN = re.compile(
    r"([가-힣]{2,12}\d?(?:\s*[,/]\s*[가-힣]{2,12}\d?)*)|([A-Za-z][A-Za-z\.\s]+)",
    re.UNICODE
)

# ---------------- DB 연결 ----------------
def db_connect():
    return pymysql.connect(
        host="localhost",
        user="root",
        password="4205",
        database="qnet_crawling2",
        charset="utf8mb4",
        autocommit=True
    )

def save_course(row, conn):
    sql = """
        INSERT INTO course (opened_grade, department, course_name, enrolled, professor, schedule)
        VALUES (%s, %s, %s, %s, %s, %s)
        ON DUPLICATE KEY UPDATE
            enrolled = VALUES(enrolled),
            professor = VALUES(professor),
            schedule = VALUES(schedule);
    """
    with conn.cursor() as cur:
        cur.execute(sql, (
            row.get("개설학년"),
            row.get("주관학과"),
            row.get("과목명"),
            int(row.get("수강인원") or 0),
            row.get("교수명"),
            row.get("요일및시간")
        ))

# ---------------- 유틸 ----------------
def build_driver():
    opt = webdriver.ChromeOptions()
    opt.add_argument("--start-maximized")
    opt.add_argument("--disable-gpu")
    return webdriver.Chrome(options=opt)

def norm(s: str) -> str:
    return " ".join((s or "").replace("\xa0", " ").split())

def wait_css(driver, css, timeout=15):
    return WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, css))
    )

# ---------------- 테이블 파싱 ----------------
def parse_table_rows(table):
    rows = table.find_elements(By.CSS_SELECTOR, "tr")[1:]
    out = []

    for r in rows:
        tds = r.find_elements(By.CSS_SELECTOR, "td")
        if not tds:
            continue

        # 과목명 위치
        subj_idx = -1
        for i, td in enumerate(tds):
            if "list_subject" in (td.get_attribute("class") or ""):
                subj_idx = i
                break
        if subj_idx < 0:
            continue

        course_name = norm(tds[subj_idx].get_attribute("innerText"))
        opened_grade = norm(tds[0].get_attribute("innerText"))

        # 주관학과 = subj_idx - 4
        try:
            dept_text = norm(tds[subj_idx - 4].get_attribute("innerText"))
        except:
            dept_text = ""

        # ---- 시간/교수명 판정 ----
        time_idx, time_text = None, ""
        prof_candidates = []
        saw_all = False

        for j in range(subj_idx + 1, len(tds)):
            txt = norm(tds[j].get_attribute("innerText"))
            if not txt:
                continue

            # 시간 텍스트인지 체크
            if is_time_text(txt):
                if time_idx is None:
                    time_idx, time_text = j, txt
                continue

            # 무효값
            if txt in INVALID_PROF:
                saw_all = True
                continue

            # 숫자만
            if INT_PATTERN.match(txt):
                continue

            # 교수명 패턴
            m = PROF_NAME_PATTERN.search(txt)
            if m:
                prof_candidates.append((j, m.group(0)))

        # ---- 교수명 확정 ----
        prof_text = ""
        if prof_candidates:
            if time_idx is not None:
                prof_candidates = [(i, v) for i, v in prof_candidates if i < time_idx]
            if prof_candidates:
                prof_text = prof_candidates[-1][1]

        elif saw_all:
            prof_text = "전체"

        # ---- 수강인원 ----
        enrolled_text = ""
        scan_end = (time_idx - 1) if time_idx else len(tds) - 1
        for j in range(scan_end, subj_idx, -1):
            txt = norm(tds[j].get_attribute("innerText"))
            if INT_PATTERN.match(txt):
                enrolled_text = txt
                break

        item = {
            "개설학년": opened_grade,
            "주관학과": dept_text,
            "과목명": course_name,
            "수강인원": enrolled_text,
            "교수명": prof_text,
            "요일및시간": time_text,
        }

        if any(item.values()):
            out.append(item)

    return out

# ---------------- 팝업 파싱 ----------------
def parse_second_page(driver, timeout=15):
    end = time.time() + timeout
    while time.time() < end:
        try:
            driver.switch_to.default_content()
            tables = driver.find_elements(By.CSS_SELECTOR, "#select_list")
            if tables:
                return parse_table_rows(tables[0])

            for f in driver.find_elements(By.TAG_NAME, "iframe"):
                try:
                    driver.switch_to.frame(f)
                    tables = driver.find_elements(By.CSS_SELECTOR, "#select_list")
                    if tables:
                        return parse_table_rows(tables[0])
                finally:
                    driver.switch_to.default_content()
        except:
            pass

        time.sleep(0.4)
    return []

# ---------------- main ----------------
def main():
    conn = db_connect()
    driver = build_driver()
    driver.get(START_URL)

    wait_css(driver, "#table_seoul")
    anchors = driver.find_elements(By.CSS_SELECTOR, '#table_seoul a[href^="javascript:gocn4001"]')

    original = driver.current_window_handle

    for idx, a in enumerate(anchors, 1):
        print(f"[INFO] ({idx}/{len(anchors)}) 링크 접속중…")

        before = set(driver.window_handles)
        driver.execute_script("arguments[0].scrollIntoView({block:'center'})", a)
        time.sleep(0.25)
        a.click()
        time.sleep(0.75)

        after = set(driver.window_handles)

        try:
            # 팝업 방식
            if len(after) > len(before):
                new_handler = list(after - before)[0]
                driver.switch_to.window(new_handler)

                rows = parse_second_page(driver)
                for row in rows:
                    print("   ", row)
                    save_course(row, conn)

                driver.close()
                driver.switch_to.window(original)

            # iframe/SPA 방식
            else:
                rows = parse_second_page(driver)
                for row in rows:
                    print("   ", row)
                    save_course(row, conn)

                driver.back()
                wait_css(driver, "#table_seoul")
                anchors = driver.find_elements(By.CSS_SELECTOR, '#table_seoul a[href^="javascript:gocn4001"]')

        except TimeoutException:
            print("[TIMEOUT] 넘어감")

        time.sleep(random.uniform(0.6, 1.2))

    conn.close()
    driver.quit()
    print("[DONE] DB 저장 완료!]")

if __name__ == "__main__":
    main()


[INFO] (1/62) 링크 클릭…
    {'개설학년': '0', '주관학과': '예술학과', '과목명': '미술의이해 (COMPREHENSION OF ART)', '수강인원': '20', '교수명': '전영백', '요일및시간': '화234'}
    {'개설학년': '0', '주관학과': '예술학과', '과목명': '미술의이해 (COMPREHENSION OF ART)', '수강인원': '20', '교수명': '전영백', '요일및시간': '수234'}
    {'개설학년': '0', '주관학과': '예술학과', '과목명': '미술의이해 (COMPREHENSION OF ART)', '수강인원': '20', '교수명': '손수연', '요일및시간': '월234'}
    {'개설학년': '0', '주관학과': '예술학과', '과목명': '미술의이해 (COMPREHENSION OF ART)', '수강인원': '20', '교수명': '이지연8', '요일및시간': '화789'}
    {'개설학년': '0', '주관학과': '교양과(서울)', '과목명': '현대생활과디자인 (MODERN LIFE & DESIGN)', '수강인원': '20', '교수명': '김명규', '요일및시간': '목678'}
    {'개설학년': '0', '주관학과': '교양과(서울)', '과목명': '현대생활과디자인 (MODERN LIFE & DESIGN)', '수강인원': '20', '교수명': '김명규', '요일및시간': '금678'}
    {'개설학년': '0', '주관학과': '교양과(서울)', '과목명': '대중예술의이해 (UNDERSTANDING POPULAR ART)', '수강인원': '20', '교수명': '박기영1', '요일및시간': '수234'}
    {'개설학년': '0', '주관학과': '교양과(서울)', '과목명': '대중예술의이해 (UNDERSTANDING POPULAR ART)', '수강인원': '20', '교수명': '박기영1', '요일및시간': '목234'}
