In [None]:
pip install requests beautifulsoup4 pdfplumber python-dateutil


Defaulting to user installation because normal site-packages is not writeable
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting Pillow>=9.1 (from pdfplumber)
  Downloading pillow-12.0.0-cp313-cp313-win_amd64.whl.metadata (9.0 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.0.0-py3-none-win_amd64.whl.metadata (67 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20250506->pdfplumber)
  Downloading cryptography-46.0.3-cp311-abi3-win_amd64.whl.metadata (5.7 kB)
Collecting cffi>=2.0.0 (from cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber)
  Downloading cffi-2.0.0-cp313-cp313-win_amd64.whl.metadata (2.6 kB)
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? et


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\송지훈\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
pip install python-dotenv


Defaulting to user installation because normal site-packages is not writeable
Collecting python-dotenv
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Downloading python_dotenv-1.2.1-py3-none-any.whl (21 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.2.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\송지훈\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
# -*- coding: utf-8 -*-
"""
Hongik Univ. 학사일정 크롤러 → MySQL 직저장 (CSV 없음)
- 목록 페이지 → '다운로드' PDF 링크 → PDF 텍스트 파싱 → MySQL INSERT ... ON DUPLICATE KEY UPDATE
- 저장 필드: context_year, raw_period, type, title
"""

import re
import io
import requests
import pdfplumber
import pymysql
from bs4 import BeautifulSoup
from datetime import date
from calendar import monthrange

# ====== 대상 페이지 & UA ======
BASE_URL = "https://www.hongik.ac.kr"
LIST_URL = "https://www.hongik.ac.kr/kr/education/academic-schedule001.do"
UA = "HongBridgeBot/1.0 (contact: your_email@example.com)"

# ====== MySQL 접속 정보 ======
MYSQL_CFG = dict(
    host="localhost",
    user="root",
    password="4205",          # ← 환경에 맞게 변경
    database="qnet_crawling2",
    charset="utf8mb4",
    autocommit=True,
)

TABLE_NAME = "academic_calendar"   # 원하는 이름으로 변경 가능


# ---------------- 공통 유틸 ----------------
def last_day(y, m):
    return monthrange(y, m)[1]

def to_iso_safe(y, m, d):
    if not (1 <= m <= 12 and 1 <= d <= 31):
        return None
    try:
        return date(y, m, d).isoformat()
    except ValueError:
        return None

def normalize_dot_date(tok, default_year):
    """'3.4(화)' / '4. 1' / '2026. 1. 5.' → (Y, M, D)"""
    tok = tok.strip()
    tok = re.sub(r"\([^)]*\)", "", tok)  # 요일 제거
    m = re.match(r"(?:(?P<y>\d{4})\.)?\s*(?P<m>\d{1,2})\.\s*(?P<d>\d{1,2})\.?$", tok)
    if not m:
        return None
    y = int(m.group("y")) if m.group("y") else default_year
    mm = int(m.group("m")); dd = int(m.group("d"))
    if not (1 <= mm <= 12 and 1 <= dd <= 31):
        return None
    return y, mm, dd

def parse_period(raw, ctx_year, ctx_month):
    """
    반환: ((start_iso|None, end_iso|None), kind, inferred_year|None, inferred_month|None)
    kind ∈ {month-header, month-range, ym-month-range, date-range, single-date, month, unknown}
    """
    if not raw:
        return (None, None), "unknown", None, None

    s = re.sub(r"\s+", "", raw)
    s = re.sub(r"\([^)]*\)", "", s)
    s = s.translate(str.maketrans({"–": "~", "—": "~", "∼": "~", "-": "~"}))

    # 섹션 월 헤더: '9월'
    m = re.fullmatch(r"(?P<m>\d{1,2})월", s)
    if m:
        return (None, None), "month-header", None, int(m.group("m"))

    # M월~M월
    m = re.match(r"(?P<m1>\d{1,2})월~(?P<m2>\d{1,2})월$", s)
    if m:
        m1, m2 = int(m.group("m1")), int(m.group("m2"))
        y1 = ctx_year; y2 = y1 + (1 if m2 < m1 else 0)
        return (to_iso_safe(y1, m1, 1), to_iso_safe(y2, m2, last_day(y2, m2))), "month-range", y1, m1

    # YYYY.MM월~YYYY.MM월
    m = re.match(r"(?P<y1>\d{4})\.(?P<m1>\d{1,2})월~(?P<y2>\d{4})\.(?P<m2>\d{1,2})월$", s)
    if m:
        y1, m1, y2, m2 = map(int, (m.group("y1"), m.group("m1"), m.group("y2"), m.group("m2")))
        return (to_iso_safe(y1, m1, 1), to_iso_safe(y2, m2, last_day(y2, m2))), "ym-month-range", y1, m1

    # M.DD ~ (YYYY.)M.DD
    if "~" in s and ("월" not in s and "일" not in s):
        a, b = s.split("~", 1)
        a_dt = normalize_dot_date(a, ctx_year)
        m_b = re.match(r"(?:(?P<y>\d{4})\.)?\s*(?P<m>\d{1,2})\.\s*(?P<d>\d{1,2})\.?$", b)
        if a_dt and m_b:
            y1, m1, d1 = a_dt
            y2 = int(m_b.group("y")) if m_b.group("y") else y1
            m2, d2 = int(m_b.group("m")), int(m_b.group("d"))
            if not m_b.group("y") and m2 < m1:
                y2 = y1 + 1
            return (to_iso_safe(y1, m1, d1), to_iso_safe(y2, m2, d2)), "date-range", y1, m1

    # D일 ~ (M)D일
    if "일" in s and "~" in s:
        m = re.match(r"(?:(?P<m1>\d{1,2})월)?(?P<d1>\d{1,2})일~(?:(?P<m2>\d{1,2})월)?(?P<d2>\d{1,2})일$", s)
        if m:
            m1 = int(m.group("m1")) if m.group("m1") else (ctx_month or 1)
            d1 = int(m.group("d1"))
            m2 = int(m.group("m2")) if m.group("m2") else m1
            d2 = int(m.group("d2"))
            y1 = ctx_year or date.today().year
            y2 = y1 + (1 if m2 < m1 else 0)
            return (to_iso_safe(y1, m1, d1), to_iso_safe(y2, m2, d2)), "date-range", y1, m1

    # 단일 'M월D일'
    m = re.fullmatch(r"(?P<m>\d{1,2})월(?P<d>\d{1,2})일", s)
    if m:
        mm, dd = int(m.group("m")), int(m.group("d"))
        iso = to_iso_safe(ctx_year or date.today().year, mm, dd)
        return (iso, iso), "single-date", ctx_year, mm

    # 단일 'M.DD' / 'YYYY.M.DD'
    a_dt = normalize_dot_date(s, ctx_year or date.today().year)
    if a_dt:
        y, mm, dd = a_dt
        iso = to_iso_safe(y, mm, dd)
        return (iso, iso), "single-date", y, mm

    # 단일 'D일' (섹션 월 사용)
    if re.fullmatch(r"\d{1,2}일", s):
        d = int(re.findall(r"\d{1,2}", s)[0])
        mm = ctx_month
        if mm:
            iso = to_iso_safe(ctx_year or date.today().year, mm, d)
            return (iso, iso), "single-date", ctx_year, mm

    # YYYY.MM월 / M월
    m = re.match(r"(?:(?P<y>\d{4})\.)?(?P<m>\d{1,2})월$", s)
    if m:
        y = int(m.group("y")) if m.group("y") else (ctx_year or date.today().year)
        mm = int(m.group("m"))
        return (to_iso_safe(y, mm, 1), to_iso_safe(y, mm, last_day(y, mm))), "month", y, mm

    if re.fullmatch(r"\d{4}\.", s):
        return (None, None), "unknown", None, None

    return (None, None), "unknown", None, None


# ---------------- 페이지 & PDF ----------------
def find_pdf_url():
    resp = requests.get(LIST_URL, headers={"User-Agent": UA}, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    for tag in soup.find_all("a"):
        if tag.get_text(strip=True) == "다운로드" and tag.get("href"):
            href = tag.get("href")
            return BASE_URL + href if href.startswith("/") else href
    raise RuntimeError("다운로드 링크를 찾지 못했습니다.")

def read_pdf_lines(pdf_url):
    r = requests.get(pdf_url, headers={"User-Agent": UA}, timeout=40)
    r.raise_for_status()
    lines = []
    with pdfplumber.open(io.BytesIO(r.content)) as pdf:
        for page in pdf.pages:
            txt = page.extract_text(x_tolerance=1.5, y_tolerance=3.0) or ""
            for ln in txt.splitlines():
                s = ln.strip()
                if s:
                    lines.append(s)
    return lines


# ---------------- 파서 (연/월 컨텍스트 자동 보정 + 학년도 우선) ----------------
def parse_pdf_to_rows(lines):
    rows = []
    ctx_year_section = None   # ← PDF 섹션의 연도(왼쪽 첫열/머리글)
    ctx_month = None
    last_month = None

    # 'YYYY.' 단독 줄 → 다음 줄 병합
    merged = []
    i = 0
    while i < len(lines):
        s = lines[i].strip()
        if re.fullmatch(r"\d{4}\.", s) and i + 1 < len(lines):
            merged.append((s + " " + lines[i+1]).strip())
            i += 2
        else:
            merged.append(s)
            i += 1

    # 'YYYY '로 붙은 줄 처리 + 섹션 연도 보존
    pre = []
    for s in merged:
        s = s.strip()

        # 섹션 연도 단독 표기 ('2024' 같은 줄)
        if re.fullmatch(r"\d{4}", s):
            ctx_year_section = int(s)
            continue

        # '2025  3.4(화) ~ ...'처럼 앞에 연도가 붙은 줄
        m = re.match(r"^(?P<y>\d{4})\s+(?P<rest>.+)$", s)
        if m:
            ctx_year_section = int(m.group("y"))
            pre.append(m.group("rest").strip())
        else:
            pre.append(s)

    # 기간 인식 정규식 (이전과 동일)
    period_re = re.compile(
        r"(?P<period>("
        r"(?:\d{4}\.\s*)?\d{1,2}\.\s*\d{1,2}(?:\([^)]\))?(?:\s*[-~–—∼]\s*(?:\d{4}\.\s*)?\d{1,2}\.\s*\d{1,2}(?:\([^)]\))?)"
        r"|"
        r"\d{1,2}\s*월\s*[-~–—∼]\s*\d{1,2}\s*월"
        r"|"
        r"\d{4}\.\s*\d{1,2}\s*월\s*[-~–—∼]\s*\d{4}\.\s*\d{1,2}\s*월"
        r"|"
        r"\d{1,2}\.\s*\d{1,2}(?:\([^)]\))?"
        r"|"
        r"\d{4}\.\s*\d{1,2}\s*월"
        r"|"
        r"\d{1,2}\s*월"
        r"|"
        r"\d{1,2}\s*월\s*\d{1,2}\s*일(?:\([^)]*\))?"
        r"|"
        r"(?:(?:\d{1,2}\s*월)?\s*\d{1,2}\s*일(?:\([^)]*\))?\s*[-~–—∼]\s*(?:\d{1,2}\s*월)?\s*\d{1,2}\s*일(?:\([^)]*\))?)"
        r"|"
        r"\d{1,2}\s*일(?:\([^)]*\))?"
        r"))"
    )

    def pick_context_year(title_text, start_iso, end_iso):
        """
        우선순위:
        1) 제목에 '(\d{4})학년도'가 있으면 그 연도
        2) 섹션 연도(ctx_year_section)
        3) 날짜(start/end)의 연도 (보조)
        """
        m = re.search(r"(?P<yy>\d{4})\s*학년도", title_text)
        if m:
            return int(m.group("yy"))
        if ctx_year_section:
            return ctx_year_section
        for iso in (start_iso, end_iso):
            if iso:
                return int(iso[:4])
        return None

    # 3줄 윈도우 스캐너
    i = 0
    while i < len(pre):
        s = pre[i]
        if "학사일정" in s or s.startswith("일자") or s.startswith("학사내용") or "일 자" in s:
            i += 1; continue

        m = period_re.search(s)
        if not m:
            merged_ok = False
            for look in (1, 2, 3):
                if i + look < len(pre):
                    nxt = pre[i+look].strip()
                    m2 = period_re.search(nxt)
                    if m2:
                        raw_period = m2.group("period").strip()
                        title = s.strip()

                        (start_iso, end_iso), kind, inf_year, inf_month = parse_period(
                            raw_period, ctx_year_section or date.today().year, ctx_month
                        )

                        # 섹션 월 헤더
                        if kind == "month-header":
                            if inf_month:
                                if last_month is not None and inf_month < last_month:
                                    # 연말→연초 월 헤더 넘어갈 때, 섹션 연도는 그대로 두고 월만 갱신
                                    pass
                                ctx_month = inf_month
                                last_month = inf_month
                            i += look + 1
                            merged_ok = True
                            break

                        # context_year 결정 (달력 연도 아님!)
                        row_year = pick_context_year(title, start_iso, end_iso)

                        if title:
                            rows.append({
                                "context_year": row_year,
                                "raw_period": raw_period,
                                "type": kind,
                                "title": title
                            })
                        i += look + 1
                        merged_ok = True
                        break
            if merged_ok:
                continue

            # 기간도 제목도 아니면 직전 제목에 이어붙임
            if rows:
                rows[-1]["title"] = (rows[-1]["title"] + " " + s).strip()
            i += 1
            continue

        # 같은 줄에 기간+제목
        raw_period = m.group("period").strip()
        title = s[m.end():].strip()

        if not title:
            for look in (1, 2, 3):
                if i + look < len(pre):
                    nxt = pre[i+look].strip()
                    if not period_re.search(nxt) and not re.fullmatch(r"\d{4}(\.)?$", nxt):
                        title = nxt
                        i += look
                        break

        (start_iso, end_iso), kind, inf_year, inf_month = parse_period(
            raw_period, ctx_year_section or date.today().year, ctx_month
        )

        if kind == "month-header":
            if inf_month:
                if last_month is not None and inf_month < last_month:
                    # 연말→연초 월 헤더 넘어가도 섹션 연도는 유지
                    pass
                ctx_month = inf_month
                last_month = inf_month
            i += 1
            continue

        # context_year 결정 (제목의 학년도 > 섹션연도 > 날짜연도)
        row_year = pick_context_year(title, start_iso, end_iso)

        if title:
            rows.append({
                "context_year": row_year,
                "raw_period": raw_period,
                "type": kind,
                "title": title
            })

        i += 1

    # 중복 제거
    clean, seen = [], set()
    for r in rows:
        if not r["title"]:
            continue
        key = (r["context_year"], r["raw_period"], r["title"])
        if key in seen:
            continue
        seen.add(key)
        clean.append(r)
    return clean


# ---------------- DB 저장 ----------------
def ensure_table(conn, table=TABLE_NAME):
    with conn.cursor() as cur:
        cur.execute(f"""
            CREATE TABLE IF NOT EXISTS `{table}` (
                id INT AUTO_INCREMENT PRIMARY KEY,
                context_year INT NULL,
                raw_period VARCHAR(100) NOT NULL,
                type VARCHAR(32) NOT NULL,
                title VARCHAR(500) NOT NULL,
                UNIQUE KEY uq_cal (context_year, raw_period, title)
            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
        """)
    print(f"[INFO] 테이블 준비 완료: {table}")

def save_rows_to_mysql(rows, cfg=MYSQL_CFG, table=TABLE_NAME):
    conn = pymysql.connect(**cfg)
    try:
        ensure_table(conn, table=table)
        with conn.cursor() as cur:
            sql = f"""
                INSERT INTO `{table}` (context_year, raw_period, type, title)
                VALUES (%s, %s, %s, %s)
                ON DUPLICATE KEY UPDATE
                    type=VALUES(type),
                    title=VALUES(title);
            """
            for r in rows:
                cur.execute(sql, (r.get("context_year"), r.get("raw_period"), r.get("type"), r.get("title")))
        conn.commit()
        print(f"[INFO] MySQL 적재 완료: {len(rows)}행 → {table}")
    finally:
        conn.close()


# ---------------- 메인 ----------------
def main():
    pdf_url = find_pdf_url()
    print(f"[INFO] PDF URL: {pdf_url}")
    lines = read_pdf_lines(pdf_url)
    rows = parse_pdf_to_rows(lines)
    print(f"[INFO] 파싱 건수: {len(rows)}")
    for r in rows[:20]:
        print(r)  # 미리보기

    save_rows_to_mysql(rows)

if __name__ == "__main__":
    main()


  1) 제목에 '(\d{4})학년도'가 있으면 그 연도


[INFO] PDF URL: https://www.hongik.ac.kr/cms/etcResourceDown.do?site=$cms$NYJyA&key=$cms$EwBmFZE1VwRRsEeaAJKBqBwPuOFQJgFIW1rAc3QSgDoAHAEwDMg
[INFO] 파싱 건수: 38
{'context_year': 2025, 'raw_period': '2024. 12월 ~ 2025. 2월', 'type': 'ym-month-range', 'title': '2025학년도 신입생 정시모집 입학전형'}
{'context_year': 2025, 'raw_period': '2024. 12월 ~ 2025. 2월', 'type': 'ym-month-range', 'title': '2025학년도 편입생 입학전형'}
{'context_year': 2025, 'raw_period': '1.22(수) ~ 2.2(일)', 'type': 'date-range', 'title': '2025학년도 1학기 복학신청'}
{'context_year': 2025, 'raw_period': '2.3(월) ~ 2.4(화)', 'type': 'date-range', 'title': '2025학년도 1학기 수강과목 사전선택'}
{'context_year': 2024, 'raw_period': '2.21(금)', 'type': 'single-date', 'title': '2024학년도 전기 학위수여식(제75회)'}
{'context_year': 2025, 'raw_period': '2.21(금) ~ 2.27(목)', 'type': 'date-range', 'title': '2025학년도 1학기 등록 및 휴학신청'}
{'context_year': 2025, 'raw_period': '2.24(월)', 'type': 'single-date', 'title': '2025학년도 신입생 입학식'}
{'context_year': 2025, 'raw_period': '2.24(월) ~ 2.27(목)', 'type': 