In [45]:
import requests
from bs4 import BeautifulSoup
import yaml

# Load configuration from YAML
def load_config(path='config/config.yaml'):
    """
    Load crawler settings from a YAML configuration file.
    """
    with open(path, 'r') as file:
        return yaml.safe_load(file)

# Load config
config = load_config()
cfg = config['crawler']

# Check loaded config
cfg


{'target_url': 'https://www.inha.ac.kr/kr/1080/subview.do?&enc=Zm5jdDF8QEB8JTJGZmFjaWxpdHklMkZrciUyRmxpc3QuZG8lM0ZzaXRlSWQlM0RrciUyNmNvZGVFJTNERTElMjY=',
 'table_class': 'None',
 'timeout': 5}

In [46]:
def fetch_page(url, timeout):
    """
    Fetch HTML content from the specified URL with a timeout.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }
    response = requests.get(url, headers=headers, timeout=timeout)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch page. Status code: {response.status_code}")
        return None

# Fetch HTML content
html_content = fetch_page(cfg['target_url'], cfg['timeout'])

# Display part of HTML to verify
# print(html_content[:1000])  # Print first 1000 characters


In [47]:
def parse_table(html_content, table_class):
    """
    Parse the HTML to extract all rows from the target table.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table')  # Assuming single table on page

    if not table:
        print("Target table not found.")
        return []

    tbody = table.find('tbody')
    rows = tbody.find_all('tr')

    data_rows = []
    for row in rows:
        cols = row.find_all('td')
        cols_text = [col.get_text(strip=True) for col in cols]
        data_rows.append(cols_text)

    return data_rows

# Extract table data
table_data = parse_table(html_content, cfg['table_class'])

# # Preview extracted data
# for row in table_data[:]:   # Show first 5 rows
#     print(row)


In [48]:
import pandas as pd

columns = ['일시', '장소', '단체명', '행사명', '비고', '인쇄']
df = pd.DataFrame(table_data, columns=columns)

df.head(5)

Unnamed: 0,일시,장소,단체명,행사명,비고,인쇄
0,20250425 ~ 20250425,본관 대강당,사범대학,2025 중등수업혁신지원단 발대식 세팅,승인,인쇄
1,20250426 ~ 20250426,본관 대강당,사범대학,2025 중등수업혁신지원단 발대식 및 역량강화 연수,승인,인쇄
2,20250428 ~ 20250428,본관 대강당,간호학과,기초자연과학 강의,승인,인쇄
3,20250428 ~ 20250428,본관 대강당,건축학과,건축학과 인하렉처,승인,인쇄
4,20250429 ~ 20250429,본관 대강당,창업지원단,스타트업 이슈 리서치,승인,인쇄


### Test

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

BASE_URL = "https://www.inha.ac.kr"
PRINT_URL_TEMPLATE = BASE_URL + "/facility/kr/facilityPrint.do?seq={seq}&req={req}"
TARGET_URL = "https://www.inha.ac.kr/kr/1080/subview.do?&enc=Zm5jdDF8QEB8JTJGZmFjaWxpdHklMkZrciUyRmxpc3QuZG8lM0ZzaXRlSWQlM0RrciUyNmNvZGVFJTNERTElMjY="

def fetch_page(url, timeout=5):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers, timeout=timeout)
    return response.text if response.status_code == 200 else None

def generate_print_link(a_tag):
    """
    Extract seq and req from javascript function and generate real URL.
    """
    if not a_tag:
        return None

    href = a_tag.get('href')
    pattern = r"jf_facilityPrint\('(\d+)','(\d+)'\)"
    match = re.search(pattern, href)
    if match:
        seq, req = match.groups()
        return PRINT_URL_TEMPLATE.format(seq=seq, req=req)
    return None

def parse_table_and_get_print_links(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table')
    if not table:
        print("❌ Target table not found.")
        return []

    data_rows = []
    for row in table.find('tbody').find_all('tr'):
        cols = row.find_all('td')
        cols_text = [col.get_text(strip=True) for col in cols[:-1]]
        a_tag = cols[-1].find('a')
        print_link = generate_print_link(a_tag)
        data_rows.append(cols_text + [print_link])
    return data_rows

def fetch_print_page_content(print_url):
    if not print_url:
        return ""
    response = requests.get(print_url, headers={"User-Agent": "Mozilla/5.0"})
    if response.status_code != 200:
        return ""
    soup = BeautifulSoup(response.text, 'html.parser')
    content_table = soup.find('table')
    return content_table.get_text(strip=True) if content_table else ""


html_content = fetch_page(TARGET_URL)
table_data = parse_table_and_get_print_links(html_content)

enhanced_data = []
for idx, row in enumerate(table_data[:3]):   # 테스트용 3개
    print_link = row[-1]
    print(f"[{idx+1}] Fetching content from: {print_link}")
    content = fetch_print_page_content(print_link)
    enhanced_data.append(row + [content])
    time.sleep(1)

columns = ['Date', 'Place', 'Department', 'Event', 'Approval', 'Print_Link', 'Print_Content']
df = pd.DataFrame(enhanced_data, columns=columns)
df.head(5)
# df.to_csv('inha_facility_print_data.csv', index=False, encoding='utf-8-sig')
# print("✅ Data saved to 'inha_facility_print_data.csv'")

[1] Fetching content from: https://www.inha.ac.kr/facility/kr/facilityPrint.do?seq=28&req=20250313
[2] Fetching content from: https://www.inha.ac.kr/facility/kr/facilityPrint.do?seq=86&req=20250304
[3] Fetching content from: https://www.inha.ac.kr/facility/kr/facilityPrint.do?seq=32&req=20250312


### Apply RDB

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

BASE_URL = "https://www.inha.ac.kr"
TARGET_URL = "https://www.inha.ac.kr/kr/1080/subview.do?&enc=Zm5jdDF8QEB8JTJGZmFjaWxpdHklMkZrciUyRmxpc3QuZG8lM0ZzaXRlSWQlM0RrciUyNmNvZGVFJTNERTElMjY="
PRINT_URL_TEMPLATE = BASE_URL + "/facility/kr/facilityPrint.do?seq={seq}&req={req}"

def fetch_page(url, timeout=5):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers, timeout=timeout)
    return response.text if response.status_code == 200 else None

def generate_print_link(a_tag):
    """
    Extract seq and req from javascript href and generate real print URL.
    """
    if not a_tag:
        return None
    href = a_tag.get('href')
    pattern = r"jf_facilityPrint\('(\d+)','(\d+)'\)"
    match = re.search(pattern, href)
    if match:
        seq, req = match.groups()
        return PRINT_URL_TEMPLATE.format(seq=seq, req=req)
    return None

def parse_reservation_table(html_content):
    """
    Parse the main reservation table and generate print links.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table')
    if not table:
        print("❌ Table not found.")
        return []

    data_rows = []
    for row in table.find('tbody').find_all('tr'):
        cols = row.find_all('td')
        cols_text = [col.get_text(strip=True) for col in cols[:-1]]
        a_tag = cols[-1].find('a')
        print_link = generate_print_link(a_tag)
        data_rows.append(cols_text + [print_link])
    return data_rows

def fetch_print_table_as_dict(print_url):
    """
    Fetch the print popup page and return the central table as a dictionary.
    """
    if not print_url:
        return {}

    response = requests.get(print_url, headers={"User-Agent": "Mozilla/5.0"})
    if response.status_code != 200:
        return {}

    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', attrs={'width': '600px'})
    if not table:
        return {}

    data = {}
    for row in table.find_all('tr'):
        th = row.find('th')
        td = row.find('td')
        if th and td:
            key = th.get_text(strip=True)
            value = td.get_text(separator=' ', strip=True)  # Handle <br> tags
            data[key] = value

    return data

def main():
    html_content = fetch_page(TARGET_URL)
    reservation_data = parse_reservation_table(html_content)

    # 메인 예약 테이블 생성
    df_reservations = pd.DataFrame(
        reservation_data,
        columns=['Date', 'Place', 'Department', 'Event', 'Approval', 'Print_Link']
    )

    # 팝업 테이블 데이터 수집
    popup_tables = []
    for idx, row in df_reservations.iterrows():
        print_link = row['Print_Link']
        print(f"[{idx+1}] Fetching popup table from: {print_link}")
        popup_data = fetch_print_table_as_dict(print_link)
        popup_data['Reservation_ID'] = idx + 1  # 관계형 관리용 ID 추가
        popup_tables.append(popup_data)
        time.sleep(1)

    df_popup = pd.DataFrame(popup_tables)

    # CSV 저장
    df_reservations.to_csv('facility_reservations.csv', index=False, encoding='utf-8-sig')
    df_popup.to_csv('facility_popup_tables.csv', index=False, encoding='utf-8-sig')

    print("✅ Data saved to 'facility_reservations.csv' and 'facility_popup_tables.csv'")

if __name__ == "__main__":
    main()


[1] Fetching popup table from: https://www.inha.ac.kr/facility/kr/facilityPrint.do?seq=28&req=20250313
[2] Fetching popup table from: https://www.inha.ac.kr/facility/kr/facilityPrint.do?seq=86&req=20250304
[3] Fetching popup table from: https://www.inha.ac.kr/facility/kr/facilityPrint.do?seq=32&req=20250312
[4] Fetching popup table from: https://www.inha.ac.kr/facility/kr/facilityPrint.do?seq=112&req=20250408
[5] Fetching popup table from: https://www.inha.ac.kr/facility/kr/facilityPrint.do?seq=85&req=20241213
[6] Fetching popup table from: https://www.inha.ac.kr/facility/kr/facilityPrint.do?seq=45&req=20241203
[7] Fetching popup table from: https://www.inha.ac.kr/facility/kr/facilityPrint.do?seq=74&req=20250331
[8] Fetching popup table from: https://www.inha.ac.kr/facility/kr/facilityPrint.do?seq=41&req=20250312
[9] Fetching popup table from: https://www.inha.ac.kr/facility/kr/facilityPrint.do?seq=40&req=20241204
[10] Fetching popup table from: https://www.inha.ac.kr/facility/kr/facil

### Apply thread

In [11]:
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
import time
import csv

BASE_URL = "https://www.inha.ac.kr"
TARGET_URL = "https://www.inha.ac.kr/kr/1080/subview.do?&enc=Zm5jdDF8QEB8JTJGZmFjaWxpdHklMkZrciUyRmxpc3QuZG8lM0ZzaXRlSWQlM0RrciUyNmNvZGVFJTNERTElMjY="
PRINT_URL_TEMPLATE = BASE_URL + "/facility/kr/facilityPrint.do?seq={seq}&req={req}"

def fetch_with_retry(url, max_retries=3, delay=2):
    headers = {"User-Agent": "Mozilla/5.0"}
    for attempt in range(1, max_retries + 1):
        try:
            response = requests.get(url, headers=headers, timeout=5)
            if response.status_code == 200:
                return response.text
            else:
                print(f"⚠️ Attempt {attempt}: Status {response.status_code}")
        except Exception as e:
            print(f"⚠️ Attempt {attempt} failed: {e}")
        time.sleep(delay)
    print(f"❌ Failed to fetch {url} after {max_retries} attempts.")
    return None

def generate_print_link(a_tag):
    if not a_tag:
        return None
    href = a_tag.get('href')
    match = re.search(r"jf_facilityPrint\('(\d+)',\s*'(\d+)'\)", href)
    if match:
        seq, req = match.groups()
        return PRINT_URL_TEMPLATE.format(seq=seq, req=req)
    return None

def parse_reservation_table(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table')
    if not table:
        print("❌ Table not found.")
        return []

    rows = []
    for row in table.find('tbody').find_all('tr'):
        cols = row.find_all('td')
        values = [col.get_text(strip=True) for col in cols[:-1]]
        a_tag = cols[-1].find('a')
        print_link = generate_print_link(a_tag)
        rows.append(values + [print_link])
    return rows

def fetch_print_table_as_dict(reservation_id, print_url):
    result = {'Reservation_ID': reservation_id}

    if not print_url:
        return result

    html = fetch_with_retry(print_url)
    if not html:
        return result

    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', attrs={'width': '600px'})
    if not table:
        return result

    for row in table.find_all('tr'):
        th = row.find('th')
        td = row.find('td')
        if th and td:
            key = th.get_text(strip=True)
            value = td.get_text(separator=' ', strip=True)
            result[key] = value
    return result

def main():
    html_content = fetch_with_retry(TARGET_URL)
    if not html_content:
        print("❌ Failed to fetch main page.")
        return

    reservation_data = parse_reservation_table(html_content)

    df_reservations = pd.DataFrame(reservation_data, columns=[
        'Date', 'Place', 'Department', 'Event', 'Approval', 'Print_Link'
    ])
    df_reservations.insert(0, 'Reservation_ID', range(1, len(df_reservations) + 1))

    # 순차적으로 팝업 데이터 수집
    popup_results = []
    for idx, row in df_reservations.iterrows():
        rid = row['Reservation_ID']
        link = row['Print_Link']
        print(f"Fetching popup for Reservation_ID: {rid}")
        popup_data = fetch_print_table_as_dict(rid, link)
        popup_results.append(popup_data)

    df_popup = pd.DataFrame(popup_results)

    # CSV 저장
    df_reservations.to_csv('facility_reservations.csv', index=False, encoding='utf-8-sig')
    df_popup.to_csv('facility_popup_tables.csv', index=False, encoding='utf-8-sig', quoting=csv.QUOTE_ALL)

    print("✅ All data saved successfully without threading!")

if __name__ == "__main__":
    main()


Fetching popup for Reservation_ID: 1
Fetching popup for Reservation_ID: 2
Fetching popup for Reservation_ID: 3
Fetching popup for Reservation_ID: 4
Fetching popup for Reservation_ID: 5
Fetching popup for Reservation_ID: 6
Fetching popup for Reservation_ID: 7
Fetching popup for Reservation_ID: 8
Fetching popup for Reservation_ID: 9
Fetching popup for Reservation_ID: 10
Fetching popup for Reservation_ID: 11
Fetching popup for Reservation_ID: 12
Fetching popup for Reservation_ID: 13
Fetching popup for Reservation_ID: 14
Fetching popup for Reservation_ID: 15
Fetching popup for Reservation_ID: 16
Fetching popup for Reservation_ID: 17
Fetching popup for Reservation_ID: 18
Fetching popup for Reservation_ID: 19
Fetching popup for Reservation_ID: 20
Fetching popup for Reservation_ID: 21
Fetching popup for Reservation_ID: 22
Fetching popup for Reservation_ID: 23
Fetching popup for Reservation_ID: 24
Fetching popup for Reservation_ID: 25
Fetching popup for Reservation_ID: 26
Fetching popup for Re