In [None]:
!pip install selenium webdriver-manager requests

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
import os
import re
import glob

# 1. 저장 폴더
DOWNLOAD_FOLDER = "BOK_Final_Fixed"
if not os.path.exists(DOWNLOAD_FOLDER):
    os.makedirs(DOWNLOAD_FOLDER)

download_path = os.path.abspath(DOWNLOAD_FOLDER)


# 2. 브라우저
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)
prefs = {
    "download.default_directory": download_path,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "plugins.always_open_pdf_externally": True
}
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


# 3. 주소
URL_TEMPLATE = "https://www.bok.or.kr/portal/singl/newsData/list.do?pageIndex={}&targetDepth=4&menuNo=200789&depth2=200038&depth3=201154&depth4=200789&sort=1&pageUnit=10"


# 4. 다운로드 대기 및 이름 변경 함수
def wait_and_rename(previous_files, target_name, fixed_ext):
    max_wait = 180 # 최대 3분
    start_time = time.time()
    
    print("다운로드 중", end="", flush=True)
    
    while time.time() - start_time < max_wait:
        # 현재 폴더 파일 목록
        current_files = set(os.listdir(download_path))
        
        # 새 파일 찾기
        new_files = current_files - previous_files
        
        # 1. 아직 다운로드 중(.crdownload)인 게 있는지 확인
        is_downloading = False
        for f in new_files:
            if f.endswith('.crdownload') or f.endswith('.tmp'):
                is_downloading = True
                break
        
        if is_downloading:
            time.sleep(1)
            print(".", end="", flush=True)
            continue
            
        # 2. 다운로드가 완전히 끝난 파일 찾기
        real_new_file = None
        for f in new_files:
            if not f.endswith('.crdownload') and not f.endswith('.tmp') and not f.startswith('.'):
                real_new_file = f
                break
        
        if real_new_file:
            print("완료")
            old_path = os.path.join(download_path, real_new_file)
            
            # 목표 파일명 만들기
            safe_name = "".join([c for c in target_name if c.isalnum() or c in ' ._()-'])
            new_name = f"{safe_name}{fixed_ext}"
            new_path = os.path.join(download_path, new_name)
            
            # 중복 방지
            uniq = 1
            while os.path.exists(new_path):
                new_name = f"{safe_name}_{uniq}{fixed_ext}"
                new_path = os.path.join(download_path, new_name)
                uniq += 1
            
            # 이름 변경
            for _ in range(5):
                try:
                    os.rename(old_path, new_path)
                    print(f"이름 변경 성공: {new_name}")
                    return True
                except:
                    time.sleep(0.5)
            
        time.sleep(1)
    
    print("\n시간 초과: 오류")
    return False


# 5. 수집 시작
START_PAGE = 1
END_PAGE = 55
BASE_URL = "https://www.bok.or.kr"

for current_page in range(START_PAGE, END_PAGE + 1):
    print(f">>>{current_page}페이지 작업 시작...")
    
    driver.get(URL_TEMPLATE.format(current_page))
    time.sleep(2)

    # 1. 주소 확보
    links = driver.find_elements(By.PARTIAL_LINK_TEXT, "의사록")
    target_list = []
    
    for link in links:
        title = link.text.strip()
        if not ("제" in title or "20" in title): continue
        
        onclick = link.get_attribute("onclick") or ""
        href = link.get_attribute("href") or ""
        full_url = ""
        
        if "fnView" in onclick or "fnView" in href:
            matches = re.findall(r"['\"](\d+)['\"]", onclick + href)
            if len(matches) >= 2:
                full_url = f"https://www.bok.or.kr/portal/singl/newsData/view.do?nttId={matches[1]}&menuNo={matches[0]}"
        elif "view.do" in href:
            full_url = BASE_URL + href if href.startswith('/') else href
            
        if full_url:
            target_list.append((title, full_url))
    
    target_list = list(set(target_list))

    # 2. 다운로드
    for idx, (title, url) in enumerate(target_list):
        print(f"   [{idx+1}/{len(target_list)}] {title}")
        
        try:
            driver.get(url)
            time.sleep(1.5)
           
            before_files = set(os.listdir(download_path))
            
            all_file_links = driver.find_elements(By.XPATH, "//a[contains(@href, 'fileDown') or contains(@href, '.pdf') or contains(@href, '.hwp')]")
            
            pdf_btn = None
            hwp_btn = None
            
            for btn in all_file_links:
                try:
                    txt = btn.text.lower()
                    href = btn.get_attribute("href")
                    if "viewer" in href or "preview" in href: continue
                    if ".pdf" in txt or ".pdf" in href:
                        pdf_btn = btn
                        break
                    elif (".hwp" in txt or ".hwp" in href) and hwp_btn is None:
                        hwp_btn = btn
                except: continue
            
            triggered = False
            fixed_ext = ""
            
            if pdf_btn:
                print(f"[PDF] 다운로드")
                driver.execute_script("arguments[0].click();", pdf_btn)
                triggered = True
                fixed_ext = ".pdf"
            elif hwp_btn:
                print(f"[HWP] 다운로드")
                driver.execute_script("arguments[0].click();", hwp_btn)
                triggered = True
                fixed_ext = ".hwp"
            else:
                print("파일 없음 (Pass)")
            
            if triggered:
                wait_and_rename(before_files, title, fixed_ext)
                
        except Exception as e:
            print(f"에러: {e}")
            continue

print("\n>>> 100% 완료")