In [None]:
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime

# ✅ 현재 시간 (파일명에 사용)
now = datetime.now().strftime(f'%Y년%m월%d일_%H시%M분')

# ✅ 후보자 목록
candidates = ['이재명', '김문수', '이준석', '김재연', '권영국', '한덕수']

# ✅ 후보별 CSV 데이터 저장용 딕셔너리
csv_data_by_candidate = {}

# ✅ Selenium 크롬 옵션 설정
options = Options()
options.add_argument('--headless')  # 필요시 주석 해제
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--window-size=1920,1080')

driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 10)

# ✅ 국민일보 언론사 메인 페이지
start_url = 'https://media.naver.com/press/025?sid=154'
driver.get(start_url)

last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# ✅ 기사 링크 수집
articles = wait.until(EC.presence_of_all_elements_located(
    (By.CSS_SELECTOR, 'li.press_edit_news_item a.press_edit_news_link')))
article_links = [a.get_attribute('href') for a in articles]
print(f'총 기사 개수: {len(article_links)}')

# ✅ 기사별 크롤링
for article_link in article_links:
    driver.get(article_link)
    time.sleep(2)

    # ✅ 기사 제목
    try:
        title = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h2#title_area'))).text.strip()
    except:
        title = "제목 없음"

    # 후보자 언급 없으면 스킵
    if not any(name in title for name in candidates):
        continue

    # ✅ 기사 작성일
    try:
        article_date = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, 'span.media_end_head_info_datestamp_time._ARTICLE_DATE_TIME'))).text.strip()
    except:
        article_date = "작성일자 없음"

    # ✅ 기사 본문
    try:
        article_body = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, 'div.newsct_article._article_body'))).text.strip()
    except:
        article_body = "본문 없음"

    # ✅ 댓글 링크 클릭 및 수집
    try:
        comment_btn = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a._COMMENT_COUNT_VIEW')))
        comment_href = comment_btn.get_attribute('href')
        comment_url = 'https://n.news.naver.com' + comment_href if comment_href.startswith('/') else comment_href

        driver.get(comment_url)
        time.sleep(2)

        total_comments = []
        comment_times = []

        while len(total_comments) < 60:
            comment_elements = driver.find_elements(By.CSS_SELECTOR, 'span.u_cbox_contents')
            time_elements = driver.find_elements(By.CSS_SELECTOR, 'span.u_cbox_date')

            for elem, time_elem in zip(comment_elements, time_elements):
                comment_text = elem.text.strip()
                comment_time = time_elem.text.strip()

                if comment_text and comment_text not in total_comments:
                    total_comments.append(comment_text)
                    comment_times.append(comment_time)

                if len(total_comments) >= 60:
                    break

            try:
                more_button = driver.find_element(By.CSS_SELECTOR, 'a.u_cbox_btn_more')
                more_button.click()
                time.sleep(2)
            except:
                break

    except Exception as e:
        total_comments = ['댓글 없음']
        comment_times = ['댓글 시간 없음']
        comment_url = '댓글 페이지 없음'

    # ✅ 후보 라벨링 후 데이터 저장
    for comment, comment_time in zip(total_comments, comment_times):
        text_combined = title + " " + comment
        label = '기타'
        for candidate in candidates:
            if candidate in text_combined:
                label = candidate
                break

        row = [title, article_date, article_body, comment, comment_time, comment_url, label]

        if label not in csv_data_by_candidate:
            csv_data_by_candidate[label] = []
        csv_data_by_candidate[label].append(row)

    time.sleep(2)

# ✅ 크롬 종료
driver.quit()

# ✅ 후보별 CSV 파일 저장
for label, rows in csv_data_by_candidate.items():
    filename = f'{now}_중앙_{label}_News_Labeled.csv'
    with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
        writer = csv.writer(file)
        writer.writerow(['기사 제목', '기사 작성일', '기사 본문', '댓글', '댓글 작성일', '댓글 링크', '라벨'])
        writer.writerows(rows)
    print(f'{label} 후보 CSV 저장 완료: {filename}')


총 기사 개수: 299
