In [16]:
!pip install selenium webdriver_manager BeautifulSoup4
!pip install chromedriver_autoinstaller requests mysql-connector-python



In [17]:
import re
import os
import time
import logging
import subprocess
import mysql.connector
from selenium import webdriver
import chromedriver_autoinstaller
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from datetime import datetime, timedelta

In [18]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chromedriver_autoinstaller.install()

driver = webdriver.Chrome(options=chrome_options)

In [19]:
db_config = {
	'host': 'localhost',
	'user': 'root',
	'password': '1234',
	'database': 'news_sentiment_db'
}

def connect_db():
		return mysql.connector.connect(**db_config)

connect_db()

<mysql.connector.connection_cext.CMySQLConnection at 0x10bbf1a90>

In [20]:
KEYWORDS = ["SKT", "해킹", "유심"]
PUBLISHER = ["연합뉴스", "뉴시스", "서울경제", "데일리안", "뉴스1", "디지털데일리", "한국경제", "파이낸셜뉴스"]

In [21]:
# 로그 디렉토리 생성
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)

# Logger 설정
logger = logging.getLogger("comment_logger")
logger.setLevel(logging.INFO)

# 중복 핸들러 방지
if not logger.handlers:
    file_handler = logging.FileHandler(
        os.path.join(log_dir, "comments.log"), encoding="utf-8"
    )
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    file_handler.setFormatter(formatter)

    logger.addHandler(file_handler)

In [22]:
def save_articles_to_db(articles):
    conn = connect_db()
    cursor = conn.cursor()
    sql = """
        INSERT INTO news_articles (article_id, title, summary, publisher, publish_date, url, comment_count, collected_at)
        VALUES (%s, %s, %s, %s, %s, %s, %s, NOW())
    """
    cursor.executemany(
        sql,
        [
            (
                article["article_id"],
                article["title"],
                article["summary"],
                article["publisher"],
                article["publish_date"],
                article["url"],
                article["comment_count"],
            )
            for article in articles
        ],
    )
    conn.commit()
    conn.close()

def save_comments_to_db(article_id, comments):
    conn = connect_db()
    cursor = conn.cursor()
    sql = """
        INSERT INTO news_comments (comment_id, article_id, nickname, content, like_count, dislike_count, comment_date, collected_at)
        VALUES (%s, %s, %s, %s, %s, %s, %s, NOW())
    """
    cursor.executemany(
        sql,
        [
            (
                c["comment_id"],  # 댓글 ID
                article_id,  # 기사 ID
                c["nickname"],
                c["content"],
                c["like_count"],
                c["dislike_count"],
                c["comment_date"],
            )
            for c in comments if c
        ],
    )
    conn.commit()
    conn.close()

In [23]:
# 기사 목록 더보기 클릭
def click_more_article():
    while True:
        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)
            more_btn = driver.find_element(
                By.CSS_SELECTOR,
                "#newsct > div.section_latest > div > div.section_more > a",
            )
            more_btn.click()
            logger.info("기사 더보기 클릭 성공")
            time.sleep(3)
        except Exception:
            logger.info("더 이상 기사 더보기 버튼이 없습니다.")
            break


# 기사 내 댓글 버튼 클릭
def click_to_comment():
    try:
        button = driver.find_element(By.CSS_SELECTOR, "a#comment_count")
        button.click()
        logger.info("댓글 버튼 클릭 성공!")
        time.sleep(10)
    except Exception:
        logger.warning("댓글 버튼을 찾을 수 없습니다.")


# 댓글 모음 내 더보기 클릭
def click_more_comment():
    while True:
        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)
            more_btn = driver.find_element(By.CSS_SELECTOR, "a.u_cbox_btn_more")
            more_btn.click()
            logger.info("댓글 더보기 클릭 성공")
            time.sleep(3)
        except Exception:
            logger.info("더 이상 댓글 더보기 버튼이 없습니다.")
            break


# 대댓글 보기 버튼
def click_child_comment_button():
    reply_count_tag_list = driver.find_elements(By.CSS_SELECTOR, "a.u_cbox_btn_reply")
    for reply_count_tag in reply_count_tag_list:
        try:
            reply_cnt_span = reply_count_tag.find_element(
                By.CSS_SELECTOR, "span.u_cbox_reply_cnt"
            )
            reply_count = int(reply_cnt_span.text.strip())

            if reply_count > 0:
                driver.execute_script("arguments[0].click();", reply_count_tag)
                logger.info(f"대댓글 {reply_count}개 버튼 클릭 성공")
                time.sleep(3)
        except Exception as e:
            logger.warning(f"대댓글 클릭 중 에러: {e}")
            continue

In [24]:
# def parse_publish_date(raw_date_str):
#     try:
#         match = re.match(r'(\d+)일전', raw_date_str)
#         if match:
#             days_ago = int(match.group(1))
#             return (datetime.now() - timedelta(days=days_ago)).date()

#         match = re.match(r'(\d{4})\.(\d{2})\.(\d{2})\.', raw_date_str)
#         if match:
#             year, month, day = map(int, match.groups())
#             return datetime(year, month, day).date()

#         print("날짜 형식 미매칭:", raw_date_str)
#         return None

#     except Exception as e:
#         print("날짜 파싱 실패:", raw_date_str, e)
#         return None


def convert_comment_date(raw_date: str) -> str:
    # '2025-05-08T22:27:22+0900' ➝ '2025-05-08 22:27:22'
    dt = datetime.strptime(raw_date, "%Y-%m-%dT%H:%M:%S%z")
    return dt.strftime("%Y-%m-%d %H:%M:%S")

In [25]:
# 기사 목록 크롤링
def get_article_data(article, publish_date):
    article_data = {}

    try:
        article_data["title"] = article.find_element(
            By.CSS_SELECTOR, "a.sa_text_title > strong.sa_text_strong"
        ).text.strip()
        article_data["summary"] = article.find_element(
            By.CSS_SELECTOR, "div.sa_text_lede"
        ).text.strip()
        article_data["url"] = article.find_element(
            By.CSS_SELECTOR, "a.sa_text_title"
        ).get_attribute("href")
        article_data["publisher"] = article.find_element(
            By.CSS_SELECTOR, "div.sa_text_press"
        ).text.strip()
        article_data["publish_date"] = publish_date

        # 기사 날짜 저장 시 사용하려고 했지만, ~일 전으로 뜨는 경우 값이 굳이 계산이 필요 없을 것 같아서 파라미터로 넘기는 날짜를 사용
        # raw_date = article.find_element(By.CSS_SELECTOR, 'div.sa_text_datetime > b').text.strip()
        # article_data['publish_date'] = parse_publish_date(raw_date)

        # https://n.news.naver.com/mnews/article/003/0013227763 ➔ 0030013227763를 Primary Key로 사용
        try:
            parts = article_data["url"].split("/")
            press_id = parts[-2]  # '003'
            article_seq = parts[-1]  # '0013227763'
            article_id = press_id + article_seq  # '0030013227763'
            article_data["article_id"] = article_id
            logger.info(f"기사 ID 추출 성공: {article_id}")
        except Exception as e:
            logger.warning(f"article_id 추출 실패: {e}")
            article_data["article_id"] = None

        try:
            comment_element = article.find_element(
                By.CSS_SELECTOR, "div.sa_text_info_right > a.sa_text_cmt"
            )
            if comment_element.is_displayed():
                comment_text = comment_element.text.strip()
                comment_number = int("".join(filter(str.isdigit, comment_text)))
                article_data["comment_count"] = comment_number
                logger.info(f"댓글 수 추출 성공: {comment_number}")
            else:
                article_data["comment_count"] = 0
                logger.info("댓글 수 없음")
        except Exception as e:
            article_data["comment_count"] = 0
            logger.warning(f"댓글 수 추출 실패: {e}")

        return article_data

    except Exception as e:
        logger.error(f"기사 데이터 추출 중 오류 발생: {e}")
        return None

In [26]:
# 댓글 데이터 크롤링
def get_comment_data(comment_element, article_id):
    comment_data = {}

    try:
        data_info = comment_element.get_attribute("data-info")
        
        comment_no = data_info.split("commentNo:'")[1].split("'")[0]
        nickname = comment_element.find_element(
            By.CSS_SELECTOR, "span.u_cbox_nick"
        ).text.strip()
        content_tag = comment_element.find_element(
            By.CSS_SELECTOR, "span.u_cbox_contents"
        )
        content = content_tag.text.strip()
        comment_time = comment_element.find_element(
            By.CSS_SELECTOR, "span.u_cbox_date"
        ).get_attribute("data-value")
        comment_time = convert_comment_date(comment_time)
        like_count = int(
            comment_element.find_element(By.CSS_SELECTOR, "em.u_cbox_cnt_recomm").text
        )
        dislike_count = int(
            comment_element.find_element(By.CSS_SELECTOR, "em.u_cbox_cnt_unrecomm").text
        )

        comment_data = {
            "comment_id": comment_no,
            "article_id": article_id,
            "nickname": nickname,
            "content": content,
            "like_count": like_count,
            "dislike_count": dislike_count,
            "comment_date": comment_time,
        }

        return comment_data

    except Exception as e:
        logger.error(f"에러 발생: {e}")
        return None, None

In [27]:
def open_comment_page(url_input, article_id):
    comment_to_save = []

    driver.execute_script("window.open('');")
    driver.switch_to.window(driver.window_handles[1])
    driver.get(url_input)
    time.sleep(3)

    click_to_comment()
    click_more_comment()
    click_child_comment_button()

    comment_list = driver.find_elements(By.CSS_SELECTOR, "li.u_cbox_comment")
    for comment_element in comment_list:
        # 댓글 내용이 있는지 확인
        delete_elements = comment_element.find_elements(By.CSS_SELECTOR, 'span.u_cbox_delete_contents')
        if delete_elements and (
            "작성자에 의해 삭제된 댓글입니다." in delete_elements[0].text
            or "작성자에 의해 삭제된 답글입니다." in delete_elements[0].text
            or "정보통신망법에 따른 권리침해 요청이 있어, 게시중단 되었습니다." in delete_elements[0].text
        ):
            logger.info(f"[{article_id}] 댓글/답글 본문 없음, 건너뜀")
            continue
        cleanbot_elements = comment_element.find_elements(By.CSS_SELECTOR, 'span.u_cbox_cleanbot_contents')
        if cleanbot_elements:
            logger.info(f"[{article_id}] 클린봇 필터된 댓글 감지, 건너뜀")
            continue

        comment_data = get_comment_data(comment_element, article_id)
        if comment_data:
            comment_to_save.append(comment_data)
            
    save_comments_to_db(article_id, comment_to_save)

    driver.close()
    driver.switch_to.window(driver.window_handles[0])

In [None]:
def crawl_by_date(date_str):
    search_url = f"https://news.naver.com/breakingnews/section/105/732?date={date_str}"
    driver.get(search_url)
    logger.info(f"크롤링 시작: {search_url}")
    time.sleep(2)

    click_more_article()

    data_to_save_article = []
    articles = driver.find_elements(By.CSS_SELECTOR, "li.sa_item")
    logger.info(f"총 {len(articles)}개의 기사 로드 완료")

    for art in articles:
        article_data = get_article_data(art, date_str)

        if not article_data:
            logger.warning(f"기사 데이터 추출 실패: {art}")
            continue

        if not any(k in (article_data['title'] + article_data['summary']) for k in KEYWORDS):
            logger.info(f"키워드 필터링 제외: {article_data['title']}")
            continue

        if article_data['publisher'] not in PUBLISHER:
            logger.info(f"출판사 필터링 제외: {article_data['publisher']}")
            continue

        data_to_save_article.append(article_data)

    if data_to_save_article:
        logger.info(f"{len(data_to_save_article)}개의 기사가 필터링되어 저장됩니다.")
        save_articles_to_db(data_to_save_article)

    for article in data_to_save_article:
        if (article['comment_count'] > 0):
            open_comment_page(article['url'], article['article_id'])

def run_crawler_daily():
    today = datetime.today()
    yesterday = today - timedelta(days=1)
    date_str = yesterday.strftime('%Y%m%d')
    logger.info(f"어제 날짜({date_str})로 크롤링 시작")
    crawl_by_date(date_str)

# 처음 크롤링 시 코드 실행 날짜의 하루 전까지 크롤링
def run_crawler_from_april_22():
    start_date = datetime.strptime("2025-05-14", "%Y-%m-%d")
    end_date = datetime.today() - timedelta(days=1)  # 어제까지

    current_date = start_date
    while current_date <= end_date:
        date_str = current_date.strftime("%Y%m%d")
        try:
            logger.info(f"{date_str} 날짜로 크롤링 시작")
            crawl_by_date(date_str)
        except Exception as e:
            logger.error(f"{date_str} 날짜 크롤링 실패: {e}")
        current_date += timedelta(days=1)

In [29]:
def dump_mysql_database():
    now = datetime.now().strftime("%Y%m%d")
    backup_file = f"backups/backup_{now}.sql"

    os.makedirs("backups", exist_ok=True)

    try:
        result = subprocess.run(
            [
                "docker",
                "exec",
                "data_science_introduction_mysql",
                "mysqldump",
                "-u",
                "root",
                "-p1234",
                "news_sentiment_db",
            ],
            capture_output=True,
            text=True,
        )

        if result.returncode == 0:
            with open(backup_file, "w") as f:
                f.write(result.stdout)
            print(f"MySQL 백업 성공: {backup_file}")
        else:
            print(f"백업 실패: {result.stderr}")
    except Exception as e:
        print(f"백업 중 예외 발생: {e}")

In [None]:
def main():
	# run_crawler_daily()
	run_crawler_from_april_22()

if __name__ == "__main__":
	try:
		main()
	finally:
		dump_mysql_database()
		logging.shutdown()

INFO:comment_logger:20250422 날짜로 크롤링 시작
INFO:comment_logger:크롤링 시작: https://news.naver.com/breakingnews/section/105/732?date=20250422
INFO:comment_logger:기사 더보기 클릭 성공
INFO:comment_logger:더 이상 기사 더보기 버튼이 없습니다.
INFO:comment_logger:총 55개의 기사 로드 완료
INFO:comment_logger:기사 ID 추출 성공: 1380002195110
INFO:comment_logger:댓글 수 없음
INFO:comment_logger:키워드 필터링 제외: 법무법인 민후, 개인정보 분야 전문가 박영수 변호사 영입
INFO:comment_logger:기사 ID 추출 성공: 0140005339690
INFO:comment_logger:댓글 수 없음
INFO:comment_logger:키워드 필터링 제외: 라온시큐어, 2025 정보통신 유공 ‘대통령 표창’ 수상
INFO:comment_logger:기사 ID 추출 성공: 0300003305904
INFO:comment_logger:댓글 수 없음
INFO:comment_logger:출판사 필터링 제외: 전자신문
INFO:comment_logger:기사 ID 추출 성공: 0010015345379
INFO:comment_logger:댓글 수 추출 성공: 30
INFO:comment_logger:기사 ID 추출 성공: 0030013199462
INFO:comment_logger:댓글 수 없음
INFO:comment_logger:키워드 필터링 제외: "가상자산 사기피해 돈 찾아드려요"…KISA 사칭 피싱 주의보
INFO:comment_logger:기사 ID 추출 성공: 0030013199434
INFO:comment_logger:댓글 수 없음
INFO:comment_logger:키워드 필터링 제외: 조규곤 파수 대표 "AI는 자신감 넘치는 인턴…제약과 과제 명

MySQL 백업 성공: backups/backup_20250515.sql
