In [6]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# ✅ Selenium 설정
chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# ✅ 구글 뉴스 검색 URL
search_url = "https://www.google.com/search?q=Jaderná+elektrárna+Dukovany&hl=cs&gl=cz&tbm=nws"

driver.get(search_url)
time.sleep(5)  # 페이지 로딩 대기

soup = BeautifulSoup(driver.page_source, "html.parser")

# ✅ HTML 구조 분석
print("=== 기사 관련 div 태그 찾기 ===")
for div in soup.find_all("div"):
    if "article" in div.get_text():
        print(div.attrs)

print("=== a 태그 찾기 (링크) ===")
for a in soup.find_all("a"):
    if "href" in a.attrs:
        print(a.attrs)

print("=== 제목 태그 찾기 ===")
for title in soup.find_all(["h3", "div"]):
    print(title.attrs)

driver.quit()


=== 기사 관련 div 태그 찾기 ===
=== a 태그 찾기 (링크) ===
{'href': '/search?q=Jadern%C3%A1+elektr%C3%A1rna+Dukovany&sca_esv=b597786ab811f977&hl=cs&gl=cz&tbm=nws&gbv=1&sei=FjbJZ8nfLMHj2roP14nUiA4'}
{'jsname': 'KI37ad', 'class': ['gyPpGe'], 'href': 'https://support.google.com/websearch/answer/181196?hl=cs', 'ping': '/url?sa=t&source=web&rct=j&url=https://support.google.com/websearch/answer/181196%3Fhl%3Dcs&ved=0ahUKEwjJwtHs3_SLAxXBsVYBHdcEFeEQwcMDCAY&opi=89978449'}
{'href': 'https://www.google.com/webhp?hl=cs&sa=X&ved=0ahUKEwjJwtHs3_SLAxXBsVYBHdcEFeEQPAgI', 'title': 'Jít na hlavní stránku Google', 'id': 'logo', 'data-hveid': '8'}
{'class': ['FgNLaf'], 'href': 'https://www.google.com/webhp?hl=cs&ictx=0&sa=X&ved=0ahUKEwjJwtHs3_SLAxXBsVYBHdcEFeEQpYkNCAo', 'id': '_FjbJZ8nfLMHj2roP14nUiA4_1', 'data-hveid': '10'}
{'class': ['gb_B'], 'aria-label': 'Aplikace Google', 'href': 'https://www.google.co.kr/intl/cs/about/products', 'aria-expanded': 'false', 'role': 'button', 'tabindex': '0'}
{'class': ['gb_A'], 'ar

In [9]:
import time
import json
import random
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm

# 🔍 검색어 및 국가 설정
search_query = "Jaderná elektrárna Dukovany"
country_code = "countryCZ"

# ✅ 3개월 단위 날짜 범위 설정
date_ranges = [
    ("2022-01-01", "2022-03-31")  # 테스트를 위해 한 개만 설정
]

# ✅ 구글 뉴스 검색 URL 템플릿
GOOGLE_NEWS_URL_TEMPLATE = (
    "https://www.google.com/search?q={query}&cr=countryCZ&hl=cs&gl=cz&as_qdr=all"
    "&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}&tbm=nws&start={start}"
)

# ✅ Selenium 설정
chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

def fetch_google_news(start_date, end_date):
    """Selenium을 사용하여 구글 뉴스 검색 결과 크롤링"""
    news_list = []
    
    for page in range(0, 100, 10):  # 10개씩 로드, 최대 100개 (10페이지)
        search_url = GOOGLE_NEWS_URL_TEMPLATE.format(
            query=search_query.replace(" ", "+"),
            start_date=start_date, end_date=end_date, start=page
        )

        driver.get(search_url)
        time.sleep(random.uniform(3, 6))  # 랜덤 딜레이

        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        # ✅ 디버깅: 검색 페이지 일부 출력
        print(driver.page_source[:1000])  

        articles = soup.select("div.SoaBEF")  # ✅ 최신 CSS 선택자 적용
        print(f"🔍 검색 결과 수: {len(articles)}개 (페이지 {page//10+1})")

        for article in articles:
            try:
                # ✅ 기사 제목
                title_element = article.select_one("div.n0jPhd ynAwRc Mbeu0 nDgy9d")
                title = title_element.text.strip() if title_element else "N/A"

                # ✅ 기사 링크
                link_element = article.select_one("a.VKoAF")
                link = link_element["href"] if link_element else "N/A"
                if link.startswith("/url?"):
                    link = "https://www.google.com" + link  # 상대경로 수정

                # ✅ 기사 날짜
                date_element = article.select_one("div.OSrXXb rbYSKb LfVVr")
                date = date_element.text.strip() if date_element else "N/A"

                news_list.append({
                    "title": title,
                    "link": link,
                    "date": date
                })

            except Exception as e:
                print(f"⚠ 오류 발생 (기사 크롤링 실패): {e}")

        if len(articles) == 0:
            break  # 더 이상 기사가 없으면 중단

    return news_list

def fetch_news_content(news_list):
    """기사 본문 크롤링"""
    news_data = []

    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}

    for news in tqdm(news_list, desc="📰 뉴스 본문 크롤링 진행"):
        try:
            response = requests.get(news["link"], headers=headers)
            soup = BeautifulSoup(response.text, "html.parser")

            # ✅ 본문 크롤링 (다양한 선택자 추가)
            content_selectors = ["article", "div#newsct_article", "div#articleBodyContents", "div.article_view"]
            content = ""
            for selector in content_selectors:
                content_element = soup.select(selector)
                if content_element:
                    content = " ".join([p.text.strip() for p in content_element])
                    break

            # ✅ 데이터 저장
            news["content"] = content.strip() if content else "N/A"
            news_data.append(news)

            print(f"✅ 크롤링 완료: {news['title'][:50]}...")

            time.sleep(random.uniform(2, 5))  # 요청 간격 랜덤 설정

        except Exception as e:
            print(f"⚠ 오류 발생 (뉴스 크롤링 실패): {news['link']}, 오류: {e}")

    return news_data

# ✅ 구글 뉴스 크롤링 실행 (3개월 단위로 반복)
all_news = []
for start_date, end_date in date_ranges:
    print(f"📅 {start_date} ~ {end_date} 기간 크롤링 시작")
    news_list = fetch_google_news(start_date, end_date)
    news_data = fetch_news_content(news_list)
    all_news.extend(news_data)

# ✅ JSON 파일로 저장
output_file = "google_news_czech.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_news, f, ensure_ascii=False, indent=4)

print(f"✅ 총 {len(all_news)}개의 뉴스 기사를 저장했습니다. (파일명: {output_file})")

# ✅ Selenium 종료
driver.quit()


📅 2022-01-01 ~ 2022-03-31 기간 크롤링 시작
<html itemscope="" itemtype="http://schema.org/SearchResultsPage" lang="cs"><head><meta charset="UTF-8"><meta content="origin" name="referrer"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Jaderná elektrárna Dukovany - Hledat Googlem</title><script nonce="">window._hst=Date.now();</script><script nonce="">(function(){var _g={kEI:'_zfJZ4vNLtvp1e8Ps9-E6Q4',kEXPI:'0,3322466,377830,1088,448529,104243,64702,599360,5281865,13,11328,8835104,3,1,3,5,3,10,53415763,8044,97010,24745,93627,14941,36104,4956,9658,1814,2816,1980,24660,6512,1464,1723,13899,7966,6899,3938,1822,265,69,2199,10300,419,28,2877,3261,5980,19,439,214,209,1021,3090,6162,1278,2098,1225,987,984,2083,672,3542,499,897,1558,1063,7,580,363,411,1311,499,422,605,26,467,1010,2942,196,907,577,485,96,540,450,819,45,66,269,703,407,60,613,105,75,140,869,1043,793,1011,340,899,823,834,511,750,1485,215,256,1073,1196,1308,21341070,5032,1992,6191,459,8537

📰 뉴스 본문 크롤링 진행: 0it [00:00, ?it/s]

✅ 총 0개의 뉴스 기사를 저장했습니다. (파일명: google_news_czech.json)



