In [3]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import random


def get_search_interval():
    current_time = datetime.now()
    one_day_ago = current_time - timedelta(days=1)
    current_time_str = current_time.strftime("%Y.%m.%d.%H.%M")
    one_day_ago_str = one_day_ago.strftime("%Y.%m.%d.%H.%M")
    return current_time_str, one_day_ago_str


def make_target_url(search_keyword):
    date_end, date_start = get_search_interval()
    target_url = (
        f"https://search.naver.com/search.naver?where=news&query=%22{search_keyword}%22"
        f"&sm=tab_opt&sort=0&photo=0&field=0&pd=4&ds={date_start}&de={date_end}"
        f"&docid=&related=0&mynews=0&office_type=0&office_section_code=0"
        f"&news_office_checked=&nso=so%3Ar%2Cp%3A1d&is_sug_officeid=0&office_category=0"
        f"&service_area=0"
    )
    return target_url


def fetch_news(target_url):
    # User-Agent 목록
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64)  AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)  AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X)  AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
    ]

    headers = {
        "User-Agent": random.choice(user_agents),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "ko-KR,ko;q=0.8,en-US;q=0.5,en;q=0.3",
    }

    try:
        response = requests.get(target_url, headers=headers)
        print(response)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching news: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    articles = []

    for item in soup.find_all("div", class_="news_area"):
        print("=" * 100)
        print(item)
        title_tag = item.find("a", class_="news_tit")
        desc_tag = item.find("div", class_="news_dsc")

        if title_tag and desc_tag:
            title = title_tag.get("title", "").strip()
            description = desc_tag.get_text(strip=True)
            url = title_tag.get("href", "").strip()
            articles.append((title, description, url))

    return articles


In [4]:
# 더 안정적인 예시
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)  AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)  AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
]
headers = {
    "User-Agent": random.choice(user_agents),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",  # 추가
    "Connection": "keep-alive",              # 추가
    "Referer": "https://www.google.com/",    # 추가 (일반적인 외부 레퍼러)
}

In [5]:
keyword = "한국장학재단"
target_url = make_target_url(keyword)
print(target_url)
articles = fetch_news(target_url)
print(articles)

https://search.naver.com/search.naver?where=news&query=%22한국장학재단%22&sm=tab_opt&sort=0&photo=0&field=0&pd=4&ds=2025.05.08.14.25&de=2025.05.09.14.25&docid=&related=0&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so%3Ar%2Cp%3A1d&is_sug_officeid=0&office_category=0&service_area=0
<Response [200]>
[]


In [6]:
fetch_news("https://naver.com")

<Response [200]>


[]

In [7]:
url = "https://www.naver.com/"
response = requests.get(url)

In [8]:
response

<Response [200]>

In [9]:
import asyncio
import json
from playwright.async_api import async_playwright

async def fetch_html(url: str) -> str:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, wait_until="networkidle")  # JS 렌더링 끝까지 대기
        html = await page.content()
        await browser.close()
        return html


url = target_url
html = await fetch_html(url)
#articles = fetch_news(target_url)

In [26]:
from bs4 import BeautifulSoup, NavigableString
import re

# -------- 핵심 파서 -------- #
def parse_naver_news_list(html: str):
    """
    네이버 통합검색(뉴스) html에서
    기사 제목과 내용 스니펫을 추출한다.

    Returns
    -------
    list[dict]  # [{'title': ..., 'content': ...}, ...]
    """
    soup = BeautifulSoup(html, "html.parser")
    
    # 1) 각 기사 블록(press/item) 단위로 분리
    #    → class 에 'fender-news-item-list-tab' 가 달려 있음
    itemss = soup.select(".fender-news-item-list-tab")
    
    result = []
    for items in itemss:
        for item in items:
            # 2) 제목
            title_tag = item.select_one(
                '.sds-comps-text-type-headline1'
            )
            if not title_tag:        # 방어 코드
                continue
            title = _clean_text(title_tag.get_text(" ", strip=True))
    
            # 3) 스니펫(본문)
            content_tag = item.select_one(
                '.sds-comps-text-type-body1'
            )
            content = _clean_text(
                content_tag.get_text(" ", strip=True) if content_tag else ""
            )
            
            
            link_tag = item.select_one('a.n6AJosQA40hUOAe_Vplg.ZtHl2s0jtiC0IevYMD5G')
            link = _clean_text(
                link_tag.get_text(" ", strip=True) if link_tag else ""
            )
            print(link_tag)
    
            
            # 4) 결과 누적
            result.append({"title": title, "content": content})

    return result


# -------- 유틸: <mark>제거 등 ---------------- #
def _clean_text(text: str) -> str:
    """
    - 줄바꿈·여백 정리
    - 연속 공백 1칸으로
    - <mark> 등 검색 하이라이트 제거
    """
    # &nbsp; → 공백
    text = text.replace("\xa0", " ")

    # 줄바꿈, 탭 → 공백
    text = re.sub(r"[\r\n\t]+", " ", text)

    # 여러 칸 공백 → 1칸
    text = re.sub(r"\s{2,}", " ", text)

    return text.strip()


# -------- 사용 예시 (Playwright) -------------- #

async with async_playwright() as p:
    browser = await p.chromium.launch(headless=True)
    page = await browser.new_page()
    await page.goto(target_url)
    html = await page.content()
    news_list = parse_naver_news_list(html)
    for n in news_list:
        print(n)
        break
#        print(n["title"])
#        print(n["href"])
#        print("   ↳", n["content"][:80], "…")


<a class="n6AJosQA40hUOAe_Vplg ZtHl2s0jtiC0IevYMD5G" href="https://www.imaeil.com/page/view/2025050910510679114" nocr="1" target="_blank"><span class="sds-comps-text sds-comps-text-ellipsis-3 sds-comps-text-type-body1"><mark>한국장학재단</mark>이 감사원 자체감사활동 심사에서 3년 연속 최고등급을 받았다. 디지털 감사기법과 내부통제 고도화 등 선제적 노력으로 모든 심사항목에서 우수한 평가를 끌어냈다. <mark>한국장학재단</mark>(이사장 배병일)은 감사원이 주관한 2025년 자체감사활동 심사에서 최고등급인 A등급을 받아 3년 연속 최상위...</span></a>
<a class="n6AJosQA40hUOAe_Vplg ZtHl2s0jtiC0IevYMD5G" href="https://view.asiae.co.kr/article/2025050816153562973" nocr="1" target="_blank"><span class="sds-comps-text sds-comps-text-ellipsis-3 sds-comps-text-type-body1"><mark>한국장학재단</mark>(이사장 배병일)은 2025년 푸른등대 삼성기부장학생 1200명을 선발해 생활비 장학금 36억원을 지원할 예정임을 8일 밝혔다. '푸른등대 삼성기부장학금'은 교육기회의 양극화 해소를 위해 교육부가 기부받은 삼성기부금을 토대로 교육 소외계층 대학생에게 생활비를 지원하는 장학금이다....</span></a>
<a class="n6AJosQA40hUOAe_Vplg ZtHl2s0jtiC0IevYMD5G" href="https://dhnews.co.kr/news/view/1065592856211555" nocr="1" target="_blank"><span class="sds-comps-text sds-comp

In [34]:
keyword = "한국장학재단"
target_url = make_target_url(keyword)

print(target_url)

async with async_playwright() as p:
    browser = await p.chromium.launch(headless=True)
    page = await browser.new_page()
    await page.goto(target_url)
    html = await page.content()

    soup = BeautifulSoup(html, "html.parser")
    
    # 1) 각 기사 블록(press/item) 단위로 분리
    #    → class 에 'fender-news-item-list-tab' 가 달려 있음
    itemss = soup.select(".fender-news-item-list-tab")
    print(itemss)

https://search.naver.com/search.naver?where=news&query=%22한국장학재단%22&sm=tab_opt&sort=0&photo=0&field=0&pd=4&ds=2025.05.08.16.03&de=2025.05.09.16.03&docid=&related=0&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so%3Ar%2Cp%3A1d&is_sug_officeid=0&office_category=0&service_area=0
[<div class="sds-comps-vertical-layout sds-comps-full-layout fender-news-item-list-tab" data-template-id="layout" data-template-type="vertical"><div class="sds-comps-vertical-layout sds-comps-full-layout dZQQMujvOqnxG1bUQsg6"><div class="sds-comps-horizontal-layout sds-comps-full-layout sds-comps-profile type-basic size-lg title-color-g10 AX25eoZU6YHDoeuv19QL" data-sds-comp="Profile"><div class="sds-comps-horizontal-layout sds-comps-inline-layout sds-comps-profile-source"><div class="sds-comps-horizontal-layout sds-comps-inline-layout sds-comps-profile-source-thumb"><a class="n6AJosQA40hUOAe_Vplg" href="https://media.naver.com/press/088" nocr="1" target="_blank"><div class="sds-comps-horizo

In [51]:
async with async_playwright() as p:
    browser = await p.chromium.launch(headless=True)
    page = await browser.new_page()
    await page.goto(target_url)
    html = await page.content()

    soup = BeautifulSoup(html, "html.parser")
    
    news_blocks = soup.find_all('div', class_='sds-comps-vertical-layout sds-comps-full-layout AgmDmsKtWKXsclHTvTwY')
    
    print(news_blocks)
    
    for block in news_blocks[:
        # 제목
        title_tag = block.find('span', class_='sds-comps-text-type-body2')
        title = title_tag.get_text(strip=True) if title_tag else None
        print(title)
        # 링크
        link_tag = block.find('a', href=True)
        href = link_tag['href'] if link_tag else None
    
        # 내용: 종종 제목 외에 요약 내용이 따로 있을 수 있음
        content_tag = block.find_next('span', class_='sds-comps-text-type-body1')
        content = content_tag.get_text(strip=True) if content_tag else None
    
        if title and href:
            news_items.append({
                'title': title,
                'content': content,
                'href': href
            })
    
    # 결과 출력
    for item in news_items:
        print(item)

[]


In [46]:
news_items

[]

In [43]:
itemss

[<div class="sds-comps-vertical-layout sds-comps-full-layout fender-news-item-list-tab" data-template-id="layout" data-template-type="vertical"><div class="sds-comps-vertical-layout sds-comps-full-layout dZQQMujvOqnxG1bUQsg6"><div class="sds-comps-horizontal-layout sds-comps-full-layout sds-comps-profile type-basic size-lg title-color-g10 AX25eoZU6YHDoeuv19QL" data-sds-comp="Profile"><div class="sds-comps-horizontal-layout sds-comps-inline-layout sds-comps-profile-source"><div class="sds-comps-horizontal-layout sds-comps-inline-layout sds-comps-profile-source-thumb"><a class="n6AJosQA40hUOAe_Vplg" href="https://media.naver.com/press/088" nocr="1" target="_blank"><div class="sds-comps-horizontal-layout sds-comps-inline-layout sds-comps-profile-thumbnail type-basic size-lg" data-sds-comp="ProfileThumbnail" height="24px" width="24px"><div class="sds-comps-base-layout sds-comps-inline-layout sds-comps-image sds-comps-image-circle fit-cover" data-dimmed="3%" data-sds-comp="CircleImage" styl