In [137]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    StaleElementReferenceException,
    WebDriverException,
    ElementClickInterceptedException,
)
from datetime import datetime, timedelta 
import time, re
import pandas as pd

In [138]:

#MAX_PAGE = int(Variable.get("MAX_PAGE"))
MAX_PAGE = 2
DRIVER_PATH = "/Users/kim-youngho/git/GeoNewsApt/notebook/chromedriver-mac-arm64/chromedriver"
# 오늘 날짜 문자열 "YYYYMMDD"
today_str = datetime.today().strftime("%Y.%m.%d")
MAXPAGE = 2
hrefs = []
news_data = []
next_page_url = 'https://www.sedaily.com/NewsMain/GB/' # 뒤에다 숫자 더해서 해당 페이지로 가는 url

In [139]:
options = Options() 
# GUI 없이 실행 - 백엔드/서버 자동화
options.add_argument("--headless")
# GPU 가속 기능 off - 안정성 개선
options.add_argument("--disable-gpu")
# Chrome을 sandbox 없이 실행함 - 권한 오류 회피(주의)
options.add_argument("--no-sandbox")

try:
    service = Service(executable_path=DRIVER_PATH)
    driver = webdriver.Chrome(service=service, options=options)
    wait = WebDriverWait(driver, 5)
except WebDriverException as e:
    print(f"[FATAL] Failed to initialize WebDriver: {e}")
try:
    driver.get("https://www.sedaily.com/v/NewsMain/GB")
except Exception as e:
    print(f"[FATAL] Failed to load initial page: {e}")
    driver.quit()

In [140]:
for i in range(1, MAXPAGE+1):
    driver.get(f'https://www.sedaily.com/NewsMain/GB/{i}')
    # WebDriver가 살아 있는 상태에서 li_elements 가져오기
    li_elements = driver.find_elements(By.CSS_SELECTOR, '#container > div > div.sub_left > div:nth-child(1) > ul > li')
    
    for li in li_elements:
        # 뉴스 링크
        try:
            a_tag = li.find_element(By.CSS_SELECTOR, 'div.text_area > div.article_tit > a')
            href = a_tag.get_attribute('href')
        except NoSuchElementException:
            href = None
    
        # 날짜
        try:
            date_span = li.find_element(By.CSS_SELECTOR, 'div.text_area > div.text_info > span.date')
            date = date_span.text
        except NoSuchElementException:
            date = '방금전'
    
        news_data.append({'href': href, 'date': date})

news_data

[{'href': 'https://www.sedaily.com/NewsView/2GWSW7X770/GB03',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSVV2CVH/GB01',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSV205QH/GB03',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSV9737X/GB03',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSV1LUQM/GB01',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSUIT69C/GB05',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSUOV3F2/GB05',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSUIFX2M/GB02',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSUB30RD/GB01',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSV24OC7/GB01',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSFO3DLH/GB07',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView

In [141]:
news_data = [x for x in news_data if x['date'] == today_str]
news_data

[{'href': 'https://www.sedaily.com/NewsView/2GWSW7X770/GB03',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSVV2CVH/GB01',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSV205QH/GB03',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSV9737X/GB03',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSV1LUQM/GB01',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSUIT69C/GB05',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSUOV3F2/GB05',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSUIFX2M/GB02',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSUB30RD/GB01',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSV24OC7/GB01',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView/2GWSFO3DLH/GB07',
  'date': '2025.08.28'},
 {'href': 'https://www.sedaily.com/NewsView

In [142]:
for i in range(len(news_data)):
    driver.get(news_data[i]['href'])
    time.sleep(3)
    article_body = driver.find_element(By.CSS_SELECTOR, 'div.article_view[itemprop="articleBody"]')
    full_text = article_body.text

    # 딕셔너리에 본문 추가
    news_data[i]['content'] = full_text
    news_data[i]['publisher'] = '서울경제'


KeyboardInterrupt



In [None]:
news_data

In [61]:
driver.quit()

In [143]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    WebDriverException,
    StaleElementReferenceException,
)
from datetime import datetime, timedelta
import time
import pandas as pd

MAX_PAGE = 2
DRIVER_PATH = "/Users/kim-youngho/git/GeoNewsApt/notebook/chromedriver-mac-arm64/chromedriver"
today_str = datetime.today().strftime("%Y.%m.%d")

options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

try:
    service = Service(executable_path=DRIVER_PATH)
    driver = webdriver.Chrome(service=service, options=options)
    print("[INFO] WebDriver 초기화 완료")
except WebDriverException as e:
    print(f"[FATAL] WebDriver 초기화 실패: {e}")
    raise

news_data = []

# 페이지 순회
for i in range(1, MAX_PAGE + 1):
    page_url = f'https://www.sedaily.com/NewsMain/GB/{i}'
    try:
        driver.get(page_url)
        print(f"[INFO] 페이지 로드: {page_url}")
        time.sleep(1)
    except Exception as e:
        print(f"[ERROR] 페이지 로드 실패 {page_url}: {e}")
        continue

    # 뉴스 목록 수집
    li_elements = driver.find_elements(
        By.CSS_SELECTOR,
        '#container > div > div.sub_left > div:nth-child(1) > ul > li'
    )

    for li in li_elements:
        try:
            a_tag = li.find_element(By.CSS_SELECTOR, 'div.text_area > div.article_tit > a')
            href = a_tag.get_attribute('href')
        except NoSuchElementException:
            href = None

        try:
            date_span = li.find_element(By.CSS_SELECTOR, 'div.text_area > div.text_info > span.date')
            date = date_span.text
        except NoSuchElementException:
            date = '방금전'

        if date == today_str and href:
            date = date.replace('.','-')
            news_data.append({'url': href, 'date': date})
            print(f"[+] 수집됨: {href} ({date})")

# 본문 수집
for idx, news in enumerate(news_data):
    try:
        driver.get(news['url'])
        time.sleep(1)
        article_body = driver.find_element(By.CSS_SELECTOR, 'div.article_view[itemprop="articleBody"]')
        full_text = article_body.text.strip()
        news['content'] = full_text
        news['publisher'] = '서울경제'
        print(f"[{idx + 1}/{len(news_data)}] 본문 수집 완료: {news['url']}")
    except NoSuchElementException:
        print(f"[WARN] 본문 요소 없음: {news['url']}")
        news['content'] = ''
    except TimeoutException:
        print(f"[WARN] 타임아웃: {news['url']}")
        news['content'] = ''
    except Exception as e:
        print(f"[ERROR] 본문 수집 실패 {news['url']}: {e}")
        news['content'] = ''

driver.quit()

df = pd.DataFrame(news_data)
df

[INFO] WebDriver 초기화 완료
[INFO] 페이지 로드: https://www.sedaily.com/NewsMain/GB/1
[+] 수집됨: https://www.sedaily.com/NewsView/2GWSW7X770/GB03 (2025.08.28)
[+] 수집됨: https://www.sedaily.com/NewsView/2GWSVV2CVH/GB01 (2025.08.28)
[+] 수집됨: https://www.sedaily.com/NewsView/2GWSV205QH/GB03 (2025.08.28)
[+] 수집됨: https://www.sedaily.com/NewsView/2GWSV9737X/GB03 (2025.08.28)
[+] 수집됨: https://www.sedaily.com/NewsView/2GWSV1LUQM/GB01 (2025.08.28)
[+] 수집됨: https://www.sedaily.com/NewsView/2GWSUIT69C/GB05 (2025.08.28)
[+] 수집됨: https://www.sedaily.com/NewsView/2GWSUOV3F2/GB05 (2025.08.28)
[+] 수집됨: https://www.sedaily.com/NewsView/2GWSUIFX2M/GB02 (2025.08.28)
[+] 수집됨: https://www.sedaily.com/NewsView/2GWSUB30RD/GB01 (2025.08.28)
[+] 수집됨: https://www.sedaily.com/NewsView/2GWSV24OC7/GB01 (2025.08.28)
[+] 수집됨: https://www.sedaily.com/NewsView/2GWSFO3DLH/GB07 (2025.08.28)
[+] 수집됨: https://www.sedaily.com/NewsView/2GWSUC30ZL/GB03 (2025.08.28)
[+] 수집됨: https://www.sedaily.com/NewsView/2GWSUM8C69/GB03 (2025.08.28)


Unnamed: 0,href,date,content,publisher
0,https://www.sedaily.com/NewsView/2GWSW7X770/GB03,2025.08.28,viewer\n26일 서울 남산에서 바라본 서울 시내 아파트 등의 모습. 연합뉴스\...,서울경제
1,https://www.sedaily.com/NewsView/2GWSVV2CVH/GB01,2025.08.28,viewer\n오세훈 서울시장이 27일 서울 중구 서울시의회에서 열린 제322회 시...,서울경제
2,https://www.sedaily.com/NewsView/2GWSV205QH/GB03,2025.08.28,viewer\n26일 서울 남산에서 바라본 서울 시내 아파트 등의 모습. 연합뉴스\...,서울경제
3,https://www.sedaily.com/NewsView/2GWSV9737X/GB03,2025.08.28,viewer\n\n\n\n\n서울 지하철2호선 이대입구역 근처의 마포구 아현동 33...,서울경제
4,https://www.sedaily.com/NewsView/2GWSV1LUQM/GB01,2025.08.28,viewer\n서울 강남구 한국토지주택공사(LH) 서울지역본부 모습. 뉴스1\n\n...,서울경제
5,https://www.sedaily.com/NewsView/2GWSUIT69C/GB05,2025.08.28,viewer\n경기도 남양주 왕숙 A-3 블록 조감도. 사진 제공=한국토지주택공사(...,서울경제
6,https://www.sedaily.com/NewsView/2GWSUOV3F2/GB05,2025.08.28,viewer\n주우정(왼쪽 세번째) 현대엔지니어링 대표가 현장 안전점검을 하고 있다...,서울경제
7,https://www.sedaily.com/NewsView/2GWSUIFX2M/GB02,2025.08.28,viewer\n경기도 시흥 ‘힐스테이트 더웨이브시티’ 조감도. 사진 제공=현대건설\...,서울경제
8,https://www.sedaily.com/NewsView/2GWSUB30RD/GB01,2025.08.28,viewer\n롯데건설 건강 어플 사용 이미지. 사진 제공=롯데건설\n\n\n\n\...,서울경제
9,https://www.sedaily.com/NewsView/2GWSV24OC7/GB01,2025.08.28,viewer\n봉천지역중심 지구단위계획구역 위치도. 사진 제공=관악구\n\n\n\n...,서울경제
