In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

headers = {
    'User-Agent': 'Mozilla/5.0'
}

# 동아일보 크롤링 함수
def crawl_news_donga(url):
    try:
        res = requests.get(url, headers=headers, timeout=10)
        res.encoding = 'utf-8' 
        soup = BeautifulSoup(res.text, 'html.parser')

        date_element = soup.find('span', {'aria-hidden': 'true'}) or \
                       soup.find('span', {'class': 'date01'}) or \
                       soup.find('div', {'class': 'date'})
        
        if date_element:
            date_match = re.search(r'\d{4}[.\-]\d{2}[.\-]\d{2}', date_element.text)
            date = date_match.group().replace(".", "-") if date_match else "날짜없음"
        else:
            date = "날짜없음"

        title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "제목없음"

        article_section = soup.find('section', {'class': 'news_view'}) or \
                          soup.find('div', {'class': 'article_view'}) or \
                          soup.find('div', {'class': 'news_text'}) or \
                          soup.find('div', {'id': 'article_txt'})

        if article_section:
            for tag in article_section.find_all(['figure', 'div', 'script', 'style', 'iframe']):
                tag.decompose()
            content = article_section.get_text(strip=True)
        else:
            content = ""

        # 제목+본문
        full_text = f"{title}\n\n{content}"

        return [date, full_text]
    except Exception:
        return None

if __name__ == "__main__":
    all_urls = []
    pages = 759

    # 1. URL 목록 수집
    print(">>> 1단계: 동아일보 URL 목록 수집 시작")
    for page in range(1, pages + 1):
        # 동아일보 페이지네이션: p=1, 11, 21...
        start_num = (page - 1) * 10 + 1
        base_url = f'https://www.donga.com/news/search?p={start_num}&query=%ED%95%9C%EA%B5%AD%EC%9D%80%ED%96%89+%EA%B8%88%EB%A6%AC&check_news=91&sorting=2&search_date=5&v1=20150101&v2=20241231&more=1'
        
        try:
            res = requests.get(base_url, headers=headers, timeout=10)
            soup = BeautifulSoup(res.text, 'html.parser')
            datas = soup.find_all('h4')
            if not datas: break
            
            for data in datas:
                link_tag = data.find('a')
                if link_tag and link_tag.get('href'):
                    all_urls.append(link_tag['href'])
            
            if page % 100 == 0:
                print(f"[{page}/{pages}] 페이지 URL 수집 완료 (누적: {len(all_urls)}개)")
        except:
            continue

    print(f"\n>>> 총 {len(all_urls)}개의 URL 확보. 2단계 본문 수집 시작 (Thread: 10)")

    # 2. 멀티스레딩 활용
    final_docs = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(crawl_news_donga, url): url for url in all_urls}
        
        for i, future in enumerate(as_completed(future_to_url)):
            result = future.result()
            if result:
                final_docs.append(result)
            
            if (i + 1) % 100 == 0:
                print(f"[{i + 1}/{len(all_urls)}] 데이터 수집 완료...")

    # 3. csv 파일 저장
    output_csv = 'news_donga_enhanced.csv'
    with open(output_csv, 'w', encoding='utf-8-sig', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['date', 'full_text'])
        writer.writerows(final_docs)

In [None]:
from google.colab import files
files.download(output_csv)