In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

headers = {
    'User-Agent': 'Mozilla/5.0'
}

def crawl_news_khan(url):
    try:
        res = requests.get(url, headers=headers, timeout=10)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')

        date_element = soup.find('div', {'class': 'date'}) or soup.find('span', {'class': 'date'})
        if date_element:
            date_raw = date_element.get_text(strip=True)
            date_match = re.search(r'\d{4}[.\-]\d{2}[.\-]\d{2}', date_raw)
            clean_date = date_match.group().replace(".", "-") if date_match else "날짜없음"
        else:
            clean_date = "날짜없음"

        title = "제목없음"
        title_tags = soup.find_all('h1')
        if len(title_tags) > 1:
            title = title_tags[1].get_text(strip=True)
        elif len(title_tags) == 1:
            title = title_tags[0].get_text(strip=True)

        content_elements = soup.find_all('p', {'class': 'content_text'})
        if content_elements:
            full_content = '\n'.join([el.get_text(strip=True) for el in content_elements])
        else:
            article_body = soup.find('div', {'class': 'article_txt'}) or \
                           soup.find('article', {'id': 'articleBody'}) or \
                           soup.find('div', {'class': 'art_body'})
            if article_body:
                for tag in article_body.find_all(['script', 'style', 'figure', 'div', 'iframe']):
                    tag.decompose()
                full_content = article_body.get_text(strip=True)
            else:
                full_content = ""

        stops = ["뱅크-아이", "무단전재", "기자 =", "기자=", "@khan.co.kr", "ⓒ", "제공=", "출처="]
        for stop in stops:
            if stop in full_content:
                full_content = full_content.split(stop)[0]

        full_content = re.sub(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', full_content)
        full_content = re.sub(r'http[s]?://\S+', '', full_content)
        full_content = full_content.strip()

        # [날짜, 제목+본문]
        if full_content:
            return [clean_date, f"{title}\n\n{full_content}"]
        return None

    except Exception:
        return None

if __name__ == "__main__":
    all_urls = []
    pages = 393

    # 1. URL 목록 수집
    print(">>> 1단계: 경향신문 URL 목록 수집 시작")
    for page in range(1, pages + 1):
        base_url = f'https://search.khan.co.kr/?q=%ED%95%9C%EA%B5%AD%EC%9D%80%ED%96%89+%EA%B8%88%EB%A6%AC&media=khan&page={page}&section=0&term=5&startDate=2015-01-01&endDate=2024-12-31&sort=2'
        try:
            res = requests.get(base_url, headers=headers, timeout=10)
            soup = BeautifulSoup(res.text, 'html.parser')

            datas = soup.find_all('a', {'ep_event_area': '검색결과_기사목록'})
            
            for data in datas:
                url = data.get('href')
                if url:
                    all_urls.append(url)
            
            if page % 50 == 0:
                print(f"[{page}/{pages}] 페이지 URL 수집 완료 (누적: {len(all_urls)}개)")
        except:
            continue

    print(f"\n>>> 총 {len(all_urls)}개의 URL 확보. 2단계 본문 수집 시작 (Thread: 10)")

    # 2. 멀티스레딩 활용
    final_docs = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(crawl_news_khan, url): url for url in all_urls}
        
        for i, future in enumerate(as_completed(future_to_url)):
            result = future.result()
            if result:
                final_docs.append(result)
            
            if (i + 1) % 100 == 0:
                print(f"[{i + 1}/{len(all_urls)}] 데이터 추출 완료...")

    # 3. CSV 파일 저장
    csv_file_path = 'news_khan_enhanced.csv'
    with open(csv_file_path, 'w', encoding='utf-8-sig', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['date', 'full_text'])
        writer.writerows(final_docs)

In [None]:
from google.colab import files
files.download(csv_file_path)