In [None]:
import requests
import json
import re
import csv
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0'
}

# 조선일보 검색 4000개 제한으로 기간 분할
date_ranges = [
    ('20050501', '20120430'),
    ('20120501', '20190430'),
    ('20190501', '20241231')
]

# 조선일보 크롤링 함수
def crawl_news_chosun(url):
    try:
        res = requests.get(url, headers=headers, timeout=10)
        res.encoding = 'utf-8'
        
        pattern = r'Fusion\.globalContent\s*=\s*(\{.*?\});'
        match = re.search(pattern, res.text)
        
        if match:
            data = json.loads(match.group(1))
            title = data.get('headlines', {}).get('basic', '제목 없음').strip()
            date = data.get('display_date', '날짜 정보 없음')[:10]
            
            elements = data.get('content_elements', [])

            paragraphs = [re.sub(r'<[^>]*>', '', el.get('content', '')) 
                          for el in elements if el.get('type') in ['text', 'raw_html']]
            content = "\n".join(paragraphs).strip()
            
            if content:
                return [date, f"{title}\n\n{content}"]

        soup = BeautifulSoup(res.text, 'html.parser')
        
        # 제목 추출
        title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "제목없음"
        
        # 날짜 추출
        date_match = re.search(r'\d{4}\.\d{2}\.\d{2}', res.text)
        date = date_match.group().replace(".", "-") if date_match else "날짜없음"
        
        # 본문 추출
        article_body = soup.find('div', {'id': 'article_body'}) or \
                       soup.find('div', {'class': 'article_body'}) or \
                       soup.find('section', {'id': 'articleAll'})
        
        if article_body:
            for tag in article_body.find_all(['script', 'style', 'div', 'iframe']):
                tag.decompose()
            content = article_body.get_text(strip=True)
            return [date, f"{title}\n\n{content}"]

    except Exception:
        return None
    return None

if __name__ == "__main__":
    all_urls = []
    
    # 1. URL 목록 수집
    print(">>> 1단계: 조선일보 URL 목록 수집 시작")
    for s_date, e_date in date_ranges:
        print(f"기간 {s_date} ~ {e_date} 처리 중...")
        for page in range(400):
            search_url = f'https://search-gateway.chosun.com/nsearch?query=%ED%95%9C%EA%B5%AD%EC%9D%80%ED%96%89%20%EA%B8%88%EB%A6%AC&page={page}&size=10&sort=2&r=direct&s={s_date}&e={e_date}'
            try:
                res = requests.get(search_url, headers=headers, timeout=10)
                search_data = json.loads(res.text)
                items = search_data.get('content_elements', [])
                if not items: break
                
                for item in items:
                    if item.get('site_url'):
                        all_urls.append(item['site_url'])
            except:
                continue

    print(f"\n>>> 총 {len(all_urls)}개의 URL 확보. 2단계 본문 수집 시작 (Thread: 10)")

    # 2. 멀티스레딩 활용
    final_docs = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(crawl_news_chosun, url): url for url in all_urls}
        
        for i, future in enumerate(as_completed(future_to_url)):
            result = future.result()
            if result:
                final_docs.append(result) # [date, full_text]
            
            if (i + 1) % 100 == 0:
                print(f"[{i + 1}/{len(all_urls)}] 추출 진행 중...")

    # 3. CSV 파일 저장
    output_csv = 'news_chosun.csv'
    with open(output_csv, 'w', encoding='utf-8-sig', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['date', 'full_text'])
        writer.writerows(final_docs)

In [None]:
from google.colab import files
files.download(output_csv)