In [None]:
import requests
import json
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

headers = {
    'User-Agent': 'Mozilla/5.0'
}

# 조선일보 검색 4000개 제한으로 기간 분할
date_ranges = [
    ('20050501', '20120430'),
    ('20120501', '20190430'),
    ('20190501', '20241231')
]

def crawl_news_chosun(url):
    try:
        res = requests.get(url, headers=headers, timeout=10)
        res.encoding = 'utf-8'
        
        pattern = r'Fusion\.globalContent\s*=\s*(\{.*?\});'
        match = re.search(pattern, res.text)
        
        if match:
            data = json.loads(match.group(1))
            
            title = data.get('headlines', {}).get('basic', '제목 없음').strip()
            
            date = data.get('display_date', '날짜 정보 없음')
            
            elements = data.get('content_elements', [])
            paragraphs = [re.sub(r'<[^>]*>', '', el.get('content', '')) 
                          for el in elements if el.get('type') == 'text']
            content = "\n".join(paragraphs)
            
            return {
                'date': date,
                'title': title,
                'content': content
            }
    except Exception:
        return None
    return None

if __name__ == "__main__":
    all_urls = []
    
    # 1. 검색 API를 통해 모든 기간의 기사 URL 먼저 수집
    print(">>> 1단계: URL 목록 수집 시작")
    for s_date, e_date in date_ranges:
        print(f"기간 {s_date} ~ {e_date} 처리 중...")
        for page in range(400):
            search_url = f'https://search-gateway.chosun.com/nsearch?query=%ED%95%9C%EA%B5%AD%EC%9D%80%ED%96%89%20%EA%B8%88%EB%A6%AC&page={page}&size=10&sort=2&r=direct&s={s_date}&e={e_date}'
            try:
                res = requests.get(search_url, headers=headers, timeout=10)
                search_data = json.loads(res.text)
                items = search_data.get('content_elements', [])
                if not items: break
                
                for item in items:
                    if item.get('site_url'):
                        all_urls.append(item['site_url'])
            except:
                continue

    print(f"\n>>> 총 {len(all_urls)}개의 URL 확보. 2단계 본문 수집 시작 (Thread: 10)")

    # 2. 멀티스레딩 활용
    final_docs = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(crawl_news_chosun, url): url for url in all_urls}
        
        for i, future in enumerate(as_completed(future_to_url)):
            result = future.result()
            if result:
                final_docs.append(result)
            
            # 진행 상황
            if (i + 1) % 100 == 0:
                print(f"[{i + 1}/{len(all_urls)}] 데이터 수집 완료...")

    # 3. 결과 저장
    print(f"\n>>> 최종 수집된 기사 수: {len(final_docs)}개")


    with open('news_corpus_for_thesis.json', 'w', encoding='utf-8') as f:
        json.dump(final_docs, f, ensure_ascii=False, indent=4)

In [None]:
import json
import pandas as pd

file_path = 'news_corpus_for_thesis.json'
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# [날짜, 제목 + 본문]
article_list = []

for item in data:
    date = item.get('date', '')
    title = item.get('title', '')
    content = item.get('content', '')
    
    full_text = f"{title}\n\n{content}"
    
    article_list.append([date, full_text])

In [None]:
import csv

csv_file_path = 'news_chosun.csv'

with open(csv_file_path, 'w', encoding='utf-8-sig', newline='') as f:
    writer = csv.writer(f)
    
    writer.writerow(['date', 'full_text'])
    
    writer.writerows(article_list)