In [None]:
import requests
import json
import re
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup

# --- 설정 및 전역 변수 ---
headers = {
    'User-Agent': 'Mozilla/5.0'
}

DATE_RANGES = [
    ('20150101', '20181231'),
    ('20190101', '20221231'),
    ('20230101', '20251230')
]

def crawl_news_chosun(url):
    try:
        res = requests.get(url, headers=headers, timeout=10)
        res.encoding = 'utf-8'
        html = res.text
        
        # 1번 방법
        pattern = r'Fusion\.globalContent\s*=\s*(\{.*?\});'
        match = re.search(pattern, html)
        if match:
            data = json.loads(match.group(1))

            title = data.get('headlines', {}).get('basic', '제목없음').strip()

            date = data.get('display_date', '날짜없음')[:10]

            elements = data.get('content_elements', [])
  
            paragraphs = [re.sub(r'<[^>]*>', '', el.get('content', '')) 
                          for el in elements if el.get('type') in ['text', 'raw_html']]
            
            content = "\n".join(paragraphs).strip()

            if content:
                return [date, f"{title}\n\n{content}"]

        # 2번 방법
        soup = BeautifulSoup(html, 'html.parser')
        
        title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "제목없음"

        date_match = re.search(r'\d{4}[.\-]\d{2}[.\-]\d{2}', html)
        date = date_match.group().replace(".", "-") if date_match else "날짜없음"
        
        article_body = soup.find('article') or \
                       soup.find('div', id='news_body_id') or \
                       soup.find('div', class_='article_body') or \
                       soup.find('div', id='article_body') or \
                       soup.find('section', id='articleAll') or \
                       soup.find('div', class_='par')

        if article_body:
            # 불필요한 태그 제거
            target_tags = ['script', 'style', 'iframe', 'textarea', 'header', 'footer', 'button', 'figure']
            for tag in article_body.find_all(target_tags):
                tag.decompose()
            
            # 광고/저작권/관련박스 제거
            for div in article_body.find_all('div', class_=['art_ad', 'art_copyright', 're_box', 'art_ad_wrap']):
                div.decompose()

            content = article_body.get_text(separator="\n", strip=True)
            return [date, f"{title}\n\n{content}"]

    except Exception:
        return None
    return None

if __name__ == "__main__":
    all_urls = []
    
    # 1. URL 목록 수집
    print('조선일보 URL 목록 수집 시작')
    for s_date, e_date in DATE_RANGES:
        print(f"기간 {s_date} ~ {e_date} 처리 중...")
        for page in range(400):
            search_url = f'https://search-gateway.chosun.com/nsearch?query=%ED%95%9C%EA%B5%AD%EC%9D%80%ED%96%89%20%EA%B8%88%EB%A6%AC&page={page}&size=10&sort=2&r=direct&s={s_date}&e={e_date}'
            try:
                res = requests.get(search_url, headers=headers, timeout=10)
                items = res.json().get('content_elements', [])
                if not items: break
                
                for item in items:
                    if item.get('site_url'):
                        all_urls.append(item['site_url'])
            except:
                continue

    # 중복 URL 제거
    all_urls = list(dict.fromkeys(all_urls))
    
    print(f'\n총 {len(all_urls)}개의 고유 URL 확보, 본문 수집 시작')

    # 2. 멀티스레딩 활용
    docs = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(crawl_news_chosun, url): url for url in all_urls}
        
        for i, future in enumerate(as_completed(future_to_url)):
            result = future.result()
            if result:
                docs.append(result)
            
            if (i + 1) % 100 == 0:
                print(f'[{i + 1}/{len(all_urls)}] 추출 완료 (현재: {len(docs)})')

    # 3. CSV 파일 저장
    output_csv = 'chosun_news.csv'
    with open(output_csv, 'w', encoding='utf-8-sig', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['date', 'full_text'])
        writer.writerows(docs)

from google.colab import files
files.download(output_csv)