In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta
import time
import random

In [3]:
offices = {"매일경제": "1009", "한국경제": "1015", "머니투데이": "1008"}
keyword = "금리"
s_date = datetime(2005, 5, 1) 
e_date = datetime(2025, 12, 30)
max_workers = 10

In [4]:
def generate_date_list(start, end):
    date_list = []
    curr = start
    while curr <= end:
        date_list.append(curr.strftime("%Y.%m.%d"))
        curr += timedelta(days=1)
    return date_list
generate_date_list(s_date, e_date)

['2005.05.01',
 '2005.05.02',
 '2005.05.03',
 '2005.05.04',
 '2005.05.05',
 '2005.05.06',
 '2005.05.07',
 '2005.05.08',
 '2005.05.09',
 '2005.05.10',
 '2005.05.11',
 '2005.05.12',
 '2005.05.13',
 '2005.05.14',
 '2005.05.15',
 '2005.05.16',
 '2005.05.17',
 '2005.05.18',
 '2005.05.19',
 '2005.05.20',
 '2005.05.21',
 '2005.05.22',
 '2005.05.23',
 '2005.05.24',
 '2005.05.25',
 '2005.05.26',
 '2005.05.27',
 '2005.05.28',
 '2005.05.29',
 '2005.05.30',
 '2005.05.31',
 '2005.06.01',
 '2005.06.02',
 '2005.06.03',
 '2005.06.04',
 '2005.06.05',
 '2005.06.06',
 '2005.06.07',
 '2005.06.08',
 '2005.06.09',
 '2005.06.10',
 '2005.06.11',
 '2005.06.12',
 '2005.06.13',
 '2005.06.14',
 '2005.06.15',
 '2005.06.16',
 '2005.06.17',
 '2005.06.18',
 '2005.06.19',
 '2005.06.20',
 '2005.06.21',
 '2005.06.22',
 '2005.06.23',
 '2005.06.24',
 '2005.06.25',
 '2005.06.26',
 '2005.06.27',
 '2005.06.28',
 '2005.06.29',
 '2005.06.30',
 '2005.07.01',
 '2005.07.02',
 '2005.07.03',
 '2005.07.04',
 '2005.07.05',
 '2005.07.

In [5]:
def extract_urls(json_data):
    urls = []
    for item in json_data.get('collection', []):
        html_str = item.get('html', '')
        soup = BeautifulSoup(html_str, 'html.parser')
        links = soup.find_all('a', href=True)
        for link in links:
            if 'n.news.naver.com' in link['href']:
                urls.append(link['href'])
    unique_urls = list(set(urls))
    return unique_urls

In [6]:
def get_urls_bydate(date, office_id):
    collected_urls = []
    base_url = "https://s.search.naver.com/p/newssearch/3/api/tab/more"
    headers = {'User-Agent': 'Mozilla/5.0', 'Referer': 'https://search.naver.com/'}
    for start in range(1, 2000, 10):
        params = {
        'abt': 'null',
        'de': date,
        'ds': date,
        'field': '0',
        'is_dts': '0',
        'is_sug_officeid': '0',
        'mynews': '1',
        'news_office_checked': office_id,
        'nqx_theme': '{"theme":{"sub":[{"name":"finance"}]}}',
        'nso': f'so:r,p:from{date.replace(".","")}to{date.replace(".","")},a:all',
        'office_category': '0',
        'office_section_code': '3',
        'office_type': '1',
        'pd': '3',
        'photo': '0',
        'query': '금리',
        'rev': '0',
        'service_area': '0',
        'sm': 'tab_smr',
        'sort': '2',
        'spq': '0',
        'ssc': 'tab.news.all',
        'start': start
        }
        try:
            res = requests.get(base_url, headers=headers, params=params, timeout=10)
            if res.status_code != 200 or not res.text.strip():
                print(f"[{date}] 수집 종료")
                break
            data = res.json()
            page_urls = extract_urls(data)
            
            if not page_urls:
                break
            
            collected_urls.extend(page_urls)
            print(f"[{date}] {start}번 진행 중... 현재까지 {len(collected_urls)}개 찾음")
            
            time.sleep(random.uniform(0.3, 0.6))
            
        except Exception as e:
            print(f"[{date}] 에러 발생: {e}")
            break
            
    # 중복 제거 후 최종 리스트 반환
    return list(set(collected_urls))
    

In [7]:
get_urls_bydate('2005.05.07', 1008)

[2005.05.07] 1번 진행 중... 현재까지 10개 찾음


['https://n.news.naver.com/mnews/article/008/0000536077?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000536074?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000536069?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000536068?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000536048?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000536067?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000536064?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000536092?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000536049?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000536070?sid=101']

In [8]:
date = '2008.5.22.'
office_id = 1008
start = 11
headers = {'User-Agent': 'Mozilla/5.0', 'Referer': 'https://search.naver.com/'}
params = {
    'abt': 'null',
    'de': date,
    'ds': date,
    'field': '0',
    'is_dts': '0',
    'is_sug_officeid': '0',
    'mynews': '1',
    'news_office_checked': office_id,
    'nqx_theme': '{"theme":{"sub":[{"name":"finance"}]}}',
    'nso': f'so:r,p:from{date.replace(".","")}to{date.replace(".","")},a:all',
    'office_category': '0',
    'office_section_code': '3',
    'office_type': '1',
    'pd': '3',
    'photo': '0',
    'query': '금리',
    'rev': '0',
    'service_area': '0',
    'sm': 'tab_smr',
    'sort': '2',
    'spq': '0',
    'ssc': 'tab.news.all',
    'start': start
}
base_url = "https://s.search.naver.com/p/newssearch/3/api/tab/more"
res = requests.get(base_url, headers=headers, params=params)

print(f"상태 코드: {res.status_code}")
print(f"받은 데이터 길이: {len(res.text)}")
json_data = res.json()
extract_urls(json_data)

상태 코드: 200
받은 데이터 길이: 113490


['https://n.news.naver.com/mnews/article/008/0000000249?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000285?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000269?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000271?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000210?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000279?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000211?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000242?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000250?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000278?sid=101']

In [9]:
urls = []
for item in json_data.get('collection', []):
    html_str = item.get('html', '')
    soup = BeautifulSoup(html_str, 'html.parser')
    links = soup.find_all('a', href=True)
    for link in links:
        if 'n.news.naver.com' in link['href']:
            urls.append(link['href'])
unique_urls = list(set(urls))
unique_urls

['https://n.news.naver.com/mnews/article/008/0000000249?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000285?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000269?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000271?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000210?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000279?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000211?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000242?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000250?sid=101',
 'https://n.news.naver.com/mnews/article/008/0000000278?sid=101']