In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time

def get_news_articles(base_url, year, max_articles=1000):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.177 Safari/537.36',
        'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
        'Referer': 'https://www.naver.com/'
    }
    articles = []
    page = 1
    pbar = tqdm(total=max_articles, desc=f"Processing Articles for {year}")  # 연도별 진행 바 생성
    while len(articles) < max_articles:
        url = f"{base_url}&nso=so:dd,p:from{year}0101to{year}1231&start={(page - 1) * 10}"
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error occurred: {e}")
            break
        soup = BeautifulSoup(response.text, 'html.parser')
        news_items = soup.select(".news_area")  # 기사 영역 선택자
        if not news_items:  # 기사가 더 이상 없으면 종료
            print("No more articles found or invalid selector.")
            break
        for item in news_items:
            title_elem = item.select_one("a.news_tit")  # 제목 선택
            press_elem = item.select_one("a.info.press")  # 언론사 선택
            desc_elem = item.select_one(".dsc_txt_wrap")  # 본문 요약 선택
            
            title = title_elem.text.strip() if title_elem else "No Title"
            link = title_elem['href'] if title_elem else None
            press = press_elem.text.strip() if press_elem else "No Press"
            description = desc_elem.text.strip() if desc_elem else "No Description"
            
            articles.append({
                "title": title,
                "link": link,
                "press": press,
                "description": description
            })
            pbar.update(1)  # 진행 바 업데이트
            if len(articles) >= max_articles:
                break
        page += 1
        time.sleep(3)  # 요청 간 대기 시간 증가
    pbar.close()  # 진행 바 종료
    return articles

def save_articles_to_file(articles, year):
    filename = f"articles_{year}.csv"
    with open(filename, "w", encoding="utf-8") as file:
        file.write("Title,Link,Press,Description\n")
        for article in articles:
            file.write(f'"{article["title"]}","{article["link"]}","{article["press"]}","{article["description"]}"\n')
    print(f"Saved {len(articles)} articles to {filename}")

# 네이버 뉴스 검색 기본 URL
base_url = "https://search.naver.com/search.naver?where=news&query=%EC%84%B1%EC%88%98+%ED%8C%9D%EC%97%85%EC%8A%A4%ED%86%A0%EC%96%B4"

# 연도별 크롤링 수행
years = range(2020, 2025)  # 2020년부터 2025년까지
max_articles = 1000        # 연도별 최대 기사 수

for year in years:
    articles = get_news_articles(base_url, year, max_articles=max_articles)
    if articles:
        save_articles_to_file(articles, year)
    else:
        print(f"No articles were collected for {year}.")

In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time

def get_news_articles(base_url, keywords, year, max_articles=1000):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.177 Safari/537.36',
        'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
        'Referer': 'https://www.naver.com/'
    }
    articles = []
    page = 1
    keyword_query = '+'.join(keywords)  # 키워드를 '+'로 연결하여 검색 쿼리 생성
    pbar = tqdm(total=max_articles, desc=f"Processing Articles for {year} - {' '.join(keywords)}")  # 연도별 진행 바 생성
    while len(articles) < max_articles:
        url = f"{base_url}&query={keyword_query}&nso=so:dd,p:from{year}0101to{year}1231&start={(page - 1) * 10}"
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error occurred: {e}")
            break
        soup = BeautifulSoup(response.text, 'html.parser')
        news_items = soup.select(".news_area")  # 기사 영역 선택자
        if not news_items:  # 기사가 더 이상 없으면 종료
            print("No more articles found or invalid selector.")
            break
        for item in news_items:
            title_elem = item.select_one("a.news_tit")  # 제목 선택
            press_elem = item.select_one("a.info.press")  # 언론사 선택
            desc_elem = item.select_one(".dsc_txt_wrap")  # 본문 요약 선택
            
            title = title_elem.text.strip() if title_elem else "No Title"
            link = title_elem['href'] if title_elem else None
            press = press_elem.text.strip() if press_elem else "No Press"
            description = desc_elem.text.strip() if desc_elem else "No Description"
            
            articles.append({
                "title": title,
                "link": link,
                "press": press,
                "description": description
            })
            pbar.update(1)  # 진행 바 업데이트
            if len(articles) >= max_articles:
                break
        page += 1
        time.sleep(3)  # 요청 간 대기 시간 증가
    pbar.close()  # 진행 바 종료
    return articles

def save_articles_to_file(articles, year, keywords):
    keyword_str = '_'.join(keywords)
    filename = f"articles_{keyword_str}_{year}.csv"
    with open(filename, "w", encoding="utf-8") as file:
        file.write("Title,Link,Press,Description\n")
        for article in articles:
            file.write(f'"{article["title"]}","{article["link"]}","{article["press"]}","{article["description"]}"\n')
    print(f"Saved {len(articles)} articles to {filename}")

# 사용자로부터 키워드 입력받기
keywords = input("검색할 키워드를 입력해주세요 (쉼표로 구분): ")
keyword_list = [keyword.strip() for keyword in keywords.split(',')]

# 네이버 뉴스 검색 기본 URL
base_url = "https://search.naver.com/search.naver?where=news"

# 연도별 크롤링 수행
years = range(2020, 2025)  # 2020년부터 2025년까지
max_articles = 1000        # 연도별 최대 기사 수

for year in years:
    articles = get_news_articles(base_url, keyword_list, year, max_articles=max_articles)
    if articles:
        save_articles_to_file(articles, year, keyword_list)
    else:
        print(f"No articles were collected for {year} with keywords '{', '.join(keyword_list)}'.")