In [None]:
import os
import re
import sys
import time
import datetime
import time
import json
import html
import torch
import torch.nn as nn
import pandas as pd
import urllib.request
from kiwipiepy import Kiwi
from bs4 import BeautifulSoup
from selenium import webdriver
import torch.nn.functional as F
from kiwipiepy.utils import Stopwords
from selenium.webdriver.common.by import By
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
client_id = ''
client_secret = ''

In [None]:
base_url = 'https://openapi.naver.com/v1/search/news.json' #네이버 뉴스 검색 결과 요청
sources = ['오마이뉴스','한겨레','경향신문','조선일보','동아일보','중앙일보','연합뉴스','매일경제','머니투데이'] #언론사
query_base = '해병대' #base 키워드
n_display = 100 #검색 결과로 표시될 뉴스 기사의 수
sort = 'sim' #검색결과 정렬기준(정확도)
max_results = 1000

def fetch_news(source):
    total_results = []  
    current_start = 1  
    query = f"{source} {query_base}"  
    encQuery = urllib.parse.quote(query)

    # 반복 실행
    while current_start <= max_results:
    
        url = f'{base_url}?query={encQuery}&display={n_display}&start={current_start}&sort={sort}'

        my_request = urllib.request.Request(url)
        my_request.add_header("X-Naver-Client-Id",client_id)
        my_request.add_header("X-Naver-Client-Secret",client_secret) 

        try:
            # API 요청 및 응답
            with urllib.request.urlopen(my_request) as response:
                response_body = response.read()
            
                # JSON 파싱
                data = json.loads(response_body.decode('utf-8'))
                total_results.extend(data['items'])  # 결과 저장
                current_start += n_display  # 시작 위치 업데이트
            
        except urllib.error.HTTPError as e:
            print(f"HTTP Error: {e.code}")
            break  # HTTP 에러 발생 시 중단

        time.sleep(1)  # 서버 부하 방지를 위한 지연

    return total_results

# 결과 확인
source_results = {}
for source in sources:
    articles = fetch_news(source)
    source_results[source] = articles
    print(f"Total articles retrieved for {source}: {len(articles)}")

In [None]:
# 네이버 뉴스 link(n.news.naver.com)가 있는 뉴스에 대해서만 link얻기

naver_news = {}  

for source, articles in source_results.items():
    naver_news[source] = []  
    for article in articles:
        link = html.unescape(article['link']).replace('\\', '') 
        pubdate = article['pubDate']
        if 'n.news.naver.com' in link: 
            naver_news[source].append({
                'link': link,
                'pubDate': pubdate
            })

naver_news

In [None]:
# 오마이뉴스(47),한겨레(28), 경향신문(32),조선일보(23),동아일보(20),중앙일보(25),뉴시스(3),뉴스1(421),연합뉴스(1)

source_substrings = {
    '오마이뉴스': 'https://n.news.naver.com/mnews/article/047',  
    '한겨레': 'https://n.news.naver.com/mnews/article/028',        
    '경향신문': 'https://n.news.naver.com/mnews/article/032',     
    '조선일보': 'https://n.news.naver.com/mnews/article/023',       
    '동아일보': 'https://n.news.naver.com/mnews/article/020',      
    '중앙일보': 'https://n.news.naver.com/mnews/article/025',
    '연합뉴스': 'https://n.news.naver.com/mnews/article/001',
    '매일경제': 'https://n.news.naver.com/mnews/article/009',
    '머니투데이': 'https://n.news.naver.com/mnews/article/008'
}       


filtered_links = {}
for source, substring in source_substrings.items():
    source_articles = naver_news.get(source, [])  
    filtered_links[source] = [article['link'] for article in source_articles if substring in article['link']]

for source, links in filtered_links.items():
    print(f"Filtered links for {source}:")
    for link in links:
        print(link)

In [None]:
counts = {key: len(value) for key, value in filtered_links.items()}

print(counts)

In [None]:
#selenium의 webdriver에서 웹페이지 열기
from selenium import webdriver

driver = webdriver.Chrome()

# 페이지 로드를 위해 기다리는 시간
driver.implicitly_wait(1)

# 제목 수집하기
articles_titles = {}

for source, links in filtered_links.items():
    articles_titles[source] = []
    for link in links:
        driver.get(link)
        time.sleep(1) 
        try:
            title_element = driver.find_elements(By.CLASS_NAME, 'media_end_head_title')
            if title_element:
                title = title_element[0].text
                articles_titles[source].append({'title': title, 'link': link})
            else: 
                print(f"No title found for link: {link}")
        except Exception as e:
            print(f"Failed to extract title from {link}: {e}")

driver.quit()

In [None]:
data = []
for source, articles in articles_titles.items():
    for article in articles:
        data.append({
            'Source': source,
            'Link': article['link'],
            'Title': article['title']
        })

df = pd.DataFrame(data)
df.to_csv('news_articles_해병대.csv', index=False)