In [3]:
import json, logging, os, re, requests, sys, time
import pandas as pd

from bs4 import BeautifulSoup as bs
from urllib.request import urlopen, Request
from datetime import date, datetime
from pykrx import stock
from webob.compat import urlparse
import lxml
URL_NAVER_FINANCE_NEWS_QUERY = "https://finance.naver.com/news/news_search.nhn?rcdate=&q=%s&x=0&y=0&sm=all.basic&pd=4&stDateStart=%s&stDateEnd=%s&page=%s"
URL_NAVER_FINANCE = "http://finance.naver.com"

In [4]:
# pip install webob
# pip install pykrx
# pip install lxml

In [5]:
class NaverFinanceNewsCrawler:
    def __init__(self):
        self.ticker = None
        self.result = []
        pass

    def _crawl_by_query(self, ticker, dt):
        """
        Crawl Naver Finance News
        :param ticker: string; search keywords
        :return: generator; [{title, summary, url, articleId, content, codes}, ...]
        """
        comp_name = stock.get_market_ticker_name(ticker)
        print("comp name = ", comp_name)
        # Convert the query to euc-kr string
        q = ""
        for c in comp_name.encode("euc-kr"):
            q += "%%%s" % format(c, "x").capitalize()

        # 여러 페이지를 동시에 돌려주므로, 기사가 존재할 때까지 loop 돌면서 뉴스 가져오기
        page = 1
        n_news = 0

        while True:
            r_url = URL_NAVER_FINANCE_NEWS_QUERY % (q, dt, dt, page,)
            print('url = ', r_url)
            r = requests.get(r_url)
            soup = bs(r.text, "lxml")
            news = soup.find("div", class_="newsSchResult").find(
                "dl", class_="newsList"
            )
            news_title = news.find_all("dd", class_="articleSubject")
            news_summary = news.find_all("dd", class_="articleSummary")
            wdate = news.find_all("span", class_="wdate")
            n_news += len(news_title)

            if len(news_title) > 0:
                for title, summary, _date in zip(news_title, news_summary, wdate):
                    date = _date.find(text=True).split("\n")[2].strip()

                    if date == dt:
                        url = URL_NAVER_FINANCE + title.a.get("href")
                        res = {
                            "title": title.a.text,
                            "summary": summary.find(text=True).strip(" \t\n\r"),
                            "url": url,
                            "ticker": ticker,
                            "articleId": urlparse.parse_qs(
                                urlparse.urlparse(url).query
                            )["article_id"][0],
                            "date": date,
                        }
                        res.update(self._crawl_content(url))
                        #             if self.query in res['title']:
                        self.result.append(res)
                        time.sleep(1)
                page += 1

            else:
                break
        print("number of news for comp {} = {}".format(comp_name, n_news))

    def _crawl_content(self, url):
        r = requests.get(url)
        soup = bs(r.text, "lxml")
        content = soup.find("div", id="content", class_="articleCont")
        codes = re.findall(r"\d{6}", content.text)
        cntnt = content.text.strip(" \t\n\r").split("@")[0]
        # 마지막 마침표 이후 제거
        effix = cntnt.split('.')[-1]
        cntnt = cntnt.replace(effix, '')
        return {"content": cntnt}

    def get_valid_ticker_name(self, ticker):
        try:
            return stock.get_market_ticker_name(ticker)
        except:
            pass

    def get_news(self, ticker, dt):
        # logger instance 생성
        logger = logging.getLogger(__name__)
        # Format 설정
        formatter = logging.Formatter(
            "%(asctime)s %(levelname)s [%(name)s] [%(filename)s:%(lineno)d] - %(message)s"
        )
        # handler 생성
        streamHandler = logging.StreamHandler()
        streamHandler.setFormatter(formatter)
        logger.addHandler(streamHandler)
        logger.setLevel(level=logging.INFO)

        today = date.today().strftime("%Y-%m-%d")
        print(f'today= {today}')
        if dt is None:
            dt = today
        else:
            dt = dt

        # 회사 정보 만들기.
        _comp = {}
        _comp["ticker"] = ticker
        _comp["name"] = [self.get_valid_ticker_name(x) for x in ticker]
    
        print('_comp = ', _comp)
        # 이미 있는 날짜와 회사 정보 가져오기

        for i, t in enumerate(_comp["ticker"]):
            logger.info("i = {}, comp = {}".format(i, _comp["ticker"][i]))
            try:
                msg = "company = {}".format(_comp["name"][i])
                logger.info(msg)
                time.sleep(1)
                self._crawl_by_query(ticker=t, dt=dt)
            except Exception as e:
                logger.error(e)
                pass

In [6]:
NNC = NaverFinanceNewsCrawler()
res = NNC.get_news(['005930', '005930'], '2022-03-22')

today= 2022-03-29


2022-03-29 02:41:02,911 INFO [__main__] [3597079524.py:109] - i = 0, comp = 005930
2022-03-29 02:41:02,912 INFO [__main__] [3597079524.py:112] - company = 삼성전자


_comp =  {'ticker': ['005930', '005930'], 'name': ['삼성전자', '삼성전자']}
comp name =  삼성전자
url =  https://finance.naver.com/news/news_search.nhn?rcdate=&q=%Bb%Ef%Bc%Ba%C0%Fc%C0%Da&x=0&y=0&sm=all.basic&pd=4&stDateStart=2022-03-22&stDateEnd=2022-03-22&page=1
url =  https://finance.naver.com/news/news_search.nhn?rcdate=&q=%Bb%Ef%Bc%Ba%C0%Fc%C0%Da&x=0&y=0&sm=all.basic&pd=4&stDateStart=2022-03-22&stDateEnd=2022-03-22&page=2
url =  https://finance.naver.com/news/news_search.nhn?rcdate=&q=%Bb%Ef%Bc%Ba%C0%Fc%C0%Da&x=0&y=0&sm=all.basic&pd=4&stDateStart=2022-03-22&stDateEnd=2022-03-22&page=3
url =  https://finance.naver.com/news/news_search.nhn?rcdate=&q=%Bb%Ef%Bc%Ba%C0%Fc%C0%Da&x=0&y=0&sm=all.basic&pd=4&stDateStart=2022-03-22&stDateEnd=2022-03-22&page=4
url =  https://finance.naver.com/news/news_search.nhn?rcdate=&q=%Bb%Ef%Bc%Ba%C0%Fc%C0%Da&x=0&y=0&sm=all.basic&pd=4&stDateStart=2022-03-22&stDateEnd=2022-03-22&page=5
url =  https://finance.naver.com/news/news_search.nhn?rcdate=&q=%Bb%Ef%Bc%Ba%C0%Fc%C

2022-03-29 02:45:35,636 INFO [__main__] [3597079524.py:109] - i = 1, comp = 005930


url =  https://finance.naver.com/news/news_search.nhn?rcdate=&q=%Bb%Ef%Bc%Ba%C0%Fc%C0%Da&x=0&y=0&sm=all.basic&pd=4&stDateStart=2022-03-22&stDateEnd=2022-03-22&page=13
number of news for comp 삼성전자 = 204


2022-03-29 02:45:35,637 INFO [__main__] [3597079524.py:112] - company = 삼성전자


comp name =  삼성전자
url =  https://finance.naver.com/news/news_search.nhn?rcdate=&q=%Bb%Ef%Bc%Ba%C0%Fc%C0%Da&x=0&y=0&sm=all.basic&pd=4&stDateStart=2022-03-22&stDateEnd=2022-03-22&page=1
url =  https://finance.naver.com/news/news_search.nhn?rcdate=&q=%Bb%Ef%Bc%Ba%C0%Fc%C0%Da&x=0&y=0&sm=all.basic&pd=4&stDateStart=2022-03-22&stDateEnd=2022-03-22&page=2
url =  https://finance.naver.com/news/news_search.nhn?rcdate=&q=%Bb%Ef%Bc%Ba%C0%Fc%C0%Da&x=0&y=0&sm=all.basic&pd=4&stDateStart=2022-03-22&stDateEnd=2022-03-22&page=3
url =  https://finance.naver.com/news/news_search.nhn?rcdate=&q=%Bb%Ef%Bc%Ba%C0%Fc%C0%Da&x=0&y=0&sm=all.basic&pd=4&stDateStart=2022-03-22&stDateEnd=2022-03-22&page=4
url =  https://finance.naver.com/news/news_search.nhn?rcdate=&q=%Bb%Ef%Bc%Ba%C0%Fc%C0%Da&x=0&y=0&sm=all.basic&pd=4&stDateStart=2022-03-22&stDateEnd=2022-03-22&page=5
url =  https://finance.naver.com/news/news_search.nhn?rcdate=&q=%Bb%Ef%Bc%Ba%C0%Fc%C0%Da&x=0&y=0&sm=all.basic&pd=4&stDateStart=2022-03-22&stDateEnd=2022

KeyboardInterrupt: 

In [None]:
NNC.result