In [2]:
import numpy as np
import pandas as pd
import re
import datetime
import time
import logging

from bs4 import BeautifulSoup
from urllib.request import urlopen
from multiprocessing import Pool
#from tqdm import tqdm

In [17]:
def naver_news_crawling(company_code):
    start = time.time()
    
    logging.basicConfig(level=logging.INFO)
    logging.info(' %s start', company_code)
    
    news_title = []
    news_link =[]
    news_info =[]
    news_date_ymd =[]
    news_date_hms = []
    
    # 맨 끝 페이지 숫자 추출
    end = last_page(company_code)
    
    for n in range(1,int(end)+1):
        
        # 페이지 url
        url = "https://finance.naver.com/item/news_news.nhn?code={code}&page={page}".format(code=company_code, page=n)
        # 페이지 추출
        page = urlopen(url)
        # 파싱
        naver_news_soup = BeautifulSoup(page, 'lxml')

        # 제목 추출
        naver_news_title = naver_news_soup.find_all('a', class_='tit')
        news_title.extend([title.get_text() for title in naver_news_title])
        
        # 링크 추출
        naver_news_link = naver_news_soup.find_all('a', class_='tit')
        news_link.extend(['https://finance.naver.com' + link['href'].split('&page')[0] for link in naver_news_link])
        
        # 제공기관 추출
        naver_news_info = naver_news_soup.find_all('td', class_='info')
        news_info.extend([info.string for info in naver_news_info])

        # 날짜 추출 (일 , 시간 별)
        naver_news_date = naver_news_soup.find_all('td', class_='date')
        news_date_ymd.extend([date.string.split(' ')[1] for date in naver_news_date])
        news_date_hms.extend([date.string.split(' ')[2] for date in naver_news_date])

        logging.basicConfig(level=logging.INFO)
        logging.info('- %s/%s page', n, end)
    

    # DataFrame에 삽입
    data = {
          '제목': news_title
        , '링크' : news_link
        , '기관' : news_info
        , '날짜' : news_date_ymd
        , '시간' : news_date_hms
    }
    df = pd.DataFrame(data)
    
    # 중복된 뉴스 제거
    df = df[df['제목'].duplicated() & df['날짜'].duplicated() == False]
    
    # csv파일 저장
    save_csv(df,company_code)
    
    # 소요 시간 측정
    sec = time.time() - start
    
    logging.basicConfig(level=logging.INFO)
    logging.info(' %s finish (%s sec)', company_code, round(sec,2))

    return df

In [18]:
# 마지막 페이지 조회
def last_page(company_code):
    page = 1
    
    while(True):
        tmp_url = "https://finance.naver.com/item/news_news.nhn?code={code}&page={page}".format(code=company_code, page=page)
        tmp_page = urlopen(tmp_url)
        tmp_soup = BeautifulSoup(tmp_page, 'lxml')

        try:
            page_href = tmp_soup.find('td', class_='pgRR').a['href']
            page = page_href.split('&')[1].split('=')[1]
            
        except:
            return page
            break
    return page

In [19]:
# csv 파일 저장
def save_csv(df,company_code):
    df.to_csv(
          './pjt/news/news_{name}.csv'.format(name=company_code)
        , encoding='utf-8'
    )

In [20]:
# 네이버 증권 인기 검색 종목 top 30 조회 
def stock_rank():
     # 페이지 url
    url = "https://finance.naver.com/sise/lastsearch2.nhn"
    # 페이지 추출
    page = urlopen(url)
    # 파싱
    company_code_soup = BeautifulSoup(page, 'lxml')
    
    # 종목 이름, 코드 추출
    rank_company_code = company_code_soup.find_all('a', class_='tltle')

    company_name = [name.string for name in rank_company_code]
    company_code = [code['href'].split('=')[1] for code in rank_company_code]
    
    return company_code[:]

In [21]:
# 테스트용 (코드넣고 반복수행)
def crawling_ranking_stock():

    for code in stock_rank():
        naver_news_crawling(code)

In [22]:
def crawling_news():
    for code in df['종목코드']:
        try:
            naver_news_crawling(code)
        except Exception as ex:
            print('에러 발생', ex)
            continue

In [None]:
#pool = Pool(processes=2) # 2개의 프로세스를 사용합니다.
#pool.map(crawling_ranking_stock(),crawling_ranking1_stock())
# try:
#     crawling_ranking_stock()
# exceptException as ex:
#     print('에러 발생', ex)
#     continue
    
crawling_news()


INFO:root: 000020 start
INFO:root:- 1/13 page
INFO:root:- 2/13 page
INFO:root:- 3/13 page
INFO:root:- 4/13 page
INFO:root:- 5/13 page
INFO:root:- 6/13 page
INFO:root:- 7/13 page
INFO:root:- 8/13 page
INFO:root:- 9/13 page
INFO:root:- 10/13 page
INFO:root:- 11/13 page
INFO:root:- 12/13 page
INFO:root:- 13/13 page
INFO:root: 000020 finish (2.75 sec)
INFO:root: 000040 start
INFO:root:- 1/5 page
INFO:root:- 2/5 page
INFO:root:- 3/5 page
INFO:root:- 4/5 page
INFO:root:- 5/5 page
INFO:root: 000040 finish (0.84 sec)
INFO:root: 000050 start
INFO:root:- 1/5 page
INFO:root:- 2/5 page
INFO:root:- 3/5 page
INFO:root:- 4/5 page
INFO:root:- 5/5 page
INFO:root: 000050 finish (0.83 sec)
INFO:root: 000060 start
INFO:root:- 1/21 page
INFO:root:- 2/21 page
INFO:root:- 3/21 page
INFO:root:- 4/21 page
INFO:root:- 5/21 page
INFO:root:- 6/21 page
INFO:root:- 7/21 page
INFO:root:- 8/21 page
INFO:root:- 9/21 page
INFO:root:- 10/21 page
INFO:root:- 11/21 page
INFO:root:- 12/21 page
INFO:root:- 13/21 page
INFO:r

INFO:root:- 1/2 page
INFO:root:- 2/2 page
INFO:root: 000155 finish (0.37 sec)
INFO:root: 000157 start
INFO:root:- 1/3 page
INFO:root:- 2/3 page
INFO:root:- 3/3 page
INFO:root: 000157 finish (0.88 sec)
INFO:root: 000180 start
INFO:root:- 1/1 page
INFO:root: 000180 finish (0.23 sec)
INFO:root: 000210 start
INFO:root:- 1/92 page
INFO:root:- 2/92 page
INFO:root:- 3/92 page
INFO:root:- 4/92 page
INFO:root:- 5/92 page
INFO:root:- 6/92 page
INFO:root:- 7/92 page
INFO:root:- 8/92 page
INFO:root:- 9/92 page
INFO:root:- 10/92 page
INFO:root:- 11/92 page
INFO:root:- 12/92 page
INFO:root:- 13/92 page
INFO:root:- 14/92 page
INFO:root:- 15/92 page
INFO:root:- 16/92 page
INFO:root:- 17/92 page
INFO:root:- 18/92 page
INFO:root:- 19/92 page
INFO:root:- 20/92 page
INFO:root:- 21/92 page
INFO:root:- 22/92 page
INFO:root:- 23/92 page
INFO:root:- 24/92 page
INFO:root:- 25/92 page
INFO:root:- 26/92 page
INFO:root:- 27/92 page
INFO:root:- 28/92 page
INFO:root:- 29/92 page
INFO:root:- 30/92 page
INFO:root:- 3

INFO:root:- 207/235 page
INFO:root:- 208/235 page
INFO:root:- 209/235 page
INFO:root:- 210/235 page
INFO:root:- 211/235 page
INFO:root:- 212/235 page
INFO:root:- 213/235 page
INFO:root:- 214/235 page
INFO:root:- 215/235 page
INFO:root:- 216/235 page
INFO:root:- 217/235 page
INFO:root:- 218/235 page
INFO:root:- 219/235 page
INFO:root:- 220/235 page
INFO:root:- 221/235 page
INFO:root:- 222/235 page
INFO:root:- 223/235 page
INFO:root:- 224/235 page
INFO:root:- 225/235 page
INFO:root:- 226/235 page
INFO:root:- 227/235 page
INFO:root:- 228/235 page
INFO:root:- 229/235 page
INFO:root:- 230/235 page
INFO:root:- 231/235 page
INFO:root:- 232/235 page
INFO:root:- 233/235 page
INFO:root:- 234/235 page
INFO:root:- 235/235 page
INFO:root: 000270 finish (43.88 sec)
INFO:root: 000300 start
INFO:root:- 1/3 page
INFO:root:- 2/3 page
INFO:root:- 3/3 page
INFO:root: 000300 finish (0.64 sec)
INFO:root: 000320 start
INFO:root:- 1/1 page
INFO:root: 000320 finish (0.23 sec)
INFO:root: 000325 start
INFO:root:

INFO:root:- 41/128 page
INFO:root:- 42/128 page
INFO:root:- 43/128 page
INFO:root:- 44/128 page
INFO:root:- 45/128 page
INFO:root:- 46/128 page
INFO:root:- 47/128 page
INFO:root:- 48/128 page
INFO:root:- 49/128 page
INFO:root:- 50/128 page
INFO:root:- 51/128 page
INFO:root:- 52/128 page
INFO:root:- 53/128 page
INFO:root:- 54/128 page
INFO:root:- 55/128 page
INFO:root:- 56/128 page
INFO:root:- 57/128 page
INFO:root:- 58/128 page
INFO:root:- 59/128 page
INFO:root:- 60/128 page
INFO:root:- 61/128 page
INFO:root:- 62/128 page
INFO:root:- 63/128 page
INFO:root:- 64/128 page
INFO:root:- 65/128 page
INFO:root:- 66/128 page
INFO:root:- 67/128 page
INFO:root:- 68/128 page
INFO:root:- 69/128 page
INFO:root:- 70/128 page
INFO:root:- 71/128 page
INFO:root:- 72/128 page
INFO:root:- 73/128 page
INFO:root:- 74/128 page
INFO:root:- 75/128 page
INFO:root:- 76/128 page
INFO:root:- 77/128 page
INFO:root:- 78/128 page
INFO:root:- 79/128 page
INFO:root:- 80/128 page
INFO:root:- 81/128 page
INFO:root:- 82/1

INFO:root:- 37/47 page
INFO:root:- 38/47 page
INFO:root:- 39/47 page
INFO:root:- 40/47 page
INFO:root:- 41/47 page
INFO:root:- 42/47 page
INFO:root:- 43/47 page
INFO:root:- 44/47 page
INFO:root:- 45/47 page
INFO:root:- 46/47 page
INFO:root:- 47/47 page
INFO:root: 001040 finish (7.41 sec)
INFO:root: 001045 start
INFO:root:- 1/2 page
INFO:root:- 2/2 page
INFO:root: 001045 finish (0.47 sec)
INFO:root: 00104K start
INFO:root:- 1/2 page
INFO:root:- 2/2 page
INFO:root: 00104K finish (0.5 sec)
INFO:root: 001060 start
INFO:root:- 1/22 page
INFO:root:- 2/22 page
INFO:root:- 3/22 page
INFO:root:- 4/22 page
INFO:root:- 5/22 page
INFO:root:- 6/22 page
INFO:root:- 7/22 page
INFO:root:- 8/22 page
INFO:root:- 9/22 page
INFO:root:- 10/22 page
INFO:root:- 11/22 page
INFO:root:- 12/22 page
INFO:root:- 13/22 page
INFO:root:- 14/22 page
INFO:root:- 15/22 page
INFO:root:- 16/22 page
INFO:root:- 17/22 page
INFO:root:- 18/22 page
INFO:root:- 19/22 page
INFO:root:- 20/22 page
INFO:root:- 21/22 page
INFO:root:

INFO:root:- 18/26 page
INFO:root:- 19/26 page
INFO:root:- 20/26 page
INFO:root:- 21/26 page
INFO:root:- 22/26 page
INFO:root:- 23/26 page
INFO:root:- 24/26 page
INFO:root:- 25/26 page
INFO:root:- 26/26 page
INFO:root: 001510 finish (3.99 sec)
INFO:root: 001515 start
INFO:root:- 1/1 page
INFO:root: 001515 finish (0.19 sec)
INFO:root: 001520 start
INFO:root:- 1/4 page
INFO:root:- 2/4 page
INFO:root:- 3/4 page
INFO:root:- 4/4 page
INFO:root: 001520 finish (0.91 sec)
INFO:root: 001525 start
INFO:root:- 1/1 page
INFO:root: 001525 finish (0.21 sec)
INFO:root: 001527 start
INFO:root:- 1/1 page
INFO:root: 001527 finish (0.2 sec)
INFO:root: 001529 start
INFO:root:- 1/1 page
INFO:root: 001529 finish (0.15 sec)
INFO:root: 001530 start
INFO:root:- 1/2 page
INFO:root:- 2/2 page
INFO:root: 001530 finish (0.45 sec)
INFO:root: 001550 start
INFO:root:- 1/5 page
INFO:root:- 2/5 page
INFO:root:- 3/5 page
INFO:root:- 4/5 page
INFO:root:- 5/5 page
INFO:root: 001550 finish (0.83 sec)
INFO:root: 001560 start

INFO:root:- 7/50 page
INFO:root:- 8/50 page
INFO:root:- 9/50 page
INFO:root:- 10/50 page
INFO:root:- 11/50 page
INFO:root:- 12/50 page
INFO:root:- 13/50 page
INFO:root:- 14/50 page
INFO:root:- 15/50 page
INFO:root:- 16/50 page
INFO:root:- 17/50 page
INFO:root:- 18/50 page
INFO:root:- 19/50 page
INFO:root:- 20/50 page
INFO:root:- 21/50 page
INFO:root:- 22/50 page
INFO:root:- 23/50 page
INFO:root:- 24/50 page
INFO:root:- 25/50 page
INFO:root:- 26/50 page
INFO:root:- 27/50 page
INFO:root:- 28/50 page
INFO:root:- 29/50 page
INFO:root:- 30/50 page
INFO:root:- 31/50 page
INFO:root:- 32/50 page
INFO:root:- 33/50 page
INFO:root:- 34/50 page
INFO:root:- 35/50 page
INFO:root:- 36/50 page
INFO:root:- 37/50 page
INFO:root:- 38/50 page
INFO:root:- 39/50 page
INFO:root:- 40/50 page
INFO:root:- 41/50 page
INFO:root:- 42/50 page
INFO:root:- 43/50 page
INFO:root:- 44/50 page
INFO:root:- 45/50 page
INFO:root:- 46/50 page
INFO:root:- 47/50 page
INFO:root:- 48/50 page
INFO:root:- 49/50 page
INFO:root:- 50

INFO:root:- 9/12 page
INFO:root:- 10/12 page
INFO:root:- 11/12 page
INFO:root:- 12/12 page
INFO:root: 003060 finish (1.67 sec)
INFO:root: 003070 start
INFO:root:- 1/18 page
INFO:root:- 2/18 page
INFO:root:- 3/18 page
INFO:root:- 4/18 page
INFO:root:- 5/18 page
INFO:root:- 6/18 page
INFO:root:- 7/18 page
INFO:root:- 8/18 page
INFO:root:- 9/18 page
INFO:root:- 10/18 page
INFO:root:- 11/18 page
INFO:root:- 12/18 page
INFO:root:- 13/18 page
INFO:root:- 14/18 page
INFO:root:- 15/18 page
INFO:root:- 16/18 page
INFO:root:- 17/18 page
INFO:root:- 18/18 page
INFO:root: 003070 finish (3.05 sec)
INFO:root: 003075 start
INFO:root:- 1/1 page
INFO:root: 003075 finish (0.16 sec)
INFO:root: 003080 start
INFO:root:- 1/1 page
INFO:root: 003080 finish (0.16 sec)
INFO:root: 003090 start
INFO:root:- 1/5 page
INFO:root:- 2/5 page
INFO:root:- 3/5 page
INFO:root:- 4/5 page
INFO:root:- 5/5 page
INFO:root: 003090 finish (0.82 sec)
INFO:root: 003120 start
INFO:root:- 1/3 page
INFO:root:- 2/3 page
INFO:root:- 3/3

INFO:root:- 148/189 page
INFO:root:- 149/189 page
INFO:root:- 150/189 page
INFO:root:- 151/189 page
INFO:root:- 152/189 page
INFO:root:- 153/189 page
INFO:root:- 154/189 page
INFO:root:- 155/189 page
INFO:root:- 156/189 page
INFO:root:- 157/189 page
INFO:root:- 158/189 page
INFO:root:- 159/189 page
INFO:root:- 160/189 page
INFO:root:- 161/189 page
INFO:root:- 162/189 page
INFO:root:- 163/189 page
INFO:root:- 164/189 page
INFO:root:- 165/189 page
INFO:root:- 166/189 page
INFO:root:- 167/189 page
INFO:root:- 168/189 page
INFO:root:- 169/189 page
INFO:root:- 170/189 page
INFO:root:- 171/189 page
INFO:root:- 172/189 page
INFO:root:- 173/189 page
INFO:root:- 174/189 page
INFO:root:- 175/189 page
INFO:root:- 176/189 page
INFO:root:- 177/189 page
INFO:root:- 178/189 page
INFO:root:- 179/189 page
INFO:root:- 180/189 page
INFO:root:- 181/189 page
INFO:root:- 182/189 page
INFO:root:- 183/189 page
INFO:root:- 184/189 page
INFO:root:- 185/189 page
INFO:root:- 186/189 page
INFO:root:- 187/189 page


INFO:root:- 3/4 page
INFO:root:- 4/4 page
INFO:root: 003560 finish (1.12 sec)
INFO:root: 003570 start
INFO:root:- 1/3 page
INFO:root:- 2/3 page
INFO:root:- 3/3 page
INFO:root: 003570 finish (0.66 sec)
INFO:root: 003580 start
INFO:root:- 1/9 page
INFO:root:- 2/9 page
INFO:root:- 3/9 page
INFO:root:- 4/9 page
INFO:root:- 5/9 page
INFO:root:- 6/9 page
INFO:root:- 7/9 page
INFO:root:- 8/9 page
INFO:root:- 9/9 page
INFO:root: 003580 finish (1.34 sec)
INFO:root: 003610 start
INFO:root:- 1/2 page
INFO:root:- 2/2 page
INFO:root: 003610 finish (0.56 sec)
INFO:root: 003620 start
INFO:root:- 1/112 page
INFO:root:- 2/112 page
INFO:root:- 3/112 page
INFO:root:- 4/112 page
INFO:root:- 5/112 page
INFO:root:- 6/112 page
INFO:root:- 7/112 page
INFO:root:- 8/112 page
INFO:root:- 9/112 page
INFO:root:- 10/112 page
INFO:root:- 11/112 page
INFO:root:- 12/112 page
INFO:root:- 13/112 page
INFO:root:- 14/112 page
INFO:root:- 15/112 page
INFO:root:- 16/112 page
INFO:root:- 17/112 page
INFO:root:- 18/112 page
I

INFO:root:- 1/3 page
INFO:root:- 2/3 page
INFO:root:- 3/3 page
INFO:root: 004150 finish (2.01 sec)
INFO:root: 004170 start
INFO:root:- 1/160 page
INFO:root:- 2/160 page
INFO:root:- 3/160 page
INFO:root:- 4/160 page
INFO:root:- 5/160 page
INFO:root:- 6/160 page
INFO:root:- 7/160 page
INFO:root:- 8/160 page
INFO:root:- 9/160 page
INFO:root:- 10/160 page
INFO:root:- 11/160 page
INFO:root:- 12/160 page
INFO:root:- 13/160 page
INFO:root:- 14/160 page
INFO:root:- 15/160 page
INFO:root:- 16/160 page
INFO:root:- 17/160 page
INFO:root:- 18/160 page
INFO:root:- 19/160 page
INFO:root:- 20/160 page
INFO:root:- 21/160 page
INFO:root:- 22/160 page
INFO:root:- 23/160 page
INFO:root:- 24/160 page
INFO:root:- 25/160 page
INFO:root:- 26/160 page
INFO:root:- 27/160 page
INFO:root:- 28/160 page
INFO:root:- 29/160 page
INFO:root:- 30/160 page
INFO:root:- 31/160 page
INFO:root:- 32/160 page
INFO:root:- 33/160 page
INFO:root:- 34/160 page
INFO:root:- 35/160 page
INFO:root:- 36/160 page
INFO:root:- 37/160 pag

In [20]:
# 저장한 csv 조회
df = pd.read_csv(
      './pjt/news_019170.csv'
    , index_col = 0 
)

df

Unnamed: 0,제목,링크,기관,날짜,시간
0,"신풍제약, 올 27배↑ 시총 10조인데...분석보고서는 ‘0’",https://finance.naver.com/item/news_read.nhn?a...,서울경제,2020.09.18,17:46
1,신풍제약 시총 10조 돌파 올해 26배 급증...하나금융·삼성화재도 ...,https://finance.naver.com/item/news_read.nhn?a...,서울경제,2020.09.18,14:17
2,"신풍제약·일양약품, 코로나 치료제 기대감에 강세",https://finance.naver.com/item/news_read.nhn?a...,조선비즈,2020.09.18,14:00
3,[주식 초고수는 지금]신풍제약 11거래일 연속 매수 1위...신성델타테...,https://finance.naver.com/item/news_read.nhn?a...,서울경제,2020.09.18,11:19
4,"신풍제약 18% 급등…코로나 치료제 기업들 동반강세, 왜?[특징주]",https://finance.naver.com/item/news_read.nhn?a...,머니투데이,2020.09.18,10:10
5,연저점 6개월 만에 ‘2229%’…신풍제약 ‘수익률 킹’,https://finance.naver.com/item/news_read.nhn?a...,헤럴드경제,2020.09.17,11:52
6,"[주식 초고수는 지금]신풍제약·우리들휴브레인 사고, 진원생명과학·텔콘R...",https://finance.naver.com/item/news_read.nhn?a...,서울경제,2020.09.17,11:18
7,[주식 초고수는 지금]신풍제약 연일 관심... ‘씨젠’ 손바뀜 치열,https://finance.naver.com/item/news_read.nhn?a...,서울경제,2020.09.16,11:08
8,[주식 초고수는 지금] 신풍제약 사고 카카오게임즈 팔았다,https://finance.naver.com/item/news_read.nhn?a...,서울경제,2020.09.15,11:21
9,[주식 초고수는 지금] 신풍제약 이어 카카오게임즈 가장 많이 매수·매도,https://finance.naver.com/item/news_read.nhn?a...,서울경제,2020.09.14,11:16


In [9]:
df = pd.read_csv(
      './pjt/code.csv'
    , encoding='ms949'
)

df

Unnamed: 0,종목코드,종목명
0,000020,동화약품
1,000040,KR모터스
2,000050,경방
3,000060,메리츠화재
4,000070,삼양홀딩스
...,...,...
1556,590018,미래에셋 중국 심천 100 ETN
1557,590012,미래에셋 글로벌 리츠 ETN(H)
1558,700001,하나 코스피 변동성추세 추종 양매도 ETN
1559,900140,엘브이엠씨홀딩스


In [10]:
for n in df['종목코드']:
    print(n)

000020
000040
000050
000060
000070
000075
000080
000087
000100
000105
000120
000140
000145
000150
000155
000157
000180
000210
000215
000220
000225
000227
000230
000240
000270
000300
000320
000325
000370
000390
000400
000430
000480
000490
000500
000520
000540
000545
000547
000590
000640
000650
000660
000670
000680
000700
000720
000725
000760
000810
000815
000850
000860
000880
000885
00088K
000890
000910
000950
000970
000990
000995
001020
001040
001045
00104K
001060
001065
001067
001070
001080
001120
001130
001140
001200
001210
001230
001250
001260
001270
001275
001290
001340
001360
001380
001390
001420
001430
001440
001450
001460
001465
001470
001500
001510
001515
001520
001525
001527
001529
001530
001550
001560
001570
001620
001630
001680
001685
001720
001725
001740
001745
001750
001755
001770
001780
001790
001795
001800
001820
001880
001940
002020
002025
002030
002070
002100
002140
002150
002170
002200
002210
002220
002240
002270
002310
002320
002350
002355
002360
002380
002390
002410