# 네이버 뉴스 크롤러

In [15]:
import requests
from bs4 import BeautifulSoup as bs
import threading

import pandas as pd

In [147]:
class NaverNewsCrawler(threading.Thread) :
    def __init__(self):
        threading.Thread.__init__(self) 
    
    def setParams(self, keyword, start, end, sort=1, include = [], not_include = []) :
        # 검색 키워드
        # 시작하는 뉴스 번호(포함)
        # 끝나는 뉴스 번호(비포함)
        # 정렬방법 : 0=관련도순, 1=최신순, 2=오래된순
        # 반드시 포함해야하는 단어
        # 반드시 제외해야하는 단어
        self.keyword = keyword
        self.start_num = start
        self.end_num = end
        self.sort =sort
        self.include = include
        self.not_include = not_include
        
        # header가 없으면 차단당함
        self.headers =  {"Referer": "https://m.search.naver.com/search.naver?where=m_news".encode('utf-8'),
                         "User-Agent": "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"}
        
        self.title = []
        self.link = []
        self.date = []
        self.publisher = []
        self.contents = []
        
    def run(self):
        self.start_metadata_crawling()
        self.start_contents_crawling()
        self.filtering()
        
    def start_metadata_crawling(self) :
        """뉴스의 링크 크롤링"""
        
        # 뉴스 페이지 하나당 15개의 뉴스가 존재하므로
        # 한 페이지당10개씩 크롤링
        for idx in range(self.start_num, self.end_num, 15) :
            self.news = "https://m.search.naver.com/search.naver?where=m_news&query={}&start={}&sort={}".format(self.keyword, self.start_num, self.sort)

            req = requests.get(self.news, headers=self.headers)
            html = req.text
            soup = bs(html, 'html.parser')
            
            title, link, date, publisher  = self.parsing_metadata(soup) 
            
            self.title += title
            self.link += link
            self.date += date
            self.publisher += publisher
            
    def start_contents_crawling(self) : 
        """뉴스의 내용 크롤링"""
        
        for link in self.link :
            req = requests.get(link, headers=self.headers)
            new_link = req.url

            req2= requests.get(new_link, headers=self.headers)
            html = req2.text
            soup = bs(html, 'html.parser')
            
            # 연예 기사
            if "m.entertain.naver" in new_link :
                try :    
                    contents = self.parsing_entertain(soup)
                except :
                    contents = ""
                
            # 스포츠 기사
            elif "m.sports.naver" in new_link: 
                try :
                    contents = self.parsing_sports(soup)
                except :
                    contents = req.url
            
            # 일반 뉴스 기사
            elif "m.news.naver" in new_link :
                try :
                    contents = self.parsing_news(soup)
                except :
                    contents = req.url
                
            else :
                contents = req.url
                
            self.contents.append(contents)
            
    def filtering(self) :
        pass
            
    def parsing_entertain(self, soup) :
        contents = soup.find('div', {'class': 'newsct_article go_trans'}).text
        return contents
    
    def parsing_sports(self, soup) :
        contents = soup.find('article', {'class': 'main_article'}).text
        return contents
        
    def parsing_news(self, soup) :
        contents = soup.find('div', {'id': 'dic_area'}).text
        return contents
        
    def parsing_metadata(self, soup) :
        title = []
        link = []
        date = []
        publisher = []
        
        wraps = soup.find_all('div', {'class': "news_wrap"})
        
        for wrap in wraps :
            
            # 네이버 뉴스인지 아닌지 확인
            # 길이가 2이면 네이버뉴스
            p = wrap.findAll("cite", {"class" : "sub_txt"})
            if len(p) == 2 :
                title.append(wrap.find("div", {"class" : "api_txt_lines tit"}).text)
                link.append(wrap.find("a", {"class" : "news_tit"}).get("href"))
                date.append(wrap.find("span", {"class" : "sub_txt sub_time"}).text)
                publisher.append(p[0].text)
            
        return title, link, date, publisher
    
    def getData(self) :
        return self.title ,self.link, self.date, self.publisher, self.contents

In [148]:
def get_news_num(keyword) : 
    news = "https://search.naver.com/search.naver?where=news&query={}".format(keyword)
    
    try :
        req = requests.get(news)
        html = req.text
        soup = bs(html, 'html.parser')

        text = soup.find("div", {"class" : "title_desc all_my"}).text
        num = int(text.split("/")[1].replace("," , "").replace("건", "").strip())
        
    except :
        num = 0
        
    return num

In [149]:
def crawling_Naver(keyword, num_thread, num=None, sort=1, include = [], not_include = []) :
    if not num :
        num = get_news_num(keyword)
        
    if num_thread > 1 :
        num_thread -= 1
        
    interval = num//num_thread
    thread_lst = []
    num_lst = []
    
    for start in range(1, num, interval) :
        naver = NaverNewsCrawler()
        naver.setParams(keyword, start, start+interval)
        naver.daemon = True
        naver.start()
        
        thread_lst.append(naver)
        num_lst.append((start, start+interval))
        
    print("Num :", num)
    print("Num_Thread :", num_thread)
    print("Num_lst :", num_lst)
    print("")
    
    ## wait for thread
    idx =1
    for thread in thread_lst :
        thread.join()
        print(idx, "Thread finished")
        idx += 1
        
    tt = []
    ll = []
    dd = []
    pp = []
    cc = []
    for thread in thread_lst : 
        t,l,d,p,c = thread.getData()
        tt += t
        ll += l
        dd += d
        pp += p
        cc += c
        

    return tt, ll, dd, pp, cc

In [150]:
def save_result(result, path="./", filename="crawling.csv") :
    
    data = []
    for t,l,d,p,c in zip(result[0],result[1], result[2], result[3], result[4]) :
        data.append([t,l,d,p,c])
        
    df = pd.DataFrame(data, columns=["title", "link", "date", "publisher", "contents"])
    df.to_csv(path + filename)
    
    return df

<br><br><br>

In [151]:
result = crawling_Naver("크롤링", 1, num=15)
df = save_result(result)

Num : 15
Num_Thread : 1
Num_lst : [(1, 16)]

1 Thread finished
