In [1]:
import multiprocessing as mp
import threading as th
import pickle
import requests
from bs4 import BeautifulSoup as bs

In [2]:
sids = {
    100: "정치",
    101: "경제",
    102: "사회",
    103: "생활/문화",
    104: "세계",
    105: "IT/과학",
}

start_pg, end_pg = 1, 12

In [3]:
def save_pkl(path, data):
    with open(path, "wb") as f:
        pickle.dump(data, f)

In [4]:
def load_pkl(path):
    with open(path, "rb") as f:
        data = pickle.load(f)
        
    return data

In [5]:
def to_dataframe(data):
    
    df = pd.DataFrame(data)
    df.section = df.section.apply(lambda sid: sids[sid])
    
    return df

In [6]:
def make_url(sid, page):
    
    url_section = "https://news.naver.com/main/mainNews.nhn?sid1={}"
    url_page = "&date=%2000:00:00&page={}"
    url = url_section.format(sid) + url_page.format(page)
    
    return url

In [7]:
def get_article_links(url, sid):
    
    url_article = "https://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1={}&oid={}&aid={}"
    response = requests.get(url)
    article_data = eval(response.json()["airsResult"].replace("true", "True").replace("null", "None"))["result"][str(sid)]
    links = [url_article.format(sid, data["officeId"], data["articleId"]) for data in article_data]
    
    return links

In [8]:
def get_article(url):
    
    response = requests.get(url)
    dom = bs(response.content, "html.parser")
    datetime = dom.select_one(".sponsor > .t11").text
    title = dom.select_one("#articleTitle").text
    text = dom.select_one("#articleBodyContents").text.split("{}")[1].strip()
    
    return datetime, title, text

In [9]:
def crawl_section_page(url, sid, section_articles, th_lock):
    
    keys = (
        "section",
        "datetime",
        "title",
        "text",
    )
    
    links = get_article_links(url, sid)
    page_articles = [dict(zip(keys, (sid, *get_article(link)))) for link in links]
    
    th_lock.acquire()
    
    try:
        section_articles += page_articles
    finally:
        th_lock.release()

In [10]:
def crawl_section(start_pg, end_pg, sid, all_articles, mp_lock):
    
    print("Process {} STARTED".format(sid % 100))
    
    section_articles = []
    pages = range(start_pg, end_pg + 1)
    links = [make_url(sid, page) for page in pages]
    th_lock = th.Lock()

    threads = [th.Thread(target=crawl_section_page, args=(link, sid, section_articles, th_lock)) for link in links]
    
    for thread in threads:
        thread.start()
    
    for thread in threads:
        thread.join()
    
    mp_lock.acquire()
    
    try:
        all_articles += section_articles
    finally:
        mp_lock.release()
        print("Process {} FINISHED".format(sid % 100))

In [11]:
def crawl_naver_news(sids, start_pg, end_pg):
    
    manager = mp.Manager()    
    mp_lock = mp.Lock()
    all_articles = manager.list()
    
    processes = [mp.Process(target=crawl_section, args=(start_pg, end_pg, sid, all_articles, mp_lock)) for sid in sids]
    
    for process in processes:
        process.start()
        
    for process in processes:
        process.join()
    
    return list(all_articles)

In [14]:
%%time

end_pages = [10]

for end_page in end_pages:
    path = "naver_{}.pkl".format(end_page * 120)
    df = to_dataframe(crawl_naver_news(sids, 1, end_page))
    print(len(df))
    save_pkl(path, df)

Process 0 STARTED
Process 1 STARTED
Process 2 STARTED
Process 3 STARTED
Process 4 STARTED
Process 5 STARTED
Process 0 FINISHED
Process 4 FINISHED
Process 2 FINISHED
Process 5 FINISHED
Process 3 FINISHED
Process 1 FINISHED
1200
CPU times: user 68.7 ms, sys: 48.5 ms, total: 117 ms
Wall time: 18.6 s


In [15]:
df = load_pkl("naver_1200.pkl")

In [16]:
df.head()

Unnamed: 0,datetime,section,text,title
0,2019.07.02. 오후 5:32,정치,3일 후보자 등록 받은 후 5일 의총 열어 선출김재원 의원 경선 요구해 투표 불가피...,한국당 '집안싸움' 예결위원장…5일 의총서 경선 선출 유력
1,2019.07.02. 오후 5:31,정치,"경제보복 발등의 불 떨어졌는데…문 대통령 주재 閣議 열렸지만 대응책 논의 안해靑 ""...","日 도발에 무기력한 정부…문 대통령 침묵, 康외교는 ""상황 보며 연구"""
2,2019.07.02. 오후 5:30,정치,적자 기관 마저 무리한 추진발전 정비용역기업 소송 채비기존 공무원과 갈등도 심각학교...,공공부문 정규직화...서둘다 禍 키웠다
3,2019.07.02. 오후 5:29,정치,국회예산처 '한국형 실업 부조'재정 소요 분석내년부터 저소득층 실직자에6개월간 매달...,[단독] 게임기 사는데 줄줄 샌 '청년수당'…세금 6.5兆 쏟아붓겠다는 정부
4,2019.07.02. 오후 5:29,정치,"文대통령, 일산병원 의사·환자들 직접 만나文 ""꼭 필요한 치료, 급여항목 지정안된 ...","'문재인케어' 현장서 살핀 文대통령…""MRI 가격 묻던 환자들 자연스레 찍게돼"""
