In [4]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time
from dateutil.parser import parse 

### Novel Class
* 101 로맨스
* 102 SF & 판타지
* 103 무협
* 104 미스터리
* 105 역사&전쟁 (베스트&첼린지)
* 106 라이트노벨
* 107 팬픽 (첼린지)
* 108 퓨전


* webnovel 오늘의 웹소설
* best 베스트리그
* challenge 첼린지리그


### novel ID crawling

In [10]:
def get_genre_url(genre, level):
    
    df = pd.DataFrame(columns=["ID", "level", "genre", ])

    number = 1
    if level == "webnovel":
        number = 2
        
    for fin in ["", "&order=Read&finish=true"][0:number]:
        url = "http://novel.naver.com/{0}/genre.nhn?genre={1}{2}&page=1000".format(level, genre, fin)
        response = requests.get(url)
        dom = BeautifulSoup(response.content, "lxml")
        end_page = dom.select_one("div.paging")
        if end_page:
            end_page = int(end_page.select_one("strong").contents[0])
        else:
            end_page = 1

        for page in range(1, end_page+1):
            url = "http://novel.naver.com/{0}/genre.nhn?genre={1}{2}&page={3}".format(level, genre, fin, page)
            response = requests.get(url)
            dom = BeautifulSoup(response.content, "lxml")
            list_item = dom.select("a.list_item")

            for item in list_item:
                novel_ID = item["href"].split("=")[1]
                df.loc[len(df)] = novel_ID, level, genre

    return df

def get_url(level="webnovel"):
    
    if "data" not in os.listdir("."):
        os.mkdir("data")
    
    genres_df = pd.DataFrame(columns=["ID", "level", "genre", ])
    
    genres = [101, 102, 103, 104, 106, 108]
    if level == "best":
        genres += [105]
    if level == "challenge":
        genres += [105, 107]
    
    genres_data = map(get_genre_url, genres, [level] * len(genres))
    genres_df = pd.concat(genres_data).reset_index(drop=True)
    genres_df.drop_duplicates(inplace=True)
    genres_df.to_pickle("data/genres_df.pickle")
    
    return genres_df

In [11]:
genres_df = get_url()
genres_df.head()

Unnamed: 0,ID,level,genre
0,466391,webnovel,101.0
1,398090,webnovel,101.0
2,514809,webnovel,101.0
3,505096,webnovel,101.0
4,523286,webnovel,101.0


In [12]:
len(genres_df)

241

## novel main data crawling

In [6]:
def get_comment_count(ID, level, get_main=True, number=0):
    
    if level == "webnovel":
        level = "novel01"
    else:
        level = "novel02"
    
    headers = {"Referer": "http://novel.naver.com/{level}/list.nhn?".format(level=level)}
    if get_main:
        data = {
            "ticket": level,
            "object_id": "novel-{ID}".format(ID=ID),
            "_ts": "1469461095606",
            "page_size": "10",
            "page_no": "1",
            "sort": "newest"
        }
    else:
        data = {
            "ticket": level,
            "object_id": "{ID}-{number}".format(ID=ID, number=number),
            "_ts": "1469461095606",
            "page_size": "10",
            "page_no": "1",
            "sort": "newest"
        }
        
    comment_response = requests.post(
        "http://novel.naver.com/comments/list_comment.nhn", 
        headers=headers, 
        data=data
    )
    total_count = json.loads(comment_response.text.replace("\\'", "\'"))['total_count']
    
    return total_count

In [7]:
def get_novel_data(ID, level):
    
    url = "http://novel.naver.com/{level}/list.nhn?novelId={ID}".format(level=level, ID=ID)
    response = requests.get(url)
    dom = BeautifulSoup(response.content, "lxml")
    
    main_score = float(dom.select_one("p.grade_area").select_one("em").text)
    concern_count = int(dom.select_one("span#concernCount").text.replace(",", ""))
    episodes_count = int(dom.select_one("span.total").text[1:-1])
    comments_count = get_comment_count(ID, level)
    
    return ID, level, main_score, concern_count, episodes_count, comments_count

In [8]:
def make_novel_df(df):
    
    data = map(get_novel_data, df["ID"], df["level"])
    novel_df = pd.DataFrame(data, columns=["ID", "level", "main_score", "concern_count", "episodes_count","comments_count"])
    novel_df.drop_duplicates(inplace=True)
    novel_df.to_pickle("data/novel_df.pickle")
    
    return novel_df
        

In [9]:
make_novel_df(pd.read_pickle("data/genres_df.pickle"))

Unnamed: 0,ID,level,main_score,concern_count,episodes_count,comments_count
0,466391,webnovel,9.98,86081,95,179
1,398090,webnovel,9.98,71697,138,188
2,514809,webnovel,9.96,43100,51,125
3,505096,webnovel,9.95,48309,59,76
4,523286,webnovel,9.97,27361,43,29
5,552533,webnovel,9.92,15943,7,27
6,514807,webnovel,9.96,36135,51,56
7,466374,webnovel,9.86,34690,95,92
8,483047,webnovel,9.98,33896,77,29
9,514808,webnovel,9.92,31621,51,33


In [None]:
headers = {
    "Host": "novel.naver.com",
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate",
    "Referer": "http://novel.naver.com/webnovel/list.nhn?",
    "Cookie": "npic=vsOmTHjBylVsPFYdS/B3FQYwhSmmJPp9HdB2FyBtDQRragD0YuRt6IWISVtFnvkECA==; NNB=IKLUINWDZ2IVO; nx_ssl=2; novel_eo=N; novel_rp=466391_94_0; nid_iplevel=1; page_uid=SHhPAwpBhGGssQx9tsosssssss0-343165",
    "Connection": "keep-alive"
}

# novel episode crawling

In [None]:
def get_novel_episode(ID, episodes_count, novel="webnovel"):
    df = pd.DataFrame(columns=["ID", "episode", "is_first", "score", "score_count", "episode_comments_count", "text"])
    err_number = 0
    
    for episode in range(1, episodes_count+1):
        
        is_first = 0
        if episode == 1:
            is_first = 1
            
        try:
            episode_url = "http://novel.naver.com/{novel}/detail.nhn?novelId={ID}&volumeNo={episode}".format(novel=novel, ID=ID, episode=episode+err_number)
            episode_response = requests.get(episode_url)
            dom = BeautifulSoup(episode_response.content, "lxml")

            score = float(dom.select_one("em#currentStarScore").text)
            if (episode % 30 == 0) | (episode == episodes_count):
                print(ID, episode)
                
        except:
            print(ID, episode, "error")
            err_number += 1
            
            episode_url = "http://novel.naver.com/{novel}/detail.nhn?novelId={ID}&volumeNo={episode}".format(novel=novel, ID=ID, episode=episode+err_number)
            episode_response = requests.get(episode_url)
            dom = BeautifulSoup(episode_response.content, "lxml")

            score = float(dom.select_one("em#currentStarScore").text)
            
        score_count = int(dom.select_one("span#currentStarScoreCount").text.replace(",","")[:-1])
        episode_comments_count = get_comment_count(ID, novel, False, episode)
        text = dom.select_one("div.detail_view_content").text.replace("\r\n", "")

        df.loc[len(df)] = ID, episode, is_first, score, score_count, episode_comments_count, text

    return df

In [None]:
def make_episode_df(df):
    
    data = list(map(get_novel_episode, df["ID"], df["episodes_count"]))
    episode_df = pd.concat(data)
    episode_df.reset_index(drop=True, inplace=True)
    episode_df.to_pickle("data/episode_df.pickle")
    
    return episode_df

In [None]:
novel_df = pd.read_pickle("data/novel_df.pickle")

In [None]:
novel_df.head()

## novel_time_crawling

In [None]:
def get_episode_time(ID, level, episodes_count):
    
    pages = episodes_count // 10
    time_list = []
    
    for page in range(1, pages+1):
        url = "http://novel.naver.com/{level}/list.nhn?novelId={ID}&page={page}".format(level=level, ID=ID, page=page)
        response = requests.get(url)
        dom = BeautifulSoup(response.content, "lxml")
        times = dom.select_one("ul.list_type2.v3.NE=a:lst").select("span.date")
        time_list += list(map(lambda x: parse(x.text), times))
    
    return time_list

In [None]:
def make_time_df(df):
    
    data = list(map(get_episode_time, df["ID"], df["level"], df["episodes_count"]))
    time_df = pd.DataFrame(np.hstack(data), columns=["time"])
    time_df.to_pickle("data/time_df.pickle")
    
    return time_df

In [None]:
time_df = make_time_df(novel_df)

## novel comment crawling

In [None]:
# novel document review crawling
def get_novel_main_comments(ID, level, episodes_count, get_main=True, number=0)
    
    if level == "webnovel":
        level = "novel01"
    else:
        level = "novel02"
    
    pages = (episodes_count // 100) + 1
    
    for page in range(pages):
        headers = {"Referer": "http://novel.naver.com/{level}/list.nhn?".format(level=level)}
        if get_main:
            data = {
                "ticket": level,
                "object_id": "novel-{ID}".format(ID=ID),
                "_ts": "1469461095606",
                "page_size": "100",
                "page_no": page,
                "sort": "newest"
            }
        else:
            data = {
                "ticket": level,
                "object_id": "{ID}-{number}".format(ID=ID, number=number),
                "_ts": "1469461095606",
                "page_size": "100",
                "page_no": page,
                "sort": "newest"
            }

        comment_response = requests.post(
            "http://novel.naver.com/comments/list_comment.nhn", 
            headers=headers, 
            data=data
        )
        total_count = json.loads(comment_response.text.replace("\\'", "\'"))

In [None]:
json.loads(comment_response.text.replace("\\'", "\'"))

In [None]:
episode_url = "http://novel.naver.com/webnovel/detail.nhn?novelId=466391&volumeNo=3"
episode_response = requests.get(episode_url)
dom = BeautifulSoup(episode_response.content, "lxml")

In [None]:
episode_url = "http://novel.naver.com/webnovel/list.nhn?novelId=466391&page=1"
episode_response = requests.get(episode_url)
dom = BeautifulSoup(episode_response.content, "lxml")

In [None]:
dom.select_one("ul.list_type2.v3.NE=a:lst").select("span.date")