In [33]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time
from dateutil.parser import parse 

### Novel Class
* 101 로맨스
* 102 SF & 판타지
* 103 무협
* 104 미스터리
* 105 역사&전쟁 (베스트&첼린지)
* 106 라이트노벨
* 107 팬픽 (첼린지)
* 108 퓨전


* webnovel 오늘의 웹소설
* best 베스트리그
* challenge 첼린지리그


### novel ID crawling

In [15]:
def get_novel_ID(genre, level):
    
    df = pd.DataFrame(columns=["ID", "level", "genre"])

    number = 1
    if level == "webnovel":
        number = 2
        
    for fin in ["", "&order=Read&finish=true"][0:number]:
        url = "http://novel.naver.com/{0}/genre.nhn?genre={1}{2}&page=1000".format(level, genre, fin)
        response = requests.get(url)
        dom = BeautifulSoup(response.content, "lxml")
        end_page = dom.select_one("div.paging")
        if end_page:
            end_page = int(end_page.select_one("strong").contents[0])
        else:
            end_page = 1

        for page in range(1, end_page+1):
            url = "http://novel.naver.com/{0}/genre.nhn?genre={1}{2}&page={3}".format(level, genre, fin, page)
            response = requests.get(url)
            dom = BeautifulSoup(response.content, "lxml")
            list_item = dom.select("a.list_item")

            for item in list_item:
                novel_ID = item["href"].split("=")[1]
                df.loc[len(df)] = novel_ID, level, genre

    return df

def make_genre_df(level):
    
    if "data" not in os.listdir("."):
        os.mkdir("data")
    
    genres_df = pd.DataFrame(columns=["ID", "level", "genre", ])
    
    genres = [101, 102, 103, 104, 106, 108]
    if level == "best":
        genres += [105]
    if level == "challenge":
        genres += [105, 107]
    
    genres_data = map(get_novel_ID, genres, [level] * len(genres))
    genres_df = pd.concat(genres_data).reset_index(drop=True)
    genres_df.drop_duplicates(inplace=True)
    genres_df.to_csv("data/genres_df.csv")
    
    return genres_df

## novel main data crawling

In [16]:
def get_comment_count(ID, level, get_main=True, number=0):
    
    if level == "webnovel":
        level = "novel01"
    else:
        level = "novel02"
    
    headers = {"Referer": "http://novel.naver.com/{level}/list.nhn?".format(level=level)}
    if get_main:
        data = {
            "ticket": level,
            "object_id": "novel-{ID}".format(ID=ID),
            "_ts": "1469461095606",
            "page_size": "10",
            "page_no": "1",
            "sort": "newest"
        }
    else:
        data = {
            "ticket": level,
            "object_id": "{ID}-{number}".format(ID=ID, number=number),
            "_ts": "1469461095606",
            "page_size": "10",
            "page_no": "1",
            "sort": "newest"
        }
        
    comment_response = requests.post(
        "http://novel.naver.com/comments/list_comment.nhn", 
        headers=headers, 
        data=data
    )
    total_count = json.loads(comment_response.text.replace("\\'", "\'"))['total_count']
    
    return total_count

def get_novel_data(ID, level):
    
    url = "http://novel.naver.com/{level}/list.nhn?novelId={ID}".format(level=level, ID=ID)
    response = requests.get(url)
    dom = BeautifulSoup(response.content, "lxml")
    
    main_score = float(dom.select_one("p.grade_area").select_one("em").text)
    concern_count = int(dom.select_one("span#concernCount").text.replace(",", ""))
    episodes_count = int(dom.select_one("span.total").text[1:-1])
    comments_count = get_comment_count(ID, level)
    
    return ID, main_score, concern_count, episodes_count, comments_count

def make_novel_df(df):
    
    data = list(map(get_novel_data, df["ID"], df["level"]))
    novel_df = pd.DataFrame(data, columns=["ID", "main_score", "concern_count", "episodes_count","comments_count"])
    novel_df.drop_duplicates(inplace=True)
    novel_df.to_csv("data/novel_df.csv")
    
    return novel_df

def make_main_df(level):

    genre_df = make_genre_df(level)
    novel_df = make_novel_df(genre_df)
    main_df = genre_df.merge(novel_df, on="ID")
    main_df.to_csv("data/main_{level}_df.csv".format(level=level))
    
    return main_df

In [19]:
main_df = make_main_df("best")

# novel episode crawling

In [25]:
def get_novel_episode(ID, episodes_count, level="webnovel"):
    
    file_name = "{ID}-{level}.pickle".format(ID=ID, level=level)
    if file_name in os.listdir("data/episodes"):
        return pd.read_pickle("data/episodes/"+file_name)
    
    df = pd.DataFrame(columns=["ID", "episode", "is_first", "score", "score_count", "episode_comments_count", "text"])
    err_number = 0
    
    for episode in range(1, episodes_count+1):
        
        is_first = 0
        if episode == 1:
            is_first = 1
            
        try:
            episode_url = "http://novel.naver.com/{level}/detail.nhn?novelId={ID}&volumeNo={episode}".format(
                level=level, 
                ID=ID, 
                episode=episode+err_number
            )
            episode_response = requests.get(episode_url)
            dom = BeautifulSoup(episode_response.content, "lxml")

            score = float(dom.select_one("em#currentStarScore").text)
            if episode == episodes_count:
                print(ID, episode)
                
        except:
            print(ID, episode, "error")
            err_number += 1
            
            episode_url = "http://novel.naver.com/{level}/detail.nhn?novelId={ID}&volumeNo={episode}".format(
                level=level, 
                ID=ID, 
                episode=episode+err_number
            )
            episode_response = requests.get(episode_url)
            dom = BeautifulSoup(episode_response.content, "lxml")

            score = float(dom.select_one("em#currentStarScore").text)
            
        score_count = int(dom.select_one("span#currentStarScoreCount").text.replace(",","")[:-1])
        episode_comments_count = get_comment_count(ID, level, False, episode)
        text = dom.select_one("div.detail_view_content").text.replace("\r\n", "")

        df.loc[len(df)] = ID, episode, is_first, score, score_count, episode_comments_count, text
        
    df.to_pickle("data/episodes/"+file_name)
    return df

In [21]:
def make_episode_df(df):
    
    if "episodes" not in os.listdir("data"):
        os.mkdir("data/episodes")
        
    data = list(map(get_novel_episode, df["ID"], df["episodes_count"]))
    episode_df = pd.concat(data)
    episode_df.reset_index(drop=True, inplace=True)
    episode_df.to_csv("data/episode_df.csv")
    
    return episode_df

In [28]:
episode_df = pd.read_csv("data/episode_df.csv", index_col=0)

## novel_time_crawling

In [None]:
def get_episode_time(ID, level, episodes_count):
    
    pages = episodes_count // 10
    time_list = []
    
    for page in range(1, pages+1):
        url = "http://novel.naver.com/{level}/list.nhn?novelId={ID}&page={page}".format(level=level, ID=ID, page=page)
        response = requests.get(url)
        dom = BeautifulSoup(response.content, "lxml")
        times = dom.select_one("ul.list_type2.v3.NE=a:lst").select("span.date")
        time_list += list(map(lambda x: parse(x.text), times))
    
    return time_list

In [None]:
def make_time_df(df):
    
    data = list(map(get_episode_time, df["ID"], df["level"], df["episodes_count"]))
    time_df = pd.DataFrame(np.hstack(data), columns=["time"])
    time_df.to_pickle("data/time_df.pickle")
    
    return time_df

## novel comment crawling

In [63]:
def get_novel_comments(ID, comments_count, get_main, number=0, level="webnovel"):
    
    if "comment" not in os.listdir("data"):
        os.mkdir("data/comment")
    
    if "{ID}-{number}.pickle" in os.listdir("data/comment"):
        return pd.read_pickle("data/comment/{ID}-{number}.pickle")
    
    if level == "webnovel":
        novel = "novel01"
    else:
        novel = "novel02"
    
    headers = {"Referer": "http://novel.naver.com/{level}/list.nhn?".format(level=level)}
    
    comment_list = []
    
    pages = (comments_count // 100) + 1
    for page in range(1, pages+1):
        
        if get_main:
            data = {
                "ticket": novel,
                "object_id": "novel-{ID}".format(ID=ID),
                "_ts": "1469461095606",
                "page_size": "100",
                "page_no": page,
                "sort": "newest"
            }
        else:
            data = {
                "ticket": novel,
                "object_id": "{ID}-{number}".format(ID=ID, number=number),
                "_ts": "1469461095606",
                "page_size": "100",
                "page_no": page,
                "sort": "newest"
            }

        comment_response = requests.post(
            "http://novel.naver.com/comments/list_comment.nhn", 
            headers=headers, 
            data=data
        )
        
        comment_list += json.loads(comment_response.text.replace("\\'", "\'"))["comment_list"]
    
    df = pd.DataFrame(comment_list)
    df.to_pickle("data/comment/{ID}-{number}.pickle".format(ID=ID, number=number))
    
    return df

In [64]:
def make_main_novel_comment(df, level):
    
    data_list = [
        get_novel_comments(
            int(df["ID"][i]), 
            int(df["comments_count"][i]), 
            True, 
            level=level
        )
        for i in range(len(df))
    ]
        
    comment_df = pd.concat(data_list).reset_index(drop=True)
    comment_df.to_csv("data/{level}_main_comment.csv".format(level=level))
    
    return comment_df

In [65]:
def make_episode_novel_comment(df, level):
    
    data_list = [
        get_novel_comments(
            int(df["ID"][i]), 
            int(df["episode_comments_count"][i]), 
            False, 
            int(df["episode"][i]), 
            level=level
        )
        for i in range(len(df))
    ]
        
    comment_df = pd.concat(data_list).reset_index(drop=True)
    comment_df.to_csv("data/{level}_episode_comment.csv".format(level=level))
    
    return comment_df

In [69]:
make_main_novel_comment(main_df, "best")

Unnamed: 0,comment_no,contents,deleted_yn,down_count,enc_writer_id,enc_writer_profile_user_id,group_no,is_facebook,is_me2day,is_mine,...,reply_level,status,ticket,up_count,visible_yn,writer_id,writer_ip,writer_nickname,writer_profile_type,writer_profile_user_id
0,7147238,대박\n잼나여~,N,0,FwMQcXwi21oGBVg3bl%2FRyQ%3D%3D,Be9UU86D7RhoZEJ73jzZ%2BA%3D%3D,-7147238,false,false,N,...,1,0,novel02,1,Y,kyjb****,112.154.xxx.85,꿀물,naver,kyjb****
1,7176588,꿀잼ㅋㅋㅋㅋㅋㅋ,N,0,7AC2orEwkkiL72EVQhjTng%3D%3D,rExCQOrvl7wfelvHn8VFQg%3D%3D,-7176588,false,false,N,...,1,0,novel02,3,Y,jhc0****,211.244.xxx.30,Mesut_ozil,naver,jhc0****
2,7172859,재밌어요!!!,N,0,hvvMB1%2BcxQG3TQcNnH45Tw%3D%3D,RjaYfZzLiMDsMdfFDEHOyw%3D%3D,-7172859,false,false,N,...,1,0,novel02,3,Y,ymn2****,221.153.xxx.199,인니피트1025,naver,ymn2****
3,7144208,전개가 신선하고 등장인물들간에 케미가 돋보이네요ㅋㅋ꾸르잼,N,1,QYDX2iiqhR2z7JU8NlK9cA%3D%3D,7eMnbF6%2FZ4MsMZBuBvoQXw%3D%3D,-7144208,false,false,N,...,1,0,novel02,4,Y,levi****,119.214.xxx.36,,naver,levi****
4,7138845,초능력을 이용한 러브라인.. 얘기가 어떻게 전개 될 지 기대되요!!ㅋㅋㅋ,N,0,YwEpmD%2BkMXfziGLL4G3HeA%3D%3D,JkD1eZI2KA5QCJ41jU%2FbUA%3D%3D,-7138845,false,false,N,...,1,0,novel02,2,Y,xhtl****,223.62.xxx.246,토실토실,naver,xhtl****
5,7121989,챌린지 리그 때부터 재밌게 보고 있어요!,N,0,kvJHFLeO3kqRUd3Ep6BLHQ%3D%3D,46O1l4l5pbQuRmvYigwJuw%3D%3D,-7121989,false,false,N,...,1,0,novel02,3,Y,psal****,218.152.xxx.188,박하영,naver,psal****
6,7121941,초능력이라 ㅋㅋㅋㅋㅋㅋㅋ 기존에 나와있는것과는 다른세계관일것인가의 기대감이 있네요 ㄹㄹ,N,1,%2Fm30qrK47HwyKh80nAe5eA%3D%3D,rOCfZmaPyTV%2FJvg9Map5GQ%3D%3D,-7121941,false,false,N,...,1,0,novel02,4,Y,vdcf****,223.62.xxx.23,강신찬,naver,vdcf****
7,7167421,베스트리그로 승격으 되셨네요. 먼저 축하드립니다. 항강 재미있게 보고 있었는데~나이...,N,0,%2FUBOoC0s6H6BSG6jLnxxUQ%3D%3D,rV5fTkIx%2BBbQORd0YWmvqg%3D%3D,-7167421,false,false,N,...,1,0,novel02,1,Y,asia****,115.137.xxx.200,하리마오,naver,asia****
8,7166312,작가님 항상 감사합니다.,N,0,9NjFpE6HfFphPh2DqvQzuw%3D%3D,mkLwzCwazSaClnX6qdVnGg%3D%3D,-7166312,false,false,N,...,1,0,novel02,2,Y,ping****,58.232.xxx.193,,naver,ping****
9,7167708,여긴 아무도 없네여??,N,0,qwUz%2F5gzXqci30P4ddc40g%3D%3D,PQ1sunBAeMEWJqFqPl6QQg%3D%3D,-7167708,false,false,N,...,1,0,novel02,2,Y,joga****,124.60.xxx.78,샷치,naver,joga****
