In [5]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time
from dateutil.parser import parse 

### Novel Class
* 101 로맨스
* 102 SF & 판타지
* 103 무협
* 104 미스터리
* 105 역사&전쟁 (베스트&첼린지)
* 106 라이트노벨
* 107 팬픽 (첼린지)
* 108 퓨전


* webnovel 오늘의 웹소설
* best 베스트리그
* challenge 첼린지리그


### 전체 소설 ID 크롤링 (종류, 장르, ID, 작가이름, 총 화수)

In [109]:
def get_novel_ID(genre, level):
    
    df = pd.DataFrame(columns=["level", "genre", "ID", "name", "episode_total"])

    number = 1
    if level == "webnovel":
        number = 2
        
    for fin in ["", "&order=Read&finish=true"][0:number]:
        url = "http://novel.naver.com/{0}/genre.nhn?genre={1}{2}&page=1000".format(level, genre, fin)
        response = requests.get(url)
        dom = BeautifulSoup(response.content, "lxml")
        end_page = dom.select_one("div.paging")
        if end_page:
            end_page = int(end_page.select_one("strong").contents[0])
        else:
            end_page = 1

        for page in range(1, end_page+1):
            url = "http://novel.naver.com/{0}/genre.nhn?genre={1}{2}&page={3}".format(level, genre, fin, page)
            response = requests.get(url)
            dom = BeautifulSoup(response.content, "lxml")
            list_item = dom.select("a.list_item")

            for item in list_item:
                novel_ID = item["href"].split("=")[1]
                novel_name = item.select_one("span.ellipsis").text
                novel_episode_total = item.select_one("span.num_total").text.split(" ")[1][:-1]
                
                df.loc[len(df)] =  level, genre, novel_ID, novel_name, novel_episode_total

    return df

def make_genre_df(level):
    
    if "data" not in os.listdir("."):
        os.mkdir("data")
    if level not in os.listdir("data"):
        os.mkdir(os.path.join("data", level))
    
    genres = [101, 102, 103, 104, 106, 108]
    if level == "best":
        genres += [105]
    if level == "challenge":
        genres += [105, 107]
    
    genre_data = map(get_novel_ID, genres, [level] * len(genres))
    genre_df = pd.concat(genre_data).reset_index(drop=True)
    genre_df.drop_duplicates(inplace=True)
    
    like_url = "http://novel.naver.com/likeCountJson.nhn?contentsIds=" + (",").join(genre_df["ID"])
    genre_df["likeit"] = [
        i["likeItCount"]
        for i in requests.get(like_url).json()["result"]["contents"]
    ]
    
    genre_df["episode_total"] = genre_df["episode_total"].astype("int")
    genre_df["genre"] = genre_df["genre"].astype("int")
    genre_df.to_csv(os.path.join("data", level, "genre_df.csv"))
    
    return genre_df

In [110]:
make_genre_df("challenge")

ValueError: invalid literal for int() with base 10: '1,000'

## 소설 score 크롤링 (별점, 관심수, 댓글수)

In [None]:
def get_comment_count(ID, level, get_main=True, number=0):
    
    if level == "webnovel":
        level = "novel01"
    else:
        level = "novel02"
    
    headers = {"Referer": "http://novel.naver.com/{level}/list.nhn?".format(level=level)}
    if get_main:
        data = {
            "ticket": level,
            "object_id": "novel-{ID}".format(ID=ID),
            "_ts": "1469461095606",
            "page_size": "10",
            "page_no": "1",
            "sort": "newest"
        }
    else:
        data = {
            "ticket": level,
            "object_id": "{ID}-{number}".format(ID=ID, number=number),
            "_ts": "1469461095606",
            "page_size": "10",
            "page_no": "1",
            "sort": "newest"
        }
        
    comment_response = requests.post(
        "http://novel.naver.com/comments/list_comment.nhn", 
        headers=headers, 
        data=data
    )
    total_count = json.loads(comment_response.text.replace("\\'", "\'"))['total_count']
    
    return total_count

In [52]:
def get_novel_data(ID, level):  
    url = "http://novel.naver.com/{level}/list.nhn?novelId={ID}".format(level=level, ID=ID)
    response = requests.get(url)
    dom = BeautifulSoup(response.content, "lxml")
    
    main_score = float(dom.select_one("p.grade_area").select_one("em").text)
    concern_count = int(dom.select_one("span#concernCount").text.replace(",", ""))
    comments_count = get_comment_count(ID, level)
    
    return ID, main_score, concern_count, comments_count

def make_novel_df(df):
    
    data = list(map(get_novel_data, df["ID"], df["level"]))
    novel_df = pd.DataFrame(data, columns=["ID", "main_score", "concern_count", "comments_count"])
    novel_df.drop_duplicates(inplace=True)
    novel_df.reset_index(drop=True, inplace=True)
    novel_df.to_csv("data/{level}/novel_df.csv".format(level=df["level"][0]))
    
    return novel_df

### 소설 크롤링 merge data 생성

In [78]:
def make_main_df(level):

    if "genre_df.csv" in os.listdir(os.path.join("data", level)):
        try:
            genre_df = pd.read_csv(
                os.path.join("data", level, "genre_df.csv"), 
                index_col=0
            )
        except:
            genre_df = pd.read_csv(
                os.path.join("data", level, "genre_df.csv"), 
                index_col=0, 
                encoding="cp949"
            )
    else:
        genre_df = make_genre_df(level)

    if "novel_df.csv" in os.listdir(os.path.join("data", level)):
        try:
            novel_df = pd.read_csv(
                os.path.join("data", level, "novel_df.csv"), 
                index_col=0
            )
        except:
            novel_df = pd.read_csv(
                os.path.join("data", level, "novel_df.csv"), 
                index_col=0,
                encoding="cp949"
            )
    else:
        novel_df = make_novel_df(genre_df)
    
    main_df = genre_df.merge(novel_df, on="ID")
    main_df.to_csv(os.path.join("data", level, "main_df.csv"))
    
    return main_df

In [62]:
len(best_df)

1085

In [60]:
best_df = make_main_df("best")

Unnamed: 0,level,genre,ID,name,episode_total,main_score,concern_count,comments_count
0,best,101.0,547121,RIYA,129.0,9.97,8415,16
1,best,101.0,521183,느린꽃,73.0,10.0,7803,3
2,best,101.0,396141,이달아,71.0,9.97,4871,0
3,best,101.0,489762,이브나-,83.0,9.99,7372,7
4,best,101.0,377644,샨탈,1.0,9.97,2719,0


In [61]:
webnovel_df = make_main_df("webnovel")

In [None]:
len(webnovel_df)

### 소설 episode 크롤링 (제목, url_volume, 몇 화, 시간, 하트수, (조회수), 댓글 수, 별점, 별점수, 글)

In [112]:
def get_text(episode_url):
    episode_response = requests.get(episode_url)
    dom = BeautifulSoup(episode_response.content, "lxml")
    score_count = int(dom.select_one("span#currentStarScoreCount").text.replace(",","")[:-1])
    text = dom.select_one("div.detail_view_content").text.replace("\r\n", "")
    
    return [score_count, text]

In [125]:
130 // 10 

13

In [183]:
def get_novel_episode(ID, level, episode_total, reset=False):

    file_name = "{ID}-{level}.pickle".format(ID=ID, level=level)
    if (file_name in os.listdir("data/episode")) & (reset == False):
        return pd.read_pickle(os.path.join("data", "episode", file_name))
    
    pages = (episode_total // 10) + 2 if episode_total % 10 > 0 else (episode_total // 10) + 1

    titles = []
    volumes = []
    times = []
    scores = []
    hits = []
    
    print(ID)
    for page in range(1, pages):
        
        episode_url = "http://novel.naver.com/{level}/list.nhn?novelId={ID}&page={page}".format(
            level = level,
            ID = ID,
            page = page
        )
        
        episode_response = requests.get(episode_url)
        dom = BeautifulSoup(episode_response.content, "lxml")
        titles += [
            i.text
            for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("p.subj.v3")
            if i.text != "게시 보류중"
        ]
        volumes += [
            i["href"].split("=")[-1]
            for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("a.list_item.NPI=a:list")
            if i
        ]
        times += [
            i.text[:-1] if len(i.text) > 8 else "2016.07.30"
            for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("span.date")
        ]
        scores += [
            float(i.text)
            for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("span.score_area em")
        ]
        if level != "webnovel":
            hits += [
                int(i.text.replace(",", ""))
                for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("span.favorite em")
            ]

    like_url = "http://novel.naver.com/likeCountJson.nhn?contentsIds=" + str(ID) + "_" + ("," + str(ID) + "_").join(volumes)
    likeits = [
        i["likeItCount"]
        for i in requests.get(like_url).json()["result"]["contents"]
    ]

    comments_count = [
        get_comment_count(ID, level, False, i)
        for i in volumes
    ]

    episode_urls = [
        "http://novel.naver.com/{level}/detail.nhn?novelId={ID}&volumeNo={episode}".format(
            level = level, 
            ID = ID, 
            episode = i
        )
        for i in volumes
    ]

    score_count, text = zip(
        *[
            get_text(url)
            for url in episode_urls
        ]
    )
    
    episodes = [
        i
        for i in range(1, len(volumes)+1)
    ][::-1]

    if level != "webnovel":
        episode_df = pd.DataFrame(
            list(zip(titles, volumes, episodes, times, likeits, hits, comments_count, scores, score_count, text)), 
            columns=["title", "volume", "episode", "time", "likeit", "hit", "comments_count", "score", "score_count", "text"]
        )
    else:
        episode_df = pd.DataFrame(
            list(zip(titles, volumes, episodes, times, likeits, comments_count, scores, score_count, text)), 
            columns=["title", "volume", "episode", "time", "likeit", "comments_count", "score", "score_count", "text"]
        ) 
        
    episode_df["level"] = level
    episode_df["ID"] = ID
    
    
    episode_df.to_pickle(os.path.join("data", "episode", file_name))
    
    return episode_df

In [184]:
def make_episode_df(level, reset=False):
    
    if "main_df.csv" in os.listdir(os.path.join("data", level)):
        try:
            main_df = pd.read_csv(os.path.join('data', level, "main_df.csv"), index_col=0)
        except:
            main_df = pd.read_csv(os.path.join('data', level, "main_df.csv"), index_col=0, encoding="cp949") 
    else:
        main_df = make_main_df(level)
    main_df = main_df[main_df["episode_total"] != 0]
    
    data = list(map(get_novel_episode, main_df["ID"], main_df["level"], main_df["episode_total"], [reset]*len(main_df)))
    episode_df = pd.concat(data)
    episode_df.reset_index(drop=True, inplace=True)
    episode_df.to_csv(os.path.join("data", main_df["level"][0], "episode_df.csv"))
    
    return episode_df

In [145]:
episode_df = make_episode_df("webnovel")

## novel comment crawling

In [279]:
def get_novel_comments(ID, comments_count, get_main, level, reset, number=0):
    
    file_name = "{ID}-{number}.pickle".format(ID=ID, number=number)
    
    if "comment" not in os.listdir("data"):
        os.mkdir(os.path.join("data", "comment"))
    
    if (file_name in os.listdir(os.path.join("data", "comment"))) & (reset == False):
        return pd.read_pickle(os.path.join("data", "comment", file_name))
    
    if level == "webnovel":
        novel = "novel01"
    else:
        novel = "novel02"
    
    headers = {"Referer": "http://novel.naver.com/{level}/list.nhn?".format(level=level)}
    
    comment_list = []
    pages = (comments_count // 100) + 1
    for page in range(1, pages+1):
        
        if get_main:
            data = {
                "ticket": novel,
                "object_id": "novel-{ID}".format(ID=ID),
                "_ts": "1469461095606",
                "page_size": "100",
                "page_no": page,
                "sort": "newest"
            }
        else:
            data = {
                "ticket": novel,
                "object_id": "{ID}-{number}".format(ID=ID, number=number),
                "_ts": "1469461095606",
                "page_size": "100",
                "page_no": page,
                "sort": "newest"
            }

        comment_response = requests.post(
            "http://novel.naver.com/comments/list_comment.nhn", 
            headers=headers, 
            data=data
        )
        
        comment_list += json.loads(comment_response.text.replace("\\'", "\'"))["comment_list"]
    
    df = pd.DataFrame(comment_list)
    df.drop(
        [
            "comment_no", 
            "enc_writer_id", 
            "enc_writer_profile_user_id",
            "group_no",
            "reply_level",
            "status",
            "is_mine",
            "is_reply",
            "parent_comment_no",
            "deleted_yn",
            "is_yozm",
            "is_me2day",
            "visible_yn",
            "object_url",
            "writer_profile_user_id"
        ], 
        axis=1, 
        inplace=True
    )
    df.to_pickle(os.path.join("data", "comment", file_name))
    
    return df

In [280]:
def make_episode_novel_comment(episode_df, reset=False):
    
    episode_df = episode_df[episode_df["comments_count"] != 0]
    
    df_list = list(map(
            get_novel_comments,
            episode_df["ID"],
            episode_df["comments_count"],
            [False] * len(episode_df), 
            episode_df["level"],
            [reset] * len(episode_df),
            episode_df["volume"],
        ))
        
    episode_comments_df = pd.concat(df_list).reset_index(drop=True)
    episode_comments_df.to_csv(os.path.join("data", episode_df["level"][0], "episode_comments.csv"))
    
    return episode_comments_df

In [289]:
def make_main_novel_comment(main_df, reset=False):
    
    main_df = main_df[main_df["comments_count"] != 0]
    
    df_list = list(map(
            get_novel_comments,
            main_df["ID"],
            main_df["comments_count"],
            [True] * len(episode_df),
            main_df["level"],
            [reset] * len(episode_df)
        ))
        
    main_comment_df = pd.concat(df_list).reset_index(drop=True)
    main_comment_df.to_csv(os.path.join("data", episode_df["level"][0], "main_comments.csv"))
    
    return main_comment_df

In [290]:
main_comments_df = make_main_novel_comment(pd.read_csv("data/best/main_df.csv", encoding="cp949"), True)

547121
521183
489762
554438
528108
491876
407247
513949
533745
539286
351993
540184
187701
296333
89749
536509
479423
536598
541213
458846
529338
535202
533887
545751
531028
496902
539502
527274
97071
508392
498329
281502
409283
538109
547385
495848
346483
233214
530958
527501
492973
539450
263457
364337
512041
333567
532969
293152
481331
466329
525752
427497
388934
518861
531464
465054
374685
434212
509076
451833
271443
485027
406260
437101
428793
221468
459255
396706
201507
327076
350945
495256
510864
492007
338598
540513
468689
209544
431921
492297
488146
3158
505865
483114
388687
389896
509211
543794
553103
312588
502417
304434
474879
495121
535830
88465
80880
166510
5297
204370
232802
409402
26978
78390
556934
554134
537796
167300
428678
527279
429732
276429
100133
424301
438693
498955
475307
27287
308068
556339
551913
496503
553795
539172
545264
414512
5533
218931
13668
422031
549819
529288
379037
554716
521793
377527
532843
176274
272689
485257
537157
471473
401844
379168
511689

In [237]:
sum(episode_comments_df["writer_profile_user_id"] == episode_comments_df["writer_id"])

814971

In [227]:
len(episode_comments_df["writer_nickname"].unique())

91199

In [229]:
len(episode_comments_df["writer_id"].unique())

56138

In [230]:
len(episode_comments_df["writer_ip"].unique())

124383

In [233]:
len((episode_comments_df["writer_nickname"]+episode_comments_df["writer_id"]).unique())

145864

In [291]:
len((episode_comments_df["writer_nickname"]+episode_comments_df["writer_id"]+episode_comments_df["writer_ip"]).unique())

243478

In [258]:
pd.read_csv("data/webnovel/episode_df.csv", encoding="cp949")

Unnamed: 0.1,Unnamed: 0,title,volume,episode,time,likeit,comments_count,score,score_count,text,level,ID
0,0,에필로그- 허니허니 베이비 (1),96,96,2016.07.29,1832,164,9.99,2983,행복한 결혼생활에서 중요한 것은서로 얼마나 잘 맞는가보다 다른 점을 어떻게 극복해나...,webnovel,466391
1,1,"95화. 사랑하기에, 꿀 같은 결혼 (完)",95,95,2016.07.26,2456,384,9.98,3713,사랑하는 것이 인생이다.사람과 사람 사이의 결합이 있는 곳에 또한 기쁨이 있다.-괴...,webnovel,466391
2,2,"94화. 잘했으니까, 상 줘야겠네.",94,94,2016.07.22,2269,198,9.98,3521,행복한 결혼이란 용서하는 두 삶의 합일체다.-톨스토이\r\n“어제 그거 봤어? ‘이...,webnovel,466391
3,3,93화. 제보 대환영!,93,93,2016.07.19,2033,172,9.98,3273,결혼은 한 자루의 가위와 비슷해서 분리될 수 없도록 결합되어 있으며때로 반대 방향으...,webnovel,466391
4,4,92화. 그가 사랑하는 사람은 내가 아니라……,92,92,2016.07.15,2209,155,9.98,3522,이 세상에서 가장 아름다운 것들은 보이거나 만져질 수 없다.단지 가슴으로만 느낄 수...,webnovel,466391
5,5,"91화. 완벽한 당분, 강력한 증거",91,91,2016.07.12,2121,185,9.98,3467,"세상에게 당신은 한 사람에 불과할지 모르지만,누군가에게 당신은 세상일지도 모른다.-...",webnovel,466391
6,6,"90화. 너라는 사랑, 너라는 기적 덕분에.",90,90,2016.07.08,2110,163,9.98,3277,"행복한 결혼은 약혼 때부터 죽을 때까지,결코 지루하지 않은 긴 대화와 같은 것이다....",webnovel,466391
7,7,89화. 하시던 것 계속 하세요.,89,89,2016.07.05,2180,130,9.98,3526,"<사랑하고, 사랑하고 또 사랑하자.그런 다음 조금 더 사랑하자.-지니 허친슨>\r\...",webnovel,466391
8,8,"88화. 눈 감고 대체 무슨 생각을 한 거야, 넌?",88,88,2016.07.01,2064,106,9.98,3375,<단지 누구를 사랑한다고 해서 무조건 감싸야 한다는 뜻은 아니다.사랑은 상처를 덮는...,webnovel,466391
9,9,87화. 너 죽으면 나도 죽어.,87,87,2016.06.28,2313,180,9.99,3689,"<사랑이란, 불완전한 상대에 대한 무조건적인 헌신이다.-미상>혁준이 해수의 손을 잡...",webnovel,466391


In [292]:
episode_comments_df.head()

Unnamed: 0,contents,down_count,is_facebook,is_twitter,mobile_yn,modified_ymdt,object_id,registered_ymdt,ticket,up_count,writer_id,writer_ip,writer_nickname,writer_profile_type
0,이런 생활 갖고싶은 데 불가능이죠 현실적이지 않겠죠..ㅜㅜ 그래서 포기했네요,0,False,False,Y,2016-07-31T02:05:23.0+0900,466391-96,2016-07-31T02:05:23.0+0900,novel01,0,lion****,39.7.xxx.164,클클클,naver
1,음 그럼 아파서 울어도 안봐준다는말이 그런거였군 아 그렇군,0,False,False,N,2016-07-31T01:31:30.0+0900,466391-96,2016-07-31T01:31:30.0+0900,novel01,0,0310****,175.142.xxx.79,책벌레,naver
2,에필로그도 재미 있다 게속보코싶다 ㅠㅠ,0,False,False,Y,2016-07-31T00:33:15.0+0900,466391-96,2016-07-31T00:30:14.0+0900,novel01,1,0913****,223.33.xxx.122,홍당무,naver
3,살면서 해수는 여러번 반하겠죠? 저도 혁준같은 사람있으면 빨리 결혼하겠습니당,0,False,False,N,2016-07-30T23:24:24.0+0900,466391-96,2016-07-30T23:24:24.0+0900,novel01,0,wlgu****,58.237.xxx.181,꽁냥잉,naver
4,분명 나 닮은 딸 낳을거 걱정해서 보류한다하셨고 아들 싫다 했는데 그리고 셋만 낳...,0,False,False,Y,2016-07-31T02:44:10.0+0900,466391-96,2016-07-30T22:35:22.0+0900,novel01,1,choi****,124.199.xxx.112,망고봉봉,naver
