In [309]:
import re
import requests
import urlparse
import datetime
import json
from time import sleep
import random
from bs4 import BeautifulSoup

In [310]:
def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[ start : end ]
    except ValueError:
        return ""
    
def find_between_r( s, first, last ):
    try:
        start = s.rindex( first ) + len( first )
        end = s.rindex( last, start )
        return s[ start : end ]
    except ValueError:
        return ""
    
def find_between_h( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.rindex( last, start )
        return s[ start : end ]
    except ValueError:
        return ""

def find_item( s, name, first, last, find_func = find_between ):
    start = s.index( name ) + len( name )
    return find_func( s[ start : ], first, last )

In [456]:
class NaverStockCrawler( object ):
    def __init__( self, code ):
        self.nvr_fnc_url = "http://finance.naver.com"
        self.code = code
        self.cur_page = 1
        self.stock_bbs_url = "http://finance.naver.com/item/board.nhn?code={}&page={}" 
        self.cur_board_url = self.stock_bbs_url.format( self.code, self.cur_page )
               
    def set_date( self, begin, end ): pass
    
    def set_page( self, page ):
        assert page > 0
        self.cur_page = page
        self.cur_board_url = self.stock_bbs_url.format( self.code, self.cur_page )
        return self.cur_board_url

    def crawl_web( self, url = '' ):
        if len( url ) < 1:
            url = self.cur_board_url
            
        res = requests.get( url )
        soup = BeautifulSoup( res.content )
        return soup
    
    def extract_top_bbs( self, soup ):
        # information extraction: 
        # sympathy, antipathy, written_date, title, investment_opinion, hits, id
        
        table = []
        col_title = ["written_date", "title", "inv_op", "id", "hits", "sympathy", "antipathy"]
        rows = soup.find_all( 'tr', attrs = {"onmouseover":"mouseOver(this)", \
                                             "onmouseout" : "mouseOut(this)"} )
        # Seperate rows and columns
        for r in rows:
            col_items = r.find_all( 'td' )
            tmp_dict = {}
            for idx, item in  enumerate( col_items ):
                if col_title[ idx ] is "title": 
                    href = item.select_one( 'a["href^=/item/board_read.nhn"]' )
                    css_str = str( href )
                    
                    article_url = find_item( css_str, "href=", '"', '"' )
                    article_url = article_url.replace( "amp;", "" )
                    tmp_dict[ "article_sub_url" ] = article_url
                    
                    #tmp_dict[ col_title[ idx ] ] = find_item( item.text, '', '\n', '\n' )
                    title = find_item( css_str, "title=", '"', '"' )
                    tmp_dict[ col_title[ idx ] ] = " ".join( title.strip().split() )
                    
                    is_sub = item.find( 'img', attrs = { 'alt':u'답글'} )
                    tmp_dict[ "is_sub" ] = True if is_sub is not None else False
               
                else:
                    tmp_dict[ col_title[idx] ] = item.text
            
            #tmp_dict["id"] = find_item( tmp_dict["id"], '', '\n\t\t\t\t', '\n\t\t\t\t' )
            tmp_dict[ "id" ] = tmp_dict[ "id" ].strip()
            table.append( tmp_dict )
            
        return table
    
    def extract_article( self, soup ):
        body = soup.find( "div", class_ = "view_se", id = "body" )     
        article = "\n".join( [ " ".join( line.strip().split() ) for line in body.text.splitlines() ] )
        return article
    
    def extract_cmt_url( self, soup ):
        cmt_soup = soup.find( "iframe", id = 'ncomment' )
        element = find_item( str( cmt_soup ), "src", '"', '"' ).replace( "amp;", "" )
        cmt_url = self.nvr_fnc_url + element
        return cmt_url

    def crawl_cmt( self, cmt_iframe_url, page_no = 1, page_size = 50 ):
        assert page_no > 0 and page_size > 0
        query_dict = urlparse.parse_qs( urlparse.urlsplit( cmt_iframe_url ).query )
        
        post_url = "http://finance.naver.com/comments/list_comment.nhn"
        headers = {}
        #headers['_ts']='1481278972259'
        headers['ticket'] = query_dict[ "ticketId" ][0]
        headers['object_id'] = query_dict[ "objectId" ][0]
        headers['lkey'] = query_dict[ "secureKey" ][0]
        headers['page_size'] = str( page_size )
        headers['page_no'] = str( page_no )
        
        res_cmt = requests.post( post_url, data = headers )
        return json.loads( res_cmt.content )
    
    def crawl_cmt_loop( self, cmt_iframe_url, page_size = 50 ):
        assert page_size > 0
        cmt_table = []
        cmt_col_title = [ "registered_ymdt", "contents", "writer_id", "object_id", "comment_no" ]
        accumulated_page_size = 0
        total_cmt_cnt = page_size
        page_no = 1
        
        while accumulated_page_size < total_cmt_cnt:
            cmt_json = self.crawl_cmt( cmt_iframe_url, page_no = page_no, page_size = page_size )
        
            if cmt_json["error"]["message"] != "No Error" : return cmt_table
           
            total_cmt_cnt = cmt_json[ "total_count" ]
       
            for i in cmt_json[ "comment_list" ]:
                cmt_row = {}
                for key, item in i.items():
                    if key in cmt_col_title:
                        cmt_row[ key ] = item
                cmt_table.append( cmt_row )

            page_no += 1
            accumulated_page_size += page_size
            
        return cmt_table
        
    def start_crawler( self ):
        soup = self.crawl_web()
        table = self.extract_top_bbs( soup )
        for t in table:
            for k, i in t.items():
                if k == "article_sub_url" : 
                    print k, ": ", self.nvr_fnc_url + i
                    delay = random.randint( 0, 100 )
                    sleep( delay * 0.01 )
                    
                    sub_soup = self.crawl_web( self.nvr_fnc_url + i )

                    article = self.extract_article( sub_soup )
                    cmt_url = self.extract_cmt_url( sub_soup )

                    print "article: ", article
                    print "cmt_url: ", cmt_url
                    
                    cmts = self.crawl_cmt_loop( cmt_url, 2 )
                    print
                    for cmt in cmts:
                        for ck, ci in cmt.items():
                            print ck, ": ", ci
                        print
                else:
                    print k,": ", i
            print "=" * 90

In [457]:
stock_crawler = NaverStockCrawler( "037230" ) # 한국패키지
stock_crawler.set_page( 1 )
stock_crawler.start_crawler()
test_cmt_url = "http://finance.naver.com/ncomment/list.nhn?ticketId=finance2&objectId=56425873&secureKey=j-2rP7uK5gJKNggDWeTmm5s8zoq5b4vHtF-2LA6d7h6HgMDPp4Wzsw&parentUrl=http%3A%2F%2Ffinance.naver.com%2Fitem%2Fboard_read.nhn%3Fcode%3D037230%26nid%3D56425873%26st%3D%26sw%3D%26page%3D3"
#stock_crawler.crawl_cmt( test_cmt_url, 1, 0 )
#stock_crawler.crawl_cmt_loop( test_cmt_url, 1 )


article_sub_url :  http://finance.naver.com/item/board_read.nhn?code=037230&nid=56502652&st=&sw=&page=1
article:  지지률상승
이재명 화이팅
cmt_url:  http://finance.naver.com/ncomment/list.nhn?ticketId=finance2&objectId=56502652&secureKey=j-2rP7uK5gKjvSiNtnYfWDWT_CBzRfR-geGWe0tiLFVNKC4TeZgvFQ&parentUrl=http%3A%2F%2Ffinance.naver.com%2Fitem%2Fboard_read.nhn%3Fcode%3D037230%26nid%3D56502652%26st%3D%26sw%3D%26page%3D1

hits :  114
title :  주말 촛불집회
written_date :  2016.12.16 18:03
is_sub :  False
antipathy :  2
sympathy :  2
id :  pjh5****
inv_op :  의견없음
article_sub_url :  http://finance.naver.com/item/board_read.nhn?code=037230&nid=56500458&st=&sw=&page=1
article:  조례지정!
경기도지사 멋지네용.
벌써 움직이네요.
액션이 빠르네용.
cmt_url:  http://finance.naver.com/ncomment/list.nhn?ticketId=finance2&objectId=56500458&secureKey=j-2rP7uK5gKxWt5yY9NIE5MbF_wd6lZSgeGWe0tiLFWk9cOo678kzg&parentUrl=http%3A%2F%2Ffinance.naver.com%2Fitem%2Fboard_read.nhn%3Fcode%3D037230%26nid%3D56500458%26st%3D%26sw%3D%26page%3D1

hits :  100
title :  경기