In [126]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs
import pandas as pd
import os
import time

pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)
pd.options.display.float_format = '{:.2f}'.format

In [132]:
def get_num_episodes(chromedriver_path, platform = 'naver', lang = 'kr', titleId = 747269):
    """
    웹툰의 에피소드 개수를 구하는 함수
    """    
    if lang == 'kr':
        print('titleId가 ', titleId, "인 웹툰의 episode 개수를 구합니다.")
    elif lang == 'en':
        print("Get the number of episodes of titleId, ", titleId)
        
    # chrome driver setting    
    service = Service(executable_path=chromedriver_path)
    
    TIMEOUT = 1 
    driver = webdriver.Chrome(service=service)
    
    if platform == 'naver' and lang == 'kr':
        base_url = 'https://comic.naver.com/webtoon/list?' + 'titleId=' + str(titleId)
    elif platform == 'naver' and lang == 'en':    
        base_url = 'https://www.webtoons.com/en/action/omniscient-reader/list?' + 'title_no=' + str(titleId)
        
    print(base_url)
    driver.get(base_url)
    driver.implicitly_wait(TIMEOUT)   
       
    # Get the total number of pages
    if lang == 'kr':
        pages = driver.find_element(by=By.CSS_SELECTOR, value="#content > table > tbody > tr:nth-child(2) > td.title > a")
        num_episodes = int(pages.text.split('.')[0]) + 1
        print("num_episodes:", num_episodes)
    elif lang == 'en':  
        element = driver.find_element(by=By.CSS_SELECTOR, value="li._episodeItem")
        num_episodes = int(element.get_attribute("data-episode-no"))
        print("num_episodes:", num_episodes)
        
    driver.quit()
    return num_episodes

In [149]:
def get_comments(driver, df, lang = 'kr', titleId = 747269, episode_no = 1):
    """
    에피소드의 댓글을 수집하는 함수
    """        
    
    if lang == 'kr':
        base_url = 'https://comic.naver.com/webtoon/detail?'
        episode_url = base_url + 'titleId=' + str(titleId) + '&no='+str(episode_no)
    elif lang == 'en':
        # https://www.webtoons.com/en/action/omniscient-reader/episode-118/viewer?title_no=2154&episode_no=119
        base_url = 'https://www.webtoons.com/en/action/omniscient-reader/episode-'
        episode_url = base_url + str(episode_no -1) + '/viewer?title_no=' + str(titleId) + '&episode_no='+str(episode_no)        
        
    print(episode_url)
    driver.get(episode_url)
    driver.implicitly_wait(1)
    
    if lang == 'kr':
        # 댓글창에 해당하는 프레임 선택
        driver.switch_to.frame('commentIframe')

        # 수집 편의를 위한 클린봇 댓글 필터링 기능 해제
        # (클린봇이 부적절한 표현을 감지한 댓글입니다.) 제거
        driver.find_element(By.CLASS_NAME,'u_cbox_cleanbot_setbutton').click()
        time.sleep(0.2)
        driver.find_element(By.CLASS_NAME,'u_cbox_layer_cleanbot2_checkbox').click()
        time.sleep(0.2)
        driver.find_element(By.CLASS_NAME,'u_cbox_layer_cleanbot2_extrabtn').click()
        time.sleep(0.2)

        # 전체 댓글 클릭
        driver.find_element(By.CSS_SELECTOR,'#cbox_module > div > div.u_cbox_view_comment > a').click()
        time.sleep(0.5)        
    elif lang == 'en':
        # No iframe
        # No Cleanbot filtering setting
        # No need to click "전체 댓글"        
        pass 
    
    # Move to the last page and Get the total number of pages
    driver.find_element(By.CSS_SELECTOR,'#cbox_module > div > div.u_cbox_paginate > div > a.u_cbox_next.u_cbox_next_end').click()
    time.sleep(1)
    
    u_cbox_num_page = driver.find_element(By.CSS_SELECTOR,'#cbox_module > div > div.u_cbox_paginate > div > strong > span.u_cbox_num_page')
    time.sleep(1)
    num_pages = u_cbox_num_page.text
    print("num_pages:",num_pages)
  
    # Move to the first page
    driver.find_element(By.CSS_SELECTOR,'#cbox_module > div > div.u_cbox_paginate > div > a.u_cbox_pre.u_cbox_pre_end').click()
    time.sleep(0.5)

    for page in range(int(num_pages)):
        if lang == 'kr':        
            # switch to the recent tab
            driver.switch_to.window(driver.window_handles[-1]) 
            driver.switch_to.frame("commentIframe")        
        
        soup = bs(driver.page_source,"html.parser")

        # Extract dates
        dates = soup.findAll("span", {"class":["u_cbox_date"]})
        dates = [date.text for date in dates]        

        # Extract reveiws
        reviews = soup.findAll("span", {"class":["u_cbox_contents"]})
        reviews = [review.get_text() for review in reviews]

        df_page = pd.DataFrame(data=[], columns=['Episode','Date','Review'])       
        df_page['Date'] = dates
        df_page['Review'] = reviews
        df_page['Episode'] = df_page['Episode'].fillna(int(episode_no))
        
        df = pd.concat([df,df_page])      
        print('.', end='')

        if (page+1) == int(num_pages):
            return df
        
        # Move to the next page
        driver.find_element(By.CSS_SELECTOR, "#cbox_module > div > div.u_cbox_paginate > div > strong + a").click()        
        time.sleep(0.2)
    return df

In [147]:
def get_webtoon_titleId(chromedriver_path, platform, lang, titleName):
    """
    웹툰명으로 타이틀id를 구하는 함수
    """
    if lang == 'kr':
        print(titleName,"의 titleId를 구합니다.")
    elif lang == 'en':
        print("Get the title number of the ", titleName)
        
    # chrome driver setting    
    service = Service(executable_path=chromedriver_path)

    TIMEOUT = 1 
    driver = webdriver.Chrome(service=service)  
    
    if platform == 'naver' and lang == 'kr':
        base_url = 'https://comic.naver.com/index' 
    elif platform == 'naver' and lang == 'en':
        base_url = 'https://www.webtoons.com/en/' 
        
    print(base_url)
    driver.get(base_url)
    driver.implicitly_wait(TIMEOUT)   
    
    search_query = titleName

    if lang == 'kr':
        # https://comic.naver.com/search?keyword=전지적+독자+시점
        driver.find_element(by=By.CSS_SELECTOR, value='input#gnb\.keyword').clear() # 검색창 초기화
        driver.find_element(by=By.CSS_SELECTOR, value='input#gnb\.keyword').send_keys(search_query)  # 검색어 입력 
        driver.find_element(by=By.CSS_SELECTOR, value= '#search_bar_button > span').click()  # 검색 버튼 클릭
        time.sleep(1)        
        
        # 웹툰 선택
        driver.find_element(by=By.CSS_SELECTOR, value='#content > div.searchTxt > ul > li:nth-child(2) > a').click()
        time.sleep(1)  

        names = driver.find_elements(by=By.CSS_SELECTOR, value='#content ul h5 > a')
        for name in names:
            if name.text == titleName:
                href = name.get_attribute('href')
                titleId = href.split('?titleId=')[1]
                print(titleName, 'titleId ', titleId)
            
    elif lang == 'en':
        # https://www.webtoons.com/en/search?keyword=Omniscient%20Reader
        driver.find_element(by=By.CSS_SELECTOR, value= 'a.btn_search').click()  # 검색 버튼 클릭
        driver.find_element(by=By.CSS_SELECTOR, value='input.input_search').clear() # 검색창 초기화
        driver.find_element(by=By.CSS_SELECTOR, value='input.input_search').send_keys(search_query)  # 검색어 입력
        driver.find_element(by=By.CSS_SELECTOR, value='input.input_search').send_keys(Keys.RETURN)  # Enter Key 입력 
              
        # ORIGINALS 선택
        names = driver.find_elements(by=By.CSS_SELECTOR, value='#content > div.card_wrap.search._searchResult > ul > li > a')
        time.sleep(1) 
        # CANVAS 선택
        #elements = driver.find_elements(by=By.CSS_SELECTOR, value='#content > div.card_wrap.search._searchResult > div.challenge_lst.search > ul  a > p.subj')
        for name in names:
            subj = name.find_element(by=By.CSS_SELECTOR, value='div > p.subj')
            if subj.text == titleName:
                href = name.get_attribute('href')
                titleId = href.split('?titleNo=')[1]
                print(titleName, 'titleId ', titleId)   
    driver.quit()
    return titleId

In [143]:
# Argument setting 
chromedriver_path='C:\Temp\chromedriver.exe'
platform = 'naver'
lang ='en'
if lang == 'kr':
    webtoon = '전지적 독자 시점'
elif lang == 'en':
    webtoon = 'Omniscient Reader'

# 웹툰명으로 타이틀id를 구한다.
title_id = get_webtoon_titleId(chromedriver_path, platform, lang, webtoon) 

# 웹툰의 전체 에피소드 개수를 구한다.
num_episodes = get_num_episodes(chromedriver_path, platform, lang, title_id)

# 결과를 저장할 폴더를 만든다.
DATA_PATH = './data/'+lang+'/'
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

Get the title number of the  Omniscient Reader
https://www.webtoons.com/en/
Omniscient Reader titleId  2154
Get the number of episodes of titleId,  2154
https://www.webtoons.com/en/action/omniscient-reader/list?title_no=2154
num_episodes: 121


In [150]:
#에피소드마다 크롤링한다.
for episode_no in range(2, num_episodes):

    # DataFrame을 생성한다.
    df = pd.DataFrame(data=[], columns=['Episode','Date','Review'])
    
    # DataFrame에 저장한다.
    driver = webdriver.Chrome(service=service)    
    df = get_comments(driver, df, lang, titleId, episode_no)
    driver.close()
    
    # CSV 파일로 저장한다.
    file_path = DATA_PATH + '{lang}_{webtoon}_episode_{episode_no}.csv'.format(lang=lang, webtoon=webtoon, episode_no=episode_no)
    df.to_csv(file_path,index = False)
    print("Episode", episode_no, " saved.")

https://www.webtoons.com/en/action/omniscient-reader/episode-1/viewer?title_no=2154&episode_no=2
num_pages: 280
........................................................................................................................................................................................................................................................................................Episode 2  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-2/viewer?title_no=2154&episode_no=3
num_pages: 173
.............................................................................................................................................................................Episode 3  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-3/viewer?title_no=2154&episode_no=4
num_pages: 198
...............................................................................................................................................................................

..........................................................................................................................Episode 32  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-32/viewer?title_no=2154&episode_no=33
num_pages: 80
................................................................................Episode 33  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-33/viewer?title_no=2154&episode_no=34
num_pages: 55
.......................................................Episode 34  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-34/viewer?title_no=2154&episode_no=35
num_pages: 92
............................................................................................Episode 35  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-35/viewer?title_no=2154&episode_no=36
num_pages: 72
........................................................................Episode 36  saved.
https://www.webtoons.com/en/acti

........................................................................................Episode 69  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-69/viewer?title_no=2154&episode_no=70
num_pages: 74
..........................................................................Episode 70  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-70/viewer?title_no=2154&episode_no=71
num_pages: 56
........................................................Episode 71  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-71/viewer?title_no=2154&episode_no=72
num_pages: 61
.............................................................Episode 72  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-72/viewer?title_no=2154&episode_no=73
num_pages: 79
...............................................................................Episode 73  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-73/viewer?title_no=2154&episode_no

num_pages: 42
..........................................Episode 111  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-111/viewer?title_no=2154&episode_no=112
num_pages: 51
...................................................Episode 112  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-112/viewer?title_no=2154&episode_no=113
num_pages: 38
......................................Episode 113  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-113/viewer?title_no=2154&episode_no=114
num_pages: 23
.......................Episode 114  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-114/viewer?title_no=2154&episode_no=115
num_pages: 52
....................................................Episode 115  saved.
https://www.webtoons.com/en/action/omniscient-reader/episode-115/viewer?title_no=2154&episode_no=116
num_pages: 34
..................................Episode 116  saved.
https://www.webtoons.com/en/action/omniscient-reade

In [151]:
print('Done.')

Done.


# Merge CSV files

In [153]:
from glob import glob

file_names= glob(DATA_PATH + '*.csv')
print(file_names)

total = pd.DataFrame()
for file_name  in file_names:
    temp = pd.read_csv(file_name, encoding='utf-8' , low_memory=False)
    total = pd.concat([total, temp])

total.reset_index(inplace=True, drop=True)
total.to_csv( DATA_PATH + '{lang}_{webtoon}_episode_total.csv'.format(lang=lang, webtoon=webtoon))
print('Done.')

['./data/en\\en_Omniscient Reader_episode_1.csv', './data/en\\en_Omniscient Reader_episode_10.csv', './data/en\\en_Omniscient Reader_episode_100.csv', './data/en\\en_Omniscient Reader_episode_101.csv', './data/en\\en_Omniscient Reader_episode_102.csv', './data/en\\en_Omniscient Reader_episode_103.csv', './data/en\\en_Omniscient Reader_episode_104.csv', './data/en\\en_Omniscient Reader_episode_105.csv', './data/en\\en_Omniscient Reader_episode_106.csv', './data/en\\en_Omniscient Reader_episode_107.csv', './data/en\\en_Omniscient Reader_episode_108.csv', './data/en\\en_Omniscient Reader_episode_109.csv', './data/en\\en_Omniscient Reader_episode_11.csv', './data/en\\en_Omniscient Reader_episode_110.csv', './data/en\\en_Omniscient Reader_episode_111.csv', './data/en\\en_Omniscient Reader_episode_112.csv', './data/en\\en_Omniscient Reader_episode_113.csv', './data/en\\en_Omniscient Reader_episode_114.csv', './data/en\\en_Omniscient Reader_episode_115.csv', './data/en\\en_Omniscient Reader_e