In [None]:
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# WebDriver 설정
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# 성인인증을 위한 로그인
URL_login = "https://webtoon.kakao.com/more"

driver.get(URL_login)

In [None]:
import json
from bs4 import BeautifulSoup as bs
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re

# # 웹드라이버 설정 (Chrome 기준)
# options = webdriver.ChromeOptions()
# options.add_argument('--no-sandbox')
# options.add_argument('--disable-dev-shm-usage')
# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# 카카오 웹툰 랭킹 페이지 열기
url = "https://webtoon.kakao.com/ranking"
driver.get(url)
time.sleep(2)  # 페이지 로딩 대기

# 카테고리 버튼 가져오기 (실시간 랭킹 제외)
cate_sel = "#root > main > div > div.translate-y-0.transition-transform.duration-\[250ms\].z-navigationBar.left-0.fixed.top-0.w-full > div > div:nth-child(4) > div.w-full.h-full.relative > ul > li:nth-child({id})"
webtoon_info_list = []

def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # 새 콘텐츠가 로드될 때까지 대기
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def scroll_to_element(driver, element):
    WebDriverWait(driver, 10).until(EC.visibility_of(element))
    driver.execute_script("arguments[0].scrollIntoView(true);", element)
    time.sleep(3)  # 스크롤 후 페이지 로딩 대기

def get_webtoon_details(url):
    profile_url = url + "?tab=profile"
    episode_url = url + "?tab=episode"
    purchase_url = url +"?tab=ticket"
    
    driver.get(profile_url)
    time.sleep(3)
    
    data = driver.page_source
    soup = bs(data, "html.parser")
    
    webtoon_info = {
        "id": len(webtoon_info_list) + 1,
        "type": "웹툰",
        "platform": "카카오웹툰",
        "title": "",
        "status": "",
        "thumbnail": thumbnail,
        "genre": "",
        "views": "",
        "rating": "-",
        "like": "",
        "description": "",
        "keywords": [],
        "author": "",
        "illustrator": "",
        "original": "",
        "age_rating": "",
        "price": "",
        "url": url,
        "episode": ""
    }

    # 제목
    title = soup.find("p", class_="whitespace-pre-wrap break-all break-words support-break-word overflow-hidden text-ellipsis !whitespace-nowrap s22-semibold-white text-center leading-26")
    if title:
        webtoon_info["title"] = title.text.strip()
    
    # 줄거리
    description = soup.find("p", class_="whitespace-pre-wrap break-all break-words support-break-word s13-regular-white leading-20 overflow-hidden")
    if description:
        webtoon_info["description"] = description.text.strip()
    
    # 키워드
    keywords = soup.find_all("p", class_="whitespace-pre-wrap break-all break-words support-break-word overflow-hidden text-ellipsis !whitespace-nowrap s14-medium-white")
    webtoon_info["keywords"] = [keyword.text.strip().lstrip('#') for keyword in keywords]

    # 장르/뷰/좋아요
    def convert_views(view_text):
        if "만" in view_text:
            return int(float(view_text.replace(",", "").replace("만", "")) * 10000)
        elif "억" in view_text:
            return int(float(view_text.replace(",", "").replace("억", "")) * 100000000)
        else:
            return int(view_text.replace(",", "")) if view_text.replace(",", "").isdigit() else "-"
    
    stats = soup.find_all("div", class_="flex justify-center items-start h-14 mt-8 leading-14")
    for stat in stats:
        infoes = stat.find_all("p", class_="whitespace-pre-wrap break-all break-words support-break-word s12-regular-white ml-2 opacity-75")
        for i, info in enumerate(infoes):
            text = info.get_text(strip=True)
            if i == 0:
                webtoon_info["genre"] = text
            elif i == 1:
                webtoon_info["views"] = convert_views(text)
            elif i == 2:
                webtoon_info["like"] = convert_views(text)

##################################################################################################################################
##################################################################################################################################
    # status/price/age
    badges = soup.find_all("div", class_="flex flex-wrap gap-4 mb-12")
    webtoon_info["status"] = ""
    webtoon_info["age_rating"] = "전체이용가"  # 기본값 설정
    days = []

    for badge in badges:
        texts = badge.find_all("p", class_=lambda x: x and "whitespace-pre-wrap" in x and "font-badge" in x)
        for text in texts:
            text = text.text.strip()
            
            # Status 처리
            if text in ["완결", "휴재"]:
                webtoon_info["status"] = text
            elif text in ["연재", "시즌완결"]:
                webtoon_info["status"] = "연재"
            elif text in ["월", "화", "수", "목", "금", "토", "일"]:
                days.append(text)
                        
            # Price 중 기다무 처리
            if "마다 무료" in text:
                webtoon_info["price"] = text
            elif "연재" in text:
                webtoon_info["price"] = "무료"
                
            # Age rating 처리 (19세 이용가가 아닌 경우에만)
            if "15" in text:
                webtoon_info["age_rating"] = "15세이용가"


    # 연재 상태이고 요일이 있으면 요일 정보 추가
    if webtoon_info["status"] == "연재" and days:
        webtoon_info["status"] = f"{', '.join(days)} {webtoon_info['status']}"

    # status가 비어있으면 "연재"로 설정
    if not webtoon_info["status"]:
        webtoon_info["status"] = "연재"

    # 성인 연령 등급 확인
    adult_img = soup.select_one("#root > main > div > div > div.inset-x-auto.relative > div.relative.-mb-11.z-2 > div.h-full.mx-11.pb-100 > div > div:nth-child(1) > div > img")
    if adult_img and adult_img.get('alt') == '성인':
        webtoon_info["age_rating"] = "19세이용가"

    # 무료 여부 확인
    if not webtoon_info["price"]:
        driver.get(purchase_url)
        time.sleep(2)
    
        try:
            # 구매 버튼이 로드될 때까지 대기 (최대 10초)
            purchase_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[.//span[contains(text(), '구매')]]"))
            )
            
            # 구매 버튼 클릭
            purchase_button.click()
            time.sleep(2)
            
            # 가격 정보 추출
            price_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "button.relative.px-10.py-0.w-\\[130px\\].px-19.h-40.btn-blue p.whitespace-pre-wrap.break-all.break-words.support-break-word.font-number.s16-medium-white"))
            )
            price_text = price_element.text
            webtoon_info["price"] = price_text
        except TimeoutException:
            # 구매 버튼이 없는 경우
            webtoon_info["price"] = "무료"


    # 글/그림 작가
    com_author = soup.find_all("dd", class_="whitespace-pre-wrap break-all break-words support-break-word s13-regular-white leading-16 flex-1")
    labels = soup.find_all("dt", class_="whitespace-pre-wrap break-all break-words support-break-word s13-regular-white opacity-50 leading-16 mr-8 flex-none w-54")
    if len(com_author) > 0:
        webtoon_info["author"] = com_author[0].text.strip()
    if len(com_author) > 1:
        webtoon_info["illustrator"] = com_author[1].text.strip()

    if len(labels) > 2 and labels[2].text.strip() == "원작":
        original_text = com_author[2].text.strip()
        webtoon_info["original"] = original_text.split(',')[0].strip()

    # 회차 정보 가져오기
    driver.get(episode_url)
    time.sleep(2)
    
    scroll_to_bottom(driver)

    episode_data = driver.page_source
    episode_soup = bs(episode_data, "html.parser")
    
    episode_list = episode_soup.find('ul', class_='flex flex-wrap')
    if episode_list:
        episode_count = len(episode_list.find_all('li'))
        webtoon_info["episode"] = episode_count

        # "다시보기 안내" 텍스트 확인 후 재연재로 수정
        rewatch_notice = episode_soup.find('p', class_='whitespace-pre-wrap break-all break-words support-break-word overflow-hidden text-ellipsis !whitespace-nowrap leading-14 s12-regular-white', string='다시보기 안내')
        if rewatch_notice:
            if '연재' in webtoon_info["status"]:
                webtoon_info["status"] = webtoon_info["status"].replace('연재', '재연재')

    else:
        webtoon_info["episode"] = "정보 없음"

    return webtoon_info


for id in range(2, 10):
    category_selector = cate_sel.format(id=id)
    category = driver.find_element(By.CSS_SELECTOR, category_selector)
    category.click()
    time.sleep(2)

    # 웹툰 리스트 가져오기
    webtoons = driver.find_elements(By.CSS_SELECTOR, "a[href*='/content/']")
    for i, webtoon in enumerate(webtoons[1:11]):  # 각 카테고리마다 10개의 웹툰만 처리
        # 웹툰 요소로 스크롤
        scroll_to_element(driver, webtoon)
        time.sleep(2)
        webtoon_url = webtoon.get_attribute("href")

        # 썸네일 이미지 URL 가져오기
        thumbnail_selector = "#root > main > div > div.px-11.mx-auto.my-0.w-full.lg\:w-default-max-width.md\:w-\[490px\] > div.flex.flex-wrap.gap-4.content-start.relative.w-full.mt-4 > div:nth-child(1) > div > div > a > picture:nth-child(2) > img"
        try:
            thumbnail_element = webtoon.find_element(By.CSS_SELECTOR, thumbnail_selector)
            thumbnail = thumbnail_element.get_attribute("src")
        except:
            try:
                thumbnail_element = webtoon.find_element(By.CSS_SELECTOR, "img.w-full.h-full.object-cover.object-top")
                thumbnail = thumbnail_element.get_attribute("src")
            except:
                thumbnail = ""  # 썸네일을 찾지 못한 경우


        # 새 탭에서 웹툰 페이지 열기
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[-1])
        
        # 웹툰 상세 정보 가져오기
        webtoon_info = get_webtoon_details(webtoon_url)
        webtoon_info["thumbnail"] = thumbnail
        
        webtoon_info_list.append(webtoon_info)
        
        # 탭 닫고 원래 페이지로 돌아가기
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        
        if i == 10:  # 10개의 웹툰을 처리했으면 반복 중단
            break

# 웹드라이버 종료
driver.quit()

# JSON 파일로 저장
with open("webtoon_data1.json", "w", encoding="utf-8") as f:
    json.dump(webtoon_info_list, f, ensure_ascii=False, indent=4)

print("총 수집된 웹툰 개수:", len(webtoon_info_list))
print("데이터가 webtoon_data.json 파일로 저장되었습니다.")

