In [124]:
import sqlite3 
import pandas as pd
import numpy as np
import pymysql
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
import time
import requests
import re
from tqdm import tqdm


In [125]:
day = ["mon", "tue", "wed", "thu", "fri", "sat", "sun"]
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0 Safari/537.36'}

In [126]:
import requests
from tqdm import tqdm

webtoon_list = []
writers_list = []
painters_list = []
novel_origin_authors_list = []

for day_code in tqdm(day):
    url = f'https://comic.naver.com/api/webtoon/titlelist/weekday?week={day_code}&order=user'
    
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        
        # 웹툰 하나씩 꺼내기 (변수명 i -> webtoon 으로 변경)
        for webtoon in data["titleList"]:
            
            # 공통 ID (Foreign Key 역할)
            t_id = int(webtoon["titleId"])
            
            # 1. 메인 웹툰 정보 (불필요한 작가 문자열 제거)
            webtoon_list.append({
                "titleId": t_id,
                "titleName": webtoon["titleName"],
                # "author": webtoon["author"], -> 제외 (아래에서 따로 처리함)
                # "novelOriginAuthors": ... -> 제외
                "thumbnailUrl": webtoon["thumbnailUrl"],
                "up": webtoon["up"],
                "rest": webtoon["rest"],
                "bm": webtoon["bm"],
                "adult": webtoon["adult"],
                "starScore": float(webtoon["starScore"]),
                "viewCount": int(webtoon["viewCount"]),
                "openToday": webtoon["openToday"],
                "potenUp": webtoon["potenUp"],
                "bestChallengeLevelUp": webtoon["bestChallengeLevelUp"],
                "finish": webtoon["finish"],
                "new": webtoon["new"]
            })
            
            # 2. 글 작가 (titleId 추가 필수!)
            if webtoon.get("writers"):
                for writer in webtoon["writers"]:
                    writers_list.append({
                        "titleId": t_id,      # <--- 이게 있어야 연결됩니다!
                        "writerId": int(writer["id"]),
                        "name": writer["name"],
                        "type": "Writer"      # 구분용 (선택사항)
                    })
            
            # 3. 그림 작가 (titleId 추가 필수!)
            if webtoon.get("painters"):
                for painter in webtoon["painters"]:
                    painters_list.append({
                        "titleId": t_id,      # <--- 이게 있어야 연결됩니다!
                        "painterId": int(painter["id"]),
                        "name": painter["name"],
                        "type": "Painter"     # 구분용 (선택사항)
                    })
                    
            # 4. 원작 작가 (titleId 추가 필수!)
            if webtoon.get("novelOriginAuthors"):
                for origin_author in webtoon["novelOriginAuthors"]:
                    novel_origin_authors_list.append({
                        "titleId": t_id,      # <--- 이게 있어야 연결됩니다!
                        "originAuthorId": int(origin_author["id"]),
                        "name": origin_author["name"],
                        "type": "Original"    # 구분용 (선택사항)
                    })

    else:
        print("에러:", response.status_code)

print(f"웹툰 수집: {len(webtoon_list)}개")
print(f"글 작가 정보: {len(writers_list)}개")

100%|██████████| 7/7 [00:04<00:00,  1.63it/s]

웹툰 수집: 758개
글 작가 정보: 792개





In [87]:
engine = create_engine('sqlite:///mydatabase.db')

In [130]:
webtoon_df = pd.DataFrame(webtoon_list).drop_duplicates()
writers_df = pd.DataFrame(writers_list).drop_duplicates()
painters_df = pd.DataFrame(painters_list).drop_duplicates()
novelOriginAuthors_df = pd.DataFrame(novel_origin_authors_list).drop_duplicates()

In [136]:
webtoon_df.to_csv("naver_webtoon.csv", index=False)
writers_df.to_csv("naver_writers.csv", index=False)
painters_df.to_csv("naver_painters.csv", index=False)
novelOriginAuthors_df.to_csv("naver_novelOriginAuthors.csv", index=False)

In [144]:
webtoon_df

Unnamed: 0,titleId,titleName,thumbnailUrl,up,rest,bm,adult,starScore,viewCount,openToday,potenUp,bestChallengeLevelUp,finish,new
0,839004,만남어플 중독,https://image-comic.pstatic.net/webtoon/839004...,False,False,False,True,9.80955,0,False,False,False,False,False
1,844058,신체,https://image-comic.pstatic.net/webtoon/844058...,False,False,False,True,9.83567,0,False,False,False,False,True
2,758037,참교육,https://image-comic.pstatic.net/webtoon/758037...,False,False,False,False,9.89173,0,False,False,False,False,False
3,822657,환생천마,https://image-comic.pstatic.net/webtoon/822657...,False,False,False,False,9.95018,0,False,False,False,False,False
4,796075,절대검감,https://image-comic.pstatic.net/webtoon/796075...,False,False,False,False,9.94959,0,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753,837417,요괴삼월,https://image-comic.pstatic.net/webtoon/837417...,False,False,False,False,9.91174,0,False,False,False,False,False
754,822774,밤필드의 아이들 by DARK MOON,https://image-comic.pstatic.net/webtoon/822774...,False,False,False,False,9.84213,0,False,False,False,False,False
755,836948,우리의 공백,https://image-comic.pstatic.net/webtoon/836948...,False,False,False,False,9.81728,0,False,False,False,False,False
756,800101,헬스던전,https://image-comic.pstatic.net/webtoon/800101...,False,False,False,False,9.72215,0,False,False,False,False,False


In [145]:
writers_df

Unnamed: 0,titleId,writerId,name,type
0,839004,332797,루즌아,Writer
1,844058,355269,엄세윤,Writer
2,758037,301243,채용택,Writer
3,822657,348256,JP,Writer
4,796075,356299,김두루미,Writer
...,...,...,...,...
787,837417,282041,이윤희,Writer
788,822774,352114,HYBE,Writer
789,836948,387060,조예빈,Writer
790,800101,358218,도베도베,Writer


In [146]:
painters_df

Unnamed: 0,titleId,painterId,name,type
0,839004,332797,루즌아,Painter
1,844058,344285,정썸머,Painter
2,758037,314484,한가람,Painter
3,822657,323121,부겸,Painter
4,796075,356300,티아이,Painter
...,...,...,...,...
770,837417,282041,이윤희,Painter
771,822774,352114,HYBE,Painter
772,836948,387060,조예빈,Painter
773,800101,358219,채종,Painter


In [147]:
novelOriginAuthors_df

Unnamed: 0,titleId,originAuthorId,name,type
0,822657,376586,장영훈,Original
1,796075,347598,한중월야,Original
2,832703,383659,청시소,Original
3,821195,375336,커피라임,Original
4,833702,376063,에르훗,Original
...,...,...,...,...
211,809622,366492,유예랑,Original
212,837643,352277,carbo(도효원),Original
213,835066,385561,혜녹,Original
214,842363,359027,람글,Original


In [129]:
webtoon_df.to_sql("mytable", con=engine, if_exists="replace")

726

In [None]:
import requests
from tqdm import tqdm
import pandas as pd

# 1. 데이터를 담을 5개의 그릇 준비 (DB 테이블 구조와 1:1 매칭)
detail_list = []        # 줄거리 등 기본 정보 (1:1)
genre_list = []         # 장르 (1:N)
keyword_tag_list = []   # 단순 텍스트 태그 (1:N) - gfpAdCustomParam 안의 tags
weekday_list = []       # 요일 (1:N)
curation_tag_list = []  # 큐레이션 태그 상세 정보 (1:N)

# 2. 중복된 ID 제거 (API 호출 횟수 줄이기)
unique_ids = webtoon_df['titleId'].unique()

# 3. 세션 사용 (속도 향상 팁: 매번 연결을 새로 맺지 않고 재사용)
session = requests.Session()
session.headers.update(headers)

for page_num in tqdm(unique_ids):
    url = f'https://comic.naver.com/api/article/list/info?titleId={page_num}'
    
    try:
        response = session.get(url) # requests.get 대신 session.get 사용
        
        if response.status_code == 200:
            data = response.json()
            t_id = int(data["titleId"])
            
            # ---------------------------------------------------
            # [1] 메인 상세 정보 (Synopsis)
            # ---------------------------------------------------
            detail_list.append({
                "titleId": t_id,
                "synopsis": data.get("synopsis", ""), # 없을 경우 대비
                # 필요한 다른 정보가 있다면 여기 추가
            })
            
            # gfpAdCustomParam 데이터 가져오기 (없을 수도 있으니 get 사용)
            gfp_data = data.get("gfpAdCustomParam", {})
            
            # ---------------------------------------------------
            # [2] 장르 (Genre) - 리스트 풀어서 저장
            # ---------------------------------------------------
            # 예: ['DRAMA', 'ROMANCE'] -> 각각 저장
            if gfp_data.get("genreTypes"):
                for genre in gfp_data["genreTypes"]:
                    genre_list.append({
                        "titleId": t_id,
                        "genre": genre
                    })
                    
            # ---------------------------------------------------
            # [3] 텍스트 태그 (Tags) - 리스트 풀어서 저장
            # ---------------------------------------------------
            # 예: ['사이다', '먼치킨'] -> 각각 저장
            if gfp_data.get("tags"):
                for tag in gfp_data["tags"]:
                    keyword_tag_list.append({
                        "titleId": t_id,
                        "tag": tag
                    })

            # ---------------------------------------------------
            # [4] 요일 (Weekdays) - 리스트 풀어서 저장
            # ---------------------------------------------------
            if gfp_data.get("weekdays"):
                for day in gfp_data["weekdays"]:
                    weekday_list.append({
                        "titleId": t_id,
                        "day": day
                    })

            # ---------------------------------------------------
            # [5] 큐레이션 태그 (Curation Tags) - titleId 필수 추가!
            # ---------------------------------------------------
            if data.get("curationTagList"):
                for tag_obj in data["curationTagList"]:
                    curation_tag_list.append({
                        "titleId": t_id,         # <--- 핵심: 연결고리 추가
                        "tagId": tag_obj.get("id"),
                        "tagName": tag_obj.get("tagName"),
                        "urlPath": tag_obj.get("urlPath"),
                        "curationType": tag_obj.get("curationType")
                    })
                    
        else:
            print(f"ID {page_num} 에러: {response.status_code}")
            
    except Exception as e:
        print(f"ID {page_num} 접속 중 예외 발생: {e}")

# --- 결과 확인 (판다스 변환) ---
df_detail = pd.DataFrame(detail_list)
df_genre = pd.DataFrame(genre_list)
df_keyword = pd.DataFrame(keyword_tag_list)
df_weekday = pd.DataFrame(weekday_list)
df_curation = pd.DataFrame(curation_tag_list)

df_detail.to_csv("naver_detail.csv", index=False)
df_genre.to_csv("naver_genre.csv", index=False)
df_keyword.to_csv("naver_keyword.csv", index=False)
df_weekday.to_csv("naver_weekday.csv", index=False)
df_curation.to_csv("naver_curation.csv", index=False)

print("수집 완료!")
print(f"상세정보: {len(df_detail)}개")
print(f"장르정보: {len(df_genre)}개")
print(f"태그정보: {len(df_keyword)}개")

100%|██████████| 725/725 [00:26<00:00, 27.83it/s]

수집 완료!
상세정보: 725개
장르정보: 725개
태그정보: 6531개





In [150]:
df_detail

Unnamed: 0,titleId,synopsis
0,839004,20살 비인기 스트리머 초롱. 어플로 만난 정체불명의 남자와의 관계 속에서 주변 사...
1,844058,"사채업자, 재벌, 꿈에 그리던 첫사랑까지... 모두가 내 몸을 원한다. 대체 왜?"
2,758037,무너진 교권을 지키기 위해 교권보호국 소속 나화진의 참교육이 시작된다!\n<부활남>...
3,822657,"철혈의 맹주, 강호의 절대자 '천하진'. 가문의 수치라 불리는 망나니 '벽리단'의 ..."
4,796075,단전이 부숴졌다는 이유로 집에서는 내놓은 자식 취급을 받던 소운휘는 혈교에 납치되어...
...,...,...
720,837417,전 태권도 국가대표 선수 이은비. 슬럼프에 빠져 운동을 그만두고 그런대로 살아가던 ...
721,822774,밤필드 하우스 보육원에서 원장 마지의 보호를 받으며 살고 있던 일곱 소년들. \n그...
722,836948,누군가 떠나는것이 두려워 회피형으로 자란 '희진'\n자신과 정반대의 모습이 된 소꿉...
723,800101,"인류 최강의 헬스남 한솔은 데드리프트 도중 정신을 잃었는데, 눈을 떠보니 이세계에 ..."


In [151]:
df_genre

Unnamed: 0,titleId,genre
0,839004,DRAMA
1,844058,THRILL
2,758037,ACTION
3,822657,HISTORICAL
4,796075,HISTORICAL
...,...,...
720,837417,FANTASY
721,822774,FANTASY
722,836948,PURE
723,800101,ACTION


In [152]:
df_keyword

Unnamed: 0,titleId,tag
0,839004,컷툰
1,839004,나쁜남자
2,839004,치명적인
3,839004,현대
4,839004,평범녀
...,...,...
6526,837993,소꿉친구
6527,837993,액션판타지
6528,837993,액션아포칼립스
6529,837993,성장드라마


In [153]:
df_weekday

Unnamed: 0,titleId,day
0,839004,월
1,839004,목
2,844058,월
3,758037,월
4,822657,월
...,...,...
753,837417,일
754,822774,일
755,836948,일
756,800101,일


In [154]:
df_curation

Unnamed: 0,titleId,tagId,tagName,urlPath,curationType
0,839004,839004,드라마,/webtoon?tab=genre&genre=DRAMA,GENRE_DRAMA
1,839004,401,컷툰,/curation/list?type=CUSTOM_TAG&id=401,CUSTOM_TAG
2,839004,380,고자극스릴러,/curation/list?type=CUSTOM_TAG&id=380,CUSTOM_TAG
3,839004,375,고자극드라마,/curation/list?type=CUSTOM_TAG&id=375,CUSTOM_TAG
4,839004,362,자극적인,/curation/list?type=CUSTOM_TAG&id=362,CUSTOM_TAG
...,...,...,...,...,...
6695,837993,260,드라마,/curation/list?type=CUSTOM_TAG&id=260,CUSTOM_TAG
6696,837993,185,소꿉친구,/curation/list?type=CUSTOM_TAG&id=185,CUSTOM_TAG
6697,837993,122,아포칼립스,/curation/list?type=CUSTOM_TAG&id=122,CUSTOM_TAG
6698,837993,65,성장드라마,/curation/list?type=CUSTOM_TAG&id=65,CUSTOM_TAG


In [50]:
# 완결 웹툰

# idx = 1
# finished_webtoon_dict = []
# finished_writers_dict = []
# finished_painters_dict = []
# finished_novelOriginAuthors_dict = []

# while True:
#     url = f'https://comic.naver.com/api/webtoon/titlelist/finished?page={idx}&order=UPDATE'
#     headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0 Safari/537.36'
#     }

#     response = requests.get(url, headers=headers)

#     # 3. 데이터 확인
#     if response.status_code == 200:
#         data = response.json() # .text가 아니라 .json()으로 받습니다
#         for i in data["titleList"]:
#             novelOriginAuthors = None
#             for j in i["novelOriginAuthors"]:
#                 novelOriginAuthors = int(j.get("id"))
#             finished_webtoon_dict.append({
#                 "titleId": int(i["titleId"]),
#                 "titleName":i["titleName"],
#                 "author":i["author"],
#                 "novelOriginAuthors":novelOriginAuthors,
#                 "thumbnailUrl":i["thumbnailUrl"],
#                 "up":i["up"],
#                 "rest":i["rest"],
#                 "bm":i["bm"],
#                 "adult":i["adult"],
#                 "starScore":float(i["starScore"]),
#                 "viewCount":int(i["viewCount"]),
#                 "openToday":i["openToday"],
#                 "potenUp":i["potenUp"],
#                 "bestChallengeLevelUp":i["bestChallengeLevelUp"],
#                 "finish":i["finish"],
#                 "new":i["new"]
#             })
#             for j in i["writers"]:
#                 finished_writers_dict.append({
#                     "id": int(j["id"]),
#                     "name": j["name"]
#                 })
#             for j in i["painters"]:
#                 finished_painters_dict.append({
#                     "id": int(j["id"]),
#                     "name": j["name"]
#                 })
#             for j in i["novelOriginAuthors"]:
#                 finished_novelOriginAuthors_dict.append({
#                     "id": int(j["id"]),
#                     "name": j["name"]
#                 })
            
#     else:
#         print("에러:", response.status_code)
#         break
#     idx += 1