- 의존성 관리

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
from datetime import datetime
from typing import Dict, List, Optional

from urllib.parse import unquote # URL 디코딩 -> ')' 의 경우 %29로 인코딩 되어있어서 href로 가져올 때 디코딩 필요
from urllib.parse import quote # URL 인코딩 -> ' '의 경우 %20으로 인코딩 필요

## Red Velvet

### Fandom API 사용하기

In [5]:
# api url
url = "https://kpop.fandom.com/api.php"

In [8]:
# api  parameters
params = {
    "action": "parse",
    "page": "Red_Velvet_-_Irene_&_Seulgi",  # 페이지명
    "format": "json"
}

In [9]:
res = requests.get(url, params=params)
data = res.json()

# 파싱된 HTML 가져오기
html = data["parse"]["text"]["*"]
soup = BeautifulSoup(html, "html.parser")

- group_name

In [46]:
# group name
# data-source="hangul" 부분 추출
group_name = soup.select_one('[data-source="hangul"] .pi-data-value')
if group_name:
    print(group_name.get_text(strip=True)) 
else:
    print("값을 찾을 수 없음")

에스파


- group_debut_date

In [47]:
# group_debut_date

# debut 블록 찾기
debut_block = soup.select_one('[data-source="debut"] .pi-data-value')

if debut_block:
    debut_parts = debut_block.get_text("||", strip=True).split("||")
    first_debut = debut_parts[0]
    first_debut_date_str = first_debut.split("(")[0].strip()
    
    # datetime 변환
    debut_date = datetime.strptime(first_debut_date_str, "%B %d, %Y")
    print(debut_date)           # 2014-08-01 00:00:00
    print(debut_date.date())    # 2014-08-01
else:
    print("Debut 정보 없음")

2020-11-17 00:00:00
2020-11-17


- entertainment

In [48]:
# enter_name
label_block = soup.select_one('[data-source="label"] .pi-data-value')

if label_block:
    labels = label_block.get_text("||", strip=True).split("||") # "||": 태그 사이 구분자로 "||" 문자열을 넣으라는 의미
    # <br> 같은 줄바꿈이나 여러 하위 태그가 있을 때, 각각의 텍스트를 "||"로 이어붙입니다.
    first_label = labels[0].strip() # 엔터테인먼트의 이름 추출
    print(first_label) 
    
    # label_block 안에서 <a> 태그 중  텍스트가 first_label과 같은 것을 찾기
    a_tag = label_block.find('a', string= lambda text: text and text.strip() == first_label)
    
    if a_tag and a_tag.has_attr('href'):
        href = a_tag['href'] # 엔터테인먼트의 링크 추출
        print(f"Href: https://kpop.fandom.com{href}")
    else :
        print("Href 정보 없음")
        
else :
    print("Label 정보 없음")

KR:
Href 정보 없음


- member

In [None]:
# member
members_current = soup.select_one('[data-source="current"] .pi-data-value > ul')   # 현재 활동 멤버
members_inactive = soup.select_one('[data-source="inactive"] .pi-data-value > ul')   # 비활동기 멤버 (ex:군백기) , 탈퇴의 경우 Former로 분류됨
members_former = soup.select_one('[data-source="former"] .pi-data-value > ul')   # 탈퇴 멤버

members = []
if members_current:
    for a in members_current.select('a'):
        name = a.get_text(strip=True)
        href = a.get("href")
        # 1. 전체 href를 다시 안전하게 인코딩 (/, :, _, () 등은 유지)
        # safe_href = quote(href, safe="/:()_") 
        # 2. URL 디코딩해서 괄호 문제 해결
        # href = unquote(href) 
        link = f"https://kpop.fandom.com{href}"
        members.append({"name": name, "href": link})

if members_inactive:
    for a in members_inactive.select('a'):
        name = a.get_text(strip=True)
        href = a.get("href") 
        link = f"https://kpop.fandom.com{href}"
        members.append({"name": name, "href": link})
        
print(members)


[{'name': 'Irene', 'href': 'https://kpop.fandom.com/wiki/Irene_(Red_Velvet)'}, {'name': 'Seulgi', 'href': 'https://kpop.fandom.com/wiki/Seulgi_(Red_Velvet)'}, {'name': 'Wendy', 'href': 'https://kpop.fandom.com/wiki/Wendy'}, {'name': 'Joy', 'href': 'https://kpop.fandom.com/wiki/Joy_(Red_Velvet)'}, {'name': 'Yeri', 'href': 'https://kpop.fandom.com/wiki/Yeri_(Red_Velvet)'}]


- Fandom

In [24]:
# fandom
fandom_block = soup.select_one('[data-source="fandom"] .pi-data-value')

if fandom_block:
    fandom_name = fandom_block.get_text(strip=True)
    print(fandom_name)
else:
    print("Fandom 정보 없음")

ReVeluv (레베럽)


- sns
    - <b>KR:</b>, <b>JP:</b>, <b>CH:</b> 같은 국가 구분자가 있을 수도 있고
	- 아예 <b> 태그가 없을 수도 있음 (→ 이 경우 기본값 “KR”). => 한국 sns만 생성된 경우, 해당 사이트에서 구분자 없음
    - <a> 태그 안에 있는 href를 모으기
    - <img> 태그의 data-image-name 같은 속성에서 SNS 플랫폼명 (Instagram, TikTok, YouTube 등) 을 뽑기

In [25]:
# sns
sns_block = soup.select_one('[data-source="sns"]')
sns_data = {}
current_country = "KR"  # 기본값 설정

if sns_block:
    for elem in sns_block.children: # elem은 <b>, <a>, NavigableString 등 다양한 타입이 될 수 있음. 자식태그 돌기
        # 국가 구분 (b 태그 발견 시 변경)
        if elem.name == 'b':
            country_text = elem.get_text(strip=True).rstrip(':')  # 'KR:', 'JP:' 등에서 ':' 제거
            current_country = country_text  # 현재 국가 설정
            if current_country not in sns_data:
                sns_data[current_country] = []  # 새로운 국가면 리스트 초기화
                
        # SNS 링크 (a 태그 발견 시 추가)
        if elem.name == 'span':
            a = elem.find('a')
            img = elem.find('img')
            if a and img:
                href = a.get('href')
                platform = img.get('data-image-name', "").replace(" Icon.png", "")  # "Instagram Icon.png" -> "Instagram"
                if current_country not in sns_data:
                    sns_data[current_country] = []
                sns_data[current_country].append({"platform": platform, "href": href})

print(sns_data)

{'KR': [{'platform': 'Daum Cafe', 'href': 'http://cafe.daum.net/SM004'}, {'platform': 'Facebook', 'href': 'https://www.facebook.com/RedVelvet'}, {'platform': 'Instagram', 'href': 'https://www.instagram.com/redvelvet.smtown/'}, {'platform': 'TikTok', 'href': 'https://www.tiktok.com/@redvelvet_smtown'}, {'platform': 'X', 'href': 'https://twitter.com/RVsmtown'}, {'platform': 'Weverse', 'href': 'https://www.weverse.io/redvelvet'}, {'platform': 'YouTube', 'href': 'https://www.youtube.com/channel/UCk9GmdlDTBfgGRb7vXeRMoQ'}], 'JP': [{'platform': 'Instagram', 'href': 'https://www.instagram.com/redvelvet_jp/'}, {'platform': 'X', 'href': 'https://twitter.com/Red_VelvetJP'}], 'CH': [{'platform': 'Weibo', 'href': 'https://www.weibo.com/RedVelvetofficial'}]}


In [8]:
# group_type

---

### 파라미터로 모든 데이터 한번에 가져와서 csv로 내보내기!!!

In [2]:
class KpopGroupCrawler:
    def __init__(self):
        self.base_url = "https://kpop.fandom.com/api.php"
        self.base_wiki_url = "https://kpop.fandom.com"
    
    def get_group_info(self, group_name: str) -> Dict:
        """
        특정 K-pop 그룹의 정보를 크롤링합니다.
        
        Args:
            group_name (str): 그룹명 (예: "Red_Velvet", "BTS", "BLACKPINK")
        
        Returns:
            Dict: 그룹 정보가 담긴 딕셔너리
        """
        params = {
            "action": "parse",
            "page": group_name,
            "format": "json"
        }
        
        try:
            res = requests.get(self.base_url, params=params)
            res.raise_for_status()
            data = res.json()
            
            if 'parse' not in data:
                print(f"페이지를 찾을 수 없습니다: {group_name}")
                return {}
            
            html = data["parse"]["text"]["*"]
            soup = BeautifulSoup(html, "html.parser")
            
            group_info = {
                'group_name_en': group_name.replace('_', ' '),
                'group_name_hangul': self._get_hangul_name(soup),
                'debut_date': self._get_debut_date(soup),
                'entertainment': self._get_entertainment(soup),
                'entertainment_link': self._get_entertainment_link(soup),
                'members': self._get_members(soup),
                'fandom_name': self._get_fandom_name(soup),
                'sns_links': self._get_sns_links(soup)
            }
            
            return group_info
            
        except requests.RequestException as e:
            print(f"네트워크 오류: {e}")
            return {}
        except Exception as e:
            print(f"크롤링 오류: {e}")
            return {}
    
    def _get_hangul_name(self, soup) -> str:
        """한글 그룹명 추출"""
        group_name = soup.select_one('[data-source="hangul"] .pi-data-value')
        return group_name.get_text(strip=True) if group_name else ""
    
    def _get_debut_date(self, soup) -> str:
        """데뷔 날짜 추출"""
        debut_block = soup.select_one('[data-source="debut"] .pi-data-value')
        if debut_block:
            debut_parts = debut_block.get_text("||", strip=True).split("||")
            first_debut = debut_parts[0]
            first_debut_date_str = first_debut.split("(")[0].strip()
            
            try:
                debut_date = datetime.strptime(first_debut_date_str, "%B %d, %Y")
                return debut_date.strftime("%Y-%m-%d")
            except ValueError:
                return first_debut_date_str
        return ""
    
    
    def _get_entertainment(self, soup) -> str:
        """소속사명 추출 - 첫 번째 엔터테인먼트 회사"""
        label_block = soup.select_one('[data-source="label"] .pi-data-value')
        if not label_block:
            return ""
        
        # 첫 번째 b 태그(국가 라벨) 다음의 첫 번째 a 태그나 텍스트 찾기
        first_b = label_block.find('b')
        if first_b:
            # b 태그 다음의 첫 번째 a 태그 찾기
            a_tag = first_b.find_next('a')
            if a_tag:
                return a_tag.get_text(strip=True)
        
        # b 태그가 없으면 기존 방식
        labels = label_block.get_text("||", strip=True).split("||")
        return labels[0].strip() if labels else ""
    
    def _get_entertainment_link(self, soup) -> str:
        """소속사 링크 추출 - 첫 번째 엔터테인먼트 회사 링크"""
        label_block = soup.select_one('[data-source="label"] .pi-data-value')
        if not label_block:
            return ""
        
        # 첫 번째 b 태그(국가 라벨) 다음의 첫 번째 a 태그 찾기
        first_b = label_block.find('b')
        if first_b:
            a_tag = first_b.find_next('a', href=True)
            if a_tag:
                href = a_tag['href']
                return href if href.startswith('http') else f"{self.base_wiki_url}{href}"
        
        # b 태그가 없으면 기존 방식
        labels = label_block.get_text("||", strip=True).split("||")
        if labels:
            first_label = labels[0].strip()
            a_tag = label_block.find('a', string=lambda text: text and text.strip() == first_label)
            if a_tag and a_tag.has_attr('href'):
                href = a_tag['href']
                return href if href.startswith('http') else f"{self.base_wiki_url}{href}"
        
        return ""
    
    def _get_members(self, soup) -> List[Dict]:
        """멤버 정보 추출"""
        members = []
        
        # 현재 활동 멤버
        members_current = soup.select_one('[data-source="current"] .pi-data-value > ul')
        if members_current:
            for a in members_current.select('a'):
                name = a.get_text(strip=True)
                href = a.get("href")
                link = f"{self.base_wiki_url}{href}"
                members.append({"name": name, "href": link, "status": "current"})
        
        # 비활동 멤버
        members_inactive = soup.select_one('[data-source="inactive"] .pi-data-value > ul')
        if members_inactive:
            for a in members_inactive.select('a'):
                name = a.get_text(strip=True)
                href = a.get("href")
                link = f"{self.base_wiki_url}{href}"
                members.append({"name": name, "href": link, "status": "inactive"})
        
        # 탈퇴 멤버
        members_former = soup.select_one('[data-source="former"] .pi-data-value > ul')
        if members_former:
            for a in members_former.select('a'):
                name = a.get_text(strip=True)
                href = a.get("href")
                link = f"{self.base_wiki_url}{href}"
                members.append({"name": name, "href": link, "status": "former"})
        
        return members
    
    def _get_fandom_name(self, soup) -> str:
        """팬덤명 추출"""
        fandom_block = soup.select_one('[data-source="fandom"] .pi-data-value')
        return fandom_block.get_text(strip=True) if fandom_block else ""
    
    def _get_sns_links(self, soup) -> Dict:
        """SNS 링크 추출"""
        sns_block = soup.select_one('[data-source="sns"]')
        sns_data = {}
        current_country = "KR"
        
        if sns_block:
            for elem in sns_block.children:
                if elem.name == 'b':
                    country_text = elem.get_text(strip=True).rstrip(':')
                    current_country = country_text
                    if current_country not in sns_data:
                        sns_data[current_country] = []
                
                if elem.name == 'span':
                    a = elem.find('a')
                    img = elem.find('img')
                    if a and img:
                        href = a.get('href')
                        platform = img.get('data-image-name', "").replace(" Icon.png", "")
                        if current_country not in sns_data:
                            sns_data[current_country] = []
                        sns_data[current_country].append({"platform": platform, "href": href})
        
        return sns_data
    
    def create_csv_from_groups(self, group_names: List[str], output_filename: str = "kpop_groups_info.csv"):
        """
        여러 그룹의 정보를 크롤링하여 CSV 파일로 저장합니다.
        
        Args:
            group_names (List[str]): 크롤링할 그룹명 리스트
            output_filename (str): 출력할 CSV 파일명
        """
        all_data = []
        
        for group_name in group_names:
            print(f"크롤링 중: {group_name}")
            group_info = self.get_group_info(group_name)
            
            # 크롤링 실패 시에도 null 값으로 데이터 추가
            if not group_info:
                print(f"정보를 가져올 수 없습니다: {group_name}")
                group_info = {
                    'group_name_en': group_name.replace('_', ' '),
                    'group_name_hangul': None,
                    'debut_date': None,
                    'entertainment': None,
                    'entertainment_link': None,
                    'members': [],
                    'fandom_name': None,
                    'sns_links': {},
                    'update_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }
            
            # 멤버를 상태별로 분리
            current_members = []
            inactive_members = []
            former_members = []
            
            for member in group_info['members']:
                if member['status'] == 'current':
                    current_members.append(member['name'])
                elif member['status'] == 'inactive':
                    inactive_members.append(member['name'])
                elif member['status'] == 'former':
                    former_members.append(member['name'])
            
            # 멤버 정보를 문자열로 변환 (null 처리)
            current_members_str = "; ".join(current_members) if current_members else None
            inactive_members_str = "; ".join(inactive_members) if inactive_members else None
            former_members_str = "; ".join(former_members) if former_members else None
            
            # SNS 링크를 문자열로 변환 (null 처리)
            sns_str = json.dumps(group_info['sns_links'], ensure_ascii=False) if group_info['sns_links'] else None
            
            row_data = {
                'group_name_en': group_info['group_name_en'],
                'group_name_kr': group_info['group_name_hangul'] or None,
                'debut_date': group_info['debut_date'] or None,
                'entertainment_name': group_info['entertainment'] or None,
                'entertainment_link': group_info['entertainment_link'] or None,
                'member_current': current_members_str,
                'member_inactive': inactive_members_str,
                'member_former': former_members_str,
                'fandom_name': group_info['fandom_name'] or None,
                'sns': sns_str,
                'update_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
            
            all_data.append(row_data)
        
        # 데이터가 있든 없든 항상 DataFrame 생성
        df = pd.DataFrame(all_data)
        df.to_csv(output_filename, index=False, encoding='utf-8-sig')
        print(f"CSV 파일이 생성되었습니다: {output_filename}")
        return df

In [2]:
class KpopGroupCrawler_byType:
    def __init__(self):
        self.base_url = "https://kpop.fandom.com/api.php"
        self.base_wiki_url = "https://kpop.fandom.com"
    
    def get_group_info(self, group_name: str) -> Dict:
        """
        특정 K-pop 그룹의 정보를 크롤링합니다.
        
        Args:
            group_name (str): 그룹명 (예: "Red_Velvet", "BTS", "BLACKPINK")
        
        Returns:
            Dict: 그룹 정보가 담긴 딕셔너리
        """
        params = {
            "action": "parse",
            "page": group_name,
            "format": "json"
        }
        
        try:
            res = requests.get(self.base_url, params=params)
            res.raise_for_status()
            data = res.json()
            
            if 'parse' not in data:
                print(f"페이지를 찾을 수 없습니다: {group_name}")
                return {}
            
            html = data["parse"]["text"]["*"]
            soup = BeautifulSoup(html, "html.parser")
            
            group_info = {
                'group_name_en': group_name.replace('_', ' '),
                'group_name_hangul': self._get_hangul_name(soup),
                'debut_date': self._get_debut_date(soup),
                'entertainment': self._get_entertainment(soup),
                'entertainment_link': self._get_entertainment_link(soup),
                'members': self._get_members(soup),
                'fandom_name': self._get_fandom_name(soup),
                'sns_links': self._get_sns_links(soup)
            }
            
            return group_info
            
        except requests.RequestException as e:
            print(f"네트워크 오류: {e}")
            return {}
        except Exception as e:
            print(f"크롤링 오류: {e}")
            return {}
    
    def _get_hangul_name(self, soup) -> str:
        """한글 그룹명 추출"""
        group_name = soup.select_one('[data-source="hangul"] .pi-data-value')
        return group_name.get_text(strip=True) if group_name else ""
    
    def _get_debut_date(self, soup) -> str:
        """데뷔 날짜 추출"""
        debut_block = soup.select_one('[data-source="debut"] .pi-data-value')
        if debut_block:
            debut_parts = debut_block.get_text("||", strip=True).split("||")
            first_debut = debut_parts[0]
            first_debut_date_str = first_debut.split("(")[0].strip()
            
            try:
                debut_date = datetime.strptime(first_debut_date_str, "%B %d, %Y")
                return debut_date.strftime("%Y-%m-%d")
            except ValueError:
                return first_debut_date_str
        return ""
    
    def _get_entertainment(self, soup) -> str:
        """소속사명 추출 - 첫 번째 엔터테인먼트 회사"""
        label_block = soup.select_one('[data-source="label"] .pi-data-value')
        if not label_block:
            return ""
        
        # 첫 번째 b 태그(국가 라벨) 다음의 첫 번째 a 태그나 텍스트 찾기
        first_b = label_block.find('b')
        if first_b:
            # b 태그 다음의 첫 번째 a 태그 찾기
            a_tag = first_b.find_next('a')
            if a_tag:
                return a_tag.get_text(strip=True)
        
        # b 태그가 없으면 기존 방식
        labels = label_block.get_text("||", strip=True).split("||")
        return labels[0].strip() if labels else ""
    
    def _get_entertainment_link(self, soup) -> str:
        """소속사 링크 추출 - 첫 번째 엔터테인먼트 회사 링크"""
        label_block = soup.select_one('[data-source="label"] .pi-data-value')
        if not label_block:
            return ""
        
        # 첫 번째 b 태그(국가 라벨) 다음의 첫 번째 a 태그 찾기
        first_b = label_block.find('b')
        if first_b:
            a_tag = first_b.find_next('a', href=True)
            if a_tag:
                href = a_tag['href']
                return href if href.startswith('http') else f"{self.base_wiki_url}{href}"
        
        # b 태그가 없으면 기존 방식
        labels = label_block.get_text("||", strip=True).split("||")
        if labels:
            first_label = labels[0].strip()
            a_tag = label_block.find('a', string=lambda text: text and text.strip() == first_label)
            if a_tag and a_tag.has_attr('href'):
                href = a_tag['href']
                return href if href.startswith('http') else f"{self.base_wiki_url}{href}"
        
        return ""
    
    def _get_members(self, soup) -> List[Dict]:
        """멤버 정보 추출"""
        members = []
        
        # 현재 활동 멤버
        members_current = soup.select_one('[data-source="current"] .pi-data-value > ul')
        if members_current:
            for a in members_current.select('a'):
                name = a.get_text(strip=True)
                href = a.get("href")
                link = f"{self.base_wiki_url}{href}"
                members.append({"name": name, "href": link, "status": "current"})
        
        # 비활동 멤버
        members_inactive = soup.select_one('[data-source="inactive"] .pi-data-value > ul')
        if members_inactive:
            for a in members_inactive.select('a'):
                name = a.get_text(strip=True)
                href = a.get("href")
                link = f"{self.base_wiki_url}{href}"
                members.append({"name": name, "href": link, "status": "inactive"})
        
        # 탈퇴 멤버
        members_former = soup.select_one('[data-source="former"] .pi-data-value > ul')
        if members_former:
            for a in members_former.select('a'):
                name = a.get_text(strip=True)
                href = a.get("href")
                link = f"{self.base_wiki_url}{href}"
                members.append({"name": name, "href": link, "status": "former"})
        
        return members
    
    def _get_fandom_name(self, soup) -> str:
        """팬덤명 추출"""
        fandom_block = soup.select_one('[data-source="fandom"] .pi-data-value')
        return fandom_block.get_text(strip=True) if fandom_block else ""
    
    def _get_sns_links(self, soup) -> Dict:
        """SNS 링크 추출"""
        sns_block = soup.select_one('[data-source="sns"]')
        sns_data = {}
        current_country = "KR"
        
        if sns_block:
            for elem in sns_block.children:
                if elem.name == 'b':
                    country_text = elem.get_text(strip=True).rstrip(':')
                    current_country = country_text
                    if current_country not in sns_data:
                        sns_data[current_country] = []
                
                if elem.name == 'span':
                    a = elem.find('a')
                    img = elem.find('img')
                    if a and img:
                        href = a.get('href')
                        platform = img.get('data-image-name', "").replace(" Icon.png", "")
                        if current_country not in sns_data:
                            sns_data[current_country] = []
                        sns_data[current_country].append({"platform": platform, "href": href})
        
        return sns_data
    
    def create_csv_from_groups(self, group_names: List[str], output_filename: str = "kpop_groups_info.csv", group_type: str = "Unknown"):
        """
        여러 그룹의 정보를 크롤링하여 CSV 파일로 저장합니다.
        
        Args:
            group_names (List[str]): 크롤링할 그룹명 리스트
            output_filename (str): 출력할 CSV 파일명
            group_type (str): 그룹 타입 (예: "Girl Group", "Boy Group", "Co-ed", "Solo" 등)
        """
        all_data = []
        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # 현재 시간
        
        for group_name in group_names:
            print(f"크롤링 중: {group_name}")
            group_info = self.get_group_info(group_name)
            
            # 크롤링 실패 시에도 null 값으로 데이터 추가
            if not group_info:
                print(f"정보를 가져올 수 없습니다: {group_name}")
                group_info = {
                    'group_name_en': group_name.replace('_', ' '),
                    'group_name_hangul': None,
                    'debut_date': None,
                    'entertainment': None,
                    'entertainment_link': None,
                    'members': [],
                    'fandom_name': None,
                    'sns_links': {}
                }
            
            # 멤버를 상태별로 분리
            current_members = []
            inactive_members = []
            former_members = []
            
            for member in group_info['members']:
                if member['status'] == 'current':
                    current_members.append(member['name'])
                elif member['status'] == 'inactive':
                    inactive_members.append(member['name'])
                elif member['status'] == 'former':
                    former_members.append(member['name'])
            
            # 멤버 정보를 문자열로 변환 (null 처리)
            current_members_str = "; ".join(current_members) if current_members else None
            inactive_members_str = "; ".join(inactive_members) if inactive_members else None
            former_members_str = "; ".join(former_members) if former_members else None
            
            # SNS 링크를 문자열로 변환 (null 처리)
            sns_str = json.dumps(group_info['sns_links'], ensure_ascii=False) if group_info['sns_links'] else None
            
            row_data = {
                'group_name_en': group_info['group_name_en'],
                'group_name_kr': group_info['group_name_hangul'] or None,
                'debut_date': group_info['debut_date'] or None,
                'entertainment_name': group_info['entertainment'] or None,
                'entertainment_link': group_info['entertainment_link'] or None,
                'member_current': current_members_str,
                'member_inactive': inactive_members_str,
                'member_former': former_members_str,
                'fandom_name': group_info['fandom_name'] or None,
                'sns': sns_str,
                'group_type': group_type,
                'update_at': current_time
            }
            
            all_data.append(row_data)
        
        # 데이터가 있든 없든 항상 DataFrame 생성
        df = pd.DataFrame(all_data)
        df.to_csv(output_filename, index=False, encoding='utf-8-sig')
        print(f"CSV 파일이 생성되었습니다: {output_filename}")
        return df

In [None]:
# # 사용 예제
# def main():
#     crawler = KpopGroupCrawler()
    
#     # 단일 그룹 정보 조회
#     group_info = crawler.get_group_info("Red_Velvet")
#     print("Red Velvet 정보:")
#     print(json.dumps(group_info, indent=2, ensure_ascii=False))
    
#     # 여러 그룹 정보를 CSV로 저장 (존재하지 않는 그룹도 null 값으로 포함)
#     group_list = ["Red_Velvet", "BLACKPINK", "BTS", "TWICE", "NonExistentGroup"]
#     df = crawler.create_csv_from_groups(group_list, "kpop_groups_data.csv")
    
#     if not df.empty:
#         print("\n생성된 CSV 데이터:")
#         print(df.head())

# if __name__ == "__main__":
#     main()

In [None]:
# # 간단한 사용법
# # 1. 클래스 인스턴스 생성 (괄호 필수!!!)
# crawler = KpopGroupCrawler()

# # 2. 원하는 그룹 리스트 작성
# group_list = ["Red_Velvet", "BLACKPINK", "BTS", "TWICE"]

# # 3. CSV 파일 생성 (끝!) -> 첫 번째 매개변수는 꼭 리스트!
# crawler.create_csv_from_groups(group_list, "my_kpop_data.csv")

In [39]:
KpopGroupCrawler().create_csv_from_groups(["SEVENTEEN", "Red_Velvet", "DAY6"], "kpop_data.csv")

크롤링 중: SEVENTEEN
크롤링 중: Red_Velvet
크롤링 중: DAY6
CSV 파일이 생성되었습니다: kpop_data.csv


Unnamed: 0,group_name_en,group_name_kr,debut_date,entertainment_name,entertainment_link,member_current,member_inactive,member_former,fandom_name,sns,group_type,update_at
0,SEVENTEEN,세븐틴,2015-05-26,Pledis Entertainment,https://kpop.fandom.com/wiki/Pledis_Entertainment,S.Coups; Joshua; Jun; DK; Mingyu; The8; Seungk...,Jeonghan; Hoshi; Wonwoo; Woozi,,CARAT (캐럿),"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",Unknown,2025-09-18 14:12:09
1,Red Velvet,레드벨벳,2014-08-01,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,Irene; Seulgi; Wendy; Joy; Yeri,,,ReVeluv (레베럽),"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",Unknown,2025-09-18 14:12:09
2,DAY6,데이식스,2015-09-07,JYP Entertainment,https://kpop.fandom.com/wiki/JYP_Entertainment,Sungjin; Young K; Wonpil; Dowoon,,Jae; Junhyeok,My Day (마이 데이),"{""KR"": [{""platform"": ""Facebook"", ""href"": ""http...",Unknown,2025-09-18 14:12:09


In [None]:
# crawler = KpopGroupCrawler_byType()

# # 걸그룹 데이터
# girl_groups = ["Red_Velvet", "BLACKPINK", "TWICE"]
# crawler.create_csv_from_groups(girl_groups, "girl_groups.csv", "Girl Group")

# # 보이그룹 데이터
# boy_groups = ["BTS", "Stray_Kids", "SEVENTEEN"]
# crawler.create_csv_from_groups(boy_groups, "boy_groups.csv", "Boy Group")

# # 솔로 아티스트
# solo_artists = ["IU", "G-Dragon"]
# crawler.create_csv_from_groups(solo_artists, "solo_artists.csv", "Solo")

In [4]:
# 걸그룹 내보내기
girl_groups = ["Red_Velvet", "BLACKPINK", "TWICE", "ITZY", "Aespa", "MAMAMOO", "CrazAngel", "IVE", "ILLIT", "BABYMONSTER", "NJZ"]
KpopGroupCrawler_byType().create_csv_from_groups(girl_groups, "girl_groups.csv", "Girl Group")

크롤링 중: Red_Velvet
크롤링 중: BLACKPINK
크롤링 중: TWICE
크롤링 중: ITZY
크롤링 중: Aespa
크롤링 중: MAMAMOO
크롤링 중: CrazAngel
크롤링 중: IVE
크롤링 중: ILLIT
크롤링 중: BABYMONSTER
크롤링 중: NJZ
CSV 파일이 생성되었습니다: girl_groups.csv


Unnamed: 0,group_name_en,group_name_kr,debut_date,entertainment_name,entertainment_link,member_current,member_inactive,member_former,fandom_name,sns,group_type,update_at
0,Red Velvet,레드벨벳,2014-08-01,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,Irene; Seulgi; Wendy; Joy; Yeri,,,ReVeluv (레베럽),"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",Girl Group,2025-09-19 14:06:06
1,BLACKPINK,블랙핑크,2016-08-08,YG Entertainment,https://kpop.fandom.com/wiki/YG_Entertainment,Jisoo; Jennie; Rosé; Lisa,,,BLINK (블링크),"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",Girl Group,2025-09-19 14:06:06
2,TWICE,트와이스,2015-10-20,JYP Entertainment,https://kpop.fandom.com/wiki/JYP_Entertainment,Nayeon; Jeongyeon; Momo; Sana; Jihyo; Mina; Da...,,,ONCE (원스),"{""KR"": [{""platform"": ""Facebook"", ""href"": ""http...",Girl Group,2025-09-19 14:06:06
3,ITZY,있지,2019-02-12,JYP Entertainment,https://kpop.fandom.com/wiki/JYP_Entertainment,Yeji; Lia; Ryujin; Chaeryeong; Yuna,,,MIDZY (믿지)[2],"{""KR"": [{""platform"": ""Facebook"", ""href"": ""http...",Girl Group,2025-09-19 14:06:06
4,Aespa,에스파,2020-11-17,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,Karina; Giselle; Winter; Ningning,,,MY (마이),"{""KR"": [{""platform"": ""Facebook"", ""href"": ""http...",Girl Group,2025-09-19 14:06:06
5,MAMAMOO,마마무,2014-06-19,RBW,https://kpop.fandom.com/wiki/RBW,Solar; Moon Byul; Whee In; Hwa Sa,,,MooMoo (무무),"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",Girl Group,2025-09-19 14:06:06
6,CrazAngel,크레이즈엔젤,2025-07-10,Forbest Entertainment,,Solmi; Daze; Shannie; Ahon,,,,"{""KR"": [{""platform"": ""Instagram"", ""href"": ""htt...",Girl Group,2025-09-19 14:06:06
7,IVE,아이브,2021-12-01,Starship Entertainment,https://kpop.fandom.com/wiki/Starship_Entertai...,Gaeul; An Yujin; Rei; Jang Wonyoung; Liz; Leeseo,,,DIVE (다이브)[1],"{""KR"": [{""platform"": ""Berriz app icon.png"", ""h...",Girl Group,2025-09-19 14:06:06
8,ILLIT,아일릿,2024-03-25,Belift Lab,https://kpop.fandom.com/wiki/Belift_Lab,Yunah; Minju; Moka; Wonhee; Iroha,,,GLLIT (글릿)(current)[1]LILLY (릴리즈)(former)[2],"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",Girl Group,2025-09-19 14:06:06
9,BABYMONSTER,베이비몬스터,2023-11-27,YG Entertainment,https://kpop.fandom.com/wiki/YG_Entertainment,Ruka; Pharita; Asa; Ahyeon; Rora; Chiquita,Rami,,MONSTIEZ (‪몬스티즈‬)[1],"{""KR"": [{""platform"": ""Discord"", ""href"": ""https...",Girl Group,2025-09-19 14:06:06


In [5]:
# boy 그룹 내보내기
boy_groups = ["BTS", "Stray_Kids", "SEVENTEEN", "ENHYPEN", "TXT", "NCT", "ATEEZ", "EXO", "P1Harmony", "SF9", "VICTON", "CRAVITY"]
KpopGroupCrawler_byType().create_csv_from_groups(boy_groups, "boy_groups.csv", "Boy Group")

크롤링 중: BTS
크롤링 중: Stray_Kids
크롤링 중: SEVENTEEN
크롤링 중: ENHYPEN
크롤링 중: TXT
크롤링 중: NCT
크롤링 중: ATEEZ
크롤링 중: EXO
크롤링 중: P1Harmony
크롤링 중: SF9
크롤링 중: VICTON
크롤링 중: CRAVITY
CSV 파일이 생성되었습니다: boy_groups.csv


Unnamed: 0,group_name_en,group_name_kr,debut_date,entertainment_name,entertainment_link,member_current,member_inactive,member_former,fandom_name,sns,group_type,update_at
0,BTS,방탄소년단,2013-06-13,BigHit Music,https://kpop.fandom.com/wiki/BigHit_Music,Jin; Suga; J-Hope; RM; Jimin; V; Jung Kook,,,ARMY (아미),"{""KR"": [{""platform"": ""Blog"", ""href"": ""http://b...",Boy Group,2025-09-19 14:11:53
1,Stray Kids,스트레이 키즈,2018-03-25,JYP Entertainment,https://kpop.fandom.com/wiki/JYP_Entertainment,Bang Chan; Lee Know; Changbin; Hyunjin; Han; F...,,Woojin,STAY,"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",Boy Group,2025-09-19 14:11:53
2,SEVENTEEN,세븐틴,2015-05-26,Pledis Entertainment,https://kpop.fandom.com/wiki/Pledis_Entertainment,S.Coups; Joshua; Jun; DK; Mingyu; The8; Seungk...,Jeonghan; Hoshi; Wonwoo; Woozi,,CARAT (캐럿),"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",Boy Group,2025-09-19 14:11:53
3,ENHYPEN,엔하이픈,2020-11-30,Belift Lab,https://kpop.fandom.com/wiki/Belift_Lab,Heeseung; Jay; Jake; Sunghoon; Sunoo; Jungwon;...,,,ENGENE (엔진),"{""KR"": [{""platform"": ""Facebook"", ""href"": ""http...",Boy Group,2025-09-19 14:11:53
4,TXT,투모로우바이투게더,2019-03-04,BigHit Music,https://kpop.fandom.com/wiki/BigHit_Music,Yeonjun; Soobin; Beomgyu; Taehyun; Hueningkai,,,MOA (모아),"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",Boy Group,2025-09-19 14:11:53
5,NCT,엔시티,,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,Johnny; Yuta; Kun; Doyoung; Ten; Jungwoo; Mark...,Taeyong; Jaehyun; Winwin,Taeil; Lucas; Shotaro; Sungchan,NCTzen (엔시티즌),"{""KR"": [{""platform"": ""Facebook"", ""href"": ""http...",Boy Group,2025-09-19 14:11:53
6,ATEEZ,에이티즈,2018-10-24,KQ Entertainment,https://kpop.fandom.com/wiki/KQ_Entertainment,Seonghwa; Hongjoong; Yunho; Yeosang; San; Ming...,,,ATINY (에이티니)[2],"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",Boy Group,2025-09-19 14:11:53
7,EXO,엑소,2012-04-08,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,Xiumin; Suho; Baekhyun; Chen; Chanyeol; D.O.; Kai,Lay; Sehun,Luhan; Kris; Tao,EXO-L (엑소엘),"{""KR"": [{""platform"": ""Fanpage"", ""href"": ""https...",Boy Group,2025-09-19 14:11:53
8,P1Harmony,피원하모니,2020-10-28,FNC Entertainment,https://kpop.fandom.com/wiki/FNC_Entertainment,Theo; Keeho; Jiung; Intak; Soul; Jongseob,,,P1ece (피스),"{""KR"": [{""platform"": ""Facebook"", ""href"": ""http...",Boy Group,2025-09-19 14:11:53
9,SF9,에스에프나인,2016-10-05,FNC Entertainment,https://kpop.fandom.com/wiki/FNC_Entertainment,In Seong; Young Bin; Jae Yoon; Zu Ho; Yoo Tae ...,Da Won,Ro Woon,FANTASY (판타지),"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",Boy Group,2025-09-19 14:11:53


---

### 그룹 내 개인 멤버 Artist csv 생성하기

In [3]:
class KpopGroupCrawler_memberDict:
    def __init__(self):
        self.base_url = "https://kpop.fandom.com/api.php"
        self.base_wiki_url = "https://kpop.fandom.com"
    
    def get_group_info(self, group_name: str) -> Dict:
        """
        특정 K-pop 그룹의 정보를 크롤링합니다.
        
        Args:
            group_name (str): 그룹명 (예: "Red_Velvet", "BTS", "BLACKPINK")
        
        Returns:
            Dict: 그룹 정보가 담긴 딕셔너리
        """
        params = {
            "action": "parse",
            "page": group_name,
            "format": "json"
        }
        
        try:
            res = requests.get(self.base_url, params=params)
            res.raise_for_status()
            data = res.json()
            
            if 'parse' not in data:
                print(f"페이지를 찾을 수 없습니다: {group_name}")
                return {}
            
            html = data["parse"]["text"]["*"]
            soup = BeautifulSoup(html, "html.parser")
            
            group_info = {
                'group_name_en': group_name.replace('_', ' '),
                'group_name_hangul': self._get_hangul_name(soup),
                'debut_date': self._get_debut_date(soup),
                'entertainment': self._get_entertainment(soup),
                'entertainment_link': self._get_entertainment_link(soup),
                'members': self._get_members(soup),
                'fandom_name': self._get_fandom_name(soup),
                'sns_links': self._get_sns_links(soup)
            }
            
            return group_info
            
        except requests.RequestException as e:
            print(f"네트워크 오류: {e}")
            return {}
        except Exception as e:
            print(f"크롤링 오류: {e}")
            return {}
    
    def _get_hangul_name(self, soup) -> str:
        """한글 그룹명 추출"""
        group_name = soup.select_one('[data-source="hangul"] .pi-data-value')
        return group_name.get_text(strip=True) if group_name else ""
    
    def _get_debut_date(self, soup) -> str:
        """데뷔 날짜 추출"""
        debut_block = soup.select_one('[data-source="debut"] .pi-data-value')
        if debut_block:
            debut_parts = debut_block.get_text("||", strip=True).split("||")
            first_debut = debut_parts[0]
            first_debut_date_str = first_debut.split("(")[0].strip()
            
            try:
                debut_date = datetime.strptime(first_debut_date_str, "%B %d, %Y")
                return debut_date.strftime("%Y-%m-%d")
            except ValueError:
                return first_debut_date_str
        return ""
    
    def _get_entertainment(self, soup) -> str:
        """소속사명 추출 - 첫 번째 엔터테인먼트 회사"""
        label_block = soup.select_one('[data-source="label"] .pi-data-value')
        if not label_block:
            return ""
        
        # 첫 번째 b 태그(국가 라벨) 다음의 첫 번째 a 태그나 텍스트 찾기
        first_b = label_block.find('b')
        if first_b:
            # b 태그 다음의 첫 번째 a 태그 찾기
            a_tag = first_b.find_next('a')
            if a_tag:
                return a_tag.get_text(strip=True)
        
        # b 태그가 없으면 기존 방식
        labels = label_block.get_text("||", strip=True).split("||")
        return labels[0].strip() if labels else ""
    
    def _get_entertainment_link(self, soup) -> str:
        """소속사 링크 추출 - 첫 번째 엔터테인먼트 회사 링크"""
        label_block = soup.select_one('[data-source="label"] .pi-data-value')
        if not label_block:
            return ""
        
        # 첫 번째 b 태그(국가 라벨) 다음의 첫 번째 a 태그 찾기
        first_b = label_block.find('b')
        if first_b:
            a_tag = first_b.find_next('a', href=True)
            if a_tag:
                href = a_tag['href']
                return href if href.startswith('http') else f"{self.base_wiki_url}{href}"
        
        # b 태그가 없으면 기존 방식
        labels = label_block.get_text("||", strip=True).split("||")
        if labels:
            first_label = labels[0].strip()
            a_tag = label_block.find('a', string=lambda text: text and text.strip() == first_label)
            if a_tag and a_tag.has_attr('href'):
                href = a_tag['href']
                return href if href.startswith('http') else f"{self.base_wiki_url}{href}"
        
        return ""
    
    def _get_members(self, soup) -> List[Dict]:
        """멤버 정보 추출"""
        members = []
        
        # 현재 활동 멤버
        members_current = soup.select_one('[data-source="current"] .pi-data-value > ul')
        if members_current:
            for a in members_current.select('a'):
                name = a.get_text(strip=True)
                href = a.get("href")
                link = f"{self.base_wiki_url}{href}"
                members.append({"name": name, "href": link, "status": "current"})
        
        # 비활동 멤버
        members_inactive = soup.select_one('[data-source="inactive"] .pi-data-value > ul')
        if members_inactive:
            for a in members_inactive.select('a'):
                name = a.get_text(strip=True)
                href = a.get("href")
                link = f"{self.base_wiki_url}{href}"
                members.append({"name": name, "href": link, "status": "inactive"})
        
        # 탈퇴 멤버
        members_former = soup.select_one('[data-source="former"] .pi-data-value > ul')
        if members_former:
            for a in members_former.select('a'):
                name = a.get_text(strip=True)
                href = a.get("href")
                link = f"{self.base_wiki_url}{href}"
                members.append({"name": name, "href": link, "status": "former"})
        
        return members
    
    def _get_fandom_name(self, soup) -> str:
        """팬덤명 추출"""
        fandom_block = soup.select_one('[data-source="fandom"] .pi-data-value')
        return fandom_block.get_text(strip=True) if fandom_block else ""
    
    def _get_sns_links(self, soup) -> Dict:
        """SNS 링크 추출"""
        sns_block = soup.select_one('[data-source="sns"]')
        sns_data = {}
        current_country = "KR"
        
        if sns_block:
            for elem in sns_block.children:
                if elem.name == 'b':
                    country_text = elem.get_text(strip=True).rstrip(':')
                    current_country = country_text
                    if current_country not in sns_data:
                        sns_data[current_country] = []
                
                if elem.name == 'span':
                    a = elem.find('a')
                    img = elem.find('img')
                    if a and img:
                        href = a.get('href')
                        platform = img.get('data-image-name', "").replace(" Icon.png", "")
                        if current_country not in sns_data:
                            sns_data[current_country] = []
                        sns_data[current_country].append({"platform": platform, "href": href})
        
        return sns_data
    
    def create_csv_from_groups(self, group_names: List[str], output_filename: str = "kpop_groups_info.csv", group_type: str = "Unknown"):
        """
        여러 그룹의 정보를 크롤링하여 CSV 파일로 저장하고, 멤버 페이지명 리스트도 반환합니다.
        
        Args:
            group_names (List[str]): 크롤링할 그룹명 리스트
            output_filename (str): 출력할 CSV 파일명
            group_type (str): 그룹 타입 (예: "Girl Group", "Boy Group", "Co-ed", "Solo" 등)
        
        Returns:
            tuple: (DataFrame, List[Dict]) - CSV DataFrame과 멤버 페이지명 리스트
        """
        all_data = []
        member_pages = []  # API 호출용 멤버 페이지명 리스트
        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # 현재 시간
        
        for group_name in group_names:
            print(f"크롤링 중: {group_name}")
            group_info = self.get_group_info(group_name)
            
            # 크롤링 실패 시에도 null 값으로 데이터 추가
            if not group_info:
                print(f"정보를 가져올 수 없습니다: {group_name}")
                group_info = {
                    'group_name_en': group_name.replace('_', ' '),
                    'group_name_hangul': None,
                    'debut_date': None,
                    'entertainment': None,
                    'entertainment_link': None,
                    'members': [],
                    'fandom_name': None,
                    'sns_links': {}
                }
            
            # 멤버 페이지명 추출 (current, inactive만 포함, former 제외)
            for member in group_info['members']:
                if member['status'] in ['current', 'inactive']:  # former 제외
                    page_name = member['href'].split('/wiki/')[-1] if '/wiki/' in member['href'] else member['name']
                    member_pages.append({
                        'group_name': group_name.replace('_', ' '),
                        'page_name': page_name
                    })
            
            # 멤버를 상태별로 분리
            current_members = []
            inactive_members = []
            former_members = []
            
            for member in group_info['members']:
                if member['status'] == 'current':
                    current_members.append(member['name'])
                elif member['status'] == 'inactive':
                    inactive_members.append(member['name'])
                elif member['status'] == 'former':
                    former_members.append(member['name'])
            
            # 멤버 정보를 문자열로 변환 (null 처리)
            current_members_str = "; ".join(current_members) if current_members else None
            inactive_members_str = "; ".join(inactive_members) if inactive_members else None
            former_members_str = "; ".join(former_members) if former_members else None
            
            # SNS 링크를 문자열로 변환 (null 처리)
            sns_str = json.dumps(group_info['sns_links'], ensure_ascii=False) if group_info['sns_links'] else None
            
            row_data = {
                'group_name_en': group_info['group_name_en'],
                'group_name_kr': group_info['group_name_hangul'] or None,
                'debut_date': group_info['debut_date'] or None,
                'entertainment_name': group_info['entertainment'] or None,
                'entertainment_link': group_info['entertainment_link'] or None,
                'member_current': current_members_str,
                'member_inactive': inactive_members_str,
                'member_former': former_members_str,
                'fandom_name': group_info['fandom_name'] or None,
                'sns': sns_str,
                'group_type': group_type,
                'update_at': current_time
            }
            
            all_data.append(row_data)
        
        # 데이터가 있든 없든 항상 DataFrame 생성
        df = pd.DataFrame(all_data)
        df.to_csv(output_filename, index=False, encoding='utf-8-sig')
        print(f"CSV 파일이 생성되었습니다: {output_filename}")
        print(f"총 {len(member_pages)}명의 멤버 페이지명이 수집되었습니다.")
        
        return df, member_pages

## 최종 Group DB

- girl group

In [11]:
crawler = KpopGroupCrawler_memberDict()

girl_groups = ["Red_Velvet", "BLACKPINK", "TWICE", "ITZY", "Aespa", "MAMAMOO", "CrazAngel", "IVE", "ILLIT", "BABYMONSTER", "NJZ"]

df, girl_groups_member_pages = crawler.create_csv_from_groups(girl_groups, "girl_groups_info.csv", "Girl Group")

# member_pages 예시:
# [
#   {'group_name': 'Red Velvet', 'page_name': 'Irene'},
#   {'group_name': 'Red Velvet', 'page_name': 'Seulgi'},
#   {'group_name': 'BLACKPINK', 'page_name': 'Jennie'},
#   ...
# ]

크롤링 중: Red_Velvet
크롤링 중: BLACKPINK
크롤링 중: TWICE
크롤링 중: ITZY
크롤링 중: Aespa
크롤링 중: MAMAMOO
크롤링 중: CrazAngel
크롤링 중: IVE
크롤링 중: ILLIT
크롤링 중: BABYMONSTER
크롤링 중: NJZ
CSV 파일이 생성되었습니다: girl_groups_info.csv
총 58명의 멤버 페이지명이 수집되었습니다.


In [13]:
girl_groups_member_pages

[{'group_name': 'Red Velvet', 'page_name': 'Irene_(Red_Velvet)'},
 {'group_name': 'Red Velvet', 'page_name': 'Seulgi_(Red_Velvet)'},
 {'group_name': 'Red Velvet', 'page_name': 'Wendy'},
 {'group_name': 'Red Velvet', 'page_name': 'Joy_(Red_Velvet)'},
 {'group_name': 'Red Velvet', 'page_name': 'Yeri_(Red_Velvet)'},
 {'group_name': 'BLACKPINK', 'page_name': 'Jisoo_(BLACKPINK)'},
 {'group_name': 'BLACKPINK', 'page_name': 'Jennie'},
 {'group_name': 'BLACKPINK', 'page_name': 'Ros%C3%A9'},
 {'group_name': 'BLACKPINK', 'page_name': 'Lisa_(BLACKPINK)'},
 {'group_name': 'TWICE', 'page_name': 'Nayeon_(TWICE)'},
 {'group_name': 'TWICE', 'page_name': 'Jeongyeon_(TWICE)'},
 {'group_name': 'TWICE', 'page_name': 'Momo'},
 {'group_name': 'TWICE', 'page_name': 'Sana'},
 {'group_name': 'TWICE', 'page_name': 'Jihyo_(TWICE)'},
 {'group_name': 'TWICE', 'page_name': 'Mina_(TWICE)'},
 {'group_name': 'TWICE', 'page_name': 'Dahyun_(TWICE)'},
 {'group_name': 'TWICE', 'page_name': 'Chaeyoung_(TWICE)'},
 {'group_n

In [40]:
# 설정값
base_url = "https://kpop.fandom.com/api.php"
base_wiki_url = "https://kpop.fandom.com"

def get_birth_name(soup):
    """본명 추출"""
    selectors = [
        '[data-source="birth_name"] .pi-data-value',
        '[data-source="birthname"] .pi-data-value',
        '[data-source="real_name"] .pi-data-value',
        '[data-source="realname"] .pi-data-value'
    ]
    
    for selector in selectors:
        birth_name = soup.select_one(selector)
        if birth_name:
            return birth_name.get_text(strip=True)
    return ""

def get_birth_date_artist(soup):
    """생년월일 추출"""
    birth_block = soup.select_one('[data-source="birth_date"] .pi-data-value')
    if birth_block:
        birth_text = birth_block.get_text(strip=True)
        birth_date_str = birth_text.split("(")[0].strip()
        
        try:
            birth_date = datetime.strptime(birth_date_str, "%B %d, %Y")
            return birth_date.strftime("%Y-%m-%d")
        except ValueError:
            return birth_date_str
    return ""

def _get_entertainment(soup) -> str:
    """소속사명 추출 - 첫 번째 엔터테인먼트 회사"""
    label_block = soup.select_one('[data-source="agency"] .pi-data-value')
    if not label_block:
        return ""
        
    # 첫 번째 b 태그(국가 라벨) 다음의 첫 번째 a 태그나 텍스트 찾기
    first_b = label_block.find('b')
    if first_b:
        # b 태그 다음의 첫 번째 a 태그 찾기
        a_tag = first_b.find_next('a')
        if a_tag:
            return a_tag.get_text(strip=True)
        
    # b 태그가 없으면 기존 방식
    labels = label_block.get_text("||", strip=True).split("||")
    return labels[0].strip() if labels else ""
    
def _get_entertainment_link(soup) -> str:
    """소속사 링크 추출 - 첫 번째 엔터테인먼트 회사 링크"""
    label_block = soup.select_one('[data-source="agency"] .pi-data-value')
    if not label_block:
        return ""
        
    # 첫 번째 b 태그(국가 라벨) 다음의 첫 번째 a 태그 찾기
    first_b = label_block.find('b')
    if first_b:
        a_tag = first_b.find_next('a', href=True)
        if a_tag:
            href = a_tag['href']
            return href if href.startswith('http') else f"{base_wiki_url}{href}"
        
    # b 태그가 없으면 기존 방식
    labels = label_block.get_text("||", strip=True).split("||")
    if labels:
        first_label = labels[0].strip()
        a_tag = label_block.find('a', string=lambda text: text and text.strip() == first_label)
        if a_tag and a_tag.has_attr('href'):
            href = a_tag['href']
            return href if href.startswith('http') else f"{base_wiki_url}{href}"
        
    return ""
#

def create_artist_csv(member_pages, output_filename="artists_info.csv"):
    """
    멤버 페이지 리스트를 받아서 개별 아티스트 정보를 크롤링하여 CSV로 저장합니다.
    
    Args:
        member_pages (List[Dict]): [{'group_name': '그룹명', 'page_name': '페이지명'}, ...] 형태의 리스트
        output_filename (str): 출력할 CSV 파일명
    
    Returns:
        DataFrame: 아티스트 정보가 담긴 DataFrame
    """
    all_artist_data = []
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    for member_info in member_pages:
        group_name = member_info['group_name']
        page_name = member_info['page_name']
        
        print(f"크롤링 중: {group_name} - {page_name}")
        
        try:
            params = {
                "action": "parse",
                "page": page_name,
                "format": "json"
            }
            
            res = requests.get(base_url, params=params)
            res.raise_for_status()
            data = res.json()
            
            if 'parse' not in data:
                try:
                    # API 호출용으로 디코딩
                    decoded_page_name = unquote(page_name)
                    
                    params = {
                        "action": "parse",
                        "page": decoded_page_name,
                        "format": "json"
                    }
                    res = requests.get(base_url, params=params)
                    res.raise_for_status()
                    data = res.json()
                except Exception as e:
                    print(f"페이지를 찾을 수 없습니다: {page_name}")
                    row_data = {
                        'group_name': group_name,
                        'page_name': page_name,
                        'birth_name': None,
                        'birth_date': None,
                        'agency_name': None,
                        'agency_href': None,
                        'update_at': current_time
                    }
                    all_artist_data.append(row_data)
                    continue
            
            html = data["parse"]["text"]["*"]
            soup = BeautifulSoup(html, "html.parser")
            
            # 정보 추출
            birth_name = get_birth_name(soup)
            birth_date = get_birth_date_artist(soup)
            agency_name = _get_entertainment(soup)
            agency_href = _get_entertainment_link(soup)
            
            row_data = {
                'group_name': group_name,
                'page_name': page_name,
                'birth_name': birth_name or None,
                'birth_date': birth_date or None,
                'agency_name': agency_name or None,
                'agency_href': agency_href or None,
                'update_at': current_time
            }
            
            all_artist_data.append(row_data)
            
        except Exception as e:
            print(f"오류 ({page_name}): {e}")
            row_data = {
                'group_name': group_name,
                'page_name': page_name,
                'birth_name': None,
                'birth_date': None,
                'agency_name': None,
                'agency_href': None,
                'update_at': current_time
            }
            all_artist_data.append(row_data)
    
    # CSV 저장
    df = pd.DataFrame(all_artist_data)
    df.to_csv(output_filename, index=False, encoding='utf-8-sig')
    print(f"Artist CSV 파일이 생성되었습니다: {output_filename}")
    print(f"총 {len(all_artist_data)}명의 아티스트 정보가 수집되었습니다.")
    
    return df

In [41]:
artist_df = create_artist_csv(girl_groups_member_pages, "girl_groups_artists.csv")

크롤링 중: Red Velvet - Irene_(Red_Velvet)
크롤링 중: Red Velvet - Seulgi_(Red_Velvet)
크롤링 중: Red Velvet - Wendy
크롤링 중: Red Velvet - Joy_(Red_Velvet)
크롤링 중: Red Velvet - Yeri_(Red_Velvet)
크롤링 중: BLACKPINK - Jisoo_(BLACKPINK)
크롤링 중: BLACKPINK - Jennie
크롤링 중: BLACKPINK - Ros%C3%A9
크롤링 중: BLACKPINK - Lisa_(BLACKPINK)
크롤링 중: TWICE - Nayeon_(TWICE)
크롤링 중: TWICE - Jeongyeon_(TWICE)
크롤링 중: TWICE - Momo
크롤링 중: TWICE - Sana
크롤링 중: TWICE - Jihyo_(TWICE)
크롤링 중: TWICE - Mina_(TWICE)
크롤링 중: TWICE - Dahyun_(TWICE)
크롤링 중: TWICE - Chaeyoung_(TWICE)
크롤링 중: TWICE - Tzuyu
크롤링 중: ITZY - Yeji_(ITZY)
크롤링 중: ITZY - Lia_(ITZY)
크롤링 중: ITZY - Ryujin
크롤링 중: ITZY - Chaeryeong
크롤링 중: ITZY - Yuna_(ITZY)
크롤링 중: Aespa - Karina_(aespa)
크롤링 중: Aespa - Giselle
크롤링 중: Aespa - Winter_(aespa)
크롤링 중: Aespa - Ningning
크롤링 중: MAMAMOO - Solar
크롤링 중: MAMAMOO - Moon_Byul
크롤링 중: MAMAMOO - Whee_In
크롤링 중: MAMAMOO - Hwa_Sa
크롤링 중: CrazAngel - Solmi_(CrazAngel)
크롤링 중: CrazAngel - Daze
크롤링 중: CrazAngel - Shannie
크롤링 중: CrazAngel - Ahon
크롤링 중: 

In [42]:
artist_df.head()

Unnamed: 0,group_name,page_name,birth_name,birth_date,agency_name,agency_href,update_at
0,Red Velvet,Irene_(Red_Velvet),Bae Joo-hyun (배주현),1991-03-29,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,2025-09-19 16:16:51
1,Red Velvet,Seulgi_(Red_Velvet),Kang Seul-gi (강슬기),1994-02-10,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,2025-09-19 16:16:51
2,Red Velvet,Wendy,Shon Seung-wan (손승완),1994-02-21,ASND,https://kpop.fandom.com/wiki/ASND,2025-09-19 16:16:51
3,Red Velvet,Joy_(Red_Velvet),Park Soo-young (박수영),1996-09-03,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,2025-09-19 16:16:51
4,Red Velvet,Yeri_(Red_Velvet),Kim Ye-rim (김예림),1999-03-05,Blitzway Entertainment,,2025-09-19 16:16:51


In [43]:
artist_df.isnull().sum()

group_name     0
page_name      0
birth_name     0
birth_date     0
agency_name    0
agency_href    5
update_at      0
dtype: int64

In [44]:
artist_df[artist_df['group_name'] == 'CrazAngel']

Unnamed: 0,group_name,page_name,birth_name,birth_date,agency_name,agency_href,update_at
31,CrazAngel,Solmi_(CrazAngel),Bae Sol-mi (배솔미),2002-05-18,Forbest Entertainment,,2025-09-19 16:16:51
32,CrazAngel,Daze,Jeong Seo-hyeon (정서현),2003-05-09,Forbest Entertainment,,2025-09-19 16:16:51
33,CrazAngel,Shannie,Chun En Shannon Hoo,2003-12-16,Forbest Entertainment,,2025-09-19 16:16:51
34,CrazAngel,Ahon,Lee Ga-eun (이가은),2005-09-04,Forbest Entertainment,,2025-09-19 16:16:51


In [45]:
crawler = KpopGroupCrawler_memberDict()

boy_groups = ["BTS", "Stray_Kids", "SEVENTEEN", "ENHYPEN", "TXT", "NCT", "ATEEZ", "EXO", "P1Harmony", "SF9", "VICTON", "CRAVITY"]

df, boy_groups_member_pages = crawler.create_csv_from_groups(boy_groups, "boy_groups_info.csv", "Boy Group")

크롤링 중: BTS
크롤링 중: Stray_Kids
크롤링 중: SEVENTEEN
크롤링 중: ENHYPEN
크롤링 중: TXT
크롤링 중: NCT
크롤링 중: ATEEZ
크롤링 중: EXO
크롤링 중: P1Harmony
크롤링 중: SF9
크롤링 중: VICTON
크롤링 중: CRAVITY
CSV 파일이 생성되었습니다: boy_groups_info.csv
총 111명의 멤버 페이지명이 수집되었습니다.


In [46]:
boy_groups_member_pages

[{'group_name': 'BTS', 'page_name': 'Jin_(BTS)'},
 {'group_name': 'BTS', 'page_name': 'Suga'},
 {'group_name': 'BTS', 'page_name': 'J-Hope'},
 {'group_name': 'BTS', 'page_name': 'RM'},
 {'group_name': 'BTS', 'page_name': 'Jimin_(BTS)'},
 {'group_name': 'BTS', 'page_name': 'V_(BTS)'},
 {'group_name': 'BTS', 'page_name': 'Jung_Kook'},
 {'group_name': 'Stray Kids', 'page_name': 'Bang_Chan'},
 {'group_name': 'Stray Kids', 'page_name': 'Lee_Know'},
 {'group_name': 'Stray Kids', 'page_name': 'Changbin_(Stray_Kids)'},
 {'group_name': 'Stray Kids', 'page_name': 'Hyunjin_(Stray_Kids)'},
 {'group_name': 'Stray Kids', 'page_name': 'Han_(Stray_Kids)'},
 {'group_name': 'Stray Kids', 'page_name': 'Felix'},
 {'group_name': 'Stray Kids', 'page_name': 'Seungmin_(Stray_Kids)'},
 {'group_name': 'Stray Kids', 'page_name': 'I.N_(Stray_Kids)'},
 {'group_name': 'SEVENTEEN', 'page_name': 'S.Coups'},
 {'group_name': 'SEVENTEEN', 'page_name': 'Joshua'},
 {'group_name': 'SEVENTEEN', 'page_name': 'Jun_(SEVENTEEN)

In [47]:
artist_boy_df = create_artist_csv(boy_groups_member_pages, "boy_groups_artists.csv")

크롤링 중: BTS - Jin_(BTS)
크롤링 중: BTS - Suga
크롤링 중: BTS - J-Hope
크롤링 중: BTS - RM
크롤링 중: BTS - Jimin_(BTS)
크롤링 중: BTS - V_(BTS)
크롤링 중: BTS - Jung_Kook
크롤링 중: Stray Kids - Bang_Chan
크롤링 중: Stray Kids - Lee_Know
크롤링 중: Stray Kids - Changbin_(Stray_Kids)
크롤링 중: Stray Kids - Hyunjin_(Stray_Kids)
크롤링 중: Stray Kids - Han_(Stray_Kids)
크롤링 중: Stray Kids - Felix
크롤링 중: Stray Kids - Seungmin_(Stray_Kids)
크롤링 중: Stray Kids - I.N_(Stray_Kids)
크롤링 중: SEVENTEEN - S.Coups
크롤링 중: SEVENTEEN - Joshua
크롤링 중: SEVENTEEN - Jun_(SEVENTEEN)
크롤링 중: SEVENTEEN - DK_(SEVENTEEN)
크롤링 중: SEVENTEEN - Mingyu_(SEVENTEEN)
크롤링 중: SEVENTEEN - The8
크롤링 중: SEVENTEEN - Seungkwan
크롤링 중: SEVENTEEN - Vernon
크롤링 중: SEVENTEEN - Dino_(SEVENTEEN)
크롤링 중: SEVENTEEN - Jeonghan
크롤링 중: SEVENTEEN - Hoshi
크롤링 중: SEVENTEEN - Wonwoo_(SEVENTEEN)
크롤링 중: SEVENTEEN - Woozi
크롤링 중: ENHYPEN - Heeseung
크롤링 중: ENHYPEN - Jay_(ENHYPEN)
크롤링 중: ENHYPEN - Jake_(ENHYPEN)
크롤링 중: ENHYPEN - Sunghoon_(ENHYPEN)
크롤링 중: ENHYPEN - Sunoo
크롤링 중: ENHYPEN - Jungwon_(ENHYP

In [48]:
artist_boy_df.head()

Unnamed: 0,group_name,page_name,birth_name,birth_date,agency_name,agency_href,update_at
0,BTS,Jin_(BTS),Kim Seok-jin (김석진),1992-12-04,BigHit Music,https://kpop.fandom.com/wiki/BigHit_Music,2025-09-19 16:19:47
1,BTS,Suga,Min Yoon-gi (민윤기),1993-03-09,BigHit Music,https://kpop.fandom.com/wiki/BigHit_Music,2025-09-19 16:19:47
2,BTS,J-Hope,Jung Ho-seok (정호석),1994-02-18,BigHit Music,https://kpop.fandom.com/wiki/BigHit_Music,2025-09-19 16:19:47
3,BTS,RM,Kim Nam-joon (김남준),1994-09-12,BigHit Music,https://kpop.fandom.com/wiki/BigHit_Music,2025-09-19 16:19:47
4,BTS,Jimin_(BTS),Park Ji-min (박지민),1995-10-13,BigHit Music,https://kpop.fandom.com/wiki/BigHit_Music,2025-09-19 16:19:47


In [49]:
artist_boy_df.isnull().sum()


group_name     0
page_name      0
birth_name     0
birth_date     0
agency_name    0
agency_href    6
update_at      0
dtype: int64

In [50]:
artist_boy_df[artist_boy_df['agency_href'].isnull()]

Unnamed: 0,group_name,page_name,birth_name,birth_date,agency_name,agency_href,update_at
80,EXO,Lay,Zhāng Jiāshuài (张加帅),1991-10-07,Zhang Yixing Studio,,2025-09-19 16:19:47
91,SF9,Zu_Ho_(SF9),Baek Ju-ho (백주호),1996-07-04,Haewadal Entertainment,,2025-09-19 16:19:47
96,VICTON,Han_Seung_Woo,Han Seung-woo (한승우),1994-12-24,AURA Entertainment,,2025-09-19 16:19:47
99,VICTON,Do_Han_Se,Do Han-se (도한세),1997-09-25,The Dial Music,,2025-09-19 16:19:47
100,VICTON,Choi_Byung_Chan,Choi Byung-chan (최병찬),1997-11-12,New Way Company,,2025-09-19 16:19:47
101,VICTON,Jung_Su_Bin,Jung Su-bin (정수빈),1999-04-05,Echo Global Group,,2025-09-19 16:19:47


### url로 가져오기 (BeautifulSoup)

In [2]:
artist = "Red_Velvet"
response = requests.get(f"https://kpop.fandom.com/wiki/{artist}/")
rating_page = response.text
soup = BeautifulSoup(rating_page, 'html.parser')
print(soup.prettify())


<!DOCTYPE html>
<html class="client-nojs sse-other l2u-other odyssey-noads" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Red Velvet/ | Kpop Wiki | Fandom
  </title>
  <script>
   document.documentElement.className="client-js sse-other l2u-other odyssey-noads";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"49f8c654b3b5e952372f88e7354183a4","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Red_Velvet/","wgTitle":"Red Velvet/","wgCurRevisionId":0,"wgRevisionId":0,"wgArticleId":0,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"R