In [67]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
from datetime import datetime
from typing import Dict, List, Optional

from urllib.parse import unquote # URL 디코딩 -> ')' 의 경우 %29로 인코딩 되어있어서 href로 가져올 때 디코딩 필요
from urllib.parse import quote # URL 인코딩 -> ' '의 경우 %20으로 인코딩 필요

## K-POP Fandom Wiki API

```
- 그룹명 중복 잡아내고 데이터 쌓음 -> Group Table
- 그룹 크롤링하면서 멤버 정보 자동으로 추출 -> Artist Table
- 한 멤버가 여러 그룹내 존재해도 Artist 테이블에서 중복되지 않음
- 그룹만 입력해도 유닛 리스트 추가되서 크롤링됨
```

> 참고

> ?action=query&prop=images&titles=BTS 이미지도 가져올 수 있음

- Group Crawling

In [177]:
class KpopGroupCrawler_memberDict:
    def __init__(self):
        self.base_url = "https://kpop.fandom.com/api.php"
        self.base_wiki_url = "https://kpop.fandom.com"
    
    def get_group_info(self, group_name: str) -> Dict:
        """
        특정 K-pop 그룹의 정보를 크롤링합니다.
        
        Args:
            group_name (str): 그룹명 (예: "Red_Velvet", "BTS", "BLACKPINK")
        
        Returns:
            Dict: 그룹 정보가 담긴 딕셔너리
        """
        params = {
            "action": "parse",
            "page": group_name,
            "format": "json"
        }
        
        try:
            res = requests.get(self.base_url, params=params)
            res.raise_for_status()
            data = res.json()
            
            if 'parse' not in data:
                print(f"페이지를 찾을 수 없습니다: {group_name}")
                return {}
            
            html = data["parse"]["text"]["*"]
            soup = BeautifulSoup(html, "html.parser")
            
            eng_name, kor_name = self._get_fandom_name(soup) # 팬덤명 추출 (영문/한글 분리)
            
            group_info = {
                'group_name_en': group_name.replace('_', ' '),
                'group_name_hangul': self._get_hangul_name(soup),
                'debut_date': self._get_debut_date(soup),
                'entertainment': self._get_entertainment(soup),
                'entertainment_link': self._get_entertainment_link(soup),
                'members': self._get_members(soup),
                'fandom_name_en': eng_name,
                'fandom_name_kr': kor_name,
                'sns_links': self._get_sns_links(soup),
                'units': self.units_check(soup),  # 유닛 정보 추가
                'disbanded' : self._get_disbanded_date(soup)
            }
            
            return group_info
            
        except requests.RequestException as e:
            print(f"네트워크 오류: {e}")
            return {}
        except Exception as e:
            print(f"크롤링 오류: {e}")
            return {}
    
    def _get_hangul_name(self, soup) -> str:
        """한글 그룹명 추출"""
        group_name = soup.select_one('[data-source="hangul"] .pi-data-value')
        return group_name.get_text(strip=True) if group_name else ""
    
    def _get_debut_date(self, soup) -> str:
        """데뷔 날짜 추출"""
        debut_block = soup.select_one('[data-source="debut"] .pi-data-value')
        if debut_block:
            debut_parts = debut_block.get_text("||", strip=True).split("||")
            first_debut = debut_parts[0]
            first_debut_date_str = first_debut.split("(")[0].strip()
            
            try:
                debut_date = datetime.strptime(first_debut_date_str, "%B %d, %Y")
                return debut_date.strftime("%Y-%m-%d")
            except ValueError:
                return first_debut_date_str
        return ""
    
    def _get_entertainment(self, soup) -> str:
        """소속사명 추출 - 첫 번째 엔터테인먼트 회사"""
        label_block = soup.select_one('[data-source="label"] .pi-data-value')
        if not label_block:
            return ""
        
        # 첫 번째 b 태그(국가 라벨) 다음의 첫 번째 a 태그나 텍스트 찾기
        first_b = label_block.find('b')
        if first_b:
            # b 태그 다음의 첫 번째 a 태그 찾기
            a_tag = first_b.find_next('a')
            if a_tag:
                return a_tag.get_text(strip=True)
        
        # b 태그가 없으면 기존 방식
        labels = label_block.get_text("||", strip=True).split("||")
        return labels[0].strip() if labels else ""
    
    def _get_entertainment_link(self, soup) -> str:
        """소속사 링크 추출 - 첫 번째 엔터테인먼트 회사 링크"""
        label_block = soup.select_one('[data-source="label"] .pi-data-value')
        if not label_block:
            return ""
        
        # 첫 번째 b 태그(국가 라벨) 다음의 첫 번째 a 태그 찾기
        first_b = label_block.find('b')
        if first_b:
            a_tag = first_b.find_next('a', href=True)
            if a_tag:
                href = a_tag['href']
                return href if href.startswith('http') else f"{self.base_wiki_url}{href}"
        
        # b 태그가 없으면 기존 방식
        labels = label_block.get_text("||", strip=True).split("||")
        if labels:
            first_label = labels[0].strip()
            a_tag = label_block.find('a', string=lambda text: text and text.strip() == first_label)
            if a_tag and a_tag.has_attr('href'):
                href = a_tag['href']
                return href if href.startswith('http') else f"{self.base_wiki_url}{href}"
        
        return ""
    
    def _get_members(self, soup) -> List[Dict]:
        """멤버 정보 추출"""
        members = []
        
        # 현재 활동 멤버
        members_current = soup.select_one('[data-source="current"] .pi-data-value > ul')
        if members_current:
            for a in members_current.select('a'):
                name = a.get_text(strip=True)
                href = a.get("href")
                link = f"{self.base_wiki_url}{href}"
                members.append({"name": name, "href": link, "status": "current"})
        
        # 비활동 멤버
        members_inactive = soup.select_one('[data-source="inactive"] .pi-data-value > ul')
        if members_inactive:
            for a in members_inactive.select('a'):
                name = a.get_text(strip=True)
                href = a.get("href")
                link = f"{self.base_wiki_url}{href}"
                members.append({"name": name, "href": link, "status": "inactive"})
        
        # 탈퇴 멤버
        members_former = soup.select_one('[data-source="former"] .pi-data-value > ul')
        if members_former:
            for a in members_former.select('a'):
                name = a.get_text(strip=True)
                href = a.get("href")
                link = f"{self.base_wiki_url}{href}"
                members.append({"name": name, "href": link, "status": "former"})
        
        return members
    
    def _get_fandom_name(self, soup) -> str:
        """팬덤명 추출 (영문/한글 분리)"""
        fandom_block = soup.select_one('[data-source="fandom"] .pi-data-value')
        if not fandom_block :
            return None, None # 팬덤명이 없을 경우 None 반환
        
        fandom_text = fandom_block.get_text(strip=True)
        
        # 괄호가 잇는 경우 : ex) "ReVeluv (레벨럽)"
        if "(" in fandom_text and ")" in fandom_text:
            eng_name = fandom_text.split("(")[0].strip()
            kor_name = fandom_text.split("(")[1].rstrip(")").strip()
        else: # 괄호가 없는 경우 : ex) "Blinks"
            eng_name = fandom_text.strip()
            kor_name = None
        
        return eng_name or None, kor_name or None
    
    def _get_sns_links(self, soup) -> Dict:
        """SNS 링크 추출"""
        sns_block = soup.select_one('[data-source="sns"]')
        sns_data = {}
        current_country = "KR"
        
        if sns_block:
            for elem in sns_block.children:
                if elem.name == 'b':
                    country_text = elem.get_text(strip=True).rstrip(':')
                    current_country = country_text
                    if current_country not in sns_data:
                        sns_data[current_country] = []
                
                if elem.name == 'span':
                    a = elem.find('a')
                    img = elem.find('img')
                    if a and img:
                        href = a.get('href')
                        platform = img.get('data-image-name', "").replace(" Icon.png", "")
                        if current_country not in sns_data:
                            sns_data[current_country] = []
                        sns_data[current_country].append({"platform": platform, "href": href})
        
        return sns_data
    
    def units_check(self, soup) -> List[str]:
        """유닛 그룹명 추출 (wiki page_name 형태)"""
        units = []
        # h2 > span의 id가 "Sub-units"인 요소 찾기
        span_with_id = soup.select_one("h2 > span#Sub-units, h2 > span#Units")
        
        if span_with_id:
            # span의 부모인 h2 태그를 가져옴
            units_title = span_with_id.parent
            
            # "Sub-units" 제목 다음의 ul 요소 찾기
            units_block = units_title.find_next_sibling("ul")
            
            if units_block:
                # units_block의 직계 자식 li들만 선택 (중첩된 ul의 li 제외)
                for li in units_block.find_all('li', recursive=False): # recursive=True 가 디폴트, 모든 하위 요소를 재귀적으로 탐색
                    a = li.find('a')  # 각 li의 첫 번째 a 태그만 선택
                    if a:
                        href = a.get("href")
                        if href and href.startswith("/wiki/"):
                            page_name = href.split("/wiki/")[-1]  # "/wiki/" 제거
                            page_name = unquote(page_name)  # URL 디코딩
                            if page_name not in units:  # 중복 제거
                                units.append(page_name)
        
        return units
    
    def _get_disbanded_date(self, soup) -> str:
        disbanded_block = soup.select_one('[data-source="disbanded"]')
        if disbanded_block : # disbanded_block 존재하면 True
                value_block = disbanded_block.select_one('.pi-data-value')
                return value_block.get_text(strip=True) if value_block else "" # 텍스트 값을 반환하거나, 없으면 공란 반환
    
    def create_csv_from_groups(self, group_names: List[str], output_filename: str = "kpop_groups_info.csv", group_type: str = "Unknown"):
        """
        여러 그룹의 정보를 크롤링하여 CSV 파일로 저장하고, 멤버 페이지명 리스트도 반환합니다.
        유닛 그룹이 발견되면 자동으로 추가하여 크롤링합니다.
        
        Args:
            group_names (List[str]): 크롤링할 그룹명 리스트
            output_filename (str): 출력할 CSV 파일명
            group_type (str): 그룹 타입 (예: "Girl Group", "Boy Group", "Co-ed", "Solo" 등)
        
        Returns:
            tuple: (DataFrame, List[Dict]) - CSV DataFrame과 멤버 페이지명 리스트
        """
        # 중복 제거 + 순서 유지
        group_names = list(dict.fromkeys(group_names))
        
        all_data = []
        member_pages = []
        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        processed_groups = set()  # 이미 처리된 그룹 추적
        groups_to_process = group_names.copy()  # 처리할 그룹 리스트
        
        while groups_to_process:
            group_name = groups_to_process.pop(0)
            
            # 이미 처리된 그룹은 건너뛰기
            if group_name in processed_groups:
                continue
                
            # print(f"크롤링 중: {group_name}")
            group_info = self.get_group_info(group_name)
            processed_groups.add(group_name)
            
            # 크롤링 실패 시에도 null 값으로 데이터 추가
            if not group_info:
                print(f"정보를 가져올 수 없습니다: {group_name}")
                group_info = {
                    'group_name_en': group_name.replace('_', ' '),
                    'group_name_hangul': None,
                    'debut_date': None,
                    'entertainment': None,
                    'entertainment_link': None,
                    'members': [],
                    'fandom_name_en': None,
                    'fandom_name_kr': None,
                    'units': [],
                    'sns_links': {},
                    'disbanded' : None
                }
            
            # 유닛 그룹이 있으면 처리할 리스트에 추가
            if group_info.get('units'):
                print(f"{group_name}의 유닛 그룹 발견: {group_info['units']}")
                for unit in group_info['units']:
                    if unit not in processed_groups and unit not in groups_to_process:
                        groups_to_process.append(unit)
                        print(f"유닛 그룹 추가: {unit}")
            
            # 멤버 페이지명 추출 (current, inactive만 포함, former 제외)
            for member in group_info['members']:
                if member['status'] in ['current', 'inactive']:
                    page_name = member['href'].split('/wiki/')[-1] if '/wiki/' in member['href'] else member['name']
                    
                    # 중복 체크
                    if not any(mp['page_name'] == page_name for mp in member_pages):
                        member_pages.append({
                            'group_name': group_name.replace('_', ' '),
                            'page_name': page_name,
                            'member_name': member['name']
                        })
            
            # 멤버를 상태별로 분리
            current_members = []
            inactive_members = []
            former_members = []
            
            for member in group_info['members']:
                if member['status'] == 'current':
                    current_members.append(member['name'])
                elif member['status'] == 'inactive':
                    inactive_members.append(member['name'])
                elif member['status'] == 'former':
                    former_members.append(member['name'])
            
            # 멤버 정보를 문자열로 변환 (null 처리)
            current_members_str = "; ".join(current_members) if current_members else None
            inactive_members_str = "; ".join(inactive_members) if inactive_members else None
            former_members_str = "; ".join(former_members) if former_members else None
            
            # SNS 링크를 문자열로 변환 (null 처리)
            sns_str = json.dumps(group_info['sns_links'], ensure_ascii=False) if group_info['sns_links'] else None
            
            # 유닛 정보를 문자열로 변환
            units_str = "; ".join(group_info['units']) if group_info['units'] else None
            
            row_data = {
                'group_name_en': group_info['group_name_en'],
                'group_name_kr': group_info['group_name_hangul'] or None,
                'debut_date': group_info['debut_date'] or None,
                'entertainment_name': group_info['entertainment'] or None,
                'entertainment_link': group_info['entertainment_link'] or None,
                'member_current': current_members_str,
                'member_inactive': inactive_members_str,
                'member_former': former_members_str,
                'fandom_name_en': group_info['fandom_name_en'] or None,
                'fandom_name_kr': group_info['fandom_name_kr'] or None,
                'units': units_str,  # 유닛 정보 추가
                'sns': sns_str,
                'disbanded' : group_info['disbanded'],
                'group_type': group_type,
                'update_at': current_time
            }
            
            all_data.append(row_data)
        
        # 데이터가 있든 없든 항상 DataFrame 생성
        df = pd.DataFrame(all_data)
        df.to_csv(output_filename, index=False, encoding='utf-8-sig')
        print(f"CSV 파일이 생성되었습니다: {output_filename}")
        print(f"총 {len(all_data)}개의 그룹이 처리되었습니다.")
        print(f"총 {len(member_pages)}명의 멤버 페이지명이 수집되었습니다.")
        
        return df, member_pages

In [178]:
# 크롤러 객체 생성
crawler = KpopGroupCrawler_memberDict()

- Boy Group

In [192]:
# boy 그룹 내보내기
boy_groups = ["NCT", "BTS", "Stray_Kids", "SEVENTEEN", "NEEZ"]
# 유닛 포함해서 그룹 내 멤버 중복되게 내보내기

df_boy_group, boy_groups_member_pages = crawler.create_csv_from_groups(boy_groups, "./csv_data/groups_info_boy.csv", "Boy Group")

NCT의 유닛 그룹 발견: ['NCT_U', 'NCT_127', 'NCT_DREAM', 'WayV', 'NCT_DOJAEJUNG', 'NCT_WISH']
유닛 그룹 추가: NCT_U
유닛 그룹 추가: NCT_127
유닛 그룹 추가: NCT_DREAM
유닛 그룹 추가: WayV
유닛 그룹 추가: NCT_DOJAEJUNG
유닛 그룹 추가: NCT_WISH
Stray_Kids의 유닛 그룹 발견: ['3RACHA']
유닛 그룹 추가: 3RACHA
SEVENTEEN의 유닛 그룹 발견: ['BSS', 'Jeonghan_X_Wonwoo', 'Hoshi_X_Woozi', 'S.Coups_X_Mingyu']
유닛 그룹 추가: BSS
유닛 그룹 추가: Jeonghan_X_Wonwoo
유닛 그룹 추가: Hoshi_X_Woozi
유닛 그룹 추가: S.Coups_X_Mingyu
WayV의 유닛 그룹 발견: ['WayV-KUN&XIAOJUN', 'WayV-TEN&YANGYANG', 'WayV-LUCAS&HENDERY']
유닛 그룹 추가: WayV-KUN&XIAOJUN
유닛 그룹 추가: WayV-TEN&YANGYANG
유닛 그룹 추가: WayV-LUCAS&HENDERY
CSV 파일이 생성되었습니다: ./csv_data/groups_info_boy.csv
총 19개의 그룹이 처리되었습니다.
총 56명의 멤버 페이지명이 수집되었습니다.


In [193]:
# NCT 멤버 중복되었는지 확인

df_duplicated_check = pd.DataFrame(boy_groups_member_pages)
print(df_duplicated_check.duplicated(subset=['page_name']).sum())  # 중복 개수
print(df_duplicated_check[df_duplicated_check.duplicated(subset=['page_name'], keep=False)])  # 중복된 행 보기

0
Empty DataFrame
Columns: [group_name, page_name, member_name]
Index: []


In [194]:
df_duplicated_check['group_name'].unique()

array(['NCT', 'BTS', 'Stray Kids', 'SEVENTEEN', 'NEEZ'], dtype=object)

In [195]:
df_boy_group.head()

Unnamed: 0,group_name_en,group_name_kr,debut_date,entertainment_name,entertainment_link,member_current,member_inactive,member_former,fandom_name_en,fandom_name_kr,units,sns,disbanded,group_type,update_at
0,NCT,엔시티,,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,Johnny; Yuta; Kun; Doyoung; Ten; Jungwoo; Mark...,Taeyong; Jaehyun; Winwin,Taeil; Lucas; Shotaro; Sungchan,NCTzen,엔시티즌,NCT_U; NCT_127; NCT_DREAM; WayV; NCT_DOJAEJUNG...,"{""KR"": [{""platform"": ""Facebook"", ""href"": ""http...",,Boy Group,2025-09-23 15:31:13
1,BTS,방탄소년단,2013-06-13,BigHit Music,https://kpop.fandom.com/wiki/BigHit_Music,Jin; Suga; J-Hope; RM; Jimin; V; Jung Kook,,,ARMY,아미,,"{""KR"": [{""platform"": ""Blog"", ""href"": ""http://b...",,Boy Group,2025-09-23 15:31:13
2,Stray Kids,스트레이 키즈,2018-03-25,JYP Entertainment,https://kpop.fandom.com/wiki/JYP_Entertainment,Bang Chan; Lee Know; Changbin; Hyunjin; Han; F...,,Woojin,STAY,,3RACHA,"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",,Boy Group,2025-09-23 15:31:13
3,SEVENTEEN,세븐틴,2015-05-26,Pledis Entertainment,https://kpop.fandom.com/wiki/Pledis_Entertainment,S.Coups; Joshua; Jun; DK; Mingyu; The8; Seungk...,Jeonghan; Hoshi; Wonwoo; Woozi,,CARAT,캐럿,BSS; Jeonghan_X_Wonwoo; Hoshi_X_Woozi; S.Coups...,"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",,Boy Group,2025-09-23 15:31:13
4,NEEZ,니즈,2025-02-15,Startree Entertainment,https://kpop.fandom.com/wiki/Startree_Entertai...,g_me; K; Min Seong,,,,,,"{""KR"": [{""platform"": ""Instagram"", ""href"": ""htt...",2025.0,Boy Group,2025-09-23 15:31:13


In [196]:
df_boy_group['group_name_en'].unique()

array(['NCT', 'BTS', 'Stray Kids', 'SEVENTEEN', 'NEEZ', 'NCT U',
       'NCT 127', 'NCT DREAM', 'WayV', 'NCT DOJAEJUNG', 'NCT WISH',
       '3RACHA', 'BSS', 'Jeonghan X Wonwoo', 'Hoshi X Woozi',
       'S.Coups X Mingyu', 'WayV-KUN&XIAOJUN', 'WayV-TEN&YANGYANG',
       'WayV-LUCAS&HENDERY'], dtype=object)

In [197]:
df_boy_group[df_boy_group['group_name_en'] == 'NCT 127']

Unnamed: 0,group_name_en,group_name_kr,debut_date,entertainment_name,entertainment_link,member_current,member_inactive,member_former,fandom_name_en,fandom_name_kr,units,sns,disbanded,group_type,update_at
6,NCT 127,엔시티 127,2016-07-07,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,Johnny; Yuta; Doyoung; Jungwoo; Mark; Haechan,Taeyong; Jaehyun; Winwin,Taeil,,,,"{""KR"": [{""platform"": ""Facebook"", ""href"": ""http...",,Boy Group,2025-09-23 15:31:13


In [198]:
df_boy_group[df_boy_group['group_name_en'] == 'NEEZ']

Unnamed: 0,group_name_en,group_name_kr,debut_date,entertainment_name,entertainment_link,member_current,member_inactive,member_former,fandom_name_en,fandom_name_kr,units,sns,disbanded,group_type,update_at
4,NEEZ,니즈,2025-02-15,Startree Entertainment,https://kpop.fandom.com/wiki/Startree_Entertai...,g_me; K; Min Seong,,,,,,"{""KR"": [{""platform"": ""Instagram"", ""href"": ""htt...",2025,Boy Group,2025-09-23 15:31:13


In [199]:
df_boy_group.isnull().sum()

group_name_en          0
group_name_kr          3
debut_date             2
entertainment_name     0
entertainment_link     0
member_current         3
member_inactive       10
member_former         13
fandom_name_en        12
fandom_name_kr        14
units                 15
sns                    8
disbanded             17
group_type             0
update_at              0
dtype: int64

In [200]:
df_boy_group[df_boy_group['member_current'].isnull()]

Unnamed: 0,group_name_en,group_name_kr,debut_date,entertainment_name,entertainment_link,member_current,member_inactive,member_former,fandom_name_en,fandom_name_kr,units,sns,disbanded,group_type,update_at
13,Jeonghan X Wonwoo,정한X원우,2024-06-17,Pledis Entertainment,https://kpop.fandom.com/wiki/Pledis_Entertainment,,Jeonghan; Wonwoo,,,,,,,Boy Group,2025-09-23 15:31:13
14,Hoshi X Woozi,호시X우지,2025-03-10,Pledis Entertainment,https://kpop.fandom.com/wiki/Pledis_Entertainment,,Hoshi; Woozi,,,,,,,Boy Group,2025-09-23 15:31:13
18,WayV-LUCAS&HENDERY,,,Label V,https://kpop.fandom.com/wiki/Label_V,,,Lucas; Hendery,,,,,c. 2021,Boy Group,2025-09-23 15:31:13


- Girl Group

In [201]:
# girl 그룹 내보내기
girl_groups = ["Red_Velvet", "BLACKPINK", "Aespa", "CrazAngel", "TWICE", "APRIL"]
# 유닛 포함해서 그룹 내 멤버 중복되게 내보내기

df_girl_group, girl_groups_member_pages = crawler.create_csv_from_groups(girl_groups, "./csv_data/groups_info_girl.csv", "Girl Group")

Red_Velvet의 유닛 그룹 발견: ['Red_Velvet_-_Irene_&_Seulgi']
유닛 그룹 추가: Red_Velvet_-_Irene_&_Seulgi
TWICE의 유닛 그룹 발견: ['MISAMO']
유닛 그룹 추가: MISAMO
CSV 파일이 생성되었습니다: ./csv_data/groups_info_girl.csv
총 8개의 그룹이 처리되었습니다.
총 26명의 멤버 페이지명이 수집되었습니다.


In [None]:
df_girl_group.head()

Unnamed: 0,group_name_en,group_name_kr,debut_date,entertainment_name,entertainment_link,member_current,member_inactive,member_former,fandom_name_en,fandom_name_kr,units,sns,group_type,update_at
0,Red Velvet,레드벨벳,2014-08-01,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,Irene; Seulgi; Wendy; Joy; Yeri,,,ReVeluv,레베럽,Red_Velvet_-_Irene_&_Seulgi,"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",Girl Group,2025-09-23 14:11:54
1,BLACKPINK,블랙핑크,2016-08-08,YG Entertainment,https://kpop.fandom.com/wiki/YG_Entertainment,Jisoo; Jennie; Rosé; Lisa,,,BLINK,블링크,,"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...",Girl Group,2025-09-23 14:11:54
2,Aespa,에스파,2020-11-17,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,Karina; Giselle; Winter; Ningning,,,MY,마이,,"{""KR"": [{""platform"": ""Facebook"", ""href"": ""http...",Girl Group,2025-09-23 14:11:54
3,CrazAngel,크레이즈엔젤,2025-07-10,Forbest Entertainment,,Solmi; Daze; Shannie; Ahon,,,,,,"{""KR"": [{""platform"": ""Instagram"", ""href"": ""htt...",Girl Group,2025-09-23 14:11:54
4,TWICE,트와이스,2015-10-20,JYP Entertainment,https://kpop.fandom.com/wiki/JYP_Entertainment,Nayeon; Jeongyeon; Momo; Sana; Jihyo; Mina; Da...,,,ONCE,원스,MISAMO,"{""KR"": [{""platform"": ""Facebook"", ""href"": ""http...",Girl Group,2025-09-23 14:11:54


In [202]:
df_girl_group['group_name_en'].unique()

array(['Red Velvet', 'BLACKPINK', 'Aespa', 'CrazAngel', 'TWICE', 'APRIL',
       'Red Velvet - Irene & Seulgi', 'MISAMO'], dtype=object)

In [203]:
df_girl_group[df_girl_group['group_name_en'] == 'Red Velvet - Irene & Seulgi']

Unnamed: 0,group_name_en,group_name_kr,debut_date,entertainment_name,entertainment_link,member_current,member_inactive,member_former,fandom_name_en,fandom_name_kr,units,sns,disbanded,group_type,update_at
6,Red Velvet - Irene & Seulgi,레드벨벳-아이린&슬기,2020-07-06,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,Irene; Seulgi,,,,,,,,Girl Group,2025-09-23 15:32:06


In [204]:
df_girl_group[df_girl_group['group_name_en'] == 'APRIL']

Unnamed: 0,group_name_en,group_name_kr,debut_date,entertainment_name,entertainment_link,member_current,member_inactive,member_former,fandom_name_en,fandom_name_kr,units,sns,disbanded,group_type,update_at
5,APRIL,,2015-08-24,DSP Media,https://kpop.fandom.com/wiki/DSP_Media,,,Chaekyung; Somin; Chaewon; Hyunjoo; Naeun; Yen...,FiNEAPPLE,파인에플,,"{""KR"": [{""platform"": ""Daum Cafe"", ""href"": ""htt...","January 28, 2022",Girl Group,2025-09-23 15:32:06


In [205]:
df_girl_group.isnull().sum()

group_name_en         0
group_name_kr         1
debut_date            0
entertainment_name    0
entertainment_link    1
member_current        1
member_inactive       8
member_former         7
fandom_name_en        3
fandom_name_kr        3
units                 6
sns                   2
disbanded             7
group_type            0
update_at             0
dtype: int64

---

- Artist Table

In [207]:
# 설정값
base_url = "https://kpop.fandom.com/api.php"
base_wiki_url = "https://kpop.fandom.com"

def get_birth_name(soup):
    """본명 추출"""
    selectors = [
        '[data-source="birth_name"] .pi-data-value',
        '[data-source="birthname"] .pi-data-value',
        '[data-source="real_name"] .pi-data-value',
        '[data-source="realname"] .pi-data-value'
    ]
    
    for selector in selectors:
        birth_name = soup.select_one(selector)
        if birth_name:
            return birth_name.get_text(strip=True)
    return ""

def get_birth_date_artist(soup):
    """생년월일 추출"""
    birth_block = soup.select_one('[data-source="birth_date"] .pi-data-value')
    if birth_block:
        birth_text = birth_block.get_text(strip=True)
        birth_date_str = birth_text.split("(")[0].strip()
        
        try:
            birth_date = datetime.strptime(birth_date_str, "%B %d, %Y")
            return birth_date.strftime("%Y-%m-%d")
        except ValueError:
            return birth_date_str
    return ""

def _get_entertainment(soup) -> str:
    """소속사명 추출 - 첫 번째 엔터테인먼트 회사"""
    label_block = soup.select_one('[data-source="agency"] .pi-data-value')
    if not label_block:
        return ""
        
    # 첫 번째 b 태그(국가 라벨) 다음의 첫 번째 a 태그나 텍스트 찾기
    first_b = label_block.find('b')
    if first_b:
        # b 태그 다음의 첫 번째 a 태그 찾기
        a_tag = first_b.find_next('a')
        if a_tag:
            return a_tag.get_text(strip=True)
        
    # b 태그가 없으면 기존 방식
    labels = label_block.get_text("||", strip=True).split("||")
    return labels[0].strip() if labels else ""
    
def _get_entertainment_link(soup) -> str:
    """소속사 링크 추출 - 첫 번째 엔터테인먼트 회사 링크"""
    label_block = soup.select_one('[data-source="agency"] .pi-data-value')
    if not label_block:
        return ""
        
    # 첫 번째 b 태그(국가 라벨) 다음의 첫 번째 a 태그 찾기
    first_b = label_block.find('b')
    if first_b:
        a_tag = first_b.find_next('a', href=True)
        if a_tag:
            href = a_tag['href']
            return href if href.startswith('http') else f"{base_wiki_url}{href}"
        
    # b 태그가 없으면 기존 방식
    labels = label_block.get_text("||", strip=True).split("||")
    if labels:
        first_label = labels[0].strip()
        a_tag = label_block.find('a', string=lambda text: text and text.strip() == first_label)
        if a_tag and a_tag.has_attr('href'):
            href = a_tag['href']
            return href if href.startswith('http') else f"{base_wiki_url}{href}"
        
    return ""
#

def create_artist_csv(member_pages, output_filename="artists_info.csv"):
    """
    멤버 페이지 리스트를 받아서 개별 아티스트 정보를 크롤링하여 CSV로 저장합니다.
    
    Args:
        member_pages (List[Dict]): [{'group_name': '그룹명', 'page_name': '페이지명'}, ...] 형태의 리스트
        output_filename (str): 출력할 CSV 파일명
    
    Returns:
        DataFrame: 아티스트 정보가 담긴 DataFrame
    """
    all_artist_data = []
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    for member_info in member_pages:
        group_name = member_info['group_name']
        page_name = member_info['page_name']
        member_name = member_info['member_name']
        
        print(f"크롤링 중: {group_name} - {page_name}")
        
        try:
            params = {
                "action": "parse",
                "page": page_name,
                "format": "json"
            }
            
            res = requests.get(base_url, params=params)
            res.raise_for_status()
            data = res.json()
            
            if 'parse' not in data:
                try:
                    # API 호출용으로 디코딩
                    decoded_page_name = unquote(page_name)
                    
                    params = {
                        "action": "parse",
                        "page": decoded_page_name,
                        "format": "json"
                    }
                    res = requests.get(base_url, params=params)
                    res.raise_for_status()
                    data = res.json()
                except Exception as e:
                    print(f"페이지를 찾을 수 없습니다: {page_name}")
                    row_data = {
                        'group_name': group_name,
                        'page_name': page_name,
                        'member_name': member_name,
                        'birth_name': None,
                        'birth_date': None,
                        'agency_name': None,
                        'agency_href': None,
                        'update_at': current_time
                    }
                    all_artist_data.append(row_data)
                    continue
            
            html = data["parse"]["text"]["*"]
            soup = BeautifulSoup(html, "html.parser")
            
            # 정보 추출
            birth_name = get_birth_name(soup)
            birth_date = get_birth_date_artist(soup)
            agency_name = _get_entertainment(soup)
            agency_href = _get_entertainment_link(soup)
            
            row_data = {
                'group_name': group_name,
                'page_name': page_name,
                'member_name': member_name,
                'birth_name': birth_name or None,
                'birth_date': birth_date or None,
                'agency_name': agency_name or None,
                'agency_href': agency_href or None,
                'update_at': current_time
            }
            
            all_artist_data.append(row_data)
            
        except Exception as e:
            print(f"오류 ({page_name}): {e}")
            row_data = {
                'group_name': group_name,
                'page_name': page_name,
                'member_name': member_name,
                'birth_name': None,
                'birth_date': None,
                'agency_name': None,
                'agency_href': None,
                'update_at': current_time
            }
            all_artist_data.append(row_data)
    
    # CSV 저장
    df = pd.DataFrame(all_artist_data)
    df.to_csv(output_filename, index=False, encoding='utf-8-sig')
    print(f"Artist CSV 파일이 생성되었습니다: {output_filename}")
    print(f"총 {len(all_artist_data)}명의 아티스트 정보가 수집되었습니다.")
    
    return df

In [208]:
artist_df_boy = create_artist_csv(boy_groups_member_pages, "./csv_data/artists_boy_group.csv")

크롤링 중: NCT - Johnny_(NCT)
크롤링 중: NCT - Yuta
크롤링 중: NCT - Kun_(NCT)
크롤링 중: NCT - Doyoung_(NCT)
크롤링 중: NCT - Ten_(NCT)
크롤링 중: NCT - Jungwoo_(NCT)
크롤링 중: NCT - Mark_(NCT)
크롤링 중: NCT - Xiao_Jun
크롤링 중: NCT - Hendery
크롤링 중: NCT - Renjun
크롤링 중: NCT - Jeno_(NCT)
크롤링 중: NCT - Haechan
크롤링 중: NCT - Jaemin_(NCT)
크롤링 중: NCT - Yangyang
크롤링 중: NCT - Chenle
크롤링 중: NCT - Jisung_(NCT)
크롤링 중: NCT - Sion_(NCT)
크롤링 중: NCT - Riku_(NCT)
크롤링 중: NCT - Yushi
크롤링 중: NCT - Jaehee_(NCT)
크롤링 중: NCT - Ryo_(NCT)
크롤링 중: NCT - Sakuya
크롤링 중: NCT - Taeyong_(NCT)
크롤링 중: NCT - Jaehyun_(NCT)
크롤링 중: NCT - Winwin
크롤링 중: BTS - Jin_(BTS)
크롤링 중: BTS - Suga
크롤링 중: BTS - J-Hope
크롤링 중: BTS - RM
크롤링 중: BTS - Jimin_(BTS)
크롤링 중: BTS - V_(BTS)
크롤링 중: BTS - Jung_Kook
크롤링 중: Stray Kids - Bang_Chan
크롤링 중: Stray Kids - Lee_Know
크롤링 중: Stray Kids - Changbin_(Stray_Kids)
크롤링 중: Stray Kids - Hyunjin_(Stray_Kids)
크롤링 중: Stray Kids - Han_(Stray_Kids)
크롤링 중: Stray Kids - Felix
크롤링 중: Stray Kids - Seungmin_(Stray_Kids)
크롤링 중: Stray Kids - I.N_(St

In [209]:
artist_df_girl = create_artist_csv(girl_groups_member_pages, "./csv_data/artists_girl_group.csv")

크롤링 중: Red Velvet - Irene_(Red_Velvet)
크롤링 중: Red Velvet - Seulgi_(Red_Velvet)
크롤링 중: Red Velvet - Wendy
크롤링 중: Red Velvet - Joy_(Red_Velvet)
크롤링 중: Red Velvet - Yeri_(Red_Velvet)
크롤링 중: BLACKPINK - Jisoo_(BLACKPINK)
크롤링 중: BLACKPINK - Jennie
크롤링 중: BLACKPINK - Ros%C3%A9
크롤링 중: BLACKPINK - Lisa_(BLACKPINK)
크롤링 중: Aespa - Karina_(aespa)
크롤링 중: Aespa - Giselle
크롤링 중: Aespa - Winter_(aespa)
크롤링 중: Aespa - Ningning
크롤링 중: CrazAngel - Solmi_(CrazAngel)
크롤링 중: CrazAngel - Daze
크롤링 중: CrazAngel - Shannie
크롤링 중: CrazAngel - Ahon
크롤링 중: TWICE - Nayeon_(TWICE)
크롤링 중: TWICE - Jeongyeon_(TWICE)
크롤링 중: TWICE - Momo
크롤링 중: TWICE - Sana
크롤링 중: TWICE - Jihyo_(TWICE)
크롤링 중: TWICE - Mina_(TWICE)
크롤링 중: TWICE - Dahyun_(TWICE)
크롤링 중: TWICE - Chaeyoung_(TWICE)
크롤링 중: TWICE - Tzuyu
Artist CSV 파일이 생성되었습니다: ./csv_data/artists_girl_group.csv
총 26명의 아티스트 정보가 수집되었습니다.


- test

In [210]:
artist_df_boy.head()

Unnamed: 0,group_name,page_name,member_name,birth_name,birth_date,agency_name,agency_href,update_at
0,NCT,Johnny_(NCT),Johnny,John Jun Suh[1],1995-02-09,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,2025-09-23 15:32:50
1,NCT,Yuta,Yuta,Nakamoto Yuta (中本 悠太),1995-10-26,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,2025-09-23 15:32:50
2,NCT,Kun_(NCT),Kun,Qián Kūn (錢錕),1996-01-01,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,2025-09-23 15:32:50
3,NCT,Doyoung_(NCT),Doyoung,Kim Dong-yeong (김동영),1996-02-01,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,2025-09-23 15:32:50
4,NCT,Ten_(NCT),Ten,Chittaphon Leechaiyapornkul(ชิตพล ลี้ชัยพรกุล),1996-02-27,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,2025-09-23 15:32:50


In [211]:
artist_df_boy['group_name'].unique()

array(['NCT', 'BTS', 'Stray Kids', 'SEVENTEEN', 'NEEZ'], dtype=object)

In [212]:
artist_df_girl.head()

Unnamed: 0,group_name,page_name,member_name,birth_name,birth_date,agency_name,agency_href,update_at
0,Red Velvet,Irene_(Red_Velvet),Irene,Bae Joo-hyun (배주현),1991-03-29,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,2025-09-23 15:37:49
1,Red Velvet,Seulgi_(Red_Velvet),Seulgi,Kang Seul-gi (강슬기),1994-02-10,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,2025-09-23 15:37:49
2,Red Velvet,Wendy,Wendy,Shon Seung-wan (손승완),1994-02-21,ASND,https://kpop.fandom.com/wiki/ASND,2025-09-23 15:37:49
3,Red Velvet,Joy_(Red_Velvet),Joy,Park Soo-young (박수영),1996-09-03,SM Entertainment,https://kpop.fandom.com/wiki/SM_Entertainment,2025-09-23 15:37:49
4,Red Velvet,Yeri_(Red_Velvet),Yeri,Kim Ye-rim (김예림),1999-03-05,Blitzway Entertainment,,2025-09-23 15:37:49


In [213]:
artist_df_girl['group_name'].unique()

array(['Red Velvet', 'BLACKPINK', 'Aespa', 'CrazAngel', 'TWICE'],
      dtype=object)