In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re


def get_decade_links(url):
    """
    Retrieve links to different decades from a given Wikipedia page.
    
    Args:
        url (str): The URL of the Wikipedia page.
    
    Returns:
        dict: A dictionary where keys are decades (e.g., '1990s') and values are their corresponding URLs.
    """
    try:
        # Send an HTTP GET request to the provided URL
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Initialize a dictionary to store the results
        decade_links = {}
        
        # Find all div elements with the class 'hatnote' (contains "Main article")
        main_articles = soup.find_all('div', class_='hatnote')
        for article in main_articles:
            link = article.find('a', href=True)  # Find the first anchor tag with an href attribute
            if link:
                # Extract the text and URL from the link
                decade_text = link.text.strip()
                
                # Extract only the decade part (e.g., '1990s', '2000s')
                decade_match = re.search(r'\b\d{4}s\b', decade_text)
                if decade_match:
                    decade = decade_match.group()
                    full_url = f"https://en.wikipedia.org{link['href']}"
                    decade_links[decade] = full_url  # Add the decade and URL to the dictionary
        
        return decade_links
    except requests.RequestException as e:
        # Handle HTTP-related errors
        print(f"Error: {e}")
        return {}

In [5]:
url = 'https://en.wikipedia.org/wiki/List_of_South_Korean_idol_groups'
decade_links_info = get_decade_links(url)
decade_links_info

{'1990s': 'https://en.wikipedia.org/wiki/List_of_South_Korean_idol_groups_(1990s)',
 '2000s': 'https://en.wikipedia.org/wiki/List_of_South_Korean_idol_groups_(2000s)',
 '2010s': 'https://en.wikipedia.org/wiki/List_of_South_Korean_idol_groups_(2010s)',
 '2020s': 'https://en.wikipedia.org/wiki/List_of_South_Korean_idol_groups_(2020s)'}

In [None]:
def clean_text_footnotes(text):
    """
    Removes footnotes and bracketed content (e.g., [1], (info)) from the given text.
    
    Args:
        text (str): The input text to clean.
    
    Returns:
        str: The cleaned text without footnotes or bracketed content.
    """
    if not text:
        return text  # Return as is if text is None or empty
    # Remove [content] and (content)
    text = re.sub(r'\[.*?\]', '', text)  # Remove square brackets and content within
    text = re.sub(r'\(.*?\)', '', text)  # Remove parentheses and content within
    return text.strip()  # Trim any extra whitespace


def parse_kpop_groups_cleaned(url: str) -> pd.DataFrame:
    """
    Parses K-pop idol groups by year from a Wikipedia page, removing footnotes and brackets.
    
    Args:
        url (str): Wikipedia page URL.
    
    Returns:
        pd.DataFrame: A DataFrame containing 'start_year', 'group', and 'wiki_url'.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        records = []
        headings = soup.find_all(['h2', 'div'], class_='mw-heading mw-heading2')
        for heading in headings:
            year = None
            if heading.name == 'h2':  # Case: <h2 id="2010">2010</h2>
                year = heading.get('id')
            elif heading.name == 'div':  # Case: <div class="mw-heading mw-heading2"><h2 id="2000">2000</h2>
                h2_tag = heading.find('h2')
                if h2_tag:
                    year = h2_tag.get('id')
            
            if not year or not year.isdigit():
                continue
            
            sibling = heading.find_next_sibling()
            while sibling:
                if sibling.name in ['h2', 'div'] and 'mw-heading' in sibling.get('class', ''):
                    break
                
                if sibling.name == 'ul':  # Groups are often listed in <ul>
                    for li in sibling.find_all('li'):
                        group_name = li.text.strip()
                        group_name = clean_text_footnotes(group_name)  # Clean footnotes
                        link_tag = li.find('a', href=True)
                        wiki_url = f"https://en.wikipedia.org{link_tag['href']}" if link_tag else None
                        # Skip empty group names
                        if group_name:
                            records.append({'start_year': year, 'group': group_name, 'wiki_url': wiki_url})
                elif sibling.name == 'div' and 'div-col' in sibling.get('class', []):  # Multi-column layout
                    for li in sibling.find_all('li'):
                        group_name = li.text.strip()
                        group_name = clean_text_footnotes(group_name)  # Clean footnotes
                        link_tag = li.find('a', href=True)
                        wiki_url = f"https://en.wikipedia.org{link_tag['href']}" if link_tag else None
                        # Skip empty group names
                        if group_name:
                            records.append({'start_year': year, 'group': group_name, 'wiki_url': wiki_url})
                
                sibling = sibling.find_next_sibling()
        
        return pd.DataFrame(records)
    
    except requests.RequestException as e:
        print(f"Error fetching the page: {e}")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error parsing the page: {e}")
        return pd.DataFrame()

In [17]:
url = 'https://en.wikipedia.org/wiki/List_of_South_Korean_idol_groups_(2010s)'
titles = parse_kpop_groups_cleaned(url)
titles

Unnamed: 0,start_year,group,wiki_url
0,2010,Coed School,https://en.wikipedia.org/wiki/Coed_School
1,2010,DMTN,https://en.wikipedia.org/wiki/DMTN
2,2010,F.Cuz,https://en.wikipedia.org/wiki/F.Cuz
3,2010,Girl's Day,https://en.wikipedia.org/wiki/Girl%27s_Day
4,2010,GD & TOP,https://en.wikipedia.org/wiki/GD_%26_TOP
...,...,...,...
309,2019,Vanner,https://en.wikipedia.org/wiki/Vanner_(band)
310,2019,Verivery,https://en.wikipedia.org/wiki/Verivery
311,2019,We in the Zone,https://en.wikipedia.org/wiki/We_in_the_Zone
312,2019,Wooseok x Kuanlin,https://en.wikipedia.org/wiki/Wooseok_x_Kuanlin


In [18]:
all_decades_df = pd.DataFrame()
for decade, url in decade_links_info.items():
    print(f"Processing {decade} from {url}...")
    
    # Get group name of this decase
    decade_df = parse_kpop_groups_cleaned(url)
    
    if not decade_df.empty:
        decade_df['decade'] = decade
        all_decades_df = pd.concat([all_decades_df, decade_df], ignore_index=True)
    else:
        print(f"No data found for {decade}.")
all_decades_df

Processing 1990s from https://en.wikipedia.org/wiki/List_of_South_Korean_idol_groups_(1990s)...
Processing 2000s from https://en.wikipedia.org/wiki/List_of_South_Korean_idol_groups_(2000s)...
Processing 2010s from https://en.wikipedia.org/wiki/List_of_South_Korean_idol_groups_(2010s)...
Processing 2020s from https://en.wikipedia.org/wiki/List_of_South_Korean_idol_groups_(2020s)...


Unnamed: 0,start_year,group,wiki_url,decade
0,1992,Seo Taiji and Boys,https://en.wikipedia.org/wiki/Seo_Taiji_and_Boys,1990s
1,1993,Deux,https://en.wikipedia.org/wiki/Deux_(band),1990s
2,1994,Cool,https://en.wikipedia.org/wiki/Cool_(band),1990s
3,1994,Roo'ra,https://en.wikipedia.org/wiki/Roo%27ra,1990s
4,1994,Two Two,https://en.wikipedia.org/wiki/Two_Two,1990s
...,...,...,...,...
536,2024,Say My Name,https://en.wikipedia.org/wiki/Say_My_Name_(group),2020s
537,2024,TWS,https://en.wikipedia.org/wiki/TWS_(group),2020s
538,2024,Unis,https://en.wikipedia.org/wiki/Unis_(group),2020s
539,2024,Waker,https://en.wikipedia.org/wiki/Waker,2020s


In [27]:
from urllib.parse import unquote

all_decades_df['group_wiki_name'] = all_decades_df['wiki_url'].apply(lambda url: unquote(url.split('/')[-1]))
all_decades_df

Unnamed: 0,start_year,group,wiki_url,decade,group_wiki_name
0,1992,Seo Taiji and Boys,https://en.wikipedia.org/wiki/Seo_Taiji_and_Boys,1990s,Seo_Taiji_and_Boys
1,1993,Deux,https://en.wikipedia.org/wiki/Deux_(band),1990s,Deux_(band)
2,1994,Cool,https://en.wikipedia.org/wiki/Cool_(band),1990s,Cool_(band)
3,1994,Roo'ra,https://en.wikipedia.org/wiki/Roo%27ra,1990s,Roo'ra
4,1994,Two Two,https://en.wikipedia.org/wiki/Two_Two,1990s,Two_Two
...,...,...,...,...,...
536,2024,Say My Name,https://en.wikipedia.org/wiki/Say_My_Name_(group),2020s,Say_My_Name_(group)
537,2024,TWS,https://en.wikipedia.org/wiki/TWS_(group),2020s,TWS_(group)
538,2024,Unis,https://en.wikipedia.org/wiki/Unis_(group),2020s,Unis_(group)
539,2024,Waker,https://en.wikipedia.org/wiki/Waker,2020s,Waker


In [37]:
all_decades_df['start_year'] = all_decades_df['start_year'].apply(lambda x:int(x))

In [None]:
all_decades_df.to_csv('./data/all_kpop_group_name_and_page.csv', index=False, encoding='utf-8')

In [89]:
def fetch_group_background(page_title: str) -> dict:
    """
    Fetch the background information of a K-pop group from a Wikipedia page Infobox using the MediaWiki API.
    Handling redirects to ensure correct page. Processes multiple active year ranges correctly.
    
    Args:
        page_title (str): The title of the Wikipedia page for the group.
    
    Returns:
        dict: A dict containing 'start_year', 'end_year', 'company', 'hanja', 'hangul', and 'members'.
            If no data is found, an empty dict is returned.
    
    Raises:
        ValueError: If the API response or Infobox structure is unexpected.
    """
    base_url = 'https://en.wikipedia.org/w/api.php'
    
    # Resolve redirects to find the actual page title
    query_params = {
        'action': 'query',
        'titles': page_title,
        'redirects': 1,  # Automatically resolve redirects
        'format': 'json'
    }
    
    try:
        query_response = requests.get(base_url, params=query_params)
        query_response.raise_for_status()
        query_data = query_response.json()
        
        # Extract the normalized title from the query response
        pages = query_data['query']['pages']
        page_id, page_info = next(iter(pages.items()))
        if page_id == '-1':
            raise ValueError(f"Page '{page_title}' not found.")
        resolved_title = page_info.get('title', page_title)  # Fallback to original title if no redirect
        
        # Fetch and parse the page using the resolved title
        parse_params = {
            'action': 'parse',
            'page': resolved_title,
            'prop': 'text',
            'format': 'json'
        }
        response = requests.get(base_url, params=parse_params)
        response.raise_for_status()
        data = response.json()
        
        # Parse the returned HTML content
        html_content = data['parse']['text']['*']
        soup = BeautifulSoup(html_content, 'html.parser')
        infobox = soup.find('table', class_='infobox')
        if not infobox:
            raise ValueError("No infobox found on the page.")
        
        # Initialize a dictionary to store group information
        group_info = {
            'start_year': None,
            'end_year': None,
            'company': None,
            'hanja': None,
            'hangul': None,
            'members': None
        }
        
        # Extract years active
        years_row = infobox.find('th', string=re.compile(r'Years active', re.IGNORECASE))
        if years_row:
            years_data = years_row.find_next_sibling('td')
            if years_data:
                # Handle multiple ranges (e.g., <ul><li>2007–2017</li><li>2021–present</li></ul>)
                year_ranges = []
                for li in years_data.find_all('li'):
                    year_text = li.get_text(strip=True)
                    match = re.match(r"(\d{4})(?:\s*(?:–|-|to)\s*(\d{4}|present))?", year_text)
                    if match:
                        
                        start = int(match.group(1))
                        end = match.group(2)
                        end = int(end) if end and end.isdigit() else None
                        year_ranges.append((start, end))
                
                # Fallback for single range (if no <li>)
                if not year_ranges:
                    year_text = years_data.get_text(strip=True)
                    match = re.match(r"(\d{4})(?:\s*(?:–|-|to)\s*(\d{4}|present))?", year_text)
                    if match:
                        start = int(match.group(1))
                        end = match.group(2)
                        end = int(end) if end and end.isdigit() else None
                        year_ranges.append((start, end))
                
                # Determine overall start and end years
                if year_ranges:
                    group_info['start_year'] = min(start for start, _ in year_ranges)
                    group_info['end_year'] = year_ranges[-1][-1]
        
        # Extract first label (company)
        label_row = infobox.find('th', string=re.compile(r'Labels', re.IGNORECASE))
        if label_row:
            label_data = label_row.find_next_sibling('td')
            if label_data:
                first_label = label_data.find('a')
                if first_label:
                    group_info['company'] = ''.join(clean_text_footnotes(first_label.get_text(strip=True)))
        
        # Extract Traditional Chinese name (Hanja)
        chinese_row = infobox.find('th', string=re.compile(r'Hanja', re.IGNORECASE))
        if chinese_row:
            chinese_data = chinese_row.find_next_sibling('td')
            if chinese_data:
                group_info['hanja'] = ''.join(clean_text_footnotes(chinese_data.get_text()).split())
        
        # Extract Hangul name
        hangul_row = infobox.find('th', string=re.compile(r'Hangul', re.IGNORECASE))
        if hangul_row:
            hangul_data = hangul_row.find_next_sibling('td')
            if hangul_data:
                group_info['hangul'] = clean_text_footnotes(hangul_data.get_text(strip=True))
        
        # Extract members
        members_row = infobox.find('th', string=re.compile(r'Members', re.IGNORECASE))
        if members_row:
            members_data = members_row.find_next_sibling('td')
            if members_data:
                members_list = [clean_text_footnotes(li.get_text(strip=True)) for li in members_data.find_all('li')]
                group_info['members'] = members_list
        
        return group_info
    
    except requests.RequestException as e:
        print(f"Error fetching the Wikipedia page: {e}")
        return pd.DataFrame()
    except ValueError as e:
        print(f"Error parsing the Infobox: {e}")
        return pd.DataFrame()
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return pd.DataFrame()

In [91]:
# test
print(fetch_group_background('Aespa'))
print(fetch_group_background('Girls\' Generation')) # test 2 stage active years
print(fetch_group_background('Big_Bang_(band)')) # test need redirct

{'start_year': 2020, 'end_year': None, 'company': 'SM', 'hanja': None, 'hangul': None, 'members': ['Karina', 'Giselle', 'Winter', 'Ningning']}
{'start_year': 2007, 'end_year': None, 'company': 'SM', 'hanja': '少女時代', 'hangul': '소녀시대', 'members': ['Taeyeon', 'Sunny', 'Tiffany', 'Hyoyeon', 'Yuri', 'Sooyoung', 'Yoona', 'Seohyun']}
{'start_year': 2006, 'end_year': None, 'company': 'YG', 'hanja': None, 'hangul': None, 'members': ['Taeyang', 'G-Dragon', 'Daesung']}


In [92]:
import time
import pandas as pd

background_csv_path = 'background_info.csv'
background_info_list = []

try:
    existing_background_df = pd.read_csv(background_csv_path)
    processed_groups = set(existing_background_df['group_wiki_name'])
    background_info_list = existing_background_df.to_dict('records')
    print(f"Loaded existing background info for {len(processed_groups)} groups.")
except FileNotFoundError:
    processed_groups = set()
    background_info_list = []
    print("No existing background info found. Starting fresh.")

filtered_df = all_decades_df[all_decades_df['start_year'] >= 2000].copy()

for index, row in filtered_df.iterrows():
    group_name = row['group_wiki_name']
    if group_name in processed_groups:
        continue
    
    print(f'Fetching {group_name}...')
    try:
        background = fetch_group_background(group_name)
        
        group_background = {
            'group_wiki_name': group_name,
            'bg_start_year': background.get('start_year'),
            'bg_end_year': background.get('end_year'),
            'bg_company': background.get('company'),
            'bg_hanja': background.get('hanja'),
            'bg_hangul': background.get('hangul'),
            'bg_members': background.get('members'),
        }
        background_info_list.append(group_background)
        
        pd.DataFrame([group_background]).to_csv(
            background_csv_path, mode='a', index=False, header=not processed_groups
        )
        processed_groups.add(group_name)
        
        # time.sleep(0.5)
    except Exception as e:
        print(f"Error fetching background for {group_name}: {e}")


background_df = pd.DataFrame(background_info_list)
group_info_df = pd.merge(filtered_df, background_df, on='group_wiki_name', how='left')


No existing background info found. Starting fresh.
Fetching Chakra_(group)...
Fetching Papaya_(group)...
Fetching UN_(band)...
Fetching 5tion...
Fetching Jewelry_(group)...
Fetching JtL...
Fetching K'Pop_(band)...
Fetching Kiss_(South_Korean_group)...
Fetching Milk_(South_Korean_group)...
Fetching Black_Beat...
Fetching F-iV...
Fetching Isak_N_Jiyeon...
Fetching Luv_(group)...
Fetching MC_the_Max...
Fetching Noel_(band)...
Fetching Shinvi...
Error parsing the Infobox: No infobox found on the page.
Fetching Sugar_(South_Korean_group)...
Fetching Brown_Eyed_Soul_(band)...
Fetching Take_(band)...
Fetching TVXQ...
Fetching TraxX...
Fetching V.O.S_(band)...
Fetching Gavy_NJ...
Fetching LPG_(South_Korean_group)...
Fetching Paran_(band)...
Fetching SS501...
Fetching Super_Junior...
Fetching The_Grace_(group)...
Fetching 2NB...
Fetching Big_Bang_(band)...
Fetching Brown_Eyed_Girls...
Fetching SeeYa...
Fetching Super_Junior-K.R.Y....
Fetching Untouchable_(band)...
Fetching Baby_Vox_Re.V...
Fetc

In [93]:
group_info_df

Unnamed: 0,start_year,group,wiki_url,decade,group_wiki_name,bg_start_year,bg_end_year,bg_company,bg_hanja,bg_hangul,bg_members
0,2000,Chakra,https://en.wikipedia.org/wiki/Chakra_(group),2000s,Chakra_(group),2000.0,2006.0,,,,"[Hwangbo, Bona, Eun, Ryeowon, Eani]"
1,2000,Papaya,https://en.wikipedia.org/wiki/Papaya_(group),2000s,Papaya_(group),2000.0,,,,,"[Kang Kyoung-ah, Joo Yeun-jung, Cho Hye-kyung,..."
2,2000,UN,https://en.wikipedia.org/wiki/UN_(band),2000s,UN_(band),2000.0,2005.0,NH Planning,,,[]
3,2001,5tion,https://en.wikipedia.org/wiki/5tion,2000s,5tion,2001.0,,,,,"[Il Kwon, Chang Woo, Ju Ho, Jun Young, Jun Ho]"
4,2001,Jewelry,https://en.wikipedia.org/wiki/Jewelry_(group),2000s,Jewelry_(group),2001.0,,Star Empire,,,"[Kim Ye-won, Baby J, Kim Eunjung, Jung Yoo-jin..."
...,...,...,...,...,...,...,...,...,...,...,...
507,2024,Say My Name,https://en.wikipedia.org/wiki/Say_My_Name_(group),2020s,Say_My_Name_(group),2024.0,,,,,"[Hitomi, Mei, Kanny, Soha, Dohee, Junhwi, Seun..."
508,2024,TWS,https://en.wikipedia.org/wiki/TWS_(group),2020s,TWS_(group),2024.0,,Pledis,,,"[Shinyu, Dohoon, Youngjae, Hanjin, Jihoon, Kyu..."
509,2024,Unis,https://en.wikipedia.org/wiki/Unis_(group),2020s,Unis_(group),2024.0,,,,,"[Hyeonju, Nana, Gehlee, Kotoko, Yunha, Elisia,..."
510,2024,Waker,https://en.wikipedia.org/wiki/Waker,2020s,Waker,2024.0,,,,,"[Kohyeon, Kwon Hyeop, Ijun, Leo, Saebyeol, Sebum]"


In [94]:
group_info_df['bg_start_year'] = pd.to_numeric(group_info_df['bg_start_year'], errors='coerce').astype('Int64')
group_info_df['bg_end_year'] = pd.to_numeric(group_info_df['bg_end_year'], errors='coerce').astype('Int64')
group_info_df

Unnamed: 0,start_year,group,wiki_url,decade,group_wiki_name,bg_start_year,bg_end_year,bg_company,bg_hanja,bg_hangul,bg_members
0,2000,Chakra,https://en.wikipedia.org/wiki/Chakra_(group),2000s,Chakra_(group),2000,2006,,,,"[Hwangbo, Bona, Eun, Ryeowon, Eani]"
1,2000,Papaya,https://en.wikipedia.org/wiki/Papaya_(group),2000s,Papaya_(group),2000,,,,,"[Kang Kyoung-ah, Joo Yeun-jung, Cho Hye-kyung,..."
2,2000,UN,https://en.wikipedia.org/wiki/UN_(band),2000s,UN_(band),2000,2005,NH Planning,,,[]
3,2001,5tion,https://en.wikipedia.org/wiki/5tion,2000s,5tion,2001,,,,,"[Il Kwon, Chang Woo, Ju Ho, Jun Young, Jun Ho]"
4,2001,Jewelry,https://en.wikipedia.org/wiki/Jewelry_(group),2000s,Jewelry_(group),2001,,Star Empire,,,"[Kim Ye-won, Baby J, Kim Eunjung, Jung Yoo-jin..."
...,...,...,...,...,...,...,...,...,...,...,...
507,2024,Say My Name,https://en.wikipedia.org/wiki/Say_My_Name_(group),2020s,Say_My_Name_(group),2024,,,,,"[Hitomi, Mei, Kanny, Soha, Dohee, Junhwi, Seun..."
508,2024,TWS,https://en.wikipedia.org/wiki/TWS_(group),2020s,TWS_(group),2024,,Pledis,,,"[Shinyu, Dohoon, Youngjae, Hanjin, Jihoon, Kyu..."
509,2024,Unis,https://en.wikipedia.org/wiki/Unis_(group),2020s,Unis_(group),2024,,,,,"[Hyeonju, Nana, Gehlee, Kotoko, Yunha, Elisia,..."
510,2024,Waker,https://en.wikipedia.org/wiki/Waker,2020s,Waker,2024,,,,,"[Kohyeon, Kwon Hyeop, Ijun, Leo, Saebyeol, Sebum]"


In [None]:
group_info_df.to_csv('./data/group_info_from_2000.csv', index=False, encoding='utf-8')