# Process the HTML code into fields

In [6]:
import json
import re
from bs4 import BeautifulSoup

def extract_structured_data(raw_html):
    """Extract structured data from the raw HTML of a card."""
    soup = BeautifulSoup(raw_html, 'html.parser')
    data = {}
    
    # Extract basic info from the info section
    info_div = soup.find('div', class_='info')
    if info_div:
        # Process main dl elements
        dl_elements = info_div.find_all('dl')
        for dl in dl_elements:
            dt_elements = dl.find_all('dt')
            dd_elements = dl.find_all('dd')
            
            for i in range(len(dt_elements)):
                if i < len(dd_elements):
                    key = dt_elements[i].text.strip()
                    
                    # Check if the DD contains images (like バトンタッチ, 色, etc.)
                    imgs = dd_elements[i].find_all('img')
                    if imgs:
                        # If the value uses images to represent information
                        if key == 'バトンタッチ':
                            # For バトンタッチ, we want to count the number of icons and store image info
                            img_data = []
                            for img in imgs:
                                img_info = {
                                    'alt': img.get('alt', ''),
                                    'src': img.get('src', '')
                                }
                                img_data.append(img_info)
                            
                            # Store both count and raw image data
                            data[key] = {
                                'count': len(imgs),
                                'images': img_data,
                                'raw_html': str(dd_elements[i])
                            }
                        elif key == '色':
                            # For 色, we want the color name from alt and the image info
                            img = imgs[0]
                            data[key] = {
                                'value': img.get('alt', ''),
                                'image': {
                                    'alt': img.get('alt', ''),
                                    'src': img.get('src', '')
                                },
                                'raw_html': str(dd_elements[i])
                            }
                        else:
                            # For other image-based values, store the general structure
                            img_data = []
                            for img in imgs:
                                img_info = {
                                    'alt': img.get('alt', ''),
                                    'src': img.get('src', '')
                                }
                                img_data.append(img_info)
                            data[key] = {
                                'images': img_data,
                                'raw_html': str(dd_elements[i])
                            }
                    # Handle special case for Tags which have links
                    elif key == 'タグ':
                        tags = []
                        for tag_link in dd_elements[i].find_all('a'):
                            tags.append({
                                'name': tag_link.text.strip(),
                                'href': tag_link.get('href', '')
                            })
                        data[key] = tags
                    else:
                        # Regular text content
                        data[key] = dd_elements[i].text.strip()
    
    # Extract Arts information
    arts_divs = soup.find_all('div', class_='sp arts')
    if arts_divs:
        arts_list = []
        for arts_div in arts_divs:
            arts_data = {}
            p_tags = arts_div.find_all('p')
            
            if len(p_tags) > 1:
                arts_content = p_tags[1]
                
                # Get full text content for reference
                arts_data['full_text'] = arts_content.text.strip()
                
                # Extract cost icons (img tags before the name)
                cost_icons = []
                for img in arts_content.find_all('img'):
                    cost_icons.append({
                        'alt': img.get('alt', ''),
                        'src': img.get('src', '')
                    })
                
                arts_data['cost_icons'] = cost_icons
                arts_data['raw_html'] = str(arts_content)
                
                # Try to extract arts name and damage value
                span = arts_content.find('span')
                if span:
                    span_text = span.text.strip()
                    
                    # Use regex to split by Japanese/English full-width space
                    parts = re.split(r'　', span_text)
                    
                    if len(parts) > 0:
                        # Name is the first part
                        arts_data['name'] = parts[0].strip()
                    
                    if len(parts) > 1:
                        # Damage/value is the second part
                        damage_part = parts[1].strip()
                        # Check for additional indicators like '+' or special damage types
                        damage_match = re.search(r'(\d+)([+]?)', damage_part)
                        if damage_match:
                            arts_data['damage'] = damage_match.group(0)
                    
                    # Special damage icons (like 特攻)
                    tokkou_span = span.find('span', class_='tokkou')
                    if tokkou_span:
                        tokkou_imgs = tokkou_span.find_all('img')
                        if tokkou_imgs:
                            tokkou_data = []
                            for img in tokkou_imgs:
                                tokkou_data.append({
                                    'alt': img.get('alt', ''),
                                    'src': img.get('src', '')
                                })
                            arts_data['tokkou'] = tokkou_data
                
                # Extract effect text (everything after the span)
                effect_text = ""
                for content in arts_content.contents:
                    if content.name != 'span':
                        effect_text += str(content)
                
                if effect_text.strip():
                    # Clean up HTML and convert to plain text
                    effect_soup = BeautifulSoup(effect_text, 'html.parser')
                    arts_data['effect'] = effect_soup.text.strip()
            
            arts_list.append(arts_data)
            
        if arts_list:
            data['アーツ'] = arts_list
    
    # Extract Extra information
    extra_div = soup.find('div', class_='extra')
    if extra_div:
        p_tags = extra_div.find_all('p')
        if len(p_tags) > 1:
            data['エクストラ'] = p_tags[1].text.strip()
    
    # Extract keyword abilities
    keyword_div = soup.find('div', class_='keyword')
    if keyword_div:
        p_tags = keyword_div.find_all('p')
        if len(p_tags) > 1:
            keyword_data = {}
            keyword_content = p_tags[1]
            
            # Store full raw HTML
            keyword_data['raw_html'] = str(keyword_content)
            
            # Store full text
            keyword_data['full_text'] = keyword_content.text.strip()
            
            # Check for keyword icon (img tag)
            img = keyword_content.find('img')
            if img:
                keyword_data['icon'] = {
                    'alt': img.get('alt', ''),
                    'src': img.get('src', '')
                }
            
            # Try to extract keyword name from span
            span = keyword_content.find('span')
            if span:
                keyword_name = span.text.strip()
                keyword_data['name'] = keyword_name
                
                # Get effect text (text after the name)
                effect_text = keyword_content.text.replace(span.text, '', 1).strip()
                keyword_data['effect'] = effect_text
            
            data['キーワード'] = keyword_data
    
    # Extract Oshi Skill information
    oshi_div = soup.find('div', class_='oshi skill')
    if oshi_div:
        skill_p = oshi_div.find_all('p')
        if len(skill_p) > 1:
            skill_data = {}
            skill_content = skill_p[1]
            skill_text = skill_content.text.strip()
            
            # Store full text and raw HTML
            skill_data['full_text'] = skill_text
            skill_data['raw_html'] = str(skill_content)
            
            # Extract skill name from span if available
            skill_span = skill_content.find('span')
            if skill_span:
                skill_name = skill_span.text.strip()
                skill_data['name'] = skill_name
                
                # Extract cost and timing using regex
                cost_match = re.search(r'\[ホロパワー：(-\d+)\]', skill_text)
                if cost_match:
                    skill_data['cost'] = cost_match.group(1)
                
                timing_match = re.search(r'\[(ターンに1回|ゲームに1回)\]', skill_text)
                if timing_match:
                    skill_data['timing'] = timing_match.group(1)
                
                # Extract effect (text after the name)
                parts = skill_text.split(skill_name)
                if len(parts) > 1:
                    skill_data['effect'] = parts[1].strip()
            
            data['推しスキル'] = skill_data
    
    # Extract SP Oshi Skill information
    sp_div = soup.find('div', class_='sp skill')
    if sp_div:
        skill_p = sp_div.find_all('p')
        if len(skill_p) > 1:
            skill_data = {}
            skill_content = skill_p[1]
            skill_text = skill_content.text.strip()
            
            # Store full text and raw HTML
            skill_data['full_text'] = skill_text
            skill_data['raw_html'] = str(skill_content)
            
            # Extract skill name from span if available
            skill_span = skill_content.find('span')
            if skill_span:
                skill_name = skill_span.text.strip()
                skill_data['name'] = skill_name
                
                # Extract cost and timing
                cost_match = re.search(r'\[ホロパワー：(-\d+)\]', skill_text)
                if cost_match:
                    skill_data['cost'] = cost_match.group(1)
                
                timing_match = re.search(r'\[(ターンに1回|ゲームに1回)\]', skill_text)
                if timing_match:
                    skill_data['timing'] = timing_match.group(1)
                
                # Extract effect (text after the name)
                parts = skill_text.split(skill_name)
                if len(parts) > 1:
                    skill_data['effect'] = parts[1].strip()
            
            data['SP推しスキル'] = skill_data
    
    # Extract illustrator information
    illustrator_div = soup.find('div', class_='illustrator')
    if illustrator_div:
        # Get illustrator name
        ill_name_p = illustrator_div.find('p', class_='ill-name')
        if ill_name_p:
            span = ill_name_p.find('span')
            if span:
                data['イラストレーター'] = span.text.strip()
        
        # Get card number
        number_p = illustrator_div.find('p', class_='number')
        if number_p:
            span = number_p.find('span')
            if span:
                data['カードナンバー'] = span.text.strip()
    
    # Process for サポート・イベント cards which have ability text
    ability_text = None
    dt_ability = info_div.find('dt', string='能力テキスト') if info_div else None
    if dt_ability:
        dd_ability = dt_ability.find_next_sibling('dd')
        if dd_ability:
            ability_text = dd_ability.text.strip()
            data['能力テキスト'] = ability_text
    
    return data

def process_cards():
    """Process all cards from cards_raw_html.json file into structured data."""
    # Read the raw HTML json file
    with open('./cards_raw_html.json', 'r', encoding='utf-8') as file:
        cards_data = json.load(file)
    
    # Process each card
    structured_cards = []
    
    for i, card in enumerate(cards_data):
        print(f"Processing card {i+1}/{len(cards_data)}")
        
        # Copy the basic info
        processed_card = {}
        
        if 'id' in card:
            processed_card['id'] = card['id']
        
        if 'name' in card:
            processed_card['name'] = card['name']

        if 'image_url' in card:
            processed_card['image_url'] = card['image_url']

        if 'image_path' in card:
            processed_card['image_path'] = card['image_path']
        
        # Extract structured data from HTML
        if 'raw_html' in card:
            structured_data = extract_structured_data(card['raw_html'])
            # Merge the structured data with the basic info
            processed_card.update(structured_data)
        
        structured_cards.append(processed_card)
    
    # Write the structured data to a new JSON file
    with open('./cards_structured.json', 'w', encoding='utf-8') as outfile:
        json.dump(structured_cards, outfile, ensure_ascii=False, indent=2)
    
    print(f"Processed {len(structured_cards)} cards. Output saved to 'cards_structured.json'")
    
    return structured_cards

process_cards()

Processing card 1/845
Processing card 2/845
Processing card 3/845
Processing card 4/845
Processing card 5/845
Processing card 6/845
Processing card 7/845
Processing card 8/845
Processing card 9/845
Processing card 10/845
Processing card 11/845
Processing card 12/845
Processing card 13/845
Processing card 14/845
Processing card 15/845
Processing card 16/845
Processing card 17/845
Processing card 18/845
Processing card 19/845
Processing card 20/845
Processing card 21/845
Processing card 22/845
Processing card 23/845
Processing card 24/845
Processing card 25/845
Processing card 26/845
Processing card 27/845
Processing card 28/845
Processing card 29/845
Processing card 30/845
Processing card 31/845
Processing card 32/845
Processing card 33/845
Processing card 34/845
Processing card 35/845
Processing card 36/845
Processing card 37/845
Processing card 38/845
Processing card 39/845
Processing card 40/845
Processing card 41/845
Processing card 42/845
Processing card 43/845
Processing card 44/8

[{'id': '565',
  'name': '姫森ルーナ',
  'image_url': 'https://hololive-official-cardgame.com/wp-content/images/cardlist/hBP03/hBP03-001_OSR.png',
  'image_path': 'card_images/hBP03-001_OSR.png',
  'カードタイプ': '推しホロメン',
  'レアリティ': 'OSR',
  '収録商品': 'ブースターパック「エリートスパーク」',
  '色': {'value': '白',
   'image': {'alt': '白', 'src': '/wp-content/images/texticon/type_white.png'},
   'raw_html': '<dd><img alt="白" class="" src="/wp-content/images/texticon/type_white.png"/></dd>'},
  'LIFE': '5',
  '推しスキル': {'full_text': '[ホロパワー：-2]パソコンならわかるのら[ターンに1回]自分のデッキから、カード名に「パソコン」を含むアイテム1枚を公開し、手札に加える。そしてデッキをシャッフルする。',
   'raw_html': '<p>[ホロパワー：-2]<span>パソコンならわかるのら</span>[ターンに1回]自分のデッキから、カード名に「パソコン」を含むアイテム1枚を公開し、手札に加える。そしてデッキをシャッフルする。</p>',
   'name': 'パソコンならわかるのら',
   'cost': '-2',
   'timing': 'ターンに1回',
   'effect': '[ターンに1回]自分のデッキから、カード名に「パソコン」を含むアイテム1枚を公開し、手札に加える。そしてデッキをシャッフルする。'},
  'SP推しスキル': {'full_text': '[ホロパワー：-2]ルーナイト集合[ゲームに1回]自分のセンターホロメンが〈姫森ルーナ〉の時に使える：自分のデッキから、〈ルーナイト〉1～4枚を公開し、自分のホロメンに割り振って付ける。そしてデッキをシャッフルす