In [None]:
import pandas as pd
import json
import os
import sys

# remove column width restrictions
pd.set_option('display.max_colwidth', None)

ROOT = os.path.dirname(os.path.abspath(os.getcwd()))
data_path = os.path.join(ROOT, 'data')
valid_characters_url = os.path.join(data_path, 'one_piece_characters_urls.txt')

# read the file and store each line as an element in a list
with open(valid_characters_url, 'r') as file:
    valid_characters = [line.strip() for line in file.readlines()]

print(f"Number of valid characters: {len(valid_characters)}")

In [None]:
import random 

# get random url from valid_characters
url = random.choice(valid_characters)
print(f"Random URL: {url}")

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from dateutil.parser import parse
import json


scraper_headers = {
        'User-Agent': 'OnePieceRAGBot/1.0 Character Parser - jfcastaneda.led@gmail.com'
    }


response = requests.get(url, headers=scraper_headers)
if response.status_code != 200:
    print(f"Failed to retrieve page. Status code: {response.status_code}")
    sys.exit(1)

soup = BeautifulSoup(response.content, 'html.parser')
character_data = {'url': url}
character_data

#### Parse infobox

In [None]:
infobox = soup.find('aside', class_='portable-infobox')
if not infobox:
    print(f"No infobox found for {url}")
    

#### Name

In [None]:
try:
    # The most reliable source is the infobox's main title
    character_data['name'] = infobox.find('h2', class_='pi-title').get_text(strip=True)
except AttributeError:
    character_data['name'] = None

character_data

#### Affiliations

In [None]:
try:
    affiliations = None
    
    label_tag = infobox.find('h3', string=re.compile("Affiliations"))
    if label_tag:
        value_tag = label_tag.find_next_sibling('div')
        if value_tag:
            # Find all the <a> tags, which contain the actual affiliation names
            affiliation_links = value_tag.find_all('a')
            # Extract the clean text from each link
            affiliation_names = [link.get_text(strip=True) for link in affiliation_links]
            # Join them into a clean, comma-separated string
            affiliations = ", ".join(affiliation_names) if affiliation_names else None
    # Fallback: Sometimes the label might be in a <b> tag instead of <h3>
    if not affiliations:
        label_tag = infobox.find('b', string=re.compile("Affiliations:"))
        if label_tag:
            value_parts = []
            for sibling in label_tag.next_siblings:
                if getattr(sibling, 'name', None) == 'b': break
                if isinstance(sibling, str):
                    cleaned = sibling.strip().replace(':', '').strip()
                    if cleaned: value_parts.append(cleaned)
            affiliations = " ".join(value_parts)
            
    character_data['affiliations'] = affiliations
except:
    character_data['affiliations'] = None

character_data

#### Occupation

In [None]:
# Assuming 'infobox' and 'character_data' are already defined in your notebook
try:
    occupations = None
    # Pattern 1: Standard <h3> tag (works for major characters like Kawamatsu)
    label_tag = infobox.find('h3', string=re.compile("Occupation"))
    if label_tag:
        value_tag = label_tag.find_next_sibling('div')
        if value_tag:
            # Clean out the reference tags (e.g., [1], [3]) first
            for sup in value_tag.find_all('sup'):
                sup.decompose()
            
            # Use .stripped_strings to get all pieces of text, including from links
            occupation_list = [text.strip().replace(';', '') for text in value_tag.stripped_strings]
            
            # A small piece of logic to combine "(former)" with the preceding occupation
            final_list = []
            for item in occupation_list:
                if item.startswith('(') and final_list:
                    final_list[-1] += f" {item}"
                else:
                    final_list.append(item)
            
            occupations = ", ".join(final_list) if final_list else None
    
    # Pattern 2 (Fallback): Dense <b> tag (for minor characters)
    if not occupations:
        label_tag = infobox.find('b', string=re.compile("Occupation(s)?:"))
        if label_tag:
            value_parts = []
            for sibling in label_tag.next_siblings:
                if getattr(sibling, 'name', None) == 'b': break # Stop at the next label
                if isinstance(sibling, str):
                    cleaned = sibling.strip().replace(':', '').strip()
                    if cleaned: value_parts.append(cleaned)
            occupations = " ".join(value_parts)

    character_data['occupations'] = occupations
except:
    character_data['occupations'] = None
    
character_data

#### Origin

In [None]:

try:
    origin = None
    # Pattern 1: Dedicated Origin section
    origin_section = infobox.find('h2', string='Origin')
    if origin_section:
        origin_div = origin_section.find_next('div', class_='pi-data-value')
        if origin_div:
            origin = origin_div.get_text(strip=True)

    # Pattern 2 (Fallback): Standard label
    if not origin:
        label_tag = infobox.find('h3', string=re.compile("Origin"))
        if label_tag:
            value_tag = label_tag.find_next_sibling('div')
            if value_tag:
                origin = value_tag.get_text(strip=True)

    character_data['origin'] = origin
except:
    character_data['origin'] = None

character_data

#### Residence

In [None]:
try:
    residence = None
    # Pattern 1: Dedicated Origin section
    origin_section = infobox.find('h2', string='Residence')
    if origin_section:
        origin_div = origin_section.find_next('div', class_='pi-data-value')
        if origin_div:
            origin = origin_div.get_text(strip=True)

    # Pattern 2 (Fallback): Standard label
    if not residence:
        label_tag = infobox.find('h3', string=re.compile("Residence"))
        if label_tag:
            value_tag = label_tag.find_next_sibling('div')
            if value_tag:
                residence = value_tag.get_text(strip=True)

    character_data['residence'] = residence
except:
    character_data['residence'] = None

character_data

#### Birthday

In [None]:
try:
    residence = None
    # Pattern 1: Dedicated Origin section
    origin_section = infobox.find('h2', string='Birthday')
    if origin_section:
        origin_div = origin_section.find_next('div', class_='pi-data-value')
        if origin_div:
            origin = origin_div.get_text(strip=True)

    # Pattern 2 (Fallback): Standard label
    if not residence:
        label_tag = infobox.find('h3', string=re.compile("Birthday"))
        if label_tag:
            value_tag = label_tag.find_next_sibling('div')
            if value_tag:
                birthday = value_tag.get_text(strip=True)

    character_data['birthday'] = birthday
except:
    character_data['birthday'] = None

character_data

#### Devil Fruit

In [None]:
try:
    devil_fruit_data = {
        'english_name': None,
        'japanese_name': None,
        'meaning': None,
        'type': None
    }
    
   
    df_section = infobox.find('h2', class_='pi-header', string='Devil Fruit')

    if df_section:
        # If the section exists, find each specific field within it
        eng_name_tag = df_section.find_next('h3', string='English Name:')
        if eng_name_tag:
            devil_fruit_data['english_name'] = eng_name_tag.find_next_sibling('div').get_text(strip=True)

        jpn_name_tag = df_section.find_next('h3', string='Japanese Name:')
        if jpn_name_tag:
            devil_fruit_data['japanese_name'] = jpn_name_tag.find_next_sibling('div').get_text(strip=True)

        meaning_tag = df_section.find_next('h3', string='Meaning:')
        if meaning_tag:
            devil_fruit_data['meaning'] = meaning_tag.find_next_sibling('div').get_text(strip=True)
            
        type_tag = df_section.find_next('h3', string='Type:')
        if type_tag:
            devil_fruit_data['type'] = type_tag.find_next_sibling('div').get_text(strip=True)

    # --- Fallback Method: Look for a single "Devil Fruit Name" line ---
    # This works for characters like Luffy. We only run this if the primary method found nothing.
    if not devil_fruit_data.get('english_name'):
        label_tag = infobox.find('h3', string=re.compile("Devil Fruit Name"))
        if label_tag:
            value_tag = label_tag.find_next_sibling('div')
            if value_tag:
                devil_fruit_data['english_name'] = value_tag.get_text(strip=True)

    # Final check: only add the dictionary if we actually found a name.
    if devil_fruit_data.get('english_name'):
        character_data['devil_fruit'] = devil_fruit_data
    else:
        character_data['devil_fruit'] = None
except Exception as e:
    print(f"An error occurred: {e}")
    character_data['devil_fruit'] = None

character_data

#### Bounty

In [None]:
# Assuming 'infobox' and 'character_data' are already defined in your notebook

try:
    bounty = None
    # FINAL, ROBUST METHOD: Directly find the bounty data container.
    bounty_container = infobox.find('div', attrs={'data-source': 'bounty'})
    
    if bounty_container:
        # Get all the text from within the container
        full_text = bounty_container.get_text()
        
        # Use a regular expression to find the first number (with commas)
        # This will find "1,374,000,000" or "3,000,000,000"
        match = re.search(r'([\d,]+)', full_text)
        
        if match:
            # Extract the matched number and remove commas
            bounty = match.group(1).replace(',', '')

    character_data['bounty'] = bounty
except:
    character_data['bounty'] = None

character_data

In [None]:
try:
    manga_debut, anime_debut = None, None
    # Find the <h3> tag whose text contains "Debut"
    label_tag = infobox.find('h3', string=lambda text: text and "Debut" in text.strip())
    
    if label_tag:
        value_tag = label_tag.find_next_sibling('div')
        if value_tag:
            # First, remove any reference tags (like [1]) to clean the source
            for sup in value_tag.find_all('sup'):
                sup.decompose()
            
            # Get the clean text from the container
            debut_text = value_tag.get_text(strip=True)
            
            # Split by either a semicolon or comma to handle variations
            parts = re.split(r'[;,]', debut_text)
            
            for part in parts:
                part = part.strip() # Clean up any extra whitespace
                if part.startswith("Chapter"):
                    manga_debut = part
                elif part.startswith("Episode"):
                    anime_debut = part
                    
    character_data['manga_debut'] = manga_debut
    character_data['anime_debut'] = anime_debut

except:
    character_data['manga_debut'] = None
    character_data['anime_debut'] = None

character_data

#### Status

In [None]:
try:
    status = None
    # Pattern 1: Dedicated Origin section
    origin_section = infobox.find('h2', string='Status')
    if origin_section:
        origin_div = origin_section.find_next('div', class_='pi-data-value')
        if origin_div:
            origin = origin_div.get_text(strip=True)

    # Pattern 2 (Fallback): Standard label
    if not residence:
        label_tag = infobox.find('h3', string=re.compile("Status"))
        if label_tag:
            value_tag = label_tag.find_next_sibling('div')
            if value_tag:
                status = value_tag.get_text(strip=True)

    character_data['status'] = status
except:
    character_data['status'] = None

character_data

### test parse infobox

In [None]:
from src.parse_characters import parse_infobox

# get random url from valid_characters
url = random.choice(valid_characters)
print(f"Random URL: {url}")

url

In [None]:
character_data = parse_infobox(url)
character_data

### main content

#### General description

In [None]:
main_content = soup.find('div', class_='mw-parser-output')

try:
    description = None
    
    # --- STRATEGY 1: Look for the introductory table (for pages like Bartolomeo) ---
    intro_table = main_content.find('table', class_='cs-begin-story')
    
    if intro_table:
        # If the special table exists, get all its text content cleanly.
        description = intro_table.get_text(strip=True)
    else:
        # --- STRATEGY 2 (Fallback): Look for paragraphs before the first major heading ---
        # This works for the majority of pages like Luffy and Sabo.
        first_heading = main_content.find(re.compile(r'h[2-6]'))
        
        if first_heading:
            intro_paragraphs = []
            # Find all <p> tags that appear before the first heading.
            for p_tag in first_heading.find_previous_siblings('p'):
                # Add a check to ensure we don't accidentally grab a <p> tag that contains the infobox.
                if not p_tag.find('aside', class_='portable-infobox'):
                    intro_paragraphs.append(p_tag.get_text(strip=True))
            
            # The results are found in reverse order, so we must reverse the list back.
            intro_paragraphs.reverse()
            description = " ".join(intro_paragraphs) if intro_paragraphs else None
            
    character_data['general_description'] = description

except AttributeError:
    character_data['general_description'] = None

character_data

In [None]:
def extract_section_text(section_id):
    """
    A robust function to find a section by its ID and extract all paragraph text
    until the next major heading.
    """
    try:
        heading = main_content.find('span', id=section_id)
        if not heading:
            return None # Section does not exist on this page

        # Find the parent heading tag (h2, h3, etc.)
        parent_heading = heading.find_parent(re.compile(r'h[1-6]'))
        
        paragraphs = []
        # Iterate through all tags that come AFTER the heading
        for sibling in parent_heading.find_next_siblings():
            # Stop condition: If we hit the next <h2>, the section is over.
            if sibling.name == 'h2':
                break
            
            # If the sibling is a <p> tag, add its text.
            if sibling.name == 'p':
                paragraphs.append(sibling.get_text(strip=True))
        
        return " ".join(paragraphs) if paragraphs else None
    except AttributeError:
        return None


In [None]:
character_data['appearance'] = extract_section_text("Appearance")

character_data['appearance']

In [None]:
character_data['personality'] = extract_section_text("Personality")
character_data

In [None]:

# --- Now, use the helper to get each section ---

character_data['appearance'] = extract_section_text("Appearance")
character_data['personality'] = extract_section_text("Personality")
character_data['relationships'] = extract_section_text("Relationships")
character_data['history'] = extract_section_text("History")

print(f"Appearance: {character_data.get('appearance', '')[:100]}...")
print(f"Personality: {character_data.get('personality', '')[:100]}...")