In [None]:
import pandas as pd
import json
import os
import sys

# remove column width restrictions
pd.set_option('display.max_colwidth', None)

ROOT = os.path.dirname(os.path.abspath(os.getcwd()))
data_path = os.path.join(ROOT, 'data')
valid_characters_url = os.path.join(data_path, 'one_piece_characters_urls.txt')

# read the file and store each line as an element in a list
with open(valid_characters_url, 'r') as file:
    valid_characters = [line.strip() for line in file.readlines()]

print(f"Number of valid characters: {len(valid_characters)}")

In [None]:
import random 

# get random url from valid_characters
url = random.choice(valid_characters)
print(f"Random URL: {url}")

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from dateutil.parser import parse
import json


scraper_headers = {
        'User-Agent': 'OnePieceRAGBot/1.0 Character Parser - jfcastaneda.led@gmail.com'
    }


response = requests.get(url, headers=scraper_headers)
if response.status_code != 200:
    print(f"Failed to retrieve page. Status code: {response.status_code}")
    sys.exit(1)

soup = BeautifulSoup(response.content, 'html.parser')
character_data = {'url': url}
character_data

#### Parse infobox

In [None]:
infobox = soup.find('aside', class_='portable-infobox')
if not infobox:
    print(f"No infobox found for {url}")
    

#### Name

In [None]:
try:
    # The most reliable source is the infobox's main title
    character_data['name'] = infobox.find('h2', class_='pi-title').get_text(strip=True)
except AttributeError:
    character_data['name'] = None

character_data

#### Affiliations

In [None]:
try:
    affiliations = None
    
    label_tag = infobox.find('h3', string=re.compile("Affiliations"))
    if label_tag:
        value_tag = label_tag.find_next_sibling('div')
        if value_tag:
            # Find all the <a> tags, which contain the actual affiliation names
            affiliation_links = value_tag.find_all('a')
            # Extract the clean text from each link
            affiliation_names = [link.get_text(strip=True) for link in affiliation_links]
            # Join them into a clean, comma-separated string
            affiliations = ", ".join(affiliation_names) if affiliation_names else None
    # Fallback: Sometimes the label might be in a <b> tag instead of <h3>
    if not affiliations:
        label_tag = infobox.find('b', string=re.compile("Affiliations:"))
        if label_tag:
            value_parts = []
            for sibling in label_tag.next_siblings:
                if getattr(sibling, 'name', None) == 'b': break
                if isinstance(sibling, str):
                    cleaned = sibling.strip().replace(':', '').strip()
                    if cleaned: value_parts.append(cleaned)
            affiliations = " ".join(value_parts)
            
    character_data['affiliations'] = affiliations
except:
    character_data['affiliations'] = None

character_data

#### Occupation

In [None]:
# Assuming 'infobox' and 'character_data' are already defined in your notebook
try:
    occupations = None
    # Pattern 1: Standard <h3> tag (works for major characters like Kawamatsu)
    label_tag = infobox.find('h3', string=re.compile("Occupation"))
    if label_tag:
        value_tag = label_tag.find_next_sibling('div')
        if value_tag:
            # Clean out the reference tags (e.g., [1], [3]) first
            for sup in value_tag.find_all('sup'):
                sup.decompose()
            
            # Use .stripped_strings to get all pieces of text, including from links
            occupation_list = [text.strip().replace(';', '') for text in value_tag.stripped_strings]
            
            # A small piece of logic to combine "(former)" with the preceding occupation
            final_list = []
            for item in occupation_list:
                if item.startswith('(') and final_list:
                    final_list[-1] += f" {item}"
                else:
                    final_list.append(item)
            
            occupations = ", ".join(final_list) if final_list else None
    
    # Pattern 2 (Fallback): Dense <b> tag (for minor characters)
    if not occupations:
        label_tag = infobox.find('b', string=re.compile("Occupation(s)?:"))
        if label_tag:
            value_parts = []
            for sibling in label_tag.next_siblings:
                if getattr(sibling, 'name', None) == 'b': break # Stop at the next label
                if isinstance(sibling, str):
                    cleaned = sibling.strip().replace(':', '').strip()
                    if cleaned: value_parts.append(cleaned)
            occupations = " ".join(value_parts)

    character_data['occupations'] = occupations
except:
    character_data['occupations'] = None
    
character_data

#### Origin

In [None]:

try:
    origin = None
    # Pattern 1: Dedicated Origin section
    origin_section = infobox.find('h2', string='Origin')
    if origin_section:
        origin_div = origin_section.find_next('div', class_='pi-data-value')
        if origin_div:
            origin = origin_div.get_text(strip=True)

    # Pattern 2 (Fallback): Standard label
    if not origin:
        label_tag = infobox.find('h3', string=re.compile("Origin"))
        if label_tag:
            value_tag = label_tag.find_next_sibling('div')
            if value_tag:
                origin = value_tag.get_text(strip=True)

    character_data['origin'] = origin
except:
    character_data['origin'] = None

character_data

#### Residence

In [None]:
try:
    residence = None
    # Pattern 1: Dedicated Origin section
    origin_section = infobox.find('h2', string='Residence')
    if origin_section:
        origin_div = origin_section.find_next('div', class_='pi-data-value')
        if origin_div:
            origin = origin_div.get_text(strip=True)

    # Pattern 2 (Fallback): Standard label
    if not residence:
        label_tag = infobox.find('h3', string=re.compile("Residence"))
        if label_tag:
            value_tag = label_tag.find_next_sibling('div')
            if value_tag:
                residence = value_tag.get_text(strip=True)

    character_data['residence'] = residence
except:
    character_data['residence'] = None

character_data

#### Birthday

In [None]:
try:
    residence = None
    # Pattern 1: Dedicated Origin section
    origin_section = infobox.find('h2', string='Birthday')
    if origin_section:
        origin_div = origin_section.find_next('div', class_='pi-data-value')
        if origin_div:
            origin = origin_div.get_text(strip=True)

    # Pattern 2 (Fallback): Standard label
    if not residence:
        label_tag = infobox.find('h3', string=re.compile("Birthday"))
        if label_tag:
            value_tag = label_tag.find_next_sibling('div')
            if value_tag:
                birthday = value_tag.get_text(strip=True)

    character_data['birthday'] = birthday
except:
    character_data['birthday'] = None

character_data

#### Devil Fruit

In [None]:
try:
    devil_fruit_data = {
        'english_name': None,
        'japanese_name': None,
        'meaning': None,
        'type': None
    }
    
   
    df_section = infobox.find('h2', class_='pi-header', string='Devil Fruit')

    if df_section:
        # If the section exists, find each specific field within it
        eng_name_tag = df_section.find_next('h3', string='English Name:')
        if eng_name_tag:
            devil_fruit_data['english_name'] = eng_name_tag.find_next_sibling('div').get_text(strip=True)

        jpn_name_tag = df_section.find_next('h3', string='Japanese Name:')
        if jpn_name_tag:
            devil_fruit_data['japanese_name'] = jpn_name_tag.find_next_sibling('div').get_text(strip=True)

        meaning_tag = df_section.find_next('h3', string='Meaning:')
        if meaning_tag:
            devil_fruit_data['meaning'] = meaning_tag.find_next_sibling('div').get_text(strip=True)
            
        type_tag = df_section.find_next('h3', string='Type:')
        if type_tag:
            devil_fruit_data['type'] = type_tag.find_next_sibling('div').get_text(strip=True)

    # --- Fallback Method: Look for a single "Devil Fruit Name" line ---
    # This works for characters like Luffy. We only run this if the primary method found nothing.
    if not devil_fruit_data.get('english_name'):
        label_tag = infobox.find('h3', string=re.compile("Devil Fruit Name"))
        if label_tag:
            value_tag = label_tag.find_next_sibling('div')
            if value_tag:
                devil_fruit_data['english_name'] = value_tag.get_text(strip=True)

    # Final check: only add the dictionary if we actually found a name.
    if devil_fruit_data.get('english_name'):
        character_data['devil_fruit'] = devil_fruit_data
    else:
        character_data['devil_fruit'] = None
except Exception as e:
    print(f"An error occurred: {e}")
    character_data['devil_fruit'] = None

character_data

#### Bounty

In [None]:
# Assuming 'infobox' and 'character_data' are already defined in your notebook

try:
    bounty = None
    # FINAL, ROBUST METHOD: Directly find the bounty data container.
    bounty_container = infobox.find('div', attrs={'data-source': 'bounty'})
    
    if bounty_container:
        # Get all the text from within the container
        full_text = bounty_container.get_text()
        
        # Use a regular expression to find the first number (with commas)
        # This will find "1,374,000,000" or "3,000,000,000"
        match = re.search(r'([\d,]+)', full_text)
        
        if match:
            # Extract the matched number and remove commas
            bounty = match.group(1).replace(',', '')

    character_data['bounty'] = bounty
except:
    character_data['bounty'] = None

character_data

In [None]:
try:
    manga_debut, anime_debut = None, None
    # Find the <h3> tag whose text contains "Debut"
    label_tag = infobox.find('h3', string=lambda text: text and "Debut" in text.strip())
    
    if label_tag:
        value_tag = label_tag.find_next_sibling('div')
        if value_tag:
            # First, remove any reference tags (like [1]) to clean the source
            for sup in value_tag.find_all('sup'):
                sup.decompose()
            
            # Get the clean text from the container
            debut_text = value_tag.get_text(strip=True)
            
            # Split by either a semicolon or comma to handle variations
            parts = re.split(r'[;,]', debut_text)
            
            for part in parts:
                part = part.strip() # Clean up any extra whitespace
                if part.startswith("Chapter"):
                    manga_debut = part
                elif part.startswith("Episode"):
                    anime_debut = part
                    
    character_data['manga_debut'] = manga_debut
    character_data['anime_debut'] = anime_debut

except:
    character_data['manga_debut'] = None
    character_data['anime_debut'] = None

character_data

#### Status

In [None]:
try:
    status = None
    # Pattern 1: Dedicated Origin section
    origin_section = infobox.find('h2', string='Status')
    if origin_section:
        origin_div = origin_section.find_next('div', class_='pi-data-value')
        if origin_div:
            origin = origin_div.get_text(strip=True)

    # Pattern 2 (Fallback): Standard label
    if not residence:
        label_tag = infobox.find('h3', string=re.compile("Status"))
        if label_tag:
            value_tag = label_tag.find_next_sibling('div')
            if value_tag:
                status = value_tag.get_text(strip=True)

    character_data['status'] = status
except:
    character_data['status'] = None

character_data

### test parse infobox

In [None]:
from src.parse_characters import parse_infobox

# get random url from valid_characters
url = random.choice(valid_characters)
print(f"Random URL: {url}")

url

In [None]:
scraper_headers = {
        'User-Agent': 'OnePieceRAGBot/1.0 Character Parser - jfcastaneda.led@gmail.com'
    }


response = requests.get(url, headers=scraper_headers)
if response.status_code != 200:
    print(f"Failed to retrieve page. Status code: {response.status_code}")
    sys.exit(1)

soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
character_data = parse_infobox(soup)
character_data

### main content

In [None]:
scraper_headers = {
        'User-Agent': 'OnePieceRAGBot/1.0 Character Parser - jfcastaneda.led@gmail.com'
    }


response = requests.get(url, headers=scraper_headers)
if response.status_code != 200:
    print(f"Failed to retrieve page. Status code: {response.status_code}")
    sys.exit(1)

soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
main_content = soup.find('div', class_='mw-parser-output')
if not main_content:
    print(f"No main content found for {url}")
    

#### General info

In [None]:
character_main_data = {'url': url}

In [None]:
general_info_texts = []

for element in main_content.find_all(recursive=False):
    if element.name == 'h2':
        break  # Stop if we reach the next main section

    if element.name in ['p']:
        for sup in element.find_all('sup'):
            sup.decompose()
        general_info_texts.append(element.get_text(strip=True))
    
general_info =  " ".join(general_info_texts) if general_info_texts else None
character_main_data['general_info'] = general_info
character_main_data

#### Appearance

In [None]:
def parse_section(main_content, section_id_pattern):
    """
    A generic function to parse a specific section from the main content area.

    It finds a section header based on a pattern and extracts text from all
    subsequent <p> and <ul> tags until the next main header (<h2>) is found.

    Args:
        main_content (bs4.element.Tag): The BeautifulSoup tag for the main content area.
        section_id_pattern (str or re.Pattern): The ID to find in the section's span tag.
                                                 Can be a string or a compiled regex.

    Returns:
        str: A single string containing all the text from the section, or None.
    """
    # Find the header span tag using the provided ID or regex pattern
    section_header = main_content.find('span', id=section_id_pattern)
    if not section_header:
        return None

    section_texts = []
    # Start iterating from the header's parent (the <h2> tag)
    element = section_header.find_parent('h2')

    # Loop through all the tags that come after the header
    for sibling in element.find_next_siblings():
        # The next <h2> tag marks the end of our current section
        if sibling.name == 'h2':
            break

        # We are interested in paragraphs (<p>) and unordered lists (<ul>)
        if sibling.name == 'p':
            # Clean out reference tags (e.g., [1], [3]) before getting text
            for sup in sibling.find_all('sup'):
                sup.decompose()
            section_texts.append(sibling.get_text(strip=True))
        elif sibling.name == 'ul':
            # For lists, get the text from each list item (<li>)
            for li in sibling.find_all('li'):
                for sup in li.find_all('sup'):
                    sup.decompose()
                section_texts.append(li.get_text(strip=True))

    return " ".join(section_texts) if section_texts else None

In [None]:
appearance = parse_section(main_content, section_id_pattern = "Appearance")
personality = parse_section(main_content, section_id_pattern = "Personality")
history = parse_section(main_content, section_id_pattern = "History")
abilities = parse_section(main_content,  section_id_pattern=re.compile(r'^Abilities_and'))
relationships = parse_section(main_content, section_id_pattern= "Relationships")

character_main_data['appearance'] = appearance
character_main_data['personality'] = personality
character_main_data['relations'] = relationships
character_main_data['history'] = history
character_main_data['abilities'] = abilities



In [None]:
character_main_data

#### Trivia

In [None]:
trivia_header = main_content.find('span', id='Trivia')
if trivia_header:
    trivia_texts = []
    # 2. Start iterating from the header's parent (the <h2> tag)
    element = trivia_header.find_parent('h2')
    # 3. Loop through all the tags that come after the header
    for sibling in element.find_next_siblings():
        # The next <h2> tag marks the end of the trivia section
        if sibling.name == 'h2':
            break
        # 4. Specifically look for unordered lists (<ul>)
        if sibling.name == 'ul':
            # 5. Find all list items (<li>) within the list
            for li in sibling.find_all('li'):
                # Clean out reference tags (e.g., [81])
                for sup in li.find_all('sup'):
                    sup.decompose()
                trivia_texts.append(li.get_text(strip=True))
    trivia = " ".join(trivia_texts) if trivia_texts else None
else:
    trivia = None

character_main_data['trivia'] = trivia

In [None]:
character_main_data

### Combine both into functions and test

In [None]:
from src.parse_characters import parse_infobox, parse_main_content

# get random url from valid_characters
url = random.choice(valid_characters)
print(f"Random URL: {url}")

character_main_data = {'url': url}

scraper_headers = {
        'User-Agent': 'OnePieceRAGBot/1.0 Character Parser - jfcastaneda.led@gmail.com'
    }


response = requests.get(url, headers=scraper_headers)
if response.status_code != 200:
    print(f"Failed to retrieve page. Status code: {response.status_code}")
    sys.exit(1)

soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
infobox_data = parse_infobox(soup)
character_main_data.update(infobox_data)
character_main_data

In [None]:
main_content_data = parse_main_content(soup)
character_main_data.update(main_content_data)
character_main_data

### Combined

In [None]:
from src.parse_characters import parse_character

url = random.choice(valid_characters)
print(f"Random URL: {url}")


### Parse characters

In [None]:
from tqdm import tqdm
import time

JSON_FILE = os.path.join(data_path, 'one_piece_characters_data.jsonl')

if os.path.exists(JSON_FILE):
    all_characters = []
    with open(JSON_FILE, 'r') as f:
        for line in f:
            line = line.strip()
            if line:  # Skip empty lines
                try:
                    all_characters.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error parsing line: {line[:100]}... Error: {e}")
                    continue
    
    scraped_characters = {char['url'] for char in all_characters if 'url' in char}
    print(f"Already scraped {len(scraped_characters)} characters.")
else:
    all_characters = []
    scraped_characters = set()
    print("Starting fresh scrape.")

# Get characters that haven't been scraped yet
characters_to_scrape = [url for url in valid_characters if url not in scraped_characters]
print(f"Characters left to scrape: {len(characters_to_scrape)}")

In [None]:
from src.parse_characters import parse_character

scraper_headers = {
    'User-Agent': 'OnePieceRAGBot/1.0 (Learning Project; contact: jfcastaneda.led@gmail.com)'
}

errors = []
ERROR_FILE = os.path.join(data_path, 'character_scraping_errors.jsonl')

print(f"Starting to parse {len(characters_to_scrape)} remaining characters...")
for character_url in tqdm(characters_to_scrape):
    try:
        # Check if already scraped (extra safety check)
        if character_url in scraped_characters:
            print(f"Skipping already scraped: {character_url}")
            continue
            
        character_data = parse_character(character_url)

        # Check if parsing was successful
        if 'error' in character_data:
            print(f"❌ Failed to parse {character_url}: {character_data.get('error')}")
            errors.append((character_url, character_data.get('error', 'Unknown error')))
            
            # Save error data to separate file
            error_data = {
                'url': character_url,
                'error': character_data.get('error', 'Unknown error'),
                'error_type': 'parsing_failed',
                'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
            }
            with open(ERROR_FILE, 'a', encoding='utf-8') as f:
                f.write(json.dumps(error_data, ensure_ascii=False) + '\n')
        else:
            print(f"✅ Successfully parsed: {character_data.get('name', 'Unknown')} - {character_url}")
            
            # Add to scraped set to avoid duplicates
            scraped_characters.add(character_url)
            
            # Save ONLY successful character data
            with open(os.path.join(data_path, 'one_piece_characters_data.jsonl'), 'a', encoding='utf-8') as f:
                f.write(json.dumps(character_data, ensure_ascii=False) + '\n')
            
    except Exception as e:
        print(f"❌ Exception parsing {character_url}: {e}")
        errors.append((character_url, str(e)))
        
        # Save exception data to separate error file
        error_data = {
            'url': character_url,
            'error': f'Exception: {str(e)}',
            'error_type': 'exception',
            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
        }
        with open(ERROR_FILE, 'a', encoding='utf-8') as f:
            f.write(json.dumps(error_data, ensure_ascii=False) + '\n')

    # Be respectful to the server
    time.sleep(0.3)

