## Goal
- Scrape one piece chapter wiki
- Store it in a json structure

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = "https://onepiece.fandom.com/wiki/Chapter_1"

print(f"Scraping {url}...")

try:
    response = requests.get(url)

    if response.status_code == 200:
        print("Successfully fetched the page.")
        soup = BeautifulSoup(response.content, 'html.parser')

        print("HTML content:")
        print(soup.prettify()) 

    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
except Exception as e:
    print(f"An error occurred: {e}")

### Use infobox

In [None]:
try:
    response = requests.get(url)

    if response.status_code == 200:

        print("Successfully fetched the page.")
        soup = BeautifulSoup(response.content, 'html.parser')

        infobox = soup.find('aside', class_='portable-infobox')

        main_content = soup.find('div', class_='mw-parser-output')

        print("Infobox content:")
        if infobox:
            print(infobox.prettify())
        else:
            print("No infobox found.")

        print("\nMain content:")
        if main_content:
            print(main_content.prettify())
        else:
            print("No main content found.")

    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
except Exception as e:
    print(f"An error occurred: {e}")

### Extract key fields

#### From infobox

In [None]:
chapter_data = {}
chapter_data

In [None]:
import re
from dateutil.parser import parse 

# chapter title
chapter_data['chapter_title'] = infobox.find('h2', class_='pi-title').get_text(strip=True) 

# find Div with "Chapter" and assign its sibling div
chapter_num_div = infobox.find('h3', string = re.compile("Chapter")).find_next_sibling('div')

if chapter_num_div:
    chapter_data['chapter_number'] = int(chapter_num_div.get_text(strip=True))

# release date
# Find <H3> with "Release date" and get its sibling <div>
release_date_div = infobox.find('h3', string=re.compile("Release Date:")).find_next_sibling('div')
# extract release date with date format
if release_date_div:
    # get raw text
    raw_date_str = release_date_div.get_text(strip=True)
    try:
        date_obj = parse(raw_date_str, fuzzy=True)
        chapter_data['release_date'] = date_obj.strftime("%Y-%m-%d")
    except ValueError:
        print(f"Could not parse date from string: {raw_date_str}")
        chapter_data['release_date'] = raw_date_str  # fallback to raw string

chapter_data

### From main content

In [None]:
main_content = soup.find('div', class_='mw-parser-output')
if main_content:
    # Short summary
    # Find H2 with a child that has id "Short_summary"
    short_summary_heading = main_content.find('span', id='Short_Summary')
    if short_summary_heading:
        # Get the next sibling <p> tag
        short_summary_paragraph = short_summary_heading.find_parent('h2').find_next_sibling('p')
        if short_summary_paragraph:
            chapter_data['short_summary'] = short_summary_paragraph.get_text(strip=True)

    long_summary_heading = main_content.find('span', id='Long_Summary')
    if long_summary_heading:
        # initialize list to hold paragraphs
        long_summary_text = []
        for p_tag in long_summary_heading.find_parent('h2').find_next_siblings('p'):
            if p_tag.get_text(strip=True):
                long_summary_text.append(p_tag.get_text(strip=True))
            else:
                # stop when we hit a non-paragraph tag or empty paragraph
                break
        chapter_data['long_summary'] = " ".join(long_summary_text)

chapter_data

In [None]:
# print short summary
print("Short Summary:")
print(chapter_data.get('short_summary'))

In [None]:
# print long summary
print("\nLong Summary:")
print(chapter_data.get('long_summary'))

### Notes

In [None]:
try:
    notes_heading = main_content.find('span', id='Chapter_Notes')
    if notes_heading:
        # Find the <ul> that follows the heading
        ul_tag = notes_heading.find_parent('h3').find_next_sibling('ul')
        if ul_tag:
            # extract list items
            notes = [li.get_text(strip=True) for li in ul_tag.find_all('li')]
            chapter_data['notes'] = "\n".join(notes)
    else:
        chapter_data['notes'] = None
except Exception as e:
    print(f"An error occurred while extracting notes: {e}")
    chapter_data['notes'] = None

In [None]:
chapter_data

### Characters

In [None]:
try:
    chars_heading = main_content.find('span', id='Characters')

    if chars_heading:
        table_tag = chars_heading.find_parent('h3').find_next_sibling('table', class_='CharTable')
        
        if table_tag:
            character_subgroups = {}

            rows = table_tag.find('tbody').find_all('tr')

            if len(rows) >= 2: # Check for at least 2 rows (header and data)
                # Get headers from the first row
                headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]

                # Get character data from the second row's td tags
                data_cells = rows[1].find_all('td')

                # Pair each header with its corresponding data cell
                for i, header in enumerate(headers):
                    if i < len(data_cells):
                        characters = [li.get_text(strip=True) for li in data_cells[i].find_all('li')]
                        character_subgroups[header] = characters
            
            chapter_data['characters'] = character_subgroups
    else:
        chapter_data['characters'] = None
except Exception as e:
    print(f"An error occurred while extracting characters: {e}")
    chapter_data['characters'] = None

### Triva

In [None]:
try:
    trivia_heading = main_content.find('span', id='Trivia')
    if trivia_heading:
        ul_tag = trivia_heading.find_parent('h2').find_next_sibling('ul')
        if ul_tag:
            trivia = [li.get_text(strip=True) for li in ul_tag.find_all('li')]
            chapter_data['trivia'] = "\n".join(trivia)
    else:
        chapter_data['trivia'] = None
except AttributeError:
    chapter_data['trivia'] = None

chapter_data

### Move parsing into a function

In [None]:
def parse_chapter(url, headers = None):
    """
    Fetches and parses single chapter page from One Piece Fandom wiki.
    Returns dictionary of chapter data
    Missing fields are set to None
    """

    # SAFEGUARD: Handle network errors and bad HTTP responses upfront.
    try:
        response = requests.get(url, timeout=10, headers=headers)
        if response.status_code != 200:
            print(f"Failed to retrieve the page {url}. Status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"An error occurred fetching {url}: {e}")
        return None
    
    # if request is successful, parse the content
    soup = BeautifulSoup(response.content, 'html.parser')
    chapter_data = {
        'url': url,
    }
    
    # Extract from infobox
    infobox = soup.find('aside', class_='portable-infobox')

    if infobox:
        print("Extracting infobox data...")
        try:
            chapter_data['chapter_title'] = infobox.find('h2', class_='pi-title').get_text(strip=True)
        except AttributeError:
            chapter_data['chapter_title'] = None

        try:
            chapter_num_div = infobox.find('h3', string = re.compile("Chapter")).find_next_sibling('div')
            chapter_data['chapter_number'] = int(chapter_num_div.get_text(strip=True)) if chapter_num_div else None
        except (AttributeError, ValueError):
            chapter_data['chapter_number'] = None
        
        try:
            release_date_div = infobox.find('h3', string=re.compile("Release Date:")).find_next_sibling('div')
            if release_date_div:
                raw_date_str = release_date_div.get_text(strip=True)
                date_obj = parse(raw_date_str, fuzzy=True)
                chapter_data['release_date'] = date_obj.strftime("%Y-%m-%d")
            else:
                chapter_data['release_date'] = None
        except (AttributeError, ValueError):
            chapter_data['release_date'] = None

    else:
        print("No infobox found.")
        chapter_data['chapter_title'] = None
        chapter_data['chapter_number'] = None
        chapter_data['release_date'] = None

    main_content = soup.find('div', class_='mw-parser-output')

    if main_content:
        print("Extracting main content data...")
        # Short summary
        try:
            heading = main_content.find('span', id='Short_Summary')
            if heading:
                # Use a loop to find all subsequent <p> tags until the next heading
                summary_ps = []
                for sibling in heading.find_parent('h2').find_next_siblings():
                    if sibling.name == 'p':
                        summary_ps.append(sibling.get_text(strip=True))
                    else:
                        # Stop when we hit a non-paragraph tag (like the next <h2>)
                        break
                chapter_data['short_summary'] = " ".join(summary_ps) if summary_ps else None
            else:
                chapter_data['short_summary'] = None
        except AttributeError:
            chapter_data['short_summary'] = None

        # Long Summary
        try:
            heading = main_content.find('span', id='Long_Summary')
            if heading:
                summary_ps = []
                for sibling in heading.find_parent('h2').find_next_siblings():
                    if sibling.name == 'p':
                        summary_ps.append(sibling.get_text(strip=True))
                    else:
                        break
                chapter_data['long_summary'] = " ".join(summary_ps) if summary_ps else None
            else:
                chapter_data['long_summary'] = None
        except AttributeError:
            chapter_data['long_summary'] = None

        # Chapter notes
        try:
            heading = main_content.find('span', id='Chapter_Notes')
            if heading:
                ul = heading.find_parent('h3').find_next_sibling('ul')
                notes = [li.get_text(strip=True) for li in ul.find_all('li', recursive=False)]
                chapter_data['chapter_notes'] = "\n".join(notes)
            else:
                chapter_data['chapter_notes'] = None
        except AttributeError:
            chapter_data['chapter_notes'] = None
        
        # Characters
        try:
            chars_heading = main_content.find('span', id='Characters')
            if chars_heading:
                table_tag = chars_heading.find_parent('h3').find_next_sibling('table', class_='CharTable')
                if table_tag:
                    character_groups = {}
                    rows = table_tag.find('tbody').find_all('tr')

                    if len(rows) >= 2:
                        headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
                        data_cells = rows[1].find_all('td')

                        for i, header in enumerate(headers):
                            if i < len(data_cells):
                                cell = data_cells[i]
                                subgroups_in_cell = {}
                                
                                # Find all <dl> tags, which define the subgroups.
                                subgroup_dls = cell.find_all('dl')

                                if subgroup_dls:
                                    for dl in subgroup_dls:
                                        dt = dl.find('dt')
                                        if not dt: continue # Skip if a <dl> has no <dt> title
                                        
                                        subgroup_title = dt.get_text(strip=True)

                                        # STRATEGY:
                                        # First, look for a <ul> INSIDE the <dl> (Pattern A)
                                        character_ul = dl.find('ul')
                                        
                                        # If not found, look for a <ul> as the NEXT SIBLING of the <dl> (Pattern B)
                                        if not character_ul:
                                            character_ul = dl.find_next_sibling('ul')
                                        
                                        if character_ul:
                                            characters = [li.get_text(strip=True) for li in character_ul.find_all('li')]
                                            subgroups_in_cell[subgroup_title] = characters
                                else:
                                    # Fallback for simple tables with no <dl> subgroups at all.
                                    characters = [li.get_text(strip=True) for li in cell.find_all('li')]
                                    if characters:
                                        subgroups_in_cell[header] = characters

                                character_groups[header] = subgroups_in_cell
                    
                    chapter_data['characters'] = character_groups
            else:
                chapter_data['characters'] = None
        except (AttributeError, IndexError) as e:
            print(f"An error occurred while extracting characters: {e}")
            chapter_data['characters'] = None
            
        # Trivia
        try:
            heading = main_content.find('span', id='Trivia')
            if heading:
                ul = heading.find_parent('h2').find_next_sibling('ul')
                trivia = [li.get_text(strip=True) for li in ul.find_all('li', recursive=False)]
                chapter_data['trivia'] = "\n".join(trivia)
            else:
                chapter_data['trivia'] = None
        except AttributeError:
            chapter_data['trivia'] = None
        
    else:
        print("No main content found.")
        chapter_data['short_summary'] = None
        chapter_data['long_summary'] = None
        chapter_data['notes'] = None
        chapter_data['characters'] = None
        chapter_data['trivia'] = None
        return chapter_data
    


    return chapter_data

chapter_data = parse_chapter(url)
chapter_data

### Test run on 10 chapters

In [None]:
import numpy as np

# set seed for reproducibility
np.random.seed(42)
CHAPTER_NUMBERS_TO_TEST = np.random.choice(range(1, 1156), size=10, replace=False)
BASE_URL = "https://onepiece.fandom.com/wiki/"
scraper_headers = {
        'User-Agent': 'OnePieceRAGBot/1.0 (Learning Project; contact: jfcastaneda.led@gmail.com)'
    }

test_urls = [f"{BASE_URL}Chapter_{num}" for num in CHAPTER_NUMBERS_TO_TEST]
test_urls



In [None]:
from tqdm import tqdm
import time

all_chapters_data = []

for url in tqdm(test_urls, desc="Scraping chapters"):
    chapter_data = parse_chapter(url, headers=scraper_headers)

    # append only if successful
    if chapter_data:
        all_chapters_data.append(chapter_data)

    time.sleep(0.5)  # be polite and avoid overwhelming the server


### Save

In [None]:
import os
import sys

import json

ROOT = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '.'))
DATA_PATH = os.path.join(ROOT, 'data')

with open(os.path.join(DATA_PATH, 'one_piece_chapters.json'), 'w', encoding='utf-8') as f:
    json.dump(all_chapters_data, f, ensure_ascii=False, indent=4)

### Do a FULL RUN

In [None]:
JSON_FILE = os.path.join(DATA_PATH, 'one_piece_chapters.json')
ALL_CHAPTERS = range(0, 1156)

# load existing data to avoid re-scraping
if os.path.exists(JSON_FILE):
    with open(JSON_FILE, 'r', encoding='utf-8') as f:
        all_chapters_data = json.load(f)
    scraped_urls = {entry['url'] for entry in all_chapters_data if 'url' in entry}
    print(f"Loaded {len(all_chapters_data)} existing entries from {JSON_FILE}.")
else:
    all_chapters_data = []
    scraped_urls = set()
    print(f"No existing data found. Starting fresh.")


# Create set of chapters scraped so far
scraped_chapter_numbers = {ch.get('chapter_number') for ch in all_chapters_data}
print(f"Already scraped chapter numbers: {sorted(scraped_chapter_numbers)}")

chapters_to_scrape = [num for num in ALL_CHAPTERS if num not in scraped_chapter_numbers]
print(f"Chapters left to scrape: {len(chapters_to_scrape)}")

In [None]:
len(all_chapters_data)

In [None]:
print("Starting full scrape...")
BASE_URL = "https://onepiece.fandom.com/wiki/"
scraper_headers = {
        'User-Agent': 'OnePieceRAGBot/1.0 (Learning Project; contact: jfcastaneda.led@gmail.com)'
}

for chapter_num in tqdm(chapters_to_scrape, desc="Scraping chapters"):
    url = f"{BASE_URL}Chapter_{chapter_num}"
    
    chapter_data = parse_chapter(url, headers=scraper_headers)

    if chapter_data and chapter_data.get('chapter_number') is not None:
        all_chapters_data.append(chapter_data)

        # Save progress after each successful scrape
        with open(JSON_FILE, 'w', encoding='utf-8') as f:
            json.dump(all_chapters_data, f, ensure_ascii=False, indent=4)

    time.sleep(0.5)  # be polite and avoid overwhelming the server
    
print(f"\nScraping complete. Total chapters in file: {len(all_chapters_data)}.")
print(f"Data saved to {JSON_FILE}.")