### To parse

- url
- episode number
- episode title (Under japanese information)
- air date (Under japanese information)
- source chapters (unders statistics)
- short summary
- long summary
- characters in order of appearance
- anime notes
- Trivia (if available)


In [None]:
import requests
from bs4 import BeautifulSoup
import re
from dateutil.parser import parse
import json

### Request from one piece wiki

In [None]:
import random
MAX_EP = 1142

# randomize from 1 to MAX_EP
ep_num = random.randint(1, MAX_EP)
ep_num

In [None]:
url = f'https://onepiece.fandom.com/wiki/Episode_{ep_num}'
scraper_headers = {
        'User-Agent': 'OnePieceRAGBot/1.0 (Learning Project; contact: jfcastaneda.led@gmail.com)'
    }

# fetch page
response = requests.get(url, headers=scraper_headers)
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
episode_data = {'url': url}
episode_data

### Parse infobox

In [None]:
infobox = soup.find('aside', class_='portable-infobox')
if not infobox:
    print("No infobox found on the page.")

In [None]:
infobox

#### Num div

In [None]:
# episode number
# fallback because the num div is not within Episode #

num_div_standard = infobox.find('div', string = 'Episode #')

if num_div_standard:
    value_div = num_div_standard.find_next_sibling('div')
    if value_div:
        episode_number = int(value_div.get_text(strip=True))
else:
    print("Standard method failed, trying fallback.")
    nav_tag = infobox.find('nav', class_ = 'pi-navigation')
    if nav_tag:
        span_tag = nav_tag.find('span', class_ = 'nomobile')
        # extract digits from text
        text = span_tag.get_text(strip=True)
        num_match = re.search(r'\d+', text)
        if num_match:
            episode_number = int(num_match.group())

episode_data['episode_number'] = episode_number

#### Episode title: English

In [None]:
try:
    title = infobox.find('h2', class_ = 'pi-title').get_text(strip= True, separator= ' ')
    episode_data['english_title'] = title
except AttributeError:
    print("No title found in the infobox.")
    episode_data['english_title'] = None

episode_data

#### Episode title: Japanese

In [None]:
try:
    romaji_div = infobox.find('h3', string = 'Romaji').find_next_sibling('div')
    episode_data['romaji_title'] = romaji_div.get_text(strip= True, separator= ' ')
except AttributeError:
    print("No Romaji title found in the infobox.")
    episode_data['romaji_title'] = None
episode_data

#### Date 

In [None]:
date_div = infobox.find('h3', string = 'Airdate').find_next_sibling('div')
raw_date = date_div.get_text(strip= True).split('[')[0]
episode_data['air_date'] = parse(raw_date).strftime('%Y-%m-%d')


In [None]:
try:
    date_div = infobox.find('h3', string = 'Airdate').find_next_sibling('div')
    raw_date = date_div.get_text(strip= True).split('[')[0]
    episode_data['air_date'] = parse(raw_date).strftime('%Y-%m-%d')
except(AttributeError, ValueError, TypeError):
    episode_data['air_date'] = None

episode_data

#### Source chapters

In [None]:
try:
    source_div = infobox.find('h3', string = 'Chapters').find_next_sibling('div')
    # use separator to handle multiple lines
    episode_data['source_chapters'] = source_div.get_text(strip= True, separator= ', ')
except AttributeError:
    episode_data['source_chapters'] = None

episode_data

#### Staff data

In [None]:
staff_data = {}
try:
    # Find the parent section for all credits
    credits_section = infobox.find('h2', string='Episode Credits').find_parent('section')
    
    # Extract each staff member
    screenplay_div = credits_section.find('h3', string='Screenplay').find_next_sibling('div')
    staff_data['screenplay'] = screenplay_div.get_text(strip=True) if screenplay_div else None
    
    art_div = credits_section.find('h3', string='Art').find_next_sibling('div')
    staff_data['art'] = art_div.get_text(strip=True) if art_div else None
    
    animation_div = credits_section.find('h3', string='Animation').find_next_sibling('div')
    staff_data['animation'] = animation_div.get_text(strip=True) if animation_div else None
    
    direction_div = credits_section.find('h3', string='Direction').find_next_sibling('div')
    staff_data['direction'] = direction_div.get_text(strip=True) if direction_div else None

except AttributeError:
    pass

episode_data['staff'] = staff_data
episode_data

## Main content

In [None]:
main_content = soup.find('div', class_ = 'mw-parser-output')
main_content

#### Short summary

In [None]:
try:
    short_summary = None
    heading = main_content.find('span', id='Short_Summary')
    if heading:
        summary_ps = []
        parent_heading = heading.find_parent('h2')
                
        for sibling in parent_heading.find_next_siblings():
            if sibling.name == 'h2':
                break
                    
                    # Only collect text from paragraph tags
            if sibling.name == 'p':
                    summary_ps.append(sibling.get_text(strip=True))
                        
            short_summary = " ".join(summary_ps) if summary_ps else None
                
    episode_data['short_summary'] = short_summary
            
except AttributeError:
    episode_data['short_summary'] = None

episode_data

#### Long Summary

In [None]:
try:
    long_summary = None
    heading = main_content.find('span', id = 'Long_Summary')
    if heading:
        parent_heading = heading.find_parent(re.compile(r'h[1-6]'))

        summary_ps = []

        for sibling in parent_heading.find_next_siblings():
            if sibling.name == 'h2': # Stop condition remains the same
                break
            if sibling.name == 'p':
                summary_ps.append(sibling.get_text(strip=True))

        
        long_summary = ' '.join(summary_ps) if summary_ps else None

    episode_data['long_summary'] = long_summary
except AttributeError:
    episode_data['long_summary'] = None

In [None]:
episode_data

### Characters

In [None]:
try:
    characters = None
    # Find the heading for the characters section
    heading = main_content.find('span', id='Characters_in_Order_of_Appearance')
    
    if heading:
        # Robustly find the parent heading tag (h2, h3, etc.)
        parent_heading = heading.find_parent(re.compile(r'h[1-6]'))
        
        # Find the very next element that follows the heading
        next_element = parent_heading.find_next_sibling()
        
        ul_tag = None
        if next_element:
            # Check if the next element is the list itself
            if next_element.name == 'ul':
                ul_tag = next_element
            # Or if it's wrapped in a div
            elif next_element.name == 'div':
                ul_tag = next_element.find('ul')
        
        if ul_tag:
            characters = ul_tag.get_text(separator='\n', strip=True)
    
    episode_data['characters'] = characters
    
except AttributeError:
    
    episode_data['characters'] = None

episode_data

#### Anime notes

In [None]:
try:
    anime_notes = None
    heading = main_content.find('span', id='Anime_Notes')
    
    if heading:
        # Robustly find the parent heading tag
        parent_heading = heading.find_parent(re.compile(r'h[1-6]'))
        
        # Find the first <ul> tag that appears after the heading
        ul_tag = parent_heading.find_next_sibling('ul')
        
        if ul_tag:
            # For notes, getting direct children (recursive=False) is often cleaner
            notes = [li.get_text(strip=True) for li in ul_tag.find_all('li', recursive=False)]
            anime_notes = "\n".join(notes)
            
    episode_data['anime_notes'] = anime_notes

except AttributeError:
    episode_data['anime_notes'] = None

episode_data

### Trivia

In [None]:
try:
    trivia = None
    heading = main_content.find('span', id='Trivia')
    if heading:
        ul_tag = heading.find_parent('h2').find_next_sibling('ul')
        if ul_tag:
            # clean out reference tag
            for sup in ul_tag.find_all('sup'):
                sup.decompose()
            trivia_items = [li.get_text(strip=True) for li in ul_tag.find_all('li', recursive=False)]
            trivia = '\n'.join(trivia_items) if trivia_items else None
    episode_data['trivia'] = trivia
except AttributeError:
    episode_data['trivia'] = None
episode_data

In [None]:
def parse_anime(url, headers=None):
    """
    Fetches and parses a single anime episode page with robust safeguards.
    Returns a dictionary of episode data, or None if the page fails to load.
    Missing fields within the page will be set to None.
    """
    try:
        response = requests.get(url, timeout=10, headers=headers)
        if response.status_code != 200:
            # Return None for pages that don't exist (like future episodes)
            return None
    except requests.exceptions.RequestException as e:
        print(f"An error occurred fetching {url}: {e}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    episode_data = {'url': url}

    # From infobox
    infobox = soup.find('aside', class_='portable-infobox')
    if infobox:
        # Episode Number (with fallback)
        try:
            num_div = infobox.find('div', string='Episode #')
            if num_div:
                episode_data['episode_number'] = int(num_div.find_next_sibling('div').get_text(strip=True))
            else: # Fallback
                nav_tag = infobox.find('nav', class_='pi-navigation')
                span_tag = nav_tag.find('span', class_='nomobile')
                num_match = re.search(r'\d+', span_tag.get_text(strip=True))
                episode_data['episode_number'] = int(num_match.group(0)) if num_match else None
        except (AttributeError, ValueError):
            episode_data['episode_number'] = None

        # Titles and Airdate
        try:
            title_div = infobox.find('h2', class_='pi-title')
            episode_data['episode_title'] = title_div.get_text(strip=True, separator=' ') if title_div else None
        except AttributeError:
            episode_data['episode_title'] = None
        try:
            date_div = infobox.find('h3', string='Airdate').find_next_sibling('div')
            episode_data['air_date'] = parse(date_div.get_text(strip=True).split('[')[0]).strftime('%Y-%m-%d') if date_div else None
        except (AttributeError, ValueError, TypeError):
            episode_data['air_date'] = None
        try:
            chapters_div = infobox.find('h3', string='Chapters').find_next_sibling('div')
            episode_data['source_chapters'] = chapters_div.get_text(strip=True, separator=', ') if chapters_div else None
        except AttributeError:
            episode_data['source_chapters'] = None
    else:
        episode_data.update({'episode_number': None, 'episode_title': None, 'air_date': None, 'source_chapters': None})

    #from Main Content ---
    main_content = soup.find('div', class_='mw-parser-output')
    if main_content:
        def get_summary_text(summary_id):
            try:
                heading = main_content.find('span', id=summary_id)
                if heading:
                    summary_ps = []
                    parent_heading = heading.find_parent(re.compile(r'h[1-6]'))
                    for sibling in parent_heading.find_next_siblings():
                        if sibling.name in ['h2', 'h3']: break
                        if sibling.name == 'p':
                            summary_ps.append(sibling.get_text(strip=True))
                    return " ".join(summary_ps) if summary_ps else None
                return None
            except AttributeError: return None

        episode_data['short_summary'] = get_summary_text('Short_Summary')
        episode_data['long_summary'] = get_summary_text('Long_Summary')

        try:
            characters = None
            heading = main_content.find('span', id='Characters_in_Order_of_Appearance')
            if heading:
                parent_heading = heading.find_parent(re.compile(r'h[1-6]'))
                next_element = parent_heading.find_next_sibling()
                ul_tag = None
                if next_element:
                    if next_element.name == 'ul': ul_tag = next_element
                    elif next_element.name == 'div': ul_tag = next_element.find('ul')
                if ul_tag: characters = ul_tag.get_text(separator='\n', strip=True)
            episode_data['characters'] = characters
        except AttributeError:
            episode_data['characters'] = None

        try:
            notes = None
            heading = main_content.find('span', id='Anime_Notes')
            if heading:
                ul_tag = heading.find_parent(re.compile(r'h[1-6]')).find_next_sibling('ul')
                if ul_tag:
                    notes_list = [li.get_text(strip=True) for li in ul_tag.find_all('li', recursive=False)]
                    notes = "\n".join(notes_list)
            episode_data['anime_notes'] = notes
        except AttributeError:
            episode_data['anime_notes'] = None
            
        try:
            trivia = None
            heading = main_content.find('span', id='Trivia')
            if heading:
                ul_tag = heading.find_parent(re.compile(r'h[1-6]')).find_next_sibling('ul')
                if ul_tag:
                    for sup in ul_tag.find_all('sup'): sup.decompose()
                    trivia_list = [li.get_text(strip=True) for li in ul_tag.find_all('li', recursive=False)]
                    trivia = "\n".join(trivia_list)
            episode_data['trivia'] = trivia
        except AttributeError:
            episode_data['trivia'] = None
    else:
        episode_data.update({'short_summary': None, 'long_summary': None, 'characters': None, 'anime_notes': None, 'trivia': None})

    return episode_data

### Parse episodes and save to JSON

In [None]:
import os

ROOT = os.path.dirname(os.path.abspath(os.getcwd()))
data_path = os.path.join(ROOT, 'data')

JSON_FILE = 'one_piece_episodes.json'
JSON_PATH = os.path.join(data_path, JSON_FILE)

In [None]:
EPISODE_RANGE = range(1, 1143) 

# load existing data to avoid re-scraping
try:
    if os.path.exists(JSON_PATH):
        with open(JSON_PATH, 'r', encoding='utf-8') as f:
            all_episodes_data = json.load(f)
        existing_urls = {entry['url'] for entry in all_episodes_data if 'url' in entry}
    else:
        all_episodes_data = []
except (json.JSONDecodeError, IOError) as e:
    print(f"Error loading existing data: {e}")
    all_episodes_data = []

In [None]:
scraped_episode_numbers = {ep.get('episode_number') for ep in all_episodes_data}
episodes_to_scrape = [num for num in EPISODE_RANGE if num not in scraped_episode_numbers]

print(f"Total episodes to scrape: {len(episodes_to_scrape)}")

In [None]:
import time
from tqdm import tqdm

BASE_URL = 'https://onepiece.fandom.com/wiki/Episode_{}'
scraper_headers = {
            'User-Agent': 'OnePieceRAGBot/1.0 (Learning Project; jfcastaneda.led@gmail.com)'
        }

for ep_num in tqdm(episodes_to_scrape, desc="Scraping Episodes", unit="episode"):
    url = BASE_URL.format(ep_num)
    if url in existing_urls:
        print(f"Skipping already scraped URL: {url}")
        continue

    episode_data = parse_anime(url, headers=scraper_headers)
    
    # only add if data was successfully fetched
    if episode_data and episode_data.get('episode_number') is not None:
        all_episodes_data.append(episode_data)
        print(f"Scraped episode {ep_num}: {url}")
    
        # Save progress after each successful scrape
        try:
            with open(JSON_PATH, 'w', encoding='utf-8') as f:
                json.dump(all_episodes_data, f, ensure_ascii=False, indent=4)
        except IOError as e:
            print(f"Error saving data: {e}")

    # Be polite with a short delay
    time.sleep(1)  # 1 second delay between requests

print("Scraping complete.")