In [None]:
import pandas as pd
import json
import os
import sys

# remove column width restrictions
pd.set_option('display.max_colwidth', None)

ROOT = os.path.dirname(os.path.abspath(os.getcwd()))
data_path = os.path.join(ROOT, 'data')
one_piece_episodes_path = os.path.join(data_path, 'one_piece_episodes.json')

In [None]:
import re

def clean_character_name(name):
    """Removes parenthetical notes like (flashback), (cover), etc., and strips whitespace."""
    # Remove any text within parentheses
    cleaned_name = re.sub(r'\s*\(.*?\)', '', name)
    # Strip any leading/trailing whitespace that might be left
    return cleaned_name.strip()

clean_character_name("Monkey D. Luffy (flashback)")  # Example usage

### Get all names

In [None]:
names = set()

episodes_df = pd.read_json(one_piece_episodes_path)

# get all characters split by "\n"
for characters in episodes_df['characters'].dropna():
    for name in characters.split("\n"):
        cleaned_name = clean_character_name(name)
        if cleaned_name:  # Ensure the name is not empty after cleaning
            names.add(cleaned_name)

print(f"Total unique character names: {len(names)}")

### Generate url and validate if url exists

In [None]:
import requests
import time
from tqdm import tqdm

base_url = "https://onepiece.fandom.com/wiki/"
valid_urls = []
invalid_names = []
failed_requests = []

scraper_headers = {
        'User-Agent': 'OnePieceRAGBot/1.0 (URL Validator)'
    }

print("Validating URLs...")
for name in tqdm(names, desc="Checking valid URLs"):
    # Replace spaces with underscores for the URL
    url_name = name.replace(" ", "_")
    url = f"{base_url}{url_name}"
    print(f"Checking URL: {url}")
    
    try:
        response = requests.head(url, headers=scraper_headers, allow_redirects=True)
        if response.status_code == 200:
            valid_urls.append(url)
        else:
            invalid_names.append(name)
            print(f"Invalid URL for character: {name} (Status Code: {response.status_code})")
    except requests.RequestException as e:
        failed_requests.append(name)
        print(f"Error checking URL for character: {name} (Error: {e})")
        continue

    finally:
        # Be polite and avoid overwhelming the server
        time.sleep(0.2)


In [None]:
# save valid urls
valid_urls_path = os.path.join(data_path, 'one_piece_characters_urls.txt')
with open(valid_urls_path, 'w') as f:
    for url in valid_urls:
        f.write(f"{url}\n")

In [None]:
# save invalid names
invalid_names_path = os.path.join(data_path, 'one_piece_invalid_character_names.txt')
with open(invalid_names_path, 'w') as f:
    for name in invalid_names:
        f.write(f"{name}\n")