In [None]:
import json
import os
import sys
from pathlib import Path


# create ROOT directory
ROOT = Path.cwd().parent.parent
raw_data_path = ROOT / "data" / "raw"


In [None]:
# get a sample of the formats in the raw data
chapters_filename = "one_piece_chapters.json"
with open(raw_data_path / chapters_filename, "r") as f:
    chapters = json.load(f)

# get a few sample of chapters
sample_chapters = chapters[:2]
sample_chapters

In [None]:
episodes_filename = "one_piece_episodes.json"
with open(raw_data_path / episodes_filename, "r") as f:
    episodes = json.load(f)

# get a few sample of episodes
sample_episodes = episodes[:2]
sample_episodes

In [None]:
characters_filename = "one_piece_characters_data.jsonl"
with open(raw_data_path / characters_filename, "r") as f:
    characters = [json.loads(line) for line in f]

# get a few sample of characters
sample_characters = characters[:2]
sample_characters

### Recommended format

#### Chapter

```JSON
{
  "text": "# Title: The Kurozumi Family's Plot\\n## Chapter Number: 965\\n## Release Date: 2019-12-16\\n\\n## Summary\\nAs they sailed with theWhitebeard Piratesfor four years and learned more about the world,OdenandTokifell in love... (and so on, combining short_summary and long_summary)\\n\\n## Chapter Notes\\nTokireveals that her parents were fromWano Country... (and so on)\\n\\n## Characters Appearing\\n- Pirates: Edward Newgate, Marco, Jozu, Vista...\\n- Citizens: Gol D. Roger, Silvers Rayleigh, Scopper Gaban...\\n- Other: Kouzuki Sukiyaki(flashback), Kurozumi Orochi...\\n\\n## Trivia\\nN/A",
  "metadata": {
    "source_type": "chapter",
    "title": "The Kurozumi Family's Plot",
    "number": 965,
    "url": "https://onepiece.fandom.com/wiki/Chapter_965"
  }
}
```

#### Episde

```JSON
{
  "text": "# Title: I'm Luffy! The Man Who Will Become the Pirate King!\\n## Episode Number: 1\\n## Air Date: 1999-10-20\\n## Source Chapters: Chapter 2, (p. 4-23)\\n\\n## Summary\\nLuffyis found floating at sea by acruise ship... (and so on, combining short_summary and long_summary)\\n\\n## Characters Appearing\\nNami, Alvida, Koby, Heppoko, Peppoko, Poppoko, Monkey D. Luffy, Roronoa Zoro\\n\\n## Anime Notes\\nThis andthe second episodeuse a unique end card... (and so on)\\n\\n## Trivia\\nN/A",
  "metadata": {
    "source_type": "episode",
    "title": "I'm Luffy! The Man Who Will Become the Pirate King!",
    "number": 1,
    "url": "https://onepiece.fandom.com/wiki/Episode_1"
  }
}
```

#### Character

```JSON
{
  "text": "# Name: Oide\\n## Affiliations: N/A\\n## Occupations: N/A\\n## Residence: Sphinx\\n## Status: Alive\\n## Manga Debut: Chapter 909\\n## Anime Debut: Episode 890\\n\\n## General Info\\nOideis a little girl who lives inSphinx. She is the owner ofTama.\\n\\n## Appearance\\nOide is a little girl with blushy cheeks, brown pigtails and brown eyes... (and so on)\\n\\n## Personality\\nOide is a cheerful girl who loves her pet, Tama... (and so on)\\n\\n## History\\nIn Sphinx, Oide brought her pet Tama to Marco so it could be treated... (and so on)\\n\\n## Abilities\\nN/A\\n\\n## Trivia\\nOide's name comes from the verb 'come over'(おいで,oide).",
  "metadata": {
    "source_type": "character",
    "name": "Oide",
    "url": "https://onepiece.fandom.com/wiki/Oide"
  }
}
```

## Preprocessing

### Chapters preprocessing

In [None]:
import json
import pandas as pd

def format_chapter_characters(char_data):
    """
    Helper function to format the nested character dictionary from the
    chapter file into a readable string. It handles the multiple levels
    of nesting found in your sample data.
    """

    if not isinstance(char_data, dict):
        return "N/A"
    
    full_character_list = []

    for category, subcategories in char_data.items():
        if isinstance(subcategories, dict):
            for subcat, members in subcategories.items():
                if members: # check if members is not empty
                    member_str = ", ".join(members)
                    full_character_list.append(f"{category} - {subcat}: {member_str}")
    
    return "\\n".join(full_character_list) if full_character_list else "N/A"


In [None]:
chapters_df = pd.read_json(raw_data_path / "one_piece_chapters.json")
chapters_df.head()

### Process chapters

In [None]:
from tqdm import tqdm

processed_chapters = []
max_str_len = 0

for index, row in tqdm(chapters_df.iterrows(), total=chapters_df.shape[0]):
    # combine relevant fields into a single text block

    text_content = (
            f"# Title: {row.get('chapter_title', 'N/A')}\\n"
            f"## Chapter Number: {row.get('chapter_number', 'N/A')}\\n"
            f"## Release Date: {row.get('release_date', 'N/A')}\\n\\n"
            f"## Summary\\n{row.get('short_summary', '')}\\n{row.get('long_summary', '')}\\n\\n"
            f"## Chapter Notes\\n{row.get('chapter_notes', 'N/A')}\\n\\n"
            f"## Characters Appearing\\n{format_chapter_characters(row.get('characters', {}))}\\n\\n"
            f"## Trivia\\n{row.get('trivia', 'N/A')}").strip()

    if len(text_content) > max_str_len:
        max_str_len = len(text_content)
        print(f"New max length {max_str_len} at index {index}")
        
    metadata = {
        "source_type": "chapter",
        "title": row.get('chapter_title', 'N/A'),
        "number": row.get('chapter_number', 'N/A'),
        "url": row.get('url', 'N/A')
    }

    processed_chapters.append({
        "text": text_content,
        "metadata": metadata
    })

In [None]:
processed_chapters[:2]  # show a couple of processed examples

In [None]:
### save into processed
processed_data_path = ROOT / "data" / "processed"
os.makedirs(processed_data_path, exist_ok=True)

with open(processed_data_path / "processed_chapters.jsonl", "w", encoding="utf-8") as f:
    for item in processed_chapters:
        f.write(json.dumps(item) + "\n")

### Episodes Preprocessing

In [None]:
episodes_df = pd.read_json(raw_data_path / "one_piece_episodes.json")
episodes_df.head()

In [None]:
processed_episodes = []
for index, row in episodes_df.iterrows():
    
    
    character_string = row.get('characters', 'N/A')
    if isinstance(character_string, str):
        lines = [line.strip() for line in character_string.splitlines() if line.strip()]
        formatted_characters = ", ".join(lines)
    else:
        formatted_characters = 'N/A'
        
    text_content = (
        f"# Title: {row.get('episode_title', 'N/A')}\\n"
        f"## Episode Number: {row.get('episode_number', 'N/A')}\\n"
        f"## Air Date: {row.get('air_date', 'N/A')}\\n"
        f"## Source Chapters: {row.get('source_chapters', 'N/A')}\\n\\n"
        f"## Summary\\n{row.get('short_summary', '')}\\n{row.get('long_summary', '')}\\n\\n"
        f"## Characters Appearing\\n{formatted_characters}\\n\\n" # Use the newly formatted string
        f"## Anime Notes\\n{row.get('anime_notes', 'N/A')}\\n\\n"
        f"## Trivia\\n{row.get('trivia', 'N/A')}"
    ).strip()
    
    metadata = {
        'source_type': 'episode', 
        'title': row.get('episode_title', 'N/A'), 
        'number': row.get('episode_number', 'N/A'),
        'url': row.get('url', 'N/A')
    }
    
    processed_episodes.append({'text': text_content, 'metadata': metadata})

In [None]:
## Save into processed
with open(processed_data_path / "processed_episodes.jsonl", "w", encoding="utf-8") as f:
    for item in processed_episodes:
        f.write(json.dumps(item) + "\n")

### Characters preprocessing

In [None]:
try:
    with open(raw_data_path / "one_piece_characters_data.jsonl", "r", encoding="utf-8") as f:
        characters = [json.loads(line) for line in f]

    characters_df = pd.DataFrame(characters)
    display(characters_df.head())
except Exception as e:
    print(f"Error loading characters data: {e}")
    characters_df = pd.DataFrame()  # create an empty DataFrame in case of error

In [None]:
characters_df[characters_df['devil_fruit'].notna()]['devil_fruit'].iloc[12]

In [None]:
def format_devil_fruit(fruit_data):
    """
    Helper function to format the nested devil fruit dictionary
    into a readable string.
    """
    if not isinstance(fruit_data, dict):
        return "N/A"
    
    name = fruit_data.get('english_name', 'N/A')
    jap_name = fruit_data.get('japanese_name', '')
    fruit_type = fruit_data.get('type', 'N/A')
    
    formatted_string = f"{name}"
    if jap_name:
        formatted_string += f" ({jap_name})"
    formatted_string += f", Type: {fruit_type}"
    
    return formatted_string

In [None]:
format_devil_fruit(characters_df[characters_df['devil_fruit'].notna()]['devil_fruit'].iloc[20])

In [None]:
processed_characters = []
for index, row in characters_df.iterrows():
    text_content = (
        f"# Name: {row.get('name', 'N/A')}\\n"
        f"## Affiliations: {row.get('affiliations', 'N/A')}\\n"
        f"## Occupations: {row.get('occupations', 'N/A')}\\n"
        f"## Devil Fruit: {format_devil_fruit(row.get('devil_fruit'))}\\n"
        f"## Residence: {row.get('residence', 'N/A')}\\n"
        f"## Status: {row.get('status', 'N/A')}\\n"
        f"## Bounty: {row.get('bounty', 'N/A')}\\n"
        f"## Manga Debut: {row.get('manga_debut', 'N/A')}\\n"
        f"## Anime Debut: {row.get('anime_debut', 'N/A')}\\n\\n"
        f"## General Info\\n{row.get('general_info', 'N/A')}\\n\\n"
        f"## Appearance\\n{row.get('appearance', 'N/A')}\\n\\n"
        f"## Personality\\n{row.get('personality', 'N/A')}\\n\\n"
        f"## History\\n{row.get('history', 'N/A')}\\n\\n"
        f"## Abilities\\n{row.get('abilities', 'N/A')}\\n\\n"
        f"## Relationships\\n{row.get('relationships', 'N/A')}\\n\\n"
        f"## Trivia\\n{row.get('trivia', 'N/A')}"
    ).strip()

    metadata = {
        "source_type": "character",
        "name": row.get('name', 'N/A'),
        "url": row.get('url', 'N/A')
    }

    processed_characters.append({
        "text": text_content,
        "metadata": metadata
    })

processed_characters[99]

In [None]:
print(processed_characters[975]['text'])

In [None]:
## Save into processed
with open(processed_data_path / "processed_characters.jsonl", "w", encoding="utf-8") as f:
    for item in processed_characters:
        f.write(json.dumps(item) + "\n")