In [None]:
import pandas as pd
import json
import os
import sys

# import parser
from src.scraping.parse_chapter import parse_chapter

# remove column width restrictions
pd.set_option('display.max_colwidth', None)

ROOT = os.path.dirname(os.path.abspath(os.getcwd()))
data_path = os.path.join(ROOT, 'data')
one_piece_chapters_path = os.path.join(data_path, 'one_piece_chapters.json')

In [None]:
df = pd.read_json(one_piece_chapters_path)

In [None]:
df.head(1)

### Overview of missing

In [None]:
df.info()

In [None]:
# Check if chapter numbers and url are unique
print("Are chapter numbers unique?", df['chapter_number'].is_unique)
print("Are URLs unique?", df['url'].is_unique)

In [None]:
# there are few entries with no long summary

print("Number of entries with no long summary:", df['long_summary'].isnull().sum())
missing_long_summaries = df[df['long_summary'].isnull()]
print(missing_long_summaries['url'].to_list())

In [None]:
# try the new parser if it gets the long summary
import time

for sample_missing_url in missing_long_summaries['url'].to_list():
    print(f"Parsing {sample_missing_url}...")
    parsed = parse_chapter(sample_missing_url)
    
    # print url and long summary
    print("URL:", parsed['url'])
    print("Long Summary:", parsed['long_summary'])
    # stop for 0.5 seconds
    time.sleep(0.5)

In [None]:
# to check robustness, test it on a few random urls with long summaries
sample_urls = [
    "https://onepiece.fandom.com/wiki/Chapter_1000",
    "https://onepiece.fandom.com/wiki/Chapter_500",
    "https://onepiece.fandom.com/wiki/Chapter_750",
    "https://onepiece.fandom.com/wiki/Chapter_250",
    "https://onepiece.fandom.com/wiki/Chapter_1",
    "https://onepiece.fandom.com/wiki/Chapter_1050"
]

for sample_url in sample_urls:
    print(f"Parsing {sample_url}...")
    parsed = parse_chapter(sample_url)

    # print url and long summary
    print("URL:", parsed['url'])
    print("Long Summary:", parsed['long_summary'])

### Fix long summary 

In [None]:
all_chapters_data = json.load(open(one_piece_chapters_path, 'r'))
all_chapters_data[0]

In [None]:
chapters_to_fix = []

for index, chapter in enumerate(all_chapters_data):
    if chapter.get('long_summary') is None:
        chapters_to_fix.append({
            'index': index,
            'url': chapter.get('url')
        })

chapters_to_fix

In [None]:
# loop through function and fix the long summaries
from tqdm import tqdm
scraper_headers = {
        'User-Agent': 'OnePieceRAGBot/1.0 (Updating Script; contact: jfcastaneda.led@gmail.com)'
    }

successful_fixes = 0
for item in tqdm(chapters_to_fix, desc="Fixing long summaries"):
    index = item['index']
    url = item['url']
    
    print(f"Re-parsing {url}...")
    reparsed = parse_chapter(url, headers=scraper_headers)
    
    if reparsed and reparsed.get('long_summary'):
        all_chapters_data[index]['long_summary'] = reparsed['long_summary']
        print(f"Updated long summary for index {index}.")
        successful_fixes += 1

    time.sleep(0.5)  # Be polite and avoid overwhelming the server

    

In [None]:
# check index of all_chapters_data where index is in chapters_to_fix
fixed_indices = [item['index'] for item in chapters_to_fix if all_chapters_data[item['index']].get('long_summary') is not None]
fixed_indices


In [None]:
# save and update the json file

with open(one_piece_chapters_path, 'w') as f:
    json.dump(all_chapters_data, f, indent=4, ensure_ascii=False)