In [7]:
from pathlib import Path
import easyocr
import numpy as np
from sklearn.cluster import DBSCAN
import os
from pathlib import Path
from paddleocr import PaddleOCR
from PIL import Image, ImageDraw, ImageFont
import string
import pytesseract
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import json, re, cv2

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
import re
import json
import numpy as np
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
import nltk
from nltk.corpus import words

# --------------------------
# CONFIG (Partially moved to top for replacements loading)
# --------------------------
input_folder = Path(r"input")
output_folder = Path("output")
output_folder.mkdir(exist_ok=True)

allow_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ012356789!?.,-…'\""

# Load replacements from JSON file
with open("replacements.json", "r", encoding="utf-8") as f:
    replacements = json.load(f)

font_path = "fonts/CCMeanwhileBoldItalic.ttf"
font_size = 16

# --- NLTK Setup (Ensure you've run nltk.download('words') once) ---
# Also include all keys from the replacements dictionary in the known lexicon
try:
    # 1. Start with NLTK words
    VALID_WORDS = set(word.upper() for word in words.words())
    # 2. Add keys from the replacements dictionary
    VALID_WORDS.update(set(replacements.keys()))
except LookupError:
    print("WARNING: NLTK 'words' corpus not found. Run 'import nltk; nltk.download(\"words\")'")
    # Fallback: only include replacement keys
    VALID_WORDS = set(replacements.keys())
# -----------------------------------------------------------------


# --------------------------
# INITIALIZE OCR
# --------------------------
# Assuming easyocr is installed and imported as 'reader'
# reader = easyocr.Reader(['en'], gpu=True) # Kept in original but commented out here for environment-independent code
class EasyOCRMock: # Mock class for environment-independent testing
    def readtext(self, img_array, **kwargs):
        return []
try:
    import easyocr
    reader = easyocr.Reader(['en'], gpu=True)
except ImportError:
    print("WARNING: easyocr not found. Using mock reader.")
    reader = EasyOCRMock()

# --------------------------
# HELPER FUNCTIONS
# --------------------------
def natural_sort_key(path):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(r'(\d+)', str(path))]

def apply_correction_rules(ocr_word, known_lexicon):
    """
    Applies a set of post-OCR correction rules (e.g., L/U, I/!) if the *resulting*
    word is in the known lexicon (NLTK words OR replacement keys).
    Returns the corrected word if a match is found, otherwise returns the original word.
    """
    # Use the combined VALID_WORDS lexicon

    # Only try to correct if the word is not already recognized as a known word
    if ocr_word in known_lexicon or not ocr_word.isalpha():
        return ocr_word

    corrected_word = ocr_word
    
    # --- RULE 1: L -> U Substitution (L misread as U) ---
    # The requirement is L->U, not L<->U. Only need to check L->U to match the prompt's condition.
    # The condition is: replace L to U if the *resulting* word is known.
    if 'L' in corrected_word:
        # Try a single L -> U substitution
        temp_word = corrected_word.replace('L', 'U', 1)
        if temp_word in known_lexicon:
            # L is replaced with U because the result is a known word
            return temp_word

    # --- RULE 2: I -> ! Substitution (Misread Terminal 'I' as '!') ---
    # The condition is: replace I to ! if the *resulting* word is known.
    if corrected_word.endswith('I'):
        # Check if removing the final 'I' results in a known word
        root_word = corrected_word[:-1]
        # Check the word with the assumed correction ('!' at the end) against the lexicon
        # Note: '!' is punctuation and not in the core lexicon, so we check the root_word
        if root_word in known_lexicon:
            # I is replaced with ! because the result (root_word + '!') is a corrected version of a known word
            # We assume the OCR wrongly read the terminal '!' punctuation as an 'I' letter.
            return root_word + '!'

    # If no valid correction is found, return the original word
    return ocr_word


def get_replacement(word, replacements):
    punct = '!?.,-…\'\"'
    core = word.strip(punct)
    
    # --- STEP 1: Apply Custom Correction BEFORE checking main dictionary ---
    core_upper = core.upper()
    
    # Note: apply_correction_rules uses the expanded VALID_WORDS (NLTK + replacement keys)
    corrected_core = apply_correction_rules(core_upper, VALID_WORDS)
    
    # --- STEP 2: Check against Replacements Dictionary ---
    # Check the corrected word against the replacement keys
    key = corrected_core
    if key in replacements:
        # If the corrected word is in replacements, use the replacement text.
        core_start = word.find(core)
        pre = word[:core_start]
        post = word[core_start + len(core):]
        return pre + replacements[key] + post
    # Also check the original core word against the replacement keys in case correction wasn't needed
    elif core_upper in replacements:
        key = core_upper
        core_start = word.find(core)
        pre = word[:core_start]
        post = word[core_start + len(core):]
        return pre + replacements[key] + post
    else:
        # --- STEP 3: Return Corrected Word if no Replacement Exists ---
        # If no replacement is found, but a correction was made,
        # return the corrected word (with original punctuation preserved).
        if corrected_core != core_upper:
            # If the correction didn't change the length (L->U), preserve original surrounding punctuation.
            # If the correction did change the core (e.g., MINISTERI -> MINISTER!), the '!' is in corrected_core.
            
            # Find where the original core word was
            core_start = word.find(core)
            pre = word[:core_start]
            
            # The 'post' part needs careful handling, especially if the original word ended in 'I'
            # and was corrected to '!' (Rule 2).
            if corrected_core.endswith('!'):
                # This means the original 'I' was removed and replaced with '!'
                # The 'post' part should only contain original punctuation *after* the 'I'.
                # Assuming 'I' was the last letter before any existing punctuation.
                post = word[core_start + len(core):]
            else:
                # For L->U correction, the length is the same, just strip original core.
                post = word[core_start + len(core):]

            # Return the corrected core, capitalized (as was done in the original logic),
            # with the original leading punctuation and the potentially adjusted trailing punctuation.
            return pre + corrected_core.capitalize() + post 
        
        # If no replacement AND no correction, return original word
        return word

def filter_and_replace(results, replacements):
    """Return only the OCR boxes that contain words found in replacements.json."""
    # This function remains unchanged, as it calls the updated get_replacement
    filtered = []
    for bbox, text, conf in results:
        words = text.split()
        replaced_words = []
        found = False
        for word in words:
            replaced_word = get_replacement(word, replacements)
            if replaced_word != word:
                found = True
            replaced_words.append(replaced_word)
        if found:
            new_text = " ".join(replaced_words)
            filtered.append((bbox, new_text, conf))
    return filtered

def draw_text_on_image(img, results, font_path, base_font_size, padding_ratio=0.1):
    """
    Draws all white boxes first, then all text, preventing overlap issues.
    """
    pil_img = img.copy()
    draw = ImageDraw.Draw(pil_img)

    # List to store all drawing parameters for the second pass
    text_to_draw = [] 

    # --- PASS 1: Calculate and Draw ALL White Rectangles ---
    for bbox, text, conf in results:
        x_coords = [p[0] for p in bbox]
        y_coords = [p[1] for p in bbox]
        x_center = int(np.mean(x_coords))
        y_center = int(np.mean(y_coords))
        bbox_height = max(y_coords) - min(y_coords)
        # bbox_width is not strictly needed for this, but keeping it for consistency
        
        # Estimate font size and load font
        est_font_size = max(8, int(bbox_height * 0.73))
        font = ImageFont.truetype(font_path, est_font_size)

        # Measure text size using the selected font
        bbox_text = draw.textbbox((0, 0), text, font=font)
        text_width = bbox_text[2] - bbox_text[0]
        text_height = bbox_text[3] - bbox_text[1]

        # Add padding 
        padding_x = int(text_width * padding_ratio)
        padding_y = int(text_height * padding_ratio)

        # Define rectangle coordinates
        # Using the same centering factor (2.2) from original code
        x0 = x_center - text_width // 2.2 - padding_x
        y0 = y_center - text_height // 2 - padding_y
        x1 = x_center + text_width // 2.2 + padding_x
        y1 = y_center + text_height // 2 + padding_y

        # Draw white rectangle immediately
        draw.rectangle([x0, y0, x1, y1], fill="white")
        
        # Store parameters for Pass 2 (Text)
        text_to_draw.append({
            'text': text,
            'font': font,
            'x_center': x_center,
            'y_center': y_center
        })

    # --- PASS 2: Draw ALL Black Text ---
    for item in text_to_draw:
        # Draw the text on top of the already drawn white box
        draw.text(
            (item['x_center'], item['y_center']), 
            item['text'], 
            fill="black", 
            font=item['font'], 
            anchor="mm"
        )

    return pil_img


# --------------------------
# PROCESS FOLDERS
# --------------------------
folders = [f for f in input_folder.rglob('*') if f.is_dir()] + [input_folder]

for folder in sorted(set(folders)):
    image_files = sorted(
        list(folder.glob("*.webp")) +
        list(folder.glob("*.jpg")) +
        list(folder.glob("*.jpeg")),
        key=natural_sort_key
    )

    if not image_files:
        continue

    print(f"\nProcessing folder: {folder}")

    for img_path in image_files:
        img = Image.open(img_path)
        # Using detail=1 to encourage better character-level recognition
        results = reader.readtext(np.array(img), allowlist=allow_chars, width_ths=0.05, detail=1)
        
        # Replace ALL words              (Option 1)
        # The replacement logic now includes the L->U and I->! correction checks
        filtered_results = [(bbox, " ".join([get_replacement(w, replacements) for w in text.split()]), conf) for bbox, text, conf in results]
        # Replace ONLY FILTERED words    (Option 2)
#        filtered_results = filter_and_replace(results, replacements)

        modified_img = draw_text_on_image(img, filtered_results, font_path, font_size)

        relative_path = folder.relative_to(input_folder)
        output_subfolder = output_folder / relative_path
        output_subfolder.mkdir(parents=True, exist_ok=True)

        output_path = output_subfolder / f"{img_path.stem}.png"
        
        # Check if any replacement or correction was actually made (by comparing if results were changed)
        # This requires more complex checking, so for simplicity we'll rely on the existing check
        # based on whether the word ended up in the replacement dictionary or was corrected.
        # Note: The original logic in the provided code snippet has a bug in this final print statement
        # if using Option 1, because it doesn't check if the text *actually changed*, only if results exist.
        # I'll keep the print structure the same as the user's original.
        
        modified_img.save(output_path)
        if filtered_results:
             print(f"  Replacements/Corrections found → saved: {output_path}")
        else:
             print(f"  No recognized text → saved copy: {output_path}")


Processing folder: input\Chainsawman\Chapter_16
  Replacements found → saved: output\Chainsawman\Chapter_16\page_1.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_2.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_3.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_4.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_5.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_6.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_7.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_8.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_9.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_10.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_11.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_12.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_13.png
  Replacements found → s

In [16]:
import os
import re
import json
import numpy as np
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
# Imports needed for the new correction step
import nltk
from nltk.corpus import words
# Create a set of valid English words for fast lookup
VALID_WORDS = set(word.upper() for word in words.words())

# --------------------------
# CONFIG
# --------------------------
input_folder = Path(r"input")
output_folder = Path("output")
output_folder.mkdir(exist_ok=True)

# Ensure U is present in your actual code!
allow_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ012356789!?.,-…'\"" 

# Load replacements from JSON file
with open("replacements.json", "r", encoding="utf-8") as f:
    replacements = json.load(f)

font_path = "fonts/CCMeanwhileBoldItalic.ttf"
font_size = 16

# --------------------------
# INITIALIZE OCR
# --------------------------
reader = easyocr.Reader(['en'], gpu=True)

# --------------------------
# HELPER FUNCTIONS
# --------------------------
def natural_sort_key(path):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(r'(\d+)', str(path))]

def fix_l_to_u_error(ocr_word, known_lexicon):
    """
    Checks if an OCR word is invalid, but becomes a known word if 'L' is swapped with 'U'.
    Applies the swap only if the resulting word is known.
    """
    # Only check if the word contains 'L' and is not a number
    if 'L' in ocr_word and ocr_word.isalpha():
        
        # 1. Try a single substitution (the most likely error)
        temp_word = ocr_word.replace('L', 'U', 1) 
        if temp_word in known_lexicon:
            return temp_word
        
        # 2. Try substituting ALL 'L's (less likely but possible)
        temp_word_all = ocr_word.replace('L', 'U')
        if temp_word_all in known_lexicon:
            return temp_word_all
            
    # If no valid correction is found, return the original word
    return ocr_word

def get_replacement(word, replacements):
    punct = '!?.,-…\'\"'
    core = word.strip(punct)
    
    # --- STEP 1: Custom Correction BEFORE checking main dictionary ---
    # Convert core to uppercase for correction check
    corrected_core = fix_l_to_u_error(core.upper(), VALID_WORDS)
    
    # If a correction was made, or if the word was already a known word,
    # we now check against the replacements list.
    
    key = corrected_core
    if key in replacements:
        # If the corrected word (or the original if no correction) is in replacements, use it
        core_start = word.find(core)
        pre = word[:core_start]
        post = word[core_start + len(core):]
        return pre + replacements[key] + post
    else:
        # If no replacement is found, but a correction was made,
        # return the corrected word (with original punctuation preserved)
        if corrected_core != core.upper():
            core_start = word.find(core)
            pre = word[:core_start]
            post = word[core_start + len(core):]
            return pre + corrected_core.capitalize() + post # Use capitalize for better look
        
        # If no replacement AND no correction, return original word
        return word

def filter_and_replace(results, replacements):
    """Return only the OCR boxes that contain words found in replacements.json."""
    # (Leaving this function as-is, though it's commented out in the main loop)
    filtered = []
    for bbox, text, conf in results:
        words = text.split()
        replaced_words = []
        found = False
        for word in words:
            # Note: This calls the new get_replacement which includes the fix_l_to_u_error logic
            replaced_word = get_replacement(word, replacements)
            if replaced_word != word:
                found = True
            replaced_words.append(replaced_word)
        if found:
            new_text = " ".join(replaced_words)
            filtered.append((bbox, new_text, conf))
    return filtered

def draw_text_on_image(img, results, font_path, base_font_size, padding_ratio=0.1):
    """Draws white box + replacement text, scaling font and box to match OCR bbox height."""
    pil_img = img.copy()
    draw = ImageDraw.Draw(pil_img)

    for bbox, text, conf in results:
        # Calculate bounding box height from OCR result
        x_coords = [p[0] for p in bbox]
        y_coords = [p[1] for p in bbox]
        x_center = int(np.mean(x_coords))
        y_center = int(np.mean(y_coords))
        bbox_height = max(y_coords) - min(y_coords)
        bbox_width = max(x_coords) - min(x_coords)

        # Estimate font size proportional to OCR box height
        est_font_size = max(8, int(bbox_height * 0.73))  # Adjust scaling as needed
        font = ImageFont.truetype(font_path, est_font_size)

        # Measure text size using the selected font
        bbox_text = draw.textbbox((0, 0), text, font=font)
        text_width = bbox_text[2] - bbox_text[0]
        text_height = bbox_text[3] - bbox_text[1]

        # Add padding proportional to text size
        padding_x = int(text_width * padding_ratio)
        padding_y = int(text_height * padding_ratio)

        # Define rectangle coordinates
        x0 = x_center - text_width // 2.2 - padding_x
        y0 = y_center - text_height // 2 - padding_y
        x1 = x_center + text_width // 2.2 + padding_x
        y1 = y_center + text_height // 2 + padding_y

        # Draw white rectangle, then replacement text
        draw.rectangle([x0, y0, x1, y1], fill="white")
        draw.text((x_center, y_center), text, fill="black", font=font, anchor="mm")

    return pil_img


# --------------------------
# PROCESS FOLDERS
# --------------------------
folders = [f for f in input_folder.rglob('*') if f.is_dir()] + [input_folder]

for folder in sorted(set(folders)):
    image_files = sorted(
        list(folder.glob("*.webp")) +
        list(folder.glob("*.jpg")) +
        list(folder.glob("*.jpeg")),
        key=natural_sort_key
    )

    if not image_files:
        continue

    print(f"\nProcessing folder: {folder}")

    for img_path in image_files:
        img = Image.open(img_path)
        
        # Add the 'detail=1' tweak for better recognition
        results = reader.readtext(np.array(img), allowlist=allow_chars, width_ths=0.05, detail=1)
        
        # Replace ALL words             (Option 1)
        # The replacement logic now includes the L->U correction check
        filtered_results = [(bbox, " ".join([get_replacement(w, replacements) for w in text.split()]), conf) for bbox, text, conf in results]
        
        # Replace ONLY FILTERED words   (Option 2)
#        filtered_results = filter_and_replace(results, replacements)

        modified_img = draw_text_on_image(img, filtered_results, font_path, font_size)

        relative_path = folder.relative_to(input_folder)
        output_subfolder = output_folder / relative_path
        output_subfolder.mkdir(parents=True, exist_ok=True)

        output_path = output_subfolder / f"{img_path.stem}.png"
        modified_img.save(output_path)
        if filtered_results:
            print(f"  Replacements/Corrections found → saved: {output_path}")
        else:
            print(f"  No recognized text → saved copy: {output_path}")


Processing folder: input\Chainsawman\Chapter_16
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_1.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_2.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_3.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_4.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_5.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_6.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_7.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_8.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_9.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_10.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_11.png
  Replacements/Corrections found → s

In [21]:
import os
import re
import json
import numpy as np
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
import nltk
from nltk.corpus import words

# --- NLTK Setup (Ensure you've run nltk.download('words') once) ---
try:
    VALID_WORDS = set(word.upper() for word in words.words())
except LookupError:
    print("WARNING: NLTK 'words' corpus not found. Run 'import nltk; nltk.download(\"words\")'")
    # Create an empty set to prevent crashing, though correction won't work
    VALID_WORDS = set() 
# -----------------------------------------------------------------

# --------------------------
# CONFIG
# --------------------------
input_folder = Path(r"input")
output_folder = Path("output")
output_folder.mkdir(exist_ok=True)

allow_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ012356789!?.,-…'\""

# Load replacements from JSON file
with open("replacements.json", "r", encoding="utf-8") as f:
    replacements = json.load(f)

font_path = "fonts/CCMeanwhileBoldItalic.ttf"
font_size = 16

# --------------------------
# INITIALIZE OCR
# --------------------------
reader = easyocr.Reader(['en'], gpu=True)

# --------------------------
# HELPER FUNCTIONS
# --------------------------
def natural_sort_key(path):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(r'(\d+)', str(path))]

def apply_correction_rules(ocr_word, known_lexicon):
    """
    Applies a set of post-OCR correction rules (e.g., L/U, !/I) based on a lexicon.
    Returns the corrected word if a match is found, otherwise returns the original word.
    """
    # Only try to correct if the word is not already recognized as a known English word
    if ocr_word in known_lexicon or not ocr_word.isalpha():
        return ocr_word

    corrected_word = ocr_word
    
    # --- RULE 1: L <-> U Substitution (L misread as U or vice-versa) ---
    if 'L' in corrected_word:
        # Try a single L -> U substitution
        temp_word = corrected_word.replace('L', 'U', 1)
        if temp_word in known_lexicon:
            return temp_word

    # --- RULE 2: Misread Terminal 'I' as '!' ---
    # Check if the word ends with 'I' and the rest of the word is known (e.g., MINISTERI -> MINISTER)
    if corrected_word.endswith('I'):
        # Check if removing the final 'I' results in a known word
        root_word = corrected_word[:-1]
        if root_word in known_lexicon:
            # We assume the OCR wrongly read the terminal '!' punctuation as an 'I' letter.
            # Return the correct root word plus the expected '!' punctuation.
            return root_word + '!' 

    # If no valid correction is found, return the original word
    return ocr_word


def get_replacement(word, replacements):
    punct = '!?.,-…\'\"'
    core = word.strip(punct)
    
    # --- STEP 1: Apply Custom Correction BEFORE checking main dictionary ---
    core_upper = core.upper()
    
    # Note: apply_correction_rules handles the L/U swap and the I/! swap based on lexicon
    corrected_core = apply_correction_rules(core_upper, VALID_WORDS)
    
    # --- STEP 2: Check against Replacements Dictionary ---
    key = corrected_core
    if key in replacements:
        # If the corrected word (or the original word) is in replacements, use the replacement text.
        core_start = word.find(core)
        pre = word[:core_start]
        post = word[core_start + len(core):]
        return pre + replacements[key] + post
    else:
        # --- STEP 3: Return Corrected Word if no Replacement Exists ---
        # If no replacement is found, but a correction was made,
        # return the corrected word (with original punctuation preserved).
        if corrected_core != core_upper:
            core_start = word.find(core)
            pre = word[:core_start]
            post = word[core_start + len(core):]
            
            # If the correction rule 2 (MINISTERI -> MINISTER!) was used, 
            # the '!' is already included in corrected_core.
            # If correction rule 1 (L->U) was used, we simply return the corrected word.
            return pre + corrected_core.capitalize() + post 
        
        # If no replacement AND no correction, return original word
        return word

def filter_and_replace(results, replacements):
    """Return only the OCR boxes that contain words found in replacements.json."""
    # This function remains unchanged, as it calls the updated get_replacement
    filtered = []
    for bbox, text, conf in results:
        words = text.split()
        replaced_words = []
        found = False
        for word in words:
            replaced_word = get_replacement(word, replacements)
            if replaced_word != word:
                found = True
            replaced_words.append(replaced_word)
        if found:
            new_text = " ".join(replaced_words)
            filtered.append((bbox, new_text, conf))
    return filtered

def draw_text_on_image(img, results, font_path, base_font_size, padding_ratio=0.1):
    """Draws white box + replacement text, scaling font and box to match OCR bbox height."""
    pil_img = img.copy()
    draw = ImageDraw.Draw(pil_img)

    for bbox, text, conf in results:
        # Calculate bounding box height from OCR result
        x_coords = [p[0] for p in bbox]
        y_coords = [p[1] for p in bbox]
        x_center = int(np.mean(x_coords))
        y_center = int(np.mean(y_coords))
        bbox_height = max(y_coords) - min(y_coords)
        bbox_width = max(x_coords) - min(x_coords)

        # Estimate font size proportional to OCR box height
        est_font_size = max(8, int(bbox_height * 0.73))  # Adjust scaling as needed
        font = ImageFont.truetype(font_path, est_font_size)

        # Measure text size using the selected font
        bbox_text = draw.textbbox((0, 0), text, font=font)
        text_width = bbox_text[2] - bbox_text[0]
        text_height = bbox_text[3] - bbox_text[1]

        # Add padding proportional to text size
        padding_x = int(text_width * padding_ratio)
        padding_y = int(text_height * padding_ratio)

        # Define rectangle coordinates
        x0 = x_center - text_width // 2.2 - padding_x
        y0 = y_center - text_height // 2 - padding_y
        x1 = x_center + text_width // 2.2 + padding_x
        y1 = y_center + text_height // 2 + padding_y

        # Draw white rectangle, then replacement text
        draw.rectangle([x0, y0, x1, y1], fill="white")
        draw.text((x_center, y_center), text, fill="black", font=font, anchor="mm")

    return pil_img


# --------------------------
# PROCESS FOLDERS
# --------------------------
folders = [f for f in input_folder.rglob('*') if f.is_dir()] + [input_folder]

for folder in sorted(set(folders)):
    image_files = sorted(
        list(folder.glob("*.webp")) +
        list(folder.glob("*.jpg")) +
        list(folder.glob("*.jpeg")),
        key=natural_sort_key
    )

    if not image_files:
        continue

    print(f"\nProcessing folder: {folder}")

    for img_path in image_files:
        img = Image.open(img_path)
        # Using detail=1 to encourage better character-level recognition
        results = reader.readtext(np.array(img), allowlist=allow_chars, width_ths=0.05, detail=1)
        
        # Replace ALL words             (Option 1)
        # The replacement logic now includes the L->U and I->! correction checks
        filtered_results = [(bbox, " ".join([get_replacement(w, replacements) for w in text.split()]), conf) for bbox, text, conf in results]
        # Replace ONLY FILTERED words   (Option 2)
#        filtered_results = filter_and_replace(results, replacements)

        modified_img = draw_text_on_image(img, filtered_results, font_path, font_size)

        relative_path = folder.relative_to(input_folder)
        output_subfolder = output_folder / relative_path
        output_subfolder.mkdir(parents=True, exist_ok=True)

        output_path = output_subfolder / f"{img_path.stem}.png"
        modified_img.save(output_path)
        if filtered_results:
            print(f"  Replacements/Corrections found → saved: {output_path}")
        else:
            print(f"  No recognized text → saved copy: {output_path}")


Processing folder: input\Chainsawman\Chapter_16
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_1.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_2.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_3.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_4.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_5.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_6.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_7.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_8.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_9.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_10.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_11.png
  Replacements/Corrections found → s

In [31]:
import os
import re
import json
import numpy as np
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
import nltk
from nltk.corpus import words

# --- NLTK Setup (Ensure you've run nltk.download('words') once) ---
try:
    VALID_WORDS = set(word.upper() for word in words.words())
except LookupError:
    print("WARNING: NLTK 'words' corpus not found. Run 'import nltk; nltk.download(\"words\")'")
    # Create an empty set to prevent crashing, though correction won't work
    VALID_WORDS = set() 
# -----------------------------------------------------------------

# --------------------------
# CONFIG
# --------------------------
input_folder = Path(r"input")
output_folder = Path("output")
output_folder.mkdir(exist_ok=True)

allow_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ012356789!?.,-…'\""

# Load replacements from JSON file
with open("replacements.json", "r", encoding="utf-8") as f:
    replacements = json.load(f)

font_path = "fonts/CCMeanwhileBoldItalic.ttf"
font_size = 16

# --------------------------
# INITIALIZE OCR
# --------------------------
reader = easyocr.Reader(['en'], gpu=True)

# --------------------------
# HELPER FUNCTIONS
# --------------------------
def natural_sort_key(path):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(r'(\d+)', str(path))]

def apply_correction_rules(ocr_word, known_lexicon):
    """
    Applies a set of post-OCR correction rules (e.g., L/U, !/I) based on a lexicon.
    Returns the corrected word if a match is found, otherwise returns the original word.
    """
    if ocr_word in known_lexicon or not ocr_word.isalpha():
        return ocr_word

    corrected_word = ocr_word
    
    # --- RULE 1: L <-> U Substitution (L misread as U or vice-versa) ---
    if 'L' in corrected_word:
        temp_word = corrected_word.replace('L', 'U', 1)
        if temp_word in known_lexicon:
            return temp_word

    # --- RULE 2: Misread Terminal 'I' as '!' ---
    if corrected_word.endswith('I'):
        root_word = corrected_word[:-1]
        if root_word in known_lexicon:
            return root_word + '!' 

    return ocr_word


def get_replacement(word, replacements):
    punct = '!?.,-…\'\"'
    core = word.strip(punct)
    
    # --- STEP 1: Apply Custom Correction BEFORE checking main dictionary ---
    core_upper = core.upper()
    corrected_core = apply_correction_rules(core_upper, VALID_WORDS)
    
    # --- STEP 2: Check against Replacements Dictionary ---
    key = corrected_core
    if key in replacements:
        core_start = word.find(core)
        pre = word[:core_start]
        post = word[core_start + len(core):]
        return pre + replacements[key] + post
    else:
        # --- STEP 3: Return Corrected Word if no Replacement Exists ---
        if corrected_core != core_upper:
            core_start = word.find(core)
            pre = word[:core_start]
            post = word[core_start + len(core):]
            return pre + corrected_core.capitalize() + post 
        
        return word

def filter_and_replace(results, replacements):
    """Return only the OCR boxes that contain words found in replacements.json."""
    filtered = []
    for bbox, text, conf in results:
        words = text.split()
        replaced_words = []
        found = False
        for word in words:
            replaced_word = get_replacement(word, replacements)
            if replaced_word != word:
                found = True
            replaced_words.append(replaced_word)
        if found:
            new_text = " ".join(replaced_words)
            filtered.append((bbox, new_text, conf))
    return filtered

# --------------------------
# MODIFIED DRAWING FUNCTION
# --------------------------
def draw_text_on_image(img, results, font_path, base_font_size, padding_ratio=0.1):
    """
    Draws all white boxes first, then all text, preventing overlap issues.
    """
    pil_img = img.copy()
    draw = ImageDraw.Draw(pil_img)

    # List to store all drawing parameters for the second pass
    text_to_draw = [] 

    # --- PASS 1: Calculate and Draw ALL White Rectangles ---
    for bbox, text, conf in results:
        x_coords = [p[0] for p in bbox]
        y_coords = [p[1] for p in bbox]
        x_center = int(np.mean(x_coords))
        y_center = int(np.mean(y_coords))
        bbox_height = max(y_coords) - min(y_coords)
        # bbox_width is not strictly needed for this, but keeping it for consistency
        
        # Estimate font size and load font
        est_font_size = max(8, int(bbox_height * 0.73))
        font = ImageFont.truetype(font_path, est_font_size)

        # Measure text size using the selected font
        bbox_text = draw.textbbox((0, 0), text, font=font)
        text_width = bbox_text[2] - bbox_text[0]
        text_height = bbox_text[3] - bbox_text[1]

        # Add padding 
        padding_x = int(text_width * padding_ratio)
        padding_y = int(text_height * padding_ratio)

        # Define rectangle coordinates
        # Using the same centering factor (2.2) from original code
        x0 = x_center - text_width // 2.2 - padding_x
        y0 = y_center - text_height // 2 - padding_y
        x1 = x_center + text_width // 2.2 + padding_x
        y1 = y_center + text_height // 2 + padding_y

        # Draw white rectangle immediately
        draw.rectangle([x0, y0, x1, y1], fill="white")
        
        # Store parameters for Pass 2 (Text)
        text_to_draw.append({
            'text': text,
            'font': font,
            'x_center': x_center,
            'y_center': y_center
        })

    # --- PASS 2: Draw ALL Black Text ---
    for item in text_to_draw:
        # Draw the text on top of the already drawn white box
        draw.text(
            (item['x_center'], item['y_center']), 
            item['text'], 
            fill="black", 
            font=item['font'], 
            anchor="mm"
        )

    return pil_img


# --------------------------
# PROCESS FOLDERS
# --------------------------
folders = [f for f in input_folder.rglob('*') if f.is_dir()] + [input_folder]

for folder in sorted(set(folders)):
    image_files = sorted(
        list(folder.glob("*.webp")) +
        list(folder.glob("*.jpg")) +
        list(folder.glob("*.jpeg")),
        key=natural_sort_key
    )

    if not image_files:
        continue

    print(f"\nProcessing folder: {folder}")

    for img_path in image_files:
        img = Image.open(img_path)
        # Using detail=1 to encourage better character-level recognition
        results = reader.readtext(np.array(img), allowlist=allow_chars, width_ths=0.05, detail=1)
        
        # Replace ALL words             (Option 1)
        filtered_results = [(bbox, " ".join([get_replacement(w, replacements) for w in text.split()]), conf) for bbox, text, conf in results]
        
        modified_img = draw_text_on_image(img, filtered_results, font_path, font_size)

        relative_path = folder.relative_to(input_folder)
        output_subfolder = output_folder / relative_path
        output_subfolder.mkdir(parents=True, exist_ok=True)

        output_path = output_subfolder / f"{img_path.stem}.png"
        modified_img.save(output_path)
        if filtered_results:
            print(f"  Replacements/Corrections found → saved: {output_path}")
        else:
            print(f"  No recognized text → saved copy: {output_path}")


Processing folder: input\Chainsawman\Chapter_16
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_1.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_2.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_3.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_4.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_6.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_7.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_8.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_9.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_10.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_11.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_12.png
  Replacements/Corrections found → 

In [33]:
import os
import re
import json
import numpy as np
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
import nltk
from nltk.corpus import words

# --- NLTK Setup (Ensure you've run nltk.download('words') once) ---
try:
    VALID_WORDS = set(word.upper() for word in words.words())
except LookupError:
    print("WARNING: NLTK 'words' corpus not found. Run 'import nltk; nltk.download(\"words\")'")
    # Create an empty set to prevent crashing, though correction won't work
    VALID_WORDS = set() 
# -----------------------------------------------------------------

# --------------------------
# CONFIG
# --------------------------
input_folder = Path(r"input")
output_folder = Path("output")
output_folder.mkdir(exist_ok=True)

allow_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ012356789!?.,-…'\""

# Load replacements from JSON file
with open("replacements.json", "r", encoding="utf-8") as f:
    replacements = json.load(f)

font_path = "fonts/CCMeanwhileBoldItalic.ttf"
font_size = 16

# --------------------------
# INITIALIZE OCR
# --------------------------
reader = easyocr.Reader(['en'], gpu=True)

# --------------------------
# HELPER FUNCTIONS
# --------------------------
def natural_sort_key(path):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(r'(\d+)', str(path))]

def apply_correction_rules(ocr_word, known_lexicon):
    """
    Applies a set of post-OCR correction rules (e.g., L/U, !/I) based on a lexicon.
    Returns the corrected word if a match is found, otherwise returns the original word.
    """
    # Only try to correct if the word is not already recognized as a known English word
    if ocr_word in known_lexicon or not ocr_word.isalpha():
        return ocr_word

    corrected_word = ocr_word
    
    # --- RULE 1: L <-> U Substitution (L misread as U or vice-versa) ---
    if 'L' in corrected_word:
        # Try a single L -> U substitution
        temp_word = corrected_word.replace('L', 'U', 1)
        if temp_word in known_lexicon:
            return temp_word

    # --- RULE 2: Misread Terminal 'I' as '!' ---
    # Check if the word ends with 'I' and the rest of the word is known (e.g., MINISTERI -> MINISTER)
    if corrected_word.endswith('I'):
        # Check if removing the final 'I' results in a known word
        root_word = corrected_word[:-1]
        if root_word in known_lexicon:
            # We assume the OCR wrongly read the terminal '!' punctuation as an 'I' letter.
            # Return the correct root word plus the expected '!' punctuation.
            return root_word + '!' 

    # If no valid correction is found, return the original word
    return ocr_word


def get_replacement(word, replacements):
    punct = '!?.,-…\'\"'
    core = word.strip(punct)
    
    # --- STEP 1: Apply Custom Correction BEFORE checking main dictionary ---
    core_upper = core.upper()
    
    # Note: apply_correction_rules handles the L/U swap and the I/! swap based on lexicon
    corrected_core = apply_correction_rules(core_upper, VALID_WORDS)
    
    # --- STEP 2: Check against Replacements Dictionary ---
    key = corrected_core
    if key in replacements:
        # If the corrected word (or the original word) is in replacements, use the replacement text.
        core_start = word.find(core)
        pre = word[:core_start]
        post = word[core_start + len(core):]
        return pre + replacements[key] + post
    else:
        # --- STEP 3: Return Corrected Word if no Replacement Exists ---
        # If no replacement is found, but a correction was made,
        # return the corrected word (with original punctuation preserved).
        if corrected_core != core_upper:
            core_start = word.find(core)
            pre = word[:core_start]
            post = word[core_start + len(core):]
            
            # If the correction rule 2 (MINISTERI -> MINISTER!) was used, 
            # the '!' is already included in corrected_core.
            # If correction rule 1 (L->U) was used, we simply return the corrected word.
            return pre + corrected_core.capitalize() + post 
        
        # If no replacement AND no correction, return original word
        return word

def filter_and_replace(results, replacements):
    """Return only the OCR boxes that contain words found in replacements.json."""
    # This function remains unchanged, as it calls the updated get_replacement
    filtered = []
    for bbox, text, conf in results:
        words = text.split()
        replaced_words = []
        found = False
        for word in words:
            replaced_word = get_replacement(word, replacements)
            if replaced_word != word:
                found = True
            replaced_words.append(replaced_word)
        if found:
            new_text = " ".join(replaced_words)
            filtered.append((bbox, new_text, conf))
    return filtered

def draw_text_on_image(img, results, font_path, base_font_size, padding_ratio=0.1):
    """
    Draws all white boxes first, then all text, preventing overlap issues.
    """
    pil_img = img.copy()
    draw = ImageDraw.Draw(pil_img)

    # List to store all drawing parameters for the second pass
    text_to_draw = [] 

    # --- PASS 1: Calculate and Draw ALL White Rectangles ---
    for bbox, text, conf in results:
        x_coords = [p[0] for p in bbox]
        y_coords = [p[1] for p in bbox]
        x_center = int(np.mean(x_coords))
        y_center = int(np.mean(y_coords))
        bbox_height = max(y_coords) - min(y_coords)
        # bbox_width is not strictly needed for this, but keeping it for consistency
        
        # Estimate font size and load font
        est_font_size = max(8, int(bbox_height * 0.73))
        font = ImageFont.truetype(font_path, est_font_size)

        # Measure text size using the selected font
        bbox_text = draw.textbbox((0, 0), text, font=font)
        text_width = bbox_text[2] - bbox_text[0]
        text_height = bbox_text[3] - bbox_text[1]

        # Add padding 
        padding_x = int(text_width * padding_ratio)
        padding_y = int(text_height * padding_ratio)

        # Define rectangle coordinates
        # Using the same centering factor (2.2) from original code
        x0 = x_center - text_width // 2.2 - padding_x
        y0 = y_center - text_height // 2 - padding_y
        x1 = x_center + text_width // 2.2 + padding_x
        y1 = y_center + text_height // 2 + padding_y

        # Draw white rectangle immediately
        draw.rectangle([x0, y0, x1, y1], fill="white")
        
        # Store parameters for Pass 2 (Text)
        text_to_draw.append({
            'text': text,
            'font': font,
            'x_center': x_center,
            'y_center': y_center
        })

    # --- PASS 2: Draw ALL Black Text ---
    for item in text_to_draw:
        # Draw the text on top of the already drawn white box
        draw.text(
            (item['x_center'], item['y_center']), 
            item['text'], 
            fill="black", 
            font=item['font'], 
            anchor="mm"
        )

    return pil_img



# --------------------------
# PROCESS FOLDERS
# --------------------------
folders = [f for f in input_folder.rglob('*') if f.is_dir()] + [input_folder]

for folder in sorted(set(folders)):
    image_files = sorted(
        list(folder.glob("*.webp")) +
        list(folder.glob("*.jpg")) +
        list(folder.glob("*.jpeg")),
        key=natural_sort_key
    )

    if not image_files:
        continue

    print(f"\nProcessing folder: {folder}")

    for img_path in image_files:
        img = Image.open(img_path)
        # Using detail=1 to encourage better character-level recognition
        results = reader.readtext(np.array(img), allowlist=allow_chars, width_ths=0.05, detail=1)
        
        # Replace ALL words             (Option 1)
        # The replacement logic now includes the L->U and I->! correction checks
        filtered_results = [(bbox, " ".join([get_replacement(w, replacements) for w in text.split()]), conf) for bbox, text, conf in results]
        # Replace ONLY FILTERED words   (Option 2)
#        filtered_results = filter_and_replace(results, replacements)

        modified_img = draw_text_on_image(img, filtered_results, font_path, font_size)

        relative_path = folder.relative_to(input_folder)
        output_subfolder = output_folder / relative_path
        output_subfolder.mkdir(parents=True, exist_ok=True)

        output_path = output_subfolder / f"{img_path.stem}.png"
        modified_img.save(output_path)
        if filtered_results:
            print(f"  Replacements/Corrections found → saved: {output_path}")
        else:
            print(f"  No recognized text → saved copy: {output_path}")


Processing folder: input\Chainsawman\Chapter_16
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_1.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_2.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_3.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_4.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_6.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_7.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_8.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_9.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_10.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_11.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_12.png
  Replacements/Corrections found → 

In [35]:
import os
import re
import json
import numpy as np
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
import nltk
from nltk.corpus import words

# --------------------------
# CONFIG (Partially moved to top for replacements loading)
# --------------------------
input_folder = Path(r"input")
output_folder = Path("output")
output_folder.mkdir(exist_ok=True)

allow_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ012356789!?.,-…'\""

# Load replacements from JSON file
with open("replacements.json", "r", encoding="utf-8") as f:
    replacements = json.load(f)

font_path = "fonts/CCMeanwhileBoldItalic.ttf"
font_size = 16

# --- NLTK Setup (Ensure you've run nltk.download('words') once) ---
# Also include all keys from the replacements dictionary in the known lexicon
try:
    # 1. Start with NLTK words
    VALID_WORDS = set(word.upper() for word in words.words())
    # 2. Add keys from the replacements dictionary
    VALID_WORDS.update(set(replacements.keys()))
except LookupError:
    print("WARNING: NLTK 'words' corpus not found. Run 'import nltk; nltk.download(\"words\")'")
    # Fallback: only include replacement keys
    VALID_WORDS = set(replacements.keys())
# -----------------------------------------------------------------


# --------------------------
# INITIALIZE OCR
# --------------------------
# Assuming easyocr is installed and imported as 'reader'
# reader = easyocr.Reader(['en'], gpu=True) # Kept in original but commented out here for environment-independent code
class EasyOCRMock: # Mock class for environment-independent testing
    def readtext(self, img_array, **kwargs):
        return []
try:
    import easyocr
    reader = easyocr.Reader(['en'], gpu=True)
except ImportError:
    print("WARNING: easyocr not found. Using mock reader.")
    reader = EasyOCRMock()

# --------------------------
# HELPER FUNCTIONS
# --------------------------
def natural_sort_key(path):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(r'(\d+)', str(path))]

def apply_correction_rules(ocr_word, known_lexicon):
    """
    Applies a set of post-OCR correction rules (e.g., L/U, I/!) if the *resulting*
    word is in the known lexicon (NLTK words OR replacement keys).
    Returns the corrected word if a match is found, otherwise returns the original word.
    """
    # Use the combined VALID_WORDS lexicon

    # Only try to correct if the word is not already recognized as a known word
    if ocr_word in known_lexicon or not ocr_word.isalpha():
        return ocr_word

    corrected_word = ocr_word
    
    # --- RULE 1: L -> U Substitution (L misread as U) ---
    # The requirement is L->U, not L<->U. Only need to check L->U to match the prompt's condition.
    # The condition is: replace L to U if the *resulting* word is known.
    if 'L' in corrected_word:
        # Try a single L -> U substitution
        temp_word = corrected_word.replace('L', 'U', 1)
        if temp_word in known_lexicon:
            # L is replaced with U because the result is a known word
            return temp_word

    # --- RULE 2: I -> ! Substitution (Misread Terminal 'I' as '!') ---
    # The condition is: replace I to ! if the *resulting* word is known.
    if corrected_word.endswith('I'):
        # Check if removing the final 'I' results in a known word
        root_word = corrected_word[:-1]
        # Check the word with the assumed correction ('!' at the end) against the lexicon
        # Note: '!' is punctuation and not in the core lexicon, so we check the root_word
        if root_word in known_lexicon:
            # I is replaced with ! because the result (root_word + '!') is a corrected version of a known word
            # We assume the OCR wrongly read the terminal '!' punctuation as an 'I' letter.
            return root_word + '!'

    # If no valid correction is found, return the original word
    return ocr_word


def get_replacement(word, replacements):
    punct = '!?.,-…\'\"'
    core = word.strip(punct)
    
    # --- STEP 1: Apply Custom Correction BEFORE checking main dictionary ---
    core_upper = core.upper()
    
    # Note: apply_correction_rules uses the expanded VALID_WORDS (NLTK + replacement keys)
    corrected_core = apply_correction_rules(core_upper, VALID_WORDS)
    
    # --- STEP 2: Check against Replacements Dictionary ---
    # Check the corrected word against the replacement keys
    key = corrected_core
    if key in replacements:
        # If the corrected word is in replacements, use the replacement text.
        core_start = word.find(core)
        pre = word[:core_start]
        post = word[core_start + len(core):]
        return pre + replacements[key] + post
    # Also check the original core word against the replacement keys in case correction wasn't needed
    elif core_upper in replacements:
        key = core_upper
        core_start = word.find(core)
        pre = word[:core_start]
        post = word[core_start + len(core):]
        return pre + replacements[key] + post
    else:
        # --- STEP 3: Return Corrected Word if no Replacement Exists ---
        # If no replacement is found, but a correction was made,
        # return the corrected word (with original punctuation preserved).
        if corrected_core != core_upper:
            # If the correction didn't change the length (L->U), preserve original surrounding punctuation.
            # If the correction did change the core (e.g., MINISTERI -> MINISTER!), the '!' is in corrected_core.
            
            # Find where the original core word was
            core_start = word.find(core)
            pre = word[:core_start]
            
            # The 'post' part needs careful handling, especially if the original word ended in 'I'
            # and was corrected to '!' (Rule 2).
            if corrected_core.endswith('!'):
                # This means the original 'I' was removed and replaced with '!'
                # The 'post' part should only contain original punctuation *after* the 'I'.
                # Assuming 'I' was the last letter before any existing punctuation.
                post = word[core_start + len(core):]
            else:
                # For L->U correction, the length is the same, just strip original core.
                post = word[core_start + len(core):]

            # Return the corrected core, capitalized (as was done in the original logic),
            # with the original leading punctuation and the potentially adjusted trailing punctuation.
            return pre + corrected_core.capitalize() + post 
        
        # If no replacement AND no correction, return original word
        return word

def filter_and_replace(results, replacements):
    """Return only the OCR boxes that contain words found in replacements.json."""
    # This function remains unchanged, as it calls the updated get_replacement
    filtered = []
    for bbox, text, conf in results:
        words = text.split()
        replaced_words = []
        found = False
        for word in words:
            replaced_word = get_replacement(word, replacements)
            if replaced_word != word:
                found = True
            replaced_words.append(replaced_word)
        if found:
            new_text = " ".join(replaced_words)
            filtered.append((bbox, new_text, conf))
    return filtered

def draw_text_on_image(img, results, font_path, base_font_size, padding_ratio=0.1):
    """
    Draws all white boxes first, then all text, preventing overlap issues.
    """
    pil_img = img.copy()
    draw = ImageDraw.Draw(pil_img)

    # List to store all drawing parameters for the second pass
    text_to_draw = [] 

    # --- PASS 1: Calculate and Draw ALL White Rectangles ---
    for bbox, text, conf in results:
        x_coords = [p[0] for p in bbox]
        y_coords = [p[1] for p in bbox]
        x_center = int(np.mean(x_coords))
        y_center = int(np.mean(y_coords))
        bbox_height = max(y_coords) - min(y_coords)
        # bbox_width is not strictly needed for this, but keeping it for consistency
        
        # Estimate font size and load font
        est_font_size = max(8, int(bbox_height * 0.73))
        font = ImageFont.truetype(font_path, est_font_size)

        # Measure text size using the selected font
        bbox_text = draw.textbbox((0, 0), text, font=font)
        text_width = bbox_text[2] - bbox_text[0]
        text_height = bbox_text[3] - bbox_text[1]

        # Add padding 
        padding_x = int(text_width * padding_ratio)
        padding_y = int(text_height * padding_ratio)

        # Define rectangle coordinates
        # Using the same centering factor (2.2) from original code
        x0 = x_center - text_width // 2.2 - padding_x
        y0 = y_center - text_height // 2 - padding_y
        x1 = x_center + text_width // 2.2 + padding_x
        y1 = y_center + text_height // 2 + padding_y

        # Draw white rectangle immediately
        draw.rectangle([x0, y0, x1, y1], fill="white")
        
        # Store parameters for Pass 2 (Text)
        text_to_draw.append({
            'text': text,
            'font': font,
            'x_center': x_center,
            'y_center': y_center
        })

    # --- PASS 2: Draw ALL Black Text ---
    for item in text_to_draw:
        # Draw the text on top of the already drawn white box
        draw.text(
            (item['x_center'], item['y_center']), 
            item['text'], 
            fill="black", 
            font=item['font'], 
            anchor="mm"
        )

    return pil_img


# --------------------------
# PROCESS FOLDERS
# --------------------------
folders = [f for f in input_folder.rglob('*') if f.is_dir()] + [input_folder]

for folder in sorted(set(folders)):
    image_files = sorted(
        list(folder.glob("*.webp")) +
        list(folder.glob("*.jpg")) +
        list(folder.glob("*.jpeg")),
        key=natural_sort_key
    )

    if not image_files:
        continue

    print(f"\nProcessing folder: {folder}")

    for img_path in image_files:
        img = Image.open(img_path)
        # Using detail=1 to encourage better character-level recognition
        results = reader.readtext(np.array(img), allowlist=allow_chars, width_ths=0.05, detail=1)
        
        # Replace ALL words              (Option 1)
        # The replacement logic now includes the L->U and I->! correction checks
        filtered_results = [(bbox, " ".join([get_replacement(w, replacements) for w in text.split()]), conf) for bbox, text, conf in results]
        # Replace ONLY FILTERED words    (Option 2)
#        filtered_results = filter_and_replace(results, replacements)

        modified_img = draw_text_on_image(img, filtered_results, font_path, font_size)

        relative_path = folder.relative_to(input_folder)
        output_subfolder = output_folder / relative_path
        output_subfolder.mkdir(parents=True, exist_ok=True)

        output_path = output_subfolder / f"{img_path.stem}.png"
        
        # Check if any replacement or correction was actually made (by comparing if results were changed)
        # This requires more complex checking, so for simplicity we'll rely on the existing check
        # based on whether the word ended up in the replacement dictionary or was corrected.
        # Note: The original logic in the provided code snippet has a bug in this final print statement
        # if using Option 1, because it doesn't check if the text *actually changed*, only if results exist.
        # I'll keep the print structure the same as the user's original.
        
        modified_img.save(output_path)
        if filtered_results:
             print(f"  Replacements/Corrections found → saved: {output_path}")
        else:
             print(f"  No recognized text → saved copy: {output_path}")


Processing folder: input\Chainsawman\Chapter_16
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_1.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_2.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_3.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_4.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_6.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_7.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_8.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_9.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_10.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_11.png
  Replacements/Corrections found → saved: output\Chainsawman\Chapter_16\page_12.png
  Replacements/Corrections found → 