In [None]:
from pathlib import Path
import easyocr
import numpy as np
from sklearn.cluster import DBSCAN
import os
from pathlib import Path
from paddleocr import PaddleOCR
from PIL import Image, ImageDraw, ImageFont
import string
import pytesseract
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import json, re, cv2

In [62]:
# --------------------------
# CONFIG
# --------------------------
input_folder = Path(r"input")
output_folder = Path("output")
output_folder.mkdir(exist_ok=True)

allow_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ012356789!?.,-…'\""

# Load replacements from JSON file
with open("replacements.json", "r", encoding="utf-8") as f:
    replacements = json.load(f)

font_path = "fonts/CCMeanwhileBoldItalic.ttf"
font_size = 16

# --------------------------
# INITIALIZE OCR
# --------------------------
reader = easyocr.Reader(['en'], gpu=True)

# --------------------------
# HELPER FUNCTIONS
# --------------------------
def natural_sort_key(path):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(r'(\d+)', str(path))]

def get_replacement(word, replacements):
    punct = '!?.,-…\'\"'
    core = word.strip(punct)
    key = core.upper()
    if key in replacements:
        core_start = word.find(core)
        pre = word[:core_start]
        post = word[core_start + len(core):]
        return pre + replacements[key] + post
    else:
        return word

def filter_and_replace(results, replacements):
    """Return only the OCR boxes that contain words found in replacements.json."""
    filtered = []
    for bbox, text, conf in results:
        words = text.split()
        replaced_words = []
        found = False
        for word in words:
            replaced_word = get_replacement(word, replacements)
            if replaced_word != word:
                found = True
            replaced_words.append(replaced_word)
        if found:
            new_text = " ".join(replaced_words)
            filtered.append((bbox, new_text, conf))
    return filtered

def draw_text_on_image(img, results, font_path, base_font_size, padding_ratio=0.1):
    """Draws white box + replacement text, scaling font and box to match OCR bbox height."""
    pil_img = img.copy()
    draw = ImageDraw.Draw(pil_img)

    for bbox, text, conf in results:
        # Calculate bounding box height from OCR result
        x_coords = [p[0] for p in bbox]
        y_coords = [p[1] for p in bbox]
        x_center = int(np.mean(x_coords))
        y_center = int(np.mean(y_coords))
        bbox_height = max(y_coords) - min(y_coords)
        bbox_width = max(x_coords) - min(x_coords)

        # Estimate font size proportional to OCR box height
        est_font_size = max(8, int(bbox_height * 0.73))  # Adjust scaling as needed
        font = ImageFont.truetype(font_path, est_font_size)

        # Measure text size using the selected font
        bbox_text = draw.textbbox((0, 0), text, font=font)
        text_width = bbox_text[2] - bbox_text[0]
        text_height = bbox_text[3] - bbox_text[1]

        # Add padding proportional to text size
        padding_x = int(text_width * padding_ratio)
        padding_y = int(text_height * padding_ratio)

        # Define rectangle coordinates
        x0 = x_center - text_width // 2.2 - padding_x
        y0 = y_center - text_height // 2 - padding_y
        x1 = x_center + text_width // 2.2 + padding_x
        y1 = y_center + text_height // 2 + padding_y

        # Draw white rectangle, then replacement text
        draw.rectangle([x0, y0, x1, y1], fill="white")
        draw.text((x_center, y_center), text, fill="black", font=font, anchor="mm")

    return pil_img


# --------------------------
# PROCESS FOLDERS
# --------------------------
folders = [f for f in input_folder.rglob('*') if f.is_dir()] + [input_folder]

for folder in sorted(set(folders)):
    image_files = sorted(
        list(folder.glob("*.webp")) +
        list(folder.glob("*.jpg")) +
        list(folder.glob("*.jpeg")),
        key=natural_sort_key
    )

    if not image_files:
        continue

    print(f"\nProcessing folder: {folder}")

    for img_path in image_files:
        img = Image.open(img_path)
        results = reader.readtext(np.array(img), allowlist=allow_chars, width_ths=0.05)
        # Replace ALL words             (Option 1)
        filtered_results = [(bbox, " ".join([get_replacement(w, replacements) for w in text.split()]), conf) for bbox, text, conf in results]
        # Replace ONLY FILTERED words   (Option 2)
#        filtered_results = filter_and_replace(results, replacements)

        modified_img = draw_text_on_image(img, filtered_results, font_path, font_size)

        relative_path = folder.relative_to(input_folder)
        output_subfolder = output_folder / relative_path
        output_subfolder.mkdir(parents=True, exist_ok=True)

        output_path = output_subfolder / f"{img_path.stem}.png"
        modified_img.save(output_path)
        if filtered_results:
            print(f"  Replacements found → saved: {output_path}")
        else:
            print(f"  No replacements → saved copy: {output_path}")


Processing folder: input\Chainsawman\Chapter_16
  Replacements found → saved: output\Chainsawman\Chapter_16\page_1.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_2.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_3.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_4.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_5.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_6.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_7.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_8.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_9.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_10.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_11.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_12.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_13.png
  Replacements found → s

In [None]:


# --------------------------
# CONFIG
# --------------------------
input_folder = Path(r"input")
output_folder = Path("output")
output_folder.mkdir(exist_ok=True)

allow_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!?.,-…'\""

# Load replacements from JSON file
with open("replacements.json", "r", encoding="utf-8") as f:
    replacements = json.load(f)

font_path = "fonts/CCMeanwhileBoldItalic.ttf"
font_size = 16

# --------------------------
# HELPER FUNCTIONS
# --------------------------
def natural_sort_key(path):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(r'(\d+)', str(path))]

def get_replacement(word, replacements):
    punct = '!?.,-…\'\"'
    core = word.strip(punct)
    key = core.upper()
    if key in replacements:
        core_start = word.find(core)
        pre = word[:core_start]
        post = word[core_start + len(core):]
        return pre + replacements[key] + post
    else:
        return word

def filter_and_replace(results, replacements):
    """Return only the OCR boxes that contain words found in replacements.json."""
    filtered = []
    for bbox, text, conf in results:
        words = text.split()
        replaced_words = []
        found = False
        for word in words:
            replaced_word = get_replacement(word, replacements)
            if replaced_word != word:
                found = True
            replaced_words.append(replaced_word)
        if found:
            new_text = " ".join(replaced_words)
            filtered.append((bbox, new_text, conf))
    return filtered

def draw_text_on_image(img, results, font_path, base_font_size, padding_ratio=0.1):
    """Draws white box + replacement text, scaling font and box to match OCR bbox height."""
    pil_img = img.copy()
    draw = ImageDraw.Draw(pil_img)

    for bbox, text, conf in results:
        x0, y0, x1, y1 = bbox
        x_center = (x0 + x1) // 2
        y_center = (y0 + y1) // 2
        bbox_height = y1 - y0
        bbox_width = x1 - x0

        # Estimate font size proportional to OCR box height
        est_font_size = max(8, int(bbox_height * 0.73))
        font = ImageFont.truetype(font_path, est_font_size)

        # Measure text size using the selected font
        bbox_text = draw.textbbox((0, 0), text, font=font)
        text_width = bbox_text[2] - bbox_text[0]
        text_height = bbox_text[3] - bbox_text[1]

        # Add padding proportional to text size
        padding_x = int(text_width * padding_ratio)
        padding_y = int(text_height * padding_ratio)

        # Define rectangle coordinates
        rect_x0 = x_center - text_width // 2.2 - padding_x
        rect_y0 = y_center - text_height // 2 - padding_y
        rect_x1 = x_center + text_width // 2.2 + padding_x
        rect_y1 = y_center + text_height // 2 + padding_y

        # Draw white rectangle, then replacement text
        draw.rectangle([rect_x0, rect_y0, rect_x1, rect_y1], fill="white")
        draw.text((x_center, y_center), text, fill="black", font=font, anchor="mm")

    return pil_img

# --------------------------
# OCR FUNCTION (PYTESSERACT)
# --------------------------
def run_ocr_with_pytesseract(img):
    """Run OCR with pytesseract and return (bbox, text, conf) tuples similar to EasyOCR."""
    # Convert to OpenCV format (BGR)
    cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

    # Optional preprocessing (helps with stylized manga text)
    gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (3, 3), 0)
    gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    # Run pytesseract OCR with bounding boxes
    data = pytesseract.image_to_data(
        gray,
        config=f'--psm 6 -c tessedit_char_whitelist="{allow_chars}"',
        output_type=pytesseract.Output.DICT
    )

    results = []
    n_boxes = len(data['text'])
    for i in range(n_boxes):
        text = data['text'][i].strip()
        conf = float(data['conf'][i]) if data['conf'][i] != '-1' else 0
        if text and conf > 20:  # Skip empty or low-confidence detections
            x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
            bbox = (x, y, x + w, y + h)
            results.append((bbox, text, conf))
    return results

# --------------------------
# PROCESS FOLDERS
# --------------------------
folders = [f for f in input_folder.rglob('*') if f.is_dir()] + [input_folder]

for folder in sorted(set(folders)):
    image_files = sorted(
        list(folder.glob("*.webp")) +
        list(folder.glob("*.jpg")) +
        list(folder.glob("*.jpeg")),
        key=natural_sort_key
    )

    if not image_files:
        continue

    print(f"\nProcessing folder: {folder}")

    for img_path in image_files:
        img = Image.open(img_path)

        # Run pytesseract OCR instead of EasyOCR
        results = run_ocr_with_pytesseract(img)

        # Replace ALL words
        filtered_results = [
            (bbox, " ".join([get_replacement(w, replacements) for w in text.split()]), conf)
            for bbox, text, conf in results
        ]

        modified_img = draw_text_on_image(img, filtered_results, font_path, font_size)

        relative_path = folder.relative_to(input_folder)
        output_subfolder = output_folder / relative_path
        output_subfolder.mkdir(parents=True, exist_ok=True)

        output_path = output_subfolder / f"{img_path.stem}.png"
        modified_img.save(output_path)
        if filtered_results:
            print(f"  Replacements found → saved: {output_path}")
        else:
            print(f"  No replacements → saved copy: {output_path}")



Processing folder: input\Chainsawman\Chapter_16
  Replacements found → saved: output\Chainsawman\Chapter_16\page_1.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_2.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_3.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_4.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_5.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_6.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_7.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_8.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_9.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_10.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_11.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_12.png
  Replacements found → saved: output\Chainsawman\Chapter_16\page_13.png
  Replacements found → s