In [50]:
from pathlib import Path
import easyocr
import numpy as np
from sklearn.cluster import DBSCAN
import os
from pathlib import Path
from paddleocr import PaddleOCR
from PIL import Image, ImageDraw, ImageFont
import string
import pytesseract
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import json, re, cv2
from nltk.corpus import wordnet

In [None]:
import os
import re
import json
import numpy as np
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
import nltk
from nltk.corpus import words

# --------------------------
# CONFIG
# --------------------------
input_folder = Path(r"input")
output_folder = Path("output")
output_folder.mkdir(exist_ok=True)

allow_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789!?.,-…'\""

with open("replacements.json", "r", encoding="utf-8") as f:
    replacements = json.load(f)

font_path = "fonts/wild-words-bold.ttf"
font_size = 16

# Build known lexicon
try:
    VALID_WORDS = set(word.upper() for word in words.words())
    VALID_WORDS.update(set(replacements.keys()))
except LookupError:
    print("WARNING: NLTK 'words' corpus not found. Run nltk.download('words').")
    VALID_WORDS = set(replacements.keys())

# --------------------------
# OCR
# --------------------------
class EasyOCRMock:
    def readtext(self, img_array, **kwargs):
        return []

try:
    import easyocr
    reader = easyocr.Reader(['en'], gpu=True)
except ImportError:
    print("WARNING: easyocr not found. Using mock reader.")
    reader = EasyOCRMock()
except Exception as e:
    print(f"WARNING: easyocr initialization failed ({e}). Using mock reader.")
    reader = EasyOCRMock()


# --------------------------
# HELPERS
# --------------------------
def natural_sort_key(path):
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', str(path))]

def apply_correction_rules(word, lexicon):
    if word in lexicon:
        return word, False

    if 'L' in word:
        temp = word.replace('L', 'U', 1)
        if temp in lexicon:
            return temp, True

    if 'P' in word:
        temp = word.replace('P', 'R', 1)
        if temp in lexicon:
            return temp, True
            
    if '4' in word:
        temp = word.replace('4', 'U') 
        if temp in lexicon:
            return temp, True

    if word.endswith('I') and word[:-1] in lexicon:
#        return word[:-1] + '!', True
        return word[:-1], True
    
    if word.endswith('I') and word[:-1] + '!' in lexicon: 
#        return word[:-1] + '!', True
        return word[:-1], True
    
    return word, False

def get_replacement(word, replacements):
    punct = '!?.,-…\'\"'
    core = word.strip(punct)
    
    corrected, was_rule_corrected = apply_correction_rules(core.upper(), VALID_WORDS)
    
    final_replaced_word = None
    
    # 1. Check for replacement from replacements.json
    for key in (corrected, core.upper()):
        if key in replacements:
            start = word.find(core)
            final_replaced_word = word[:start] + replacements[key] + word[start + len(core):]
            break
            
    # 2. If not replaced by JSON, check for rule-based correction
    if final_replaced_word is None and was_rule_corrected:
        start = word.find(core)
        post = word[start + len(core):]
        final_replaced_word = word[:start] + corrected.capitalize() + post

    was_any_change = final_replaced_word is not None and final_replaced_word != word
    
    if was_any_change:
        return final_replaced_word, was_any_change
    
    return word, False

def process_ocr_results(results, replacements):
    filtered_results = []
    
    for bbox, text, conf in results:
        words_in_line = text.split()
        replaced_words = []
        is_any_change = False
        
        for w in words_in_line:
            replaced_word, was_any_change_for_word = get_replacement(w, replacements)
            
            if was_any_change_for_word:
                is_any_change = True
                
            replaced_words.append(replaced_word)

        if is_any_change:#<----- THIS LINE MAKES ONLY CENSORED WORDS BE REPLACED
#        if 1==1:#<---- THIS LINE MAKES EVERY WORD BE REPLACED, NOT JUST CENSORED WORDS
            filtered_results.append((bbox, " ".join(replaced_words), conf))
            
    return filtered_results

def draw_text_on_image(img, results, font_path, base_font_size, padding_ratio=0.1):
    pil_img = img.copy()
    draw = ImageDraw.Draw(pil_img)
    text_to_draw = []

    for bbox, text, conf in results:
        x_coords = [p[0] for p in bbox]
        y_coords = [p[1] for p in bbox]
        
        x_center, y_center = int(np.mean(x_coords)), int(np.mean(y_coords))
        
        bbox_height = max(y_coords) - min(y_coords)
        font = ImageFont.truetype(font_path, max(8, int(bbox_height * 0.73)))
        
        text_bbox = draw.textbbox((0, 0), text, font=font, anchor="mm") 
        w, h = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
        
        pad_x, pad_y = int(w*padding_ratio), int(h*padding_ratio)
        
        x_min = x_center - w//2.25 - pad_x
        y_min = y_center - h//2 - pad_y
        x_max = x_center + w//2.25 + pad_x
        y_max = y_center + h//2 + pad_y
        
        draw.rectangle([x_min, y_min, x_max, y_max], fill="white")
        
        text_to_draw.append({'text': text, 'font': font, 'x_center': x_center, 'y_center': y_center})

    for item in text_to_draw:
        draw.text((item['x_center'], item['y_center']), item['text'], fill="black", font=item['font'], anchor="mm")

    return pil_img

# --------------------------
# PROCESS IMAGES
# --------------------------
folders = [f for f in input_folder.rglob('*') if f.is_dir()] + [input_folder]

for folder in sorted(set(folders)):
    image_files = sorted(
        list(folder.glob("*.webp")) + list(folder.glob("*.jpg")) + list(folder.glob("*.jpeg")),
        key=natural_sort_key
    )
    if not image_files:
        continue
    print(f"\nProcessing folder: {folder}")

    for img_path in image_files:
        try:
            img = Image.open(img_path).convert("RGB")
        except Exception as e:
            print(f"  ERROR: Could not open {img_path}. Skipping. Error: {e}")
            continue

        img_array = np.array(img)
        
        results = reader.readtext(img_array, allowlist=allow_chars, width_ths=0.05, detail=1)
        
        filtered_results = process_ocr_results(results, replacements)

        if filtered_results:
            modified_img = draw_text_on_image(img, filtered_results, font_path, font_size)
        else:
            modified_img = img

        out_sub = output_folder / folder.relative_to(input_folder)
        out_sub.mkdir(parents=True, exist_ok=True)
        out_path = out_sub / f"{img_path.stem}.png"
        modified_img.save(out_path)

        if filtered_results:
            print(f"  Correction(s) found → saved: {out_path} ({len(filtered_results)} line(s) replaced)")
        else:
            print(f"  No corrections found (copied original) → saved: {out_path}")


Processing folder: input\Chainsawman\Chapter_20
  Correction(s) found → saved: output\Chainsawman\Chapter_20\page_13.png (8 line(s) replaced)
  Correction(s) found → saved: output\Chainsawman\Chapter_20\page_15.png (3 line(s) replaced)
