# 🧠🔎 TIF & IMAGE WORKFLOW: TEXT SEARCH + PHOTO EXTRACTION

Version: 3.0 (Refactored API, YOLO and Bounding Box modes)

This notebook demonstrates a complete pipeline:
- OCR-based page detection in multi-page TIFs
- Photo extraction from detected pages using YOLO
- Alternate extraction using predefined bounding boxes (no YOLO)

The notebook uses the refactored vendored libraries:
- external.tif_searcher.TifTextSearcher
- external.photo_extractor.PhotoExtractor

Configure once in Section 1, then run each section in order.

## SECTION 1: ⚙️ MASTER CONFIGURATION

In [1]:
import logging
import sys
from pathlib import Path
from typing import Any, Dict, List

# =============================================================================
# --- 1. Core Path Configurations ---
# =============================================================================
PROJECT_ROOT = Path.cwd()
TIF_INPUT_DIRECTORY = PROJECT_ROOT / 'data_real'
OUTPUT_DIRECTORY_FOR_CROPS = PROJECT_ROOT / 'pipeline_outputs' / 'extracted_crops'
EXTERNAL_DIR = PROJECT_ROOT / 'external'

# Ensure output directories exist
OUTPUT_DIRECTORY_FOR_CROPS.mkdir(parents=True, exist_ok=True)

# =============================================================================
# --- 2. System Path & Vendored Library Imports ---
# =============================================================================
# Add project and external library directories to the Python path
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
if EXTERNAL_DIR.is_dir() and str(EXTERNAL_DIR) not in sys.path:
    sys.path.insert(0, str(EXTERNAL_DIR))

# Imports from vendored (external) libraries
from external.photo_extractor import PhotoExtractor
from external.tif_searcher import TifTextSearcher

# =============================================================================
# --- 3. Logging Configuration ---
# =============================================================================
# Set up basic logging if no handlers are configured yet
if not logging.getLogger().handlers:
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - [%(levelname)s] - %(message)s'
    )
logging.info('Project root: %s', PROJECT_ROOT)

# =============================================================================
# --- 4. Application Configurations ---
# =============================================================================

# --- A. Photo Extractor (YOLO) Configuration ---
# gpu_inference: 'true' (GPU-only), 'false' (CPU-only), 'auto' (GPU with CPU fallback)
photo_extractor_config: Dict[str, Any] = {
    'yolo_object_detection': {
        'model_path': str(PROJECT_ROOT / 'trained_model' / 'yolo_photo_extractor' / 'weights' / 'best.pt'),
        'inference': {
            'target_object_names': ['photo'],
            'confidence_threshold': 0.25,
            'iou_threshold': 0.45,
            'imgsz': 640,
            'gpu_inference': 'auto',
        }
    }
}

# --- B. Manual Bounding Box Configuration ---
BBOX_LIST: List[List[int]] = [[171,236,1480,1100],[171,1168,1480,2032]]  # Example: [[x1, y1, x2, y2]]
BBOX_FORMAT = 'xyxy'  # one of: 'xyxy', 'xywh', 'cxcywh'
BBOX_NORMALIZED = False  # Set to True if coordinates are in [0, 1] range

# --- C. TifTextSearcher (OCR) Configuration Overrides ---
# These settings mirror the structure in 'external/tif_searcher/tif_searcher_core/config.py'
searcher_overrides: Dict[str, Any] = {
    # Search parameters
    'search_text': '가맹점 실사 사진',
    'language': 'ko',
    'ocr_backend': 'paddleocr',           # Options: 'paddleocr', 'easyocr', 'tesseract'
    'search_mode': 'exact_phrase',        # Options: 'exact_phrase', 'all_words'
    'allow_normalization': True,          # Normalizes recognized text for better matching
    'remove_spaces_in_normalization': True,
    'recognized_text_debug': True,

    # Zonal OCR (define a search area; set to {} to scan the whole page)
    'search_location': {'top': 0.05},      # Example: Search only the top 50% of the page

    # Offline/engine behavior
    'use_offline_models': False,

    # PaddleOCR-specific settings
    'use_angle_cls': True,
    'use_gpu_for_paddle': True,           # Requires CUDA/cuDNN for PaddleOCR
    'paddle_batch_size': 6,
    'download_path_det_model': '',
    'download_path_rec_model': '',
    'download_path_cls_model': '',
    'offline_paddle_det_model_dir': r"C:\Users\jeeb\.paddlex\official_models\PP-OCRv3_det",
    'offline_paddle_rec_model_dir': r"C:\Users\jeeb\.paddlex\official_models\korean_PP-OCRv3_rec",
    'offline_paddle_cls_model_dir': r"C:\Users\jeeb\.paddlex\official_models\ch_ppocr_mobile_v2.0_cls_infer",

    # EasyOCR-specific settings (used only if ocr_backend='easyocr' and use_offline_models=True)
    'offline_easyocr_model_dir': r"C:\ocr_models\.EasyOCR",

    # Tesseract-specific settings (used only if ocr_backend='tesseract')
    'tesseract_cmd_path': 'tesseract',    # e.g., r"C:\Program Files\Tesseract-OCR\tesseract.exe"
}

# =============================================================================
# --- 5. Initialization and Validation ---
# =============================================================================
searcher_instance = None
extractor_instance = None

# Validate that the TIF input directory exists (issues a warning if not found)
if not TIF_INPUT_DIRECTORY.exists():
    logging.warning("TIF input directory not found: %s", TIF_INPUT_DIRECTORY)

##### Logging configuration

In [2]:
LOG_TO_FILE = True # Set to False to log only to console

import logging 
import sys 
from pathlib import Path

# Configuration
LOG_LEVEL = logging.INFO 
LOG_FORMAT = '%(asctime)s - [%(levelname)s] - %(message)s' 
LOG_DIR = Path('./notebook_logs') 
LOG_FILE = LOG_DIR / 'ocr_section2.log'

# Build handlers
handlers = [logging.StreamHandler(sys.stdout)] 
if LOG_TO_FILE: 
    LOG_DIR.mkdir(parents=True, exist_ok=True)     
    file_handler = logging.FileHandler(LOG_FILE, mode='w', encoding='utf-8') 
    file_handler.setLevel(LOG_LEVEL) 
    handlers.append(file_handler)

# Apply logging configuration (force=True is critical for Jupyter)
logging.basicConfig( level=LOG_LEVEL, format=LOG_FORMAT, handlers=handlers, force=True )

# Ensure external module logs are visible
logging.getLogger('external.tif_searcher').setLevel(LOG_LEVEL) 
logging.getLogger('external.tif_searcher.tif_searcher_core').setLevel(LOG_LEVEL)

# Optional: quiet noisy libraries (uncomment as needed)
logging.getLogger('PIL.PngImagePlugin').setLevel(logging.WARNING)
logging.getLogger('ultralytics').setLevel(logging.WARNING)

# Confirmation
if LOG_TO_FILE: 
    logging.info('Logging configured: console + file at %s', str(LOG_FILE.resolve())) 
else: 
    logging.info('Logging configured: console only')

2025-09-04 15:49:15,032 - [INFO] - Logging configured: console + file at D:\frm_git\hyundai_document_authenticator\hyundai_document_authenticator\notebook_logs\ocr_section2.log


## SECTION 2: 📄 OCR — Find Pages Containing Target Text

In [3]:
try:
    # Instantiate the engines with their respective configurations
    searcher_instance = TifTextSearcher(**searcher_overrides)
    extractor_instance = PhotoExtractor(config_override=photo_extractor_config)

    logging.info("Successfully initialized TifTextSearcher and PhotoExtractor.")

except Exception as e:
    logging.exception('A critical error occurred during initialization: %s', e)
    raise

--- 🚀 OCR ENGINE INITIALIZATION (PaddleOCR) ---
  - INFO: Mapped language code 'ko' to 'korean' for PaddleOCR compatibility via config.

🔎 Checking for PaddleOCR models...
- INFO: Model 'Detection' not resolved in provided path. Will use PaddleOCR default models (auto-download or cached).
- INFO: Model 'Recognition' not resolved in provided path. Will use PaddleOCR default models (auto-download or cached).
- INFO: Model 'Classification' not resolved in provided path. Will use PaddleOCR default models (auto-download or cached).

🔧 Assembling PaddleOCR Parameters:
  - Language: 'korean'
  - GPU Acceleration: Enabled
  - Text Angle Classification: Enabled

⚙️ Initializing PaddleOCR engine with parameters:
  - use_angle_cls: True
  - lang: korean
  - rec_batch_num: 6
  - show_log: False
  - INFO: PaddleOCR resolved Detection model directory to: C:\Users\jeeb\.paddleocr\whl\det\ml\Multilingual_PP-OCRv3_det_infer
  - INFO: PaddleOCR resolved Recognition model directory to: C:\Users\jeeb\.pad

In [4]:
# =================================================================================================
# SECTION 2: OCR page detection
# =================================================================================================
from PIL import Image, ImageSequence

all_files_to_process = []
root = Path(TIF_INPUT_DIRECTORY)
supported = {'.tif', '.tiff'}
tif_files = [p for p in root.rglob('*') if p.suffix.lower() in supported]
logging.info(f'Found {len(tif_files)} TIF files under {root}')

for tif_path in tif_files:
    try:
        pages = searcher_instance.find_text_pages(tif_path)
        if pages:
            logging.info(f'{tif_path.name}: matched pages {pages}')
            all_files_to_process.append({'path': tif_path, 'pages': pages})
        else:
            logging.info(f'{tif_path.name}: no matched pages')
    except Exception as e:
        logging.warning(f'OCR failed for {tif_path}: {e}')

logging.info(f'Total files with matched pages: {len(all_files_to_process)}')


2025-09-04 15:49:27,717 - [INFO] - Found 10 TIF files under d:\frm_git\hyundai_document_authenticator\hyundai_document_authenticator\data_real
  - INFO: PaddleOCR resolved Recognition model directory to: C:\Users\jeeb\.paddleocr\whl\rec\korean\korean_PP-OCRv4_rec_infer
  - INFO: PaddleOCR resolved Classification model directory to: C:\Users\jeeb\.paddleocr\whl\cls\ch_ppocr_mobile_v2.0_cls_infer
2025-09-04 15:49:29,489 - [INFO] - [OCR DEBUG] N2023100400003THA00100001.tif page 1 zone 1 (top, 1688.9 ms) RAW: 가맹점 실사 사진
2025-09-04 15:49:29,490 - [INFO] - [OCR DEBUG] N2023100400003THA00100001.tif page 1 zone 1 (top, 1688.9 ms) NORM: 가맹점실사사진
2025-09-04 15:49:29,493 - [INFO] - N2023100400003THA00100001.tif: matched pages [1]
2025-09-04 15:49:29,585 - [INFO] - [OCR DEBUG] N2023100400119THA00100001 - Copy.tif page 1 zone 1 (top, 70.8 ms) RAW: 가맹점 실사 사진
2025-09-04 15:49:29,585 - [INFO] - [OCR DEBUG] N2023100400119THA00100001 - Copy.tif page 1 zone 1 (top, 70.8 ms) NORM: 가맹점실사사진
2025-09-04 15:49:2

## SECTION 3: 🖼️ YOLO — Extract Photos from Detected Pages

In [5]:
# Validate the YOLO model path (fails fast with a clear error if not found)
_yolo_model_path = Path(photo_extractor_config['yolo_object_detection']['model_path'])
if not _yolo_model_path.is_file():
    raise FileNotFoundError(
        f"YOLO model not found at: {_yolo_model_path}\n"
        f"Please update 'photo_extractor_config[\"yolo_object_detection\"][\"model_path\"]' to a valid .pt file."
    )

In [6]:
# =================================================================================================
# SECTION 3: YOLO extraction
# =================================================================================================
output_dir = Path(OUTPUT_DIRECTORY_FOR_CROPS)
output_dir.mkdir(parents=True, exist_ok=True)
count = 0
for file_info in all_files_to_process:
    tif_path = file_info['path']
    for page_num in file_info['pages']:
        try:
            crops = extractor_instance.extract_photos(tif_path, page_num)
            for i, img in enumerate(crops):
                save_path = output_dir / f'{tif_path.stem}_page{page_num}_crop{i+1}.jpg'
                img.save(save_path, 'JPEG', quality=95)
                count += 1
            logging.info(f'{tif_path.name} p{page_num}: saved {len(crops)} crop(s)')
        except Exception as e:
            logging.warning(f'YOLO extract failed for {tif_path} p{page_num}: {e}')

logging.info(f'Total YOLO crops saved: {count}')


2025-09-04 15:49:30,357 - [INFO] - Total YOLO crops saved: 0


## SECTION 4B: 🧭 Bounding Box — Extract Photos from Predefined Regions

Use this mode when you already know the regions of interest. Coordinates can be absolute pixels or normalized [0,1] depending on `BBOX_NORMALIZED`.

In [7]:
# =================================================================================================
# SECTION 4B: Bounding-box extraction
# =================================================================================================
bbox_output_dir = Path(OUTPUT_DIRECTORY_FOR_CROPS) / 'bbox_crops'
bbox_output_dir.mkdir(parents=True, exist_ok=True)
bbox_count = 0
for file_info in all_files_to_process:
    tif_path = file_info['path']
    for page_num in file_info['pages']:
        try:
            # Load page
            from PIL import Image
            with Image.open(tif_path) as tif_img:
                if not (1 <= page_num <= tif_img.n_frames):
                    continue
                tif_img.seek(page_num - 1)
                page_img = tif_img.convert('RGB')
            # Crop with predefined boxes
            crops = extractor_instance.extract_photos_from_bboxes(
                page_img, bboxes=BBOX_LIST, bbox_format=BBOX_FORMAT, normalized=BBOX_NORMALIZED
            )
            for i, img in enumerate(crops):
                save_path = bbox_output_dir / f'{tif_path.stem}_page{page_num}_bbox{i+1}.jpg'
                img.save(save_path, 'JPEG', quality=95)
                bbox_count += 1
            logging.info(f'{tif_path.name} p{page_num}: saved {len(crops)} bbox crop(s)')
        except Exception as e:
            logging.warning(f'BBox extract failed for {tif_path} p{page_num}: {e}')

logging.info(f'Total BBox crops saved: {bbox_count}')


2025-09-04 15:49:30,440 - [INFO] - N2023100400003THA00100001.tif p1: saved 2 bbox crop(s)
2025-09-04 15:49:30,498 - [INFO] - N2023100400119THA00100001 - Copy.tif p1: saved 2 bbox crop(s)
2025-09-04 15:49:30,555 - [INFO] - N2023100400119THA00100001.tif p1: saved 2 bbox crop(s)
2025-09-04 15:49:30,640 - [INFO] - N2023100400119THA00100001_3.tif p1: saved 2 bbox crop(s)
2025-09-04 15:49:30,694 - [INFO] - N2023100400122THA00100001.tif p1: saved 2 bbox crop(s)
2025-09-04 15:49:30,749 - [INFO] - N2024030602077THA00100001_12.tif p1: saved 2 bbox crop(s)
2025-09-04 15:49:30,798 - [INFO] - N2024030602091THA00100001_13.tif p1: saved 2 bbox crop(s)
2025-09-04 15:49:30,876 - [INFO] - N2024030602100THA00100001_12.tif p1: saved 2 bbox crop(s)
2025-09-04 15:49:30,937 - [INFO] - N2024030602100THA00100001_13.tif p1: saved 2 bbox crop(s)
2025-09-04 15:49:31,000 - [INFO] - N2024030602104THA00100001_12.tif p1: saved 2 bbox crop(s)
2025-09-04 15:49:31,000 - [INFO] - Total BBox crops saved: 20
