# 🔑 Key-Driven TIF Pipeline Validation

Validate the new optional helper module (external/key_input) that batches TIF documents based on a key table.

This notebook demonstrates:
- Loading the key_input_config.yaml
- Previewing the key table (CSV/Excel/JSON)
- Name mapping (파일명 → storage convention)
- Local and Database fetch resolution (a few batches)
- Summaries with sample resolved paths
- Optional: Triggering the existing pipeline via run_key_input_pipeline


In [1]:
from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import Any, Dict, List

import yaml

# Ensure imports from project
PROJECT_ROOT = Path.cwd()
EXTERNAL_DIR = PROJECT_ROOT / 'external'
import sys
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))
if EXTERNAL_DIR.is_dir() and str(EXTERNAL_DIR) not in sys.path:
    sys.path.append(str(EXTERNAL_DIR))

from external.key_input.key_input_orchestrator import (
    KeyInputConfig, KeyInputLoader, LocalFetchConfig, LocalFolderFetcher,
    DatabaseConfig, DatabaseFetcher, NameMappingConfig, run_key_input_pipeline,
    _ensure_iterable_chunks,
)

LOGGER = logging.getLogger('key_input_notebook')
if not LOGGER.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter('%(asctime)s | %(levelname)s | %(name)s | %(message)s'))
    LOGGER.addHandler(handler)
LOGGER.setLevel(logging.INFO)

# Notebook-level debug controls
# Set DEBUG_OVERRIDE to True/False to force debug on/off, or None to respect YAML (name_mapping.debug_log)
DEBUG_OVERRIDE = True #None
# Limit number of sample paths shown per batch and in final summary
DEBUG_SAMPLE_LIMIT = 1000000

def load_yaml(path: Path) -> Dict[str, Any]:
    with path.open('r', encoding='utf-8') as f:
        return yaml.safe_load(f) or {}


## 📄 Load configuration and preview
Provide the path to key_input_config.yaml and (optionally) to image_similarity_config.yaml for pipeline execution.

In [2]:
key_cfg_path = Path('external/key_input/key_input_config.yaml')
main_config_path = Path('configs/image_similarity_config.yaml')
cfg = load_yaml(key_cfg_path)
cfg


{'key_input': {'input_table_path': 'D:/mock_api_TEST/filtered_rows.xlsx',
  'file_name_column': '파일명',
  'format': 'auto',
  'key_in_xlsx_to_csv': True,
  'excel_sheet_name': None,
  'csv_chunk_size': 50000,
  'json_array_field': None,
  'json_records_is_lines': False,
  'batch_size': 8000,
  'deduplicate': True,
  'strip_whitespace': True,
  'case_insensitive_match': True,
  'columns_for_results': ['사업자등록번호', '수신일자', '업종명']},
 'name_mapping': {'enabled': True,
  'debug_log': False,
  'tail_len': 5,
  'insert_token': '001',
  'glob_suffix': '_*.tif',
  'use_rglob_any_depth': True,
  'db_like_template': '{prefix}{insert}{suffix}_%.tif'},
 'data_source': {'mode': 'api',
  'key_mode': {'use_tmp_for_indexing_input': False},
  'local': {'search_roots': ['D:/mock_api_TEST/mock_api_server_images',
    'data_real'],
   'recursive': True,
   'allowed_extensions': ['.tif', '.tiff'],
   'resolve_without_extension': True,
   'stop_on_first_match': True},
  'api': {'api_endpoint': 'http://127.0.0.1

## 🧭 Build configs
Construct the loader, mapping, and fetchers from YAML.

In [3]:
ki = cfg.get('key_input', {})
nm = cfg.get('name_mapping', {})
mode = cfg.get('data_source', {}).get('mode', 'local')

ki_cfg = KeyInputConfig(
    input_table_path=Path(ki.get('input_table_path')),
    file_name_column=str(ki.get('file_name_column', '파일명')),
    format=str(ki.get('format', 'auto')),
    json_array_field=ki.get('json_array_field'),
    json_records_is_lines=bool(ki.get('json_records_is_lines', False)),
    batch_size=int(ki.get('batch_size', 200)),
    deduplicate=bool(ki.get('deduplicate', True)),
    strip_whitespace=bool(ki.get('strip_whitespace', True)),
)
nm_cfg = NameMappingConfig(
    enabled=bool(nm.get('enabled', True)),
    debug_log=bool(nm.get('debug_log', False)),
    tail_len=int(nm.get('tail_len', 5)),
    insert_token=str(nm.get('insert_token', '001')),
    glob_suffix=str(nm.get('glob_suffix', '_*.tif')),
    use_rglob_any_depth=bool(nm.get('use_rglob_any_depth', True)),
    db_like_template=str(nm.get('db_like_template', '{prefix}{insert}{suffix}_%.tif')),
)
# Notebook-level debug override
if DEBUG_OVERRIDE is not None:
    nm_cfg.debug_log = bool(DEBUG_OVERRIDE)
    LOGGER.setLevel(logging.DEBUG if nm_cfg.debug_log else logging.INFO)

loader = KeyInputLoader(ki_cfg, LOGGER)

src_local = cfg.get('data_source', {}).get('local', {})
local_cfg = LocalFetchConfig(
    search_roots=[Path(p) for p in src_local.get('search_roots', [])],
    recursive=bool(src_local.get('recursive', True)),
    allowed_extensions=tuple(src_local.get('allowed_extensions', ['.tif', '.tiff'])),
    resolve_without_extension=bool(src_local.get('resolve_without_extension', True)),
    case_insensitive_match=bool(ki.get('case_insensitive_match', True)),
    stop_on_first_match=bool(src_local.get('stop_on_first_match', True)),
)

src_db = cfg.get('data_source', {}).get('database', {})
db_cfg = DatabaseConfig(
    driver=str(src_db.get('driver', 'postgresql')),
    host=str(src_db.get('host', '127.0.0.1')),
    port=int(src_db.get('port', 5432)),
    database=str(src_db.get('database', 'postgres')),
    user=str(src_db.get('user', 'postgres')),
    password_env_var=str(src_db.get('password_env_var', 'POSTGRES_PASSWORD')),
    sslmode=str(src_db.get('sslmode', 'prefer')),
    fetch_mode=str(src_db.get('fetch_mode', 'path')).lower(),
    query_template=str(src_db.get('query_template', 'SELECT file_path FROM tif_documents WHERE file_name = %(file_name)s')),
    path_column=str(src_db.get('path_column', 'file_path')),
    blob_column=str(src_db.get('blob_column', 'file_blob')),
    blob_temp_dir=Path(src_db.get('blob_temp_dir')) if src_db.get('blob_temp_dir') else None,
)

print(f"mode:\n{mode}\n")
print(f"ki_cfg:\n{ki_cfg}\n")
print(f"nm_cfg:\n{nm_cfg}\n")
print(f"local_cfg:\n{local_cfg}\n")
print(f"db_cfg:\n{db_cfg}\n")


mode:
api

ki_cfg:
KeyInputConfig(input_table_path=WindowsPath('D:/mock_api_TEST/filtered_rows.xlsx'), file_name_column='파일명', format='auto', json_array_field=None, json_records_is_lines=False, batch_size=8000, deduplicate=True, strip_whitespace=True, key_in_xlsx_to_csv=True, excel_sheet_name=None, csv_chunk_size=50000, columns_for_results=None)

nm_cfg:
NameMappingConfig(enabled=True, debug_log=True, tail_len=5, insert_token='001', glob_suffix='_*.tif', use_rglob_any_depth=True, db_like_template='{prefix}{insert}{suffix}_%.tif')

local_cfg:
LocalFetchConfig(search_roots=[WindowsPath('D:/mock_api_TEST/mock_api_server_images'), WindowsPath('data_real')], recursive=True, allowed_extensions=('.tif', '.tiff'), resolve_without_extension=True, case_insensitive_match=True, stop_on_first_match=True)

db_cfg:
DatabaseConfig(driver='postgresql', host='127.0.0.1', port=5432, database='postgres', user='postgres', password_env_var='POSTGRES_PASSWORD', sslmode='prefer', fetch_mode='path', query_temp

## 🔍 Preview key file (first 10 filenames)

In [4]:
from itertools import islice
list(islice(loader.iter_filenames(), 10))


2025-09-03 14:27:03,356 | INFO | key_input_notebook | Converting Excel to CSV for streaming: 'C:\Users\jeeb\AppData\Local\Temp\tmpsdgy669w.csv'
[2025-09-03 14:27:03,356] [    INFO] key_input_orchestrator.py:156 - Converting Excel to CSV for streaming: 'C:\Users\jeeb\AppData\Local\Temp\tmpsdgy669w.csv'


['N2023100400119THA00001',
 'N2023100400122THA00001',
 'N2023100400003THA00001',
 'N2023100401020THA00001',
 'N2023100400802THA00001',
 'N2023100400324THA00001',
 'N2023100401111THA00001',
 'N2023100401038THA00001',
 'N2023100400605THA00001',
 'N2023100400195THA00001']

## 🧪 Test resolution for a couple batches
This demonstrates local and database resolution without running the pipeline.

Where filenames are matched from:
- Local resolution:
  - Search roots: values from key_input_config.yaml -> data_source.local.search_roots.
  - Recursive search: data_source.local.recursive; glob search may use name_mapping.use_rglob_any_depth.
  - Allowed extensions: data_source.local.allowed_extensions; resolve_without_extension controls matching on stems.
  - If name_mapping.enabled, a mapped core is built: mapped_core = prefix + insert_token + suffix, where prefix = file_name[:-tail_len], suffix = file_name[-tail_len:]. Files are then searched with pattern f'{mapped_core}{glob_suffix}'.
- Database resolution:
  - Connection coordinates: driver, host, port, database, user (password is read via the env var indicated by data_source.database.password_env_var); sslmode governs transport.
  - Query: data_source.database.query_template, parameterized with %(file_name)s.
  - Fetch mode: 'path' uses data_source.database.path_column; 'blob' uses blob_column and optional blob_temp_dir.
  - If no exact match and name mapping is enabled, a LIKE fallback pattern is built using name_mapping.db_like_template.


In [5]:
import os, json
from typing import Optional
from itertools import islice

def _map_core(file_name: str, tail_len: int, insert_token: str) -> str:
    if not file_name:
        return file_name
    tail_len = max(0, int(tail_len))
    prefix = file_name[:-tail_len] if tail_len else file_name
    suffix = file_name[-tail_len:] if tail_len else ''
    return f'{prefix}{insert_token}{suffix}'

def redact_secret(value: Optional[str], show_last: int = 3) -> str:
    if not value:
        return ''
    if len(value) <= show_last:
        return '*' * len(value)
    return '*' * (len(value) - show_last) + value[-show_last:]

def build_db_dsn(cfg) -> str:
    pw_env = getattr(cfg, 'password_env_var', '') or ''
    pw = os.environ.get(pw_env, '')
    pw_red = redact_secret(pw)
    return f'{cfg.driver}://{cfg.user}:{pw_red}@{cfg.host}:{cfg.port}/{cfg.database}?sslmode={cfg.sslmode}'

def summarize_local(local_cfg, name_map_cfg) -> dict:
    return {
        'search_roots': [str(p) for p in local_cfg.search_roots],
        'recursive': local_cfg.recursive,
        'allowed_extensions': list(local_cfg.allowed_extensions),
        'resolve_without_extension': local_cfg.resolve_without_extension,
        'case_insensitive_match': getattr(local_cfg, 'case_insensitive_match', True),
        'stop_on_first_match': local_cfg.stop_on_first_match,
        'name_mapping': {
            'enabled': name_map_cfg.enabled,
            'tail_len': name_map_cfg.tail_len,
            'insert_token': name_map_cfg.insert_token,
            'glob_suffix': name_map_cfg.glob_suffix,
            'use_rglob_any_depth': name_map_cfg.use_rglob_any_depth,
        },
    }

def summarize_db(db_cfg, name_map_cfg) -> dict:
    return {
        'dsn_redacted': build_db_dsn(db_cfg),
        'fetch_mode': db_cfg.fetch_mode,
        'query_template': db_cfg.query_template,
        'path_column': db_cfg.path_column,
        'blob_column': db_cfg.blob_column,
        'blob_temp_dir': str(db_cfg.blob_temp_dir) if db_cfg.blob_temp_dir else None,
        'name_mapping_fallback_like': {
            'enabled': name_map_cfg.enabled and '%' in (name_map_cfg.db_like_template or ''),
            'db_like_template': name_map_cfg.db_like_template,
        },
    }

def explain_local_match(file_name: str, local_cfg, name_map_cfg) -> dict:
    mapped = _map_core(file_name, name_map_cfg.tail_len, name_map_cfg.insert_token) if name_map_cfg.enabled else file_name
    patterns = []
    if name_map_cfg.enabled and name_map_cfg.glob_suffix:
        patterns.append(f'{mapped}{name_map_cfg.glob_suffix}')
        # also consider exact .tif without underscore
        patterns.append(f'{mapped}.tif')
    else:
        patterns.extend([f'{file_name}{ext}' for ext in local_cfg.allowed_extensions])
        if local_cfg.resolve_without_extension:
            patterns.append(file_name)
    return {
        'file_name': file_name,
        'mapped_core': mapped if name_map_cfg.enabled else None,
        'candidate_patterns': patterns,
        'search_roots': [str(p) for p in local_cfg.search_roots],
        'use_rglob_any_depth': name_map_cfg.use_rglob_any_depth if name_map_cfg.enabled else local_cfg.recursive,
    }

def explain_db_match(file_name: str, db_cfg, name_map_cfg) -> dict:
    params = {'file_name': file_name}
    like_pattern = None
    if name_map_cfg.enabled and '%' in (name_map_cfg.db_like_template or ''):
        mapped = _map_core(file_name, name_map_cfg.tail_len, name_map_cfg.insert_token)
        prefix = mapped[:-name_map_cfg.tail_len] if name_map_cfg.tail_len else mapped
        suffix = mapped[-name_map_cfg.tail_len:] if name_map_cfg.tail_len else ''
        like_pattern = (name_map_cfg.db_like_template
                        .replace('{prefix}', prefix)
                        .replace('{insert}', name_map_cfg.insert_token)
                        .replace('{suffix}', suffix))
    return {
        'file_name': file_name,
        'primary_query': db_cfg.query_template,
        'primary_params': params,
        'fallback_like_pattern': like_pattern,
    }

print('Local resolution sources:')
print(json.dumps(summarize_local(local_cfg, nm_cfg), indent=2, ensure_ascii=False))
print()
print('Database resolution sources:')
print(json.dumps(summarize_db(db_cfg, nm_cfg), indent=2, ensure_ascii=False))

samples = list(islice(loader.iter_filenames(), 3))
print()
print('Sample filename explanations (local):')
for s in samples:
    print(json.dumps(explain_local_match(s, local_cfg, nm_cfg), indent=2, ensure_ascii=False))

print()
print('Sample filename explanations (database):')
for s in samples:
    print(json.dumps(explain_db_match(s, db_cfg, nm_cfg), indent=2, ensure_ascii=False))

2025-09-03 14:27:03,393 | INFO | key_input_notebook | Converting Excel to CSV for streaming: 'C:\Users\jeeb\AppData\Local\Temp\tmpz9di7w06.csv'
[2025-09-03 14:27:03,393] [    INFO] key_input_orchestrator.py:156 - Converting Excel to CSV for streaming: 'C:\Users\jeeb\AppData\Local\Temp\tmpz9di7w06.csv'


Local resolution sources:
{
  "search_roots": [
    "D:\\mock_api_TEST\\mock_api_server_images",
    "data_real"
  ],
  "recursive": true,
  "allowed_extensions": [
    ".tif",
    ".tiff"
  ],
  "resolve_without_extension": true,
  "case_insensitive_match": true,
  "stop_on_first_match": true,
  "name_mapping": {
    "enabled": true,
    "tail_len": 5,
    "insert_token": "001",
    "glob_suffix": "_*.tif",
    "use_rglob_any_depth": true
  }
}

Database resolution sources:
{
  "dsn_redacted": "postgresql://postgres:@127.0.0.1:5432/postgres?sslmode=prefer",
  "fetch_mode": "path",
  "query_template": "SELECT file_path FROM tif_documents WHERE file_name = %(file_name)s",
  "path_column": "file_path",
  "blob_column": "file_blob",
  "blob_temp_dir": null,
  "name_mapping_fallback_like": {
    "enabled": true,
    "db_like_template": "{prefix}{insert}{suffix}_%.tif"
  }
}

Sample filename explanations (local):
{
  "file_name": "N2023100400119THA00001",
  "mapped_core": "N2023100400119THA

In [6]:
from collections import defaultdict

batches_to_test = None #2
resolved_summary: List[Dict[str, Any]] = []

local = True
batch_mode_str = 'local' if local else 'database'

# Aggregation counters
total_requested = 0
total_resolved = 0
samples_agg: List[str] = []
batches_processed = 0

# Local resolution
if local:    
    lfetch = LocalFolderFetcher(local_cfg, LOGGER, name_map=nm_cfg)
    for i, batch in enumerate(_ensure_iterable_chunks(loader.iter_filenames(), max(1, ki_cfg.batch_size)), 1):
        if (batches_to_test is not None) and (i > batches_to_test): break
        rp = lfetch.fetch_batch(batch)
        batches_processed += 1
        total_requested += len(batch)
        total_resolved += len(rp)
        samples_agg.extend([str(p) for p in rp])
        samples_agg = samples_agg[:DEBUG_SAMPLE_LIMIT]
        display({
            'batch': i, 'mode': 'local',
            'requested': len(batch), 'resolved': len(rp), 'missing': len(batch) - len(rp),
            'samples': [str(p) for p in rp[:DEBUG_SAMPLE_LIMIT]],
        })

# Database resolution (if configured)
if not local:    
    dbfetch = DatabaseFetcher(db_cfg, LOGGER, name_map=nm_cfg)
    for i, batch in enumerate(_ensure_iterable_chunks(loader.iter_filenames(), max(1, ki_cfg.batch_size)), 1):
        if (batches_to_test is not None) and (i > batches_to_test): break
        rp = dbfetch.fetch_batch(batch)
        batches_processed += 1
        total_requested += len(batch)
        total_resolved += len(rp)
        samples_agg.extend([str(p) for p in rp])
        samples_agg = samples_agg[:DEBUG_SAMPLE_LIMIT]
        display({
            'batch': i, 'mode': 'database',
            'requested': len(batch), 'resolved': len(rp), 'missing': len(batch) - len(rp),
            'samples': [str(p) for p in rp[:DEBUG_SAMPLE_LIMIT]],
        })

# Final aggregated summary across processed batches
display({
    'total_batches': batches_processed,
    'mode': batch_mode_str,
    'requested': total_requested,
    'resolved': total_resolved,
    'missing': total_requested - total_resolved,
    'samples': samples_agg,
})


2025-09-03 14:27:03,434 | INFO | key_input_notebook | Converting Excel to CSV for streaming: 'C:\Users\jeeb\AppData\Local\Temp\tmpwoczj4v2.csv'
[2025-09-03 14:27:03,434] [    INFO] key_input_orchestrator.py:156 - Converting Excel to CSV for streaming: 'C:\Users\jeeb\AppData\Local\Temp\tmpwoczj4v2.csv'
2025-09-03 14:27:03,451 | INFO | key_input_notebook | query image filename from table: N2023100400119THA00001
[2025-09-03 14:27:03,451] [    INFO] key_input_orchestrator.py:677 - query image filename from table: N2023100400119THA00001
2025-09-03 14:27:03,452 | INFO | key_input_notebook | modified query image filename for search: N2023100400119THA00100001
[2025-09-03 14:27:03,452] [    INFO] key_input_orchestrator.py:678 - modified query image filename for search: N2023100400119THA00100001
2025-09-03 14:27:03,454 | INFO | key_input_notebook | query image filename from table: N2023100400122THA00001
[2025-09-03 14:27:03,454] [    INFO] key_input_orchestrator.py:677 - query image filename fro

{'batch': 1,
 'mode': 'local',
 'requested': 26,
 'resolved': 3,
 'missing': 23,
 'samples': ['D:\\mock_api_TEST\\mock_api_server_images\\N2023100400119THA00100001.tif',
  'D:\\mock_api_TEST\\mock_api_server_images\\N2023100400122THA00100001.tif',
  'D:\\mock_api_TEST\\mock_api_server_images\\N2023100400003THA00100001.tif']}

{'total_batches': 1,
 'mode': 'local',
 'requested': 26,
 'resolved': 3,
 'missing': 23,
 'samples': ['D:\\mock_api_TEST\\mock_api_server_images\\N2023100400119THA00100001.tif',
  'D:\\mock_api_TEST\\mock_api_server_images\\N2023100400122THA00100001.tif',
  'D:\\mock_api_TEST\\mock_api_server_images\\N2023100400003THA00100001.tif']}

## ▶️ Optional: Run the pipeline via run_key_input_pipeline
Ensure your image_similarity_config.yaml has:

```yaml
input_mode:
  doc_input_start: "key"
  key_input_config_path: "external/key_input/key_input_config.yaml"
```


In [8]:
result = run_key_input_pipeline(main_config_path)
print(json.dumps(result, ensure_ascii=False, indent=2))



--- 🚀 OCR ENGINE INITIALIZATION (PaddleOCR) ---
  - INFO: Mapped language code 'ko' to 'korean' for PaddleOCR compatibility via config.

🔎 Checking for PaddleOCR models...
  - Resolved Detection path: C:\Users\jeeb\.paddlex\official_models\PP-OCRv3_det\Multilingual_PP-OCRv3_det_infer
  - Resolved Recognition path: C:\Users\jeeb\.paddlex\official_models\korean_PP-OCRv3_rec\korean_PP-OCRv3_rec_infer
- INFO: Model 'Classification' not resolved in provided path. Will use PaddleOCR default models (auto-download or cached).

🔧 Assembling PaddleOCR Parameters:
  - Language: 'korean'
  - GPU Acceleration: Enabled
  - Text Angle Classification: Enabled

⚙️ Initializing PaddleOCR engine with parameters:
  - use_angle_cls: True
  - lang: korean
  - det_model_dir: C:\Users\jeeb\.paddlex\official_models\PP-OCRv3_det\Multilingual_PP-OCRv3_det_infer
  - rec_model_dir: C:\Users\jeeb\.paddlex\official_models\korean_PP-OCRv3_rec\korean_PP-OCRv3_rec_infer
  - rec_batch_num: 6
  - show_log: False
  - INFO

Batch Searching TIFs:  33%|███▎      | 1/3 [00:00<00:00,  2.28it/s]

  - INFO: PaddleOCR resolved Classification model directory to: C:\Users\jeeb\.paddleocr\whl\cls\ch_ppocr_mobile_v2.0_cls_infer
CONSOLE DEBUG: N2023100400003THA00100001_9.tif p1: crops=2
CONSOLE DEBUG: TIF 'N2023100400003THA00100001_9.tif': extracted 2 photo(s).


Batch Searching TIFs:  67%|██████▋   | 2/3 [00:00<00:00,  2.69it/s]

CONSOLE DEBUG: N2023100400119THA00100001_9.tif p1: crops=2
CONSOLE DEBUG: TIF 'N2023100400119THA00100001_9.tif': extracted 2 photo(s).


Batch Searching TIFs: 100%|██████████| 3/3 [00:01<00:00,  2.80it/s]

CONSOLE DEBUG: N2023100400122THA00100001_9.tif p1: crops=2
CONSOLE DEBUG: TIF 'N2023100400122THA00100001_9.tif': extracted 2 photo(s).
CONSOLE DEBUG: N2023100400003THA00100001_9.tif total crops=2
CONSOLE DEBUG: N2023100400119THA00100001_9.tif total crops=2
CONSOLE DEBUG: N2023100400122THA00100001_9.tif total crops=2
CONSOLE DEBUG: Total crops across batch: 6
{
  "status": "success",
  "exit_code": 0,
  "total_files_requested": 26,
  "total_files_resolved": 3,
  "total_batches": 1,
  "batch_results": [
    {
      "mode": "api",
      "requested": 26,
      "resolved": 3,
      "result": {
        "status": "success",
        "exit_code": 0,
        "message": "Computed top 7 documents (global).",
        "top_documents": [
          {
            "document": "N2023100400119THA00100001.tif",
            "score": 0.9999999999997318
          },
          {
            "document": "N2023100400122THA00100001.tif",
            "score": 0.999999999999658
          },
          {
            


