# OCR Data Processor

This notebook handles data preprocessing, organization, and preparation for the OCR pipeline.

## Tasks:
1. **Organize raw dataset** - Sort images by train/val/test
2. **Validate images** - Check format and integrity
3. **Generate labels** - Create annotation files
4. **Crop text regions** - Save detected text areas
5. **Build metadata** - Create index files

# OCR Data Processor

This notebook handles data processing for the OCR pipeline.

In [None]:
import os
import json
import shutil
import cv2
import numpy as np
from pathlib import Path
from datetime import datetime
import easyocr

In [None]:
def generate_dataset_report(base_dir):
    """Generate dataset statistics report"""
    base_dir = Path(base_dir)
    
    report = {
        'timestamp': datetime.now().isoformat(),
        'directories': {}
    }
    
    for split_name in ['train', 'val', 'test']:
        split_dir = base_dir / split_name
        if split_dir.exists():
            image_files = list(split_dir.glob('*.jpg')) + list(split_dir.glob('*.png')) + list(split_dir.glob('*.jpeg'))
            
            total_size = sum(f.stat().st_size for f in image_files) / (1024*1024)  # MB
            
            report['directories'][split_name] = {
                'count': len(image_files),
                'total_size_mb': round(total_size, 2)
            }
    
    # Total crops
    if (base_dir / 'crops').exists():
        crop_files = list((base_dir / 'crops').glob('*.png'))
        report['crops'] = {
            'count': len(crop_files),
            'total_size_mb': round(sum(f.stat().st_size for f in crop_files) / (1024*1024), 2)
        }
    
    return report

def print_dataset_report():
    """Print formatted dataset report"""
    report = generate_dataset_report(base_dir)
    
    print("\n" + "="*60)
    print("üìä DATASET REPORT")
    print("="*60)
    
    for split_name, stats in report.get('directories', {}).items():
        print(f"\n{split_name.upper()}:")
        print(f"  Images: {stats['count']}")
        print(f"  Size: {stats['total_size_mb']} MB")
    
    if 'crops' in report:
        print(f"\nCROPS:")
        print(f"  Total: {report['crops']['count']}")
        print(f"  Size: {report['crops']['total_size_mb']} MB")
    
    print("\n" + "="*60 + "\n")

print("‚úÖ Report functions loaded")

## 5. Generate Dataset Statistics

In [None]:
def generate_crops_and_labels(image_path, reader, crop_save_dir=crops_dir):
    """
    Generate cropped text regions and labels from an image
    
    Args:
        image_path: Path to image file
        reader: EasyOCR reader object
        crop_save_dir: Directory to save crops
        
    Returns:
        dict: Crop information
    """
    image = cv2.imread(str(image_path))
    if image is None:
        return None
    
    results = reader.readtext(image)
    
    crops_info = {
        'image_name': image_path.name,
        'image_path': str(image_path),
        'crops': []
    }
    
    for idx, detection in enumerate(results):
        bbox, text, confidence = detection
        bbox = np.array(bbox, dtype=np.int32)
        
        # Get bounding rectangle
        x_min = min([point[0] for point in bbox])
        x_max = max([point[0] for point in bbox])
        y_min = min([point[1] for point in bbox])
        y_max = max([point[1] for point in bbox])
        
        # Crop the region with some padding
        padding = 5
        x_min = max(0, x_min - padding)
        y_min = max(0, y_min - padding)
        x_max = min(image.shape[1], x_max + padding)
        y_max = min(image.shape[0], y_max + padding)
        
        crop = image[y_min:y_max, x_min:x_max]
        
        if crop.size > 0:
            # Save crop
            crop_filename = f"{image_path.stem}_crop_{idx:03d}.png"
            crop_path = crop_save_dir / crop_filename
            cv2.imwrite(str(crop_path), crop)
            
            crops_info['crops'].append({
                'crop_id': idx,
                'text': text,
                'confidence': float(confidence),
                'bbox': [int(x_min), int(y_min), int(x_max), int(y_max)],
                'crop_file': crop_filename
            })
    
    return crops_info

def process_all_images(directory, reader):
    """Process all images to generate crops and labels"""
    directory = Path(directory)
    all_labels = {'images': []}
    
    image_files = list(directory.glob('*.jpg')) + list(directory.glob('*.png')) + list(directory.glob('*.jpeg'))
    
    print(f"üì∏ Processing {len(image_files)} images...")
    
    for i, img_path in enumerate(image_files, 1):
        print(f"  [{i}/{len(image_files)}] {img_path.name}...", end='')
        
        crops_info = generate_crops_and_labels(img_path, reader)
        if crops_info:
            all_labels['images'].append(crops_info)
            print(f" ‚úÖ ({len(crops_info['crops'])} crops)")
        else:
            print(" ‚ùå (failed)")
    
    # Save labels
    labels_file.parent.mkdir(exist_ok=True)
    with open(labels_file, 'w') as f:
        json.dump(all_labels, f, indent=2)
    
    print(f"\n‚úÖ Labels saved to: {labels_file}")
    return all_labels

print("‚úÖ Crop and label functions loaded")

## 4. Crop and Label Generation

In [None]:
def split_dataset(image_paths, train_ratio=0.7, val_ratio=0.2):
    """
    Split images into train/val/test sets
    
    Args:
        image_paths: List of image file paths
        train_ratio: Ratio for training set (default 0.7)
        val_ratio: Ratio for validation set (default 0.2)
        
    Returns:
        dict: Contains train, val, test image lists
    """
    n_images = len(image_paths)
    n_train = int(n_images * train_ratio)
    n_val = int(n_images * val_ratio)
    
    np.random.shuffle(image_paths)
    
    train_images = image_paths[:n_train]
    val_images = image_paths[n_train:n_train+n_val]
    test_images = image_paths[n_train+n_val:]
    
    return {
        'train': train_images,
        'val': val_images,
        'test': test_images
    }

def organize_images(splits):
    """
    Copy images to appropriate directories
    
    Args:
        splits: Dict with train/val/test image lists
    """
    for split_name, images in splits.items():
        if split_name == 'train':
            target_dir = train_dir
        elif split_name == 'val':
            target_dir = val_dir
        else:  # test
            target_dir = base_dir / 'test'
            target_dir.mkdir(exist_ok=True)
        
        for img_path in images:
            try:
                shutil.copy2(img_path, target_dir / img_path.name)
            except Exception as e:
                print(f"Error copying {img_path}: {e}")
        
        print(f"  ‚úÖ {split_name}: {len(images)} images")

def organize_raw_dataset():
    """Organize raw dataset into train/val/test"""
    print("\nüîÑ Organizing dataset...")
    
    valid_images, invalid = scan_dataset(raw_dir)
    
    if len(valid_images) == 0:
        print("‚ö†Ô∏è  No valid images found in raw directory")
        return None
    
    splits = split_dataset(valid_images)
    organize_images(splits)
    
    print("‚úÖ Dataset organization complete")
    return splits

print("‚úÖ Organization functions loaded")

## 3. Data Organization Functions

In [None]:
def validate_image(image_path):
    """
    Validate if an image file is readable
    
    Args:
        image_path: Path to image file
        
    Returns:
        bool: True if valid, False otherwise
    """
    try:
        img = cv2.imread(str(image_path))
        if img is None:
            return False
        return True
    except Exception as e:
        print(f"Error validating {image_path}: {e}")
        return False

def get_image_stats(image_path):
    """Get image statistics"""
    img = cv2.imread(str(image_path))
    if img is None:
        return None
    
    height, width = img.shape[:2]
    size_kb = image_path.stat().st_size / 1024
    
    return {
        'width': width,
        'height': height,
        'size_kb': round(size_kb, 2),
        'channels': img.shape[2] if len(img.shape) > 2 else 1
    }

def scan_dataset(directory):
    """Scan and validate dataset directory"""
    directory = Path(directory)
    
    valid_images = []
    invalid_images = []
    
    supported_formats = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff'}
    
    for file_path in directory.rglob('*'):
        if file_path.suffix.lower() in supported_formats:
            if validate_image(file_path):
                valid_images.append(file_path)
            else:
                invalid_images.append(file_path)
    
    print(f"üìä Dataset Scan Results:")
    print(f"  ‚úÖ Valid images: {len(valid_images)}")
    print(f"  ‚ùå Invalid images: {len(invalid_images)}")
    
    return valid_images, invalid_images

print("‚úÖ Validation functions loaded")

## 2. Image Validation Functions

In [None]:
# Define directory paths
base_dir = Path('processed_data')
raw_dir = base_dir / 'raw'
train_dir = base_dir / 'train'
val_dir = base_dir / 'val'
crops_dir = base_dir / 'crops'
labels_file = base_dir / 'labels.json'

# Create directories if they don't exist
for directory in [raw_dir, train_dir, val_dir, crops_dir]:
    directory.mkdir(parents=True, exist_ok=True)
    print(f"‚úÖ Directory ready: {directory}")

print("\nüìÅ Directory structure initialized")

## 1. Setup Directory Structure

# 6. EasyOCR Performance Optimization Guide

## Quick Reference: When to Use Each Strategy

| Image Type | Preprocessing | Threshold | Resize | Notes |
|---|---|---|---|---|
| Clear documents | Light | 0.6 | 1200px | Fastest, highest accuracy |
| Normal photos | Light | 0.5 | 1200px | Balanced quality/speed |
| Noisy/blurry | Aggressive | 0.3 | 1500px | Slower, detects more |
| Small text | Aggressive | 0.4 | 2000-3000px | Very slow, necessary for tiny text |
| Skewed docs | Aggressive + deskew | 0.5 | 1500px | Rotation correction first |
| Colored background | Aggressive | 0.4 | 1500px | Focus on brightness channel |

In [None]:
## Example: Processing Different Image Types

# Load an image
test_image_path = "path/to/your/image.jpg"  # Change this
test_image = cv2.imread(test_image_path)

if test_image is not None:
    # STRATEGY 1: Clear Documents
    print("=" * 60)
    print("STRATEGY 1: Clear Documents (Fast, Accurate)")
    print("=" * 60)
    
    # Resize moderately
    resized = cv2.resize(test_image, (1200, int(test_image.shape[0] * 1200 / test_image.shape[1])))
    gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
    
    # Minimal denoising
    filtered = cv2.bilateralFilter(gray, 5, 50, 50)
    enhanced = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(filtered)
    
    # Use high confidence threshold
    print(f"‚úÖ Preprocessing: Light")
    print(f"‚úÖ Resize: 1200px")
    print(f"‚úÖ Confidence Threshold: 0.6")
    print(f"‚úÖ Speed: Fast | Accuracy: High")
    
    print("\n" + "=" * 60)
    print("STRATEGY 2: Noisy/Blurry Images (Slower, Better Detection)")
    print("=" * 60)
    
    # Aggressive resizing
    resized = cv2.resize(test_image, (1500, int(test_image.shape[0] * 1500 / test_image.shape[1])))
    gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
    
    # Multi-pass denoising
    denoised = cv2.bilateralFilter(gray, 9, 75, 75)
    denoised = cv2.bilateralFilter(denoised, 5, 50, 50)
    
    # Enhance contrast
    enhanced = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(denoised)
    
    # Sharpen
    kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])
    sharpened = cv2.filter2D(enhanced, -1, kernel)
    final = cv2.addWeighted(enhanced, 0.7, sharpened, 0.3, 0)
    
    print(f"‚úÖ Preprocessing: Aggressive")
    print(f"‚úÖ Resize: 1500px")
    print(f"‚úÖ Denoising: 2-pass bilateral filter")
    print(f"‚úÖ Sharpening: Applied")
    print(f"‚úÖ Confidence Threshold: 0.3")
    print(f"‚úÖ Speed: Slow | Accuracy: Better")
    
    print("\n" + "=" * 60)
    print("STRATEGY 3: Small Text (Very Slow, Necessary)")
    print("=" * 60)
    
    # Extreme resizing for small text
    resized = cv2.resize(test_image, (2500, int(test_image.shape[0] * 2500 / test_image.shape[1])))
    gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
    
    # Aggressive preprocessing
    denoised = cv2.bilateralFilter(gray, 9, 75, 75)
    enhanced = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(denoised)
    
    print(f"‚úÖ Resize: 2500px (Huge!)")
    print(f"‚úÖ Preprocessing: Aggressive")
    print(f"‚úÖ Confidence Threshold: 0.35")
    print(f"‚úÖ Speed: Very Slow (~30-60s per image)")
    print(f"‚ö†Ô∏è Use only when text is < 50px height")
else:
    print("‚ùå Could not load test image. Update path above.")

# 7. Implementation Checklist for Your Project

## ‚úÖ What's Been Implemented in app.py

### Preprocessing Functions (‚úÖ DONE)
- `resize_for_ocr()` - Optimal resizing for different text sizes
- `enhance_contrast()` - CLAHE contrast enhancement
- `sharpen_image()` - Edge sharpening for blurry images
- `deskew_image()` - Rotation correction for tilted documents
- `preprocess_image_light()` - Fast preprocessing for high-quality images
- `preprocess_image_aggressive()` - Enhanced preprocessing for poor-quality images

### Smart Filtering (‚úÖ DONE)
- `filter_by_confidence()` - Intelligent threshold filtering with text quality checks
- `remove_overlapping_detections()` - Removes duplicate detections (IoU-based)
- `improve_bbox()` - Tightens bounding boxes for better visualization

### OCR Extraction (‚úÖ DONE)
- `extract_text_with_ocr()` - Complete optimized pipeline with:
  - ‚úÖ Confidence filtering
  - ‚úÖ Duplicate removal
  - ‚úÖ BBox tightening
  - ‚úÖ Position sorting
  - ‚úÖ Deduplication

### UI Enhancements (‚úÖ DONE)
- **Image Quality Selector** - Choose preprocessing based on image type
- **Auto-Threshold Adjustment** - Recommended settings per quality level
- **Debug Mode** - Shows all optimizations applied

---

## üéØ How to Use the Optimizations

### For Clear Documents:
```
1. Select: "High (Clear Documents)"
2. Auto Settings: Light preprocessing, threshold=0.6
3. Speed: ~2-3 seconds per image
4. Accuracy: Very high
```

### For Normal Photos:
```
1. Select: "Medium (Normal Photos)"
2. Auto Settings: Light preprocessing, threshold=0.5
3. Speed: ~3-5 seconds per image
4. Accuracy: High
```

### For Noisy/Blurry Images:
```
1. Select: "Low (Noisy/Blurry)"
2. Auto Settings: Aggressive preprocessing, threshold=0.3
3. Speed: ~10-15 seconds per image
4. Accuracy: Good (gets more text including noise)
```

### For Small Text:
```
1. Select: "Low (Noisy/Blurry)"
2. Manually change Preprocessing to: "Aggressive (Enhanced Denoise)"
3. May need to run locally and modify resize target to 2500-3000px
4. Speed: ~30-60 seconds per image
5. Note: Requires code modification - see notebook section below
```

---

## üîß If You Want to Fine-Tune Further

### To Enlarge Further for Tiny Text:
Edit `resize_for_ocr()` target_width parameter when calling it.

In [None]:
## Direct Function Usage Example (For Jupyter or Scripts)

# If you want to use the optimizations directly without Streamlit:

import cv2
import numpy as np
import easyocr

# Load reader
reader = easyocr.Reader(['en'])

# Load image
image = cv2.imread("your_image.jpg")

# ---- EXAMPLE 1: Process High-Quality Document ----
print("Processing high-quality document...")

# Step 1: Light preprocessing
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
filtered = cv2.bilateralFilter(gray, 5, 50, 50)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(filtered)

# Step 2: Extract with high confidence
results = reader.readtext(enhanced)
high_conf_results = [(bbox, text, conf) for bbox, text, conf in results if conf >= 0.6]

print(f"Found {len(high_conf_results)} high-confidence detections")

# ---- EXAMPLE 2: Process Noisy Image ----
print("\nProcessing noisy image...")

# Step 1: Aggressive preprocessing
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Multi-pass denoising
denoised = cv2.bilateralFilter(gray, 9, 75, 75)
denoised = cv2.bilateralFilter(denoised, 5, 50, 50)

# Enhance and sharpen
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(denoised)

kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])
sharpened = cv2.filter2D(enhanced, -1, kernel)
final = cv2.addWeighted(enhanced, 0.7, sharpened, 0.3, 0)

# Step 2: Extract with lower confidence
results = reader.readtext(final)
filtered_results = [(bbox, text, conf) for bbox, text, conf in results if conf >= 0.3]

print(f"Found {len(filtered_results)} detections (confidence >= 0.3)")

# ---- EXAMPLE 3: Remove Duplicates ----
print("\nRemoving duplicate detections...")

def calculate_iou(box1, box2):
    """Calculate Intersection over Union"""
    x1_coords = [p[0] for p in box1]
    x1_min, x1_max = min(x1_coords), max(x1_coords)
    y1_coords = [p[1] for p in box1]
    y1_min, y1_max = min(y1_coords), max(y1_coords)
    
    x2_coords = [p[0] for p in box2]
    x2_min, x2_max = min(x2_coords), max(x2_coords)
    y2_coords = [p[1] for p in box2]
    y2_min, y2_max = min(y2_coords), max(y2_coords)
    
    x_inter = max(0, min(x1_max, x2_max) - max(x1_min, x2_min))
    y_inter = max(0, min(y1_max, y2_max) - max(y1_min, y2_min))
    inter_area = x_inter * y_inter
    
    box1_area = (x1_max - x1_min) * (y1_max - y1_min)
    box2_area = (x2_max - x2_min) * (y2_max - y2_min)
    union_area = box1_area + box2_area - inter_area
    
    if union_area == 0:
        return 0
    return inter_area / union_area

# Remove overlapping detections
unique_results = []
for bbox1, text1, conf1 in filtered_results:
    is_duplicate = False
    
    for bbox2, text2, conf2 in unique_results:
        if calculate_iou(bbox1, bbox2) > 0.3:
            # Overlaps significantly
            if conf1 > conf2:
                unique_results.remove((bbox2, text2, conf2))
            else:
                is_duplicate = True
                break
    
    if not is_duplicate:
        unique_results.append((bbox1, text1, conf1))

print(f"After deduplication: {len(unique_results)} unique detections")
print("\nDone! Use unique_results for visualization or export.")