# 🔍 OCR Text Extractor

Extract text from images using Optical Character Recognition! Supports:
- Single images
- Multiple images (creates text files)
- ZIP files containing images
- Folders with images
- **Advanced features:** Language detection, confidence scores, text positioning

**Supported formats:** PNG, JPG, JPEG, WEBP, TIFF, BMP
**Languages:** English, Vietnamese, Chinese, Japanese, Korean, and more!


## 🚀 How to Use

**Option 1:** Set `path_arg` to your file/folder path
**Option 2:** Upload files directly (in Colab)

### Examples:
```python
# Single image
path_arg = "screenshot.png"

# Folder with images  
path_arg = "/path/to/images/"

# ZIP file
path_arg = "documents.zip"
```


In [None]:
# Import libraries
import os, zipfile, shutil, uuid, json
from pathlib import Path
import cv2
import numpy as np

# Check if running in Google Colab
try:
    from google.colab import files
    IS_COLAB = True
    print("🔧 Running in Google Colab")
except ImportError:
    IS_COLAB = False
    print("🔧 Running locally")

# Install required packages if needed
try:
    import pytesseract
    from PIL import Image
    print("✅ Required packages available")
except ImportError:
    print("📦 Installing required packages...")
    import subprocess
    subprocess.check_call(["pip", "install", "pytesseract", "opencv-python", "Pillow"])
    import pytesseract
    from PIL import Image
    print("✅ Packages installed successfully")

# Try to set tesseract path for different systems
try:
    # For Windows
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
    pytesseract.get_tesseract_version()
    print("✅ Tesseract found")
except:
    try:
        # For Linux/Mac
        pytesseract.get_tesseract_version()
        print("✅ Tesseract found")
    except:
        print("⚠️ Tesseract not found. Please install Tesseract OCR:")
        print("   Windows: https://github.com/UB-Mannheim/tesseract/wiki")
        print("   Linux: sudo apt-get install tesseract-ocr")
        print("   Mac: brew install tesseract")


In [None]:
def get_images(path):
    """Get all image files from path (file, folder, or zip)"""
    if not os.path.exists(path):
        raise FileNotFoundError(f"❌ Path not found: {path}")
    
    if zipfile.is_zipfile(path):
        print(f"📦 Extracting ZIP: {os.path.basename(path)}")
        base = os.path.splitext(os.path.basename(path))[0]
        tmp_dir = f"ext_{uuid.uuid4().hex[:6]}"
        os.makedirs(tmp_dir, exist_ok=True)
        zipfile.ZipFile(path).extractall(tmp_dir)
        imgs = [os.path.join(r, f) for r, _, fs in os.walk(tmp_dir) 
                for f in fs if f.lower().endswith(('.png','.jpg','.jpeg','.webp','.tiff','.bmp'))]
        return imgs, base, tmp_dir
    elif os.path.isdir(path):
        print(f"📁 Scanning folder: {os.path.basename(path)}")
        base = os.path.basename(os.path.normpath(path))
        imgs = [os.path.join(r, f) for r, _, fs in os.walk(path) 
                for f in fs if f.lower().endswith(('.png','.jpg','.jpeg','.webp','.tiff','.bmp'))]
        return imgs, base, None
    else:
        print(f"🖼️ Processing single image: {os.path.basename(path)}")
        return [path], os.path.splitext(os.path.basename(path))[0], None


In [None]:
def preprocess_image(image_path):
    """Preprocess image for better OCR results"""
    # Read image
    img = cv2.imread(image_path)
    if img is None:
        return None
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply noise reduction
    denoised = cv2.medianBlur(gray, 3)
    
    # Apply threshold to get binary image
    _, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Morphological operations to clean up
    kernel = np.ones((1,1), np.uint8)
    cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
    
    return cleaned

def extract_text_with_details(image_path, language='eng+vie'):
    """Extract text with detailed information including confidence scores"""
    try:
        # Preprocess image
        processed_img = preprocess_image(image_path)
        if processed_img is None:
            return {
                'text': '',
                'confidence': 0,
                'word_count': 0,
                'details': []
            }
        
        # Extract text with confidence scores
        data = pytesseract.image_to_data(processed_img, lang=language, output_type=pytesseract.Output.DICT)
        
        # Extract text
        text = pytesseract.image_to_string(processed_img, lang=language)
        
        # Calculate average confidence
        confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
        avg_confidence = sum(confidences) / len(confidences) if confidences else 0
        
        # Get word details
        word_details = []
        for i in range(len(data['text'])):
            if int(data['conf'][i]) > 0:
                word_details.append({
                    'text': data['text'][i],
                    'confidence': int(data['conf'][i]),
                    'bbox': {
                        'x': data['left'][i],
                        'y': data['top'][i],
                        'width': data['width'][i],
                        'height': data['height'][i]
                    }
                })
        
        return {
            'text': text.strip(),
            'confidence': round(avg_confidence, 2),
            'word_count': len([w for w in text.split() if w.strip()]),
            'details': word_details
        }
        
    except Exception as e:
        return {
            'text': f'Error: {str(e)}',
            'confidence': 0,
            'word_count': 0,
            'details': []
        }

def detect_language(image_path):
    """Detect the primary language in the image"""
    try:
        processed_img = preprocess_image(image_path)
        if processed_img is None:
            return 'eng'
        
        # Try different language combinations
        languages = ['eng', 'vie', 'chi_sim', 'jpn', 'kor']
        best_lang = 'eng'
        best_confidence = 0
        
        for lang in languages:
            try:
                data = pytesseract.image_to_data(processed_img, lang=lang, output_type=pytesseract.Output.DICT)
                confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
                if confidences:
                    avg_conf = sum(confidences) / len(confidences)
                    if avg_conf > best_confidence:
                        best_confidence = avg_conf
                        best_lang = lang
            except:
                continue
        
        return best_lang
    except:
        return 'eng'


In [None]:
def process(path, language='auto', include_details=True):
    """Process images and extract text using OCR"""
    try:
        imgs, base, tmp = get_images(path)
        if not imgs:
            print("❌ No valid images found.")
            return

        print(f"📊 Found {len(imgs)} image(s)")
        
        out_dir = f"ocr_{uuid.uuid4().hex[:6]}"
        os.makedirs(out_dir, exist_ok=True)
        
        results = []
        
        for i, img in enumerate(imgs, 1):
            print(f"🔄 Processing {i}/{len(imgs)}: {os.path.basename(img)}")
            
            # Detect language if auto mode
            if language == 'auto':
                detected_lang = detect_language(img)
                print(f"  🔍 Detected language: {detected_lang}")
            else:
                detected_lang = language
            
            # Extract text
            result = extract_text_with_details(img, detected_lang)
            
            # Save text file
            txt_name = os.path.splitext(os.path.basename(img))[0] + ".txt"
            txt_path = os.path.join(out_dir, txt_name)
            
            with open(txt_path, 'w', encoding='utf-8') as f:
                f.write(f"=== OCR Results for {os.path.basename(img)} ===\n")
                f.write(f"Language: {detected_lang}\n")
                f.write(f"Confidence: {result['confidence']}%\n")
                f.write(f"Word Count: {result['word_count']}\n")
                f.write(f"\n--- Extracted Text ---\n")
                f.write(result['text'])
                
                if include_details and result['details']:
                    f.write(f"\n\n--- Word Details ---\n")
                    for detail in result['details'][:20]:  # Limit to first 20 words
                        f.write(f"'{detail['text']}' (confidence: {detail['confidence']}%)\n")
            
            # Save detailed JSON if requested
            if include_details:
                json_name = os.path.splitext(os.path.basename(img))[0] + ".json"
                json_path = os.path.join(out_dir, json_name)
                
                with open(json_path, 'w', encoding='utf-8') as f:
                    json.dump({
                        'filename': os.path.basename(img),
                        'language': detected_lang,
                        'confidence': result['confidence'],
                        'word_count': result['word_count'],
                        'text': result['text'],
                        'word_details': result['details']
                    }, f, ensure_ascii=False, indent=2)
            
            results.append({
                'filename': os.path.basename(img),
                'confidence': result['confidence'],
                'word_count': result['word_count'],
                'text_preview': result['text'][:100] + '...' if len(result['text']) > 100 else result['text']
            })
            
            print(f"  ✅ Confidence: {result['confidence']}%, Words: {result['word_count']}")

        # Create summary
        summary_path = os.path.join(out_dir, "summary.txt")
        with open(summary_path, 'w', encoding='utf-8') as f:
            f.write("=== OCR Processing Summary ===\n\n")
            f.write(f"Total images processed: {len(results)}\n")
            f.write(f"Average confidence: {sum(r['confidence'] for r in results) / len(results):.2f}%\n")
            f.write(f"Total words extracted: {sum(r['word_count'] for r in results)}\n\n")
            
            f.write("Individual results:\n")
            for result in results:
                f.write(f"- {result['filename']}: {result['confidence']}% confidence, {result['word_count']} words\n")
                f.write(f"  Preview: {result['text_preview']}\n\n")

        if len(results) == 1:
            # Single file - create individual outputs
            txt_files = [f for f in os.listdir(out_dir) if f.endswith('.txt') and f != 'summary.txt']
            if txt_files:
                final_file = f"{base}_ocr.txt"
                shutil.move(os.path.join(out_dir, txt_files[0]), final_file)
                print(f"✅ Final output: {final_file}")
        else:
            # Multiple files - create ZIP
            final_file = f"{base}_ocr.zip"
            with zipfile.ZipFile(final_file, 'w', zipfile.ZIP_DEFLATED) as z:
                for root, dirs, files in os.walk(out_dir):
                    for file in files:
                        z.write(os.path.join(root, file), file)
            print(f"✅ Created ZIP: {final_file} ({len(results)} files)")

        if IS_COLAB:
            files.download(final_file)
            print("📥 Download started!")
        else:
            print(f"📁 Output: {os.path.abspath(final_file)}")

    except Exception as e:
        print(f"❌ Error: {e}")
    finally:
        shutil.rmtree(out_dir, ignore_errors=True)
        if tmp: shutil.rmtree(tmp, ignore_errors=True)


## ⚙️ Configuration

Set your file path and options here:


In [None]:
# Configuration
path_arg = None  # Set your file/folder path here
language = 'auto'  # 'auto', 'eng', 'vie', 'chi_sim', 'jpn', 'kor', or combinations like 'eng+vie'
include_details = True  # Set to False to skip detailed JSON output


## 🎯 Run OCR Extraction

Execute the OCR process:


In [None]:
if path_arg:
    print(f"🚀 Processing: {path_arg}")
    process(path_arg, language, include_details)
elif IS_COLAB:
    print("📤 Upload your image files...")
    uploaded = files.upload()
    for fname in uploaded.keys():
        if fname.lower().endswith(('.png','.jpg','.jpeg','.webp','.tiff','.bmp')):
            print(f"🚀 Processing: {fname}")
            process(fname, language, include_details)
        else:
            print(f"⚠️ Skipping non-image file: {fname}")
else:
    print("❗ Please set path_arg or upload in Colab.")
