In [2]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from scipy import ndimage
from skimage import measure
import os
import pytesseract
from PIL import Image
import tempfile
import shutil
from pdf2image import convert_from_path
import pymupdf as fitz
from docx import Document
from docx.shared import Inches
import aspose.words as aw
from PIL import Image, ImageDraw, ImageFont
import re
from datetime import datetime
from collections import Counter

class UniversalOCRProcessor:
    def __init__(self):
        #It supports image, pdfs, docs and txt files. 
        self.supported_extensions={'image': ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.gif'],
                                  'pdf': ['.pdf'], 'docx': ['.docx', '.doc'], 'text': ['.txt'] }

    # detecting the uploaded file type
    def detect_file_type(self, file_path):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found!")
        _, ext=os.path.splitext(file_path.lower())
        for file_type, exts in self.supported_extensions.items():
            if ext in exts:
                return file_type
        return 'unsupported'

    #pdf to images
    def pdf_to_images(self, pdf_path, dpi=300):
        return convert_from_path(pdf_path, dpi=dpi)

    #docs to images
    def docx_to_images(self, docx_path, dpi=300):
        doc=aw.Document(docx_path)
        images=[]
        #Rendering each page to PNG.
        with tempfile.TemporaryDirectory() as temp_dir:
            for i in range(doc.page_count):
                out=os.path.join(temp_dir, f"page_{i}.png")
                opts=aw.saving.ImageSaveOptions(aw.SaveFormat.PNG)
                opts.page_set=aw.saving.PageSet(i)
                opts.resolution=dpi
                doc.save(out, opts)
                images.append(Image.open(out).copy())
        return images

    #create an image from text.
    def create_text_image(self, text, width=800, height=1000):
        
        img = Image.new('RGB', (width, height), color='white')
        draw = ImageDraw.Draw(img)
        
        try:
            font = ImageFont.truetype("arial.ttf", 20)
        except:
            font = ImageFont.load_default()
        
        lines = text.split('\n')
        y_position = 20
        line_height = 25
        
        for line in lines:
            if y_position > height - 50:
                break
            draw.text((20, y_position), line, fill='black', font=font)
            y_position += line_height
        
        return img
        
    #txt to image.
    def txt_to_images(self, txt_path):
            with open(txt_path, 'r', encoding='utf-8') as file:
                text_content = file.read()
            
            max_chars_per_page = 2000
            pages = []
            
            if len(text_content) <= max_chars_per_page:
                pages = [text_content]
            else:
                words = text_content.split(' ')
                current_page = ""
                
                for word in words:
                    if len(current_page + word) < max_chars_per_page:
                        current_page += word + " "
                    else:
                        pages.append(current_page)
                        current_page = word + " "
                
            if current_page:
                pages.append(current_page)
            
            images = []
            for page_text in pages:
                img = self.create_text_image(page_text)
                images.append(img)
            
            return images

# Now the image processing will be done. All the images converted from different files
# are processed using OpenCV (Rescaling, binarization etc)
class OCRImageProcessor:
    def __init__(self):
        pass

    #converts PIL into  OpenCV BGR and genrates gray scale version.
    def set_image(self, image):
        if isinstance(image, Image.Image):
            #converts to BGR 
            self.original_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        else:
            self.original_image = image
        
        self.processed_image = self.original_image.copy()
        #gray scale version of original image.
        self.gray_image = cv2.cvtColor(self.original_image, cv2.COLOR_BGR2GRAY)

    def rescale_image(self, image=None, scale_factor=2.0, interpolation=cv2.INTER_CUBIC):
        if image is None:
            image = self.gray_image
        height, width = image.shape[:2]
        new_width = int(width*scale_factor)
        new_height = int(height*scale_factor)
        rescaled = cv2.resize(image, (new_width, new_height), interpolation=interpolation)
        return rescaled

    #Binarize the image.
    def binarize_image(self, image=None, method='otsu', threshold_value=127):
        if image is None:
            image =self.gray_image
        if method=='otsu':
            _, binary=cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        else:
            raise ValueError("Invalid binarization method")
        
        return binary

    #denoising the image.
    def remove_noise(self, image=None, method='median', kernel_size=5):
        if image is None:
            image=self.gray_image
        
        if method=='median':
            denoised=cv2.medianBlur(image, kernel_size)
        else:
            raise ValueError("Invalid noise removal method")
        return denoised

    def remove_borders(self, image=None, border_size=10):
        if image is None:
            image = self.gray_image
        h,w=image.shape[:2]
        if h>border_size*2 and w> border_size*2:
            cropped =image[border_size:h-border_size, border_size:w-border_size]
            return cropped
        return image
    
    def add_borders(self, image=None, border_size=20, border_color=255):
        if image is None:
            image=self.gray_image
        bordered=cv2.copyMakeBorder(image, border_size, border_size, border_size, 
                                    border_size, cv2.BORDER_CONSTANT, value=border_color)
        return bordered

    #full preprocessing
    def preprocess_complete(self, scale_factor=2.0, binarization_method='otsu', noise_removal_method='median',remove_border=True, add_border=True):
         processed=self.gray_image.copy()
         # Step 1: Rescale
         processed=self.rescale_image(processed, scale_factor)
            
         # Step 2: Noise removal
         processed=self.remove_noise(processed, method=noise_removal_method)
         # Step 3: Remove borders if needed
         if remove_border:
             processed = self.remove_borders(processed)
        
         # Step 4: Binarization
         processed = self.binarize_image(processed, method=binarization_method)
        
         # Step 5: Add borders if needed
         if add_border:
             processed = self.add_borders(processed)
        
         self.processed_image = processed
         return processed
        
class OCRMetadataExtractor:
    def __init__(self):
        self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'this', 'that', 'these', 'those'])
    
    def extract_metadata(self, extracted_text, file_path=None, total_pages=1):
        metadata = {
            'file_path': file_path,'extraction_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),'total_pages': total_pages,'total_words': len(extracted_text.split()), 'total_characters': len(extracted_text),
        }
        # Extract title
        metadata['title']=self._extract_title(extracted_text)
        
        # Extract primary date
        metadata['primary_date']=self._extract_primary_date(extracted_text)
        
        # Extract keywords
        metadata['keywords']=self._extract_keywords(extracted_text)
        
        # Generate summary  
        metadata['summary']=self._generate_summary(extracted_text)
        
        # Extract contact information
        metadata['emails']=self._extract_emails(extracted_text)
        metadata['phone_numbers']=self._extract_phone_numbers(extracted_text)
        
        # Document type
        metadata['document_type']=self._classify_document_type(extracted_text)
        return metadata
    
    # def _extract_title(self, text):
    #     lines = [line.strip() for line in text.split('\n') if line.strip()]
    #     if not lines:
    #         return "No title found"
        
    #     # Get first substantial line
    #     for line in lines[:3]:
    #         if 10 < len(line) < 100:
    #             return line
        
    #     return lines[0][:80] + "..." if len(lines[0]) > 80 else lines[0]

    def _extract_title(self, text):
        """Enhanced title extraction with multiple strategies"""
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        if not lines:
            return "No title found"
        
        # Strategy 1: Look for title-like patterns (common in documents)
        title_patterns = [
            r'^[A-Z][A-Za-z\s]{10,80}$',  # Capitalized sentences
            r'^[A-Z\s]{5,}$',             # ALL CAPS titles
            r'^\d+\.\s*[A-Za-z\s]{5,80}$', # Numbered titles
        ]
        
        for line in lines[:5]:  # Check first 5 lines
            for pattern in title_patterns:
                if re.match(pattern, line) and 5 <= len(line) <= 100:
                    return line
        
        # Strategy 2: Find the longest meaningful line in first few lines
        candidates = []
        for line in lines[:8]:  # Increased from 3 to 8 lines
            # Remove common non-title elements
            if not any(skip in line.lower() for skip in ['page', 'www.', 'http', '@', 'tel:', 'fax:']):
                if 5 <= len(line) <= 150:  # Relaxed length constraints
                    candidates.append((line, len(line)))
        
        if candidates:
            # Return the longest candidate (likely to be title)
            return max(candidates, key=lambda x: x[1])[0]
        
        # Strategy 3: Fallback to first substantial line
        for line in lines[:5]:
            if 5 <= len(line) <= 200:  # More relaxed constraints
                return line
        
        # Strategy 4: Last resort - return first line with reasonable length
        return lines[0][:100] + "..." if len(lines[0]) > 100 else lines[0]

    
    def _extract_primary_date(self, text):
        date_patterns = [
            r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b',
            r'\b\d{1,2}\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{2,4}\b',
            r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{2,4}\b',
        ]
        
        for pattern in date_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group()
        
        return None
    
    def _extract_keywords(self, text, max_keywords=10):
        # Simple word extraction
        words = re.findall(r'\b[a-zA-Z]{4,}\b', text.lower())
        words = [word for word in words if word not in self.stop_words]
        
        # Count frequency and return top words
        word_freq = Counter(words)
        return [word for word, count in word_freq.most_common(max_keywords)]
    
    def _generate_summary(self, text, max_sentences=2):
        sentences = re.split(r'[.!?]+', text)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 30]
        
        if not sentences:
            return "No summary available"
        
        if len(sentences) <= max_sentences:
            return '. '.join(sentences) + '.'
        
        # Take first and last sentence
        summary_sentences = [sentences[0]]
        if len(sentences) > 1:
            summary_sentences.append(sentences[-1])
        
        return '. '.join(summary_sentences) + '.'
    
    def _extract_emails(self, text):
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        emails = re.findall(email_pattern, text)
        return emails[:3] if emails else []
    
    def _extract_phone_numbers(self, text):
        phone_pattern = r'\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b'
        phones = re.findall(phone_pattern, text)
        return phones[:3] if phones else []
    
    def _classify_document_type(self, text):
        text_lower = text.lower()
        
        if any(word in text_lower for word in ['invoice', 'bill', 'payment']):
            return 'Invoice/Bill'
        elif any(word in text_lower for word in ['certificate', 'certification']):
            return 'Certificate'
        elif any(word in text_lower for word in ['contract', 'agreement']):
            return 'Contract'
        elif any(word in text_lower for word in ['report', 'analysis']):
            return 'Report'
        elif any(word in text_lower for word in ['letter', 'dear']):
            return 'Letter'
        else:
            return 'General Document'             
         
         
def process_and_view_ocr(images, show_images=False, save_results=False, output_dir="ocr_results"):
    preprocessor = OCRImageProcessor()
    complete_text = ""
    
    if save_results:
        os.makedirs(output_dir, exist_ok=True)
    
    for i, image in enumerate(images):
        # Set image for preprocessing
        preprocessor.set_image(image)
        
        # Apply preprocessing
        processed_image = preprocessor.preprocess_complete()
        
        # Extract text using OCR
        ocr_text = pytesseract.image_to_string(processed_image)
        
        # Add to complete text
        if len(images) > 1:
            complete_text += f"\n{'='*60}\n"
            complete_text += f"PAGE {i+1}\n"
            complete_text += f"{'='*60}\n\n"
        complete_text += ocr_text + "\n"
        
        # Save results if requested
        if save_results:
            cv2.imwrite(os.path.join(output_dir, f"page_{i+1:03d}_processed.png"), processed_image)
            with open(os.path.join(output_dir, f"page_{i+1:03d}_text.txt"), 'w', encoding='utf-8') as f:
                f.write(ocr_text)
    
    if save_results:
        complete_text_path = os.path.join(output_dir, "complete_extracted_text.txt")
        with open(complete_text_path, 'w', encoding='utf-8') as f:
            f.write(complete_text)
    
    return complete_text

def universal_ocr(file_path, show_images=False, save_results=False, output_dir="ocr_results"):
    # Initialize processor
    processor = UniversalOCRProcessor()
    
    # Detect file type
    file_type = processor.detect_file_type(file_path)
    
    if file_type == 'unsupported':
        raise ValueError(f"Unsupported file type: {os.path.splitext(file_path)[1]}")
    
    # Convert to images based on file type
    images = []
    if file_type == 'image':
            img = Image.open(file_path)
            images = [img]
    elif file_type == 'pdf':
        images = processor.pdf_to_images(file_path)
    elif file_type == 'docx':
        images = processor.docx_to_images(file_path)
    elif file_type == 'text':
        images = processor.txt_to_images(file_path)
    
    # Process images and extract text
    extracted_text = process_and_view_ocr(
        images, 
        show_images=show_images, 
        save_results=save_results, 
        output_dir=output_dir
    )    
    return extracted_text
def universal_ocr_with_metadata(file_path, show_images=False, save_results=False, output_dir="ocr_results"):
    # Initialize processors
    processor = UniversalOCRProcessor()
    metadata_extractor = OCRMetadataExtractor()
    
    # Detect file type
    file_type = processor.detect_file_type(file_path)
    
    if file_type == 'unsupported':
        raise ValueError(f"Unsupported file type: {os.path.splitext(file_path)[1]}")
    
    # Convert to images based on file type  
    images = []
    if file_type == 'image':
        img = Image.open(file_path)
        images = [img]
    elif file_type == 'pdf':
        images = processor.pdf_to_images(file_path)
    elif file_type == 'docx':
        images = processor.docx_to_images(file_path)
    elif file_type == 'text':
        images = processor.txt_to_images(file_path)
    
    # Extract text
    extracted_text = process_and_view_ocr(
        images, 
        show_images=show_images, 
        save_results=save_results, 
        output_dir=output_dir
    )
    
    # Extract metadata
    metadata = metadata_extractor.extract_metadata(
        extracted_text, 
        file_path=file_path, 
        total_pages=len(images)
    )
    
    return extracted_text, metadata

def print_metadata(metadata):
    print("DOCUMENT METADATA")
    
    print(f"File: {metadata.get('file_path', 'N/A')}")
    print(f"Extracted: {metadata.get('extraction_date', 'N/A')}")
    print(f"Type: {metadata.get('document_type', 'N/A')}")
    print(f"Pages: {metadata.get('total_pages', 'N/A')}")
    print(f"Words: {metadata.get('total_words', 'N/A')}")
    print(f"Characters: {metadata.get('total_characters', 'N/A')}")
    
    print(f"\nTitle:")
    print(f"{metadata.get('title', 'N/A')}")
    
    if metadata.get('primary_date'):
        print(f"\nDate Found: {metadata['primary_date']}")
    
    if metadata.get('keywords'):
        print(f"\nKeywords:")
        print(f"   {', '.join(metadata['keywords'])}")
    
    if metadata.get('summary'):
        print(f"\nSummary:")
        print(f"{metadata['summary']}")
    
    # Contact info
    contact_info = []
    if metadata.get('emails'):
        contact_info.append(f"{', '.join(metadata['emails'])}")
    if metadata.get('phone_numbers'):
        contact_info.append(f"{', '.join(metadata['phone_numbers'])}")
    
    if contact_info:
        print(f"\nContact Info:")
        for info in contact_info:
            print(f"{info}")

# Usage Examples
# if __name__ == "__main__":
#     file_path = "Books_CFD.jpg"
    
#     # Option 1: Just extract text
#     # print("="*80)
#     # print("OPTION 1: TEXT EXTRACTION ONLY")
#     # print("="*80)
#     # result_text = universal_ocr(file_path, show_images=False, save_results=False)
#     # print(result_text)
    
#     # Option 2: Extract text with metadata
#     print("OPTION 2: TEXT EXTRACTION WITH METADATA")
#     # print("="*80)
#     result_text, metadata = universal_ocr_with_metadata(
#         file_path, 
#         show_images=False, 
#         save_results=False
#     )
    
#     print("EXTRACTED TEXT:")
#     print(result_text)
    
#     # Print metadata
#     print_metadata(metadata)

if __name__ == "__main__":
    file_path = "Books_CFD.jpg"
    _, metadata = universal_ocr_with_metadata(file_path)
    print_metadata(metadata)

DOCUMENT METADATA
File: Books_CFD.jpg
Extracted: 2025-06-25 21:32:41
Type: General Document
Pages: 1
Words: 79
Characters: 535

Title:
¢ P. S. Ghoshdastidar, “Computational Fluld

Keywords:
   computational, fluid, dynamics, heat, transfer, delhi, flow, suggested, books, ghoshdastidar

Summary:
Ghoshdastidar, “Computational Fluld
Dynamics and Heat Transfer’, Cengage
Learning India Pvt. Date, “Introduction to Computational Fluid
Dynamics’, Cambridge Univ.
