In [None]:
import cv2
import pytesseract
from PIL import Image
import fitz  # PyMuPDF
import docx
import json
import numpy as np
import io
import os
import shutil
from typing import List, Tuple, Dict
import pdb
# Function to clear the output directory
def clear_output_directory(output_dir: str) -> None:
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)

# Function to preprocess documents and convert to images
def preprocess_document(file_path: str) -> List[Image.Image]:
    if file_path.endswith('.pdf'):
        return extract_images_from_pdf(file_path)
    elif file_path.endswith('.doc') or file_path.endswith('.docx'):
        return extract_images_from_doc(file_path)
    else:
        return [Image.open(file_path).convert("RGB")]

# Function to extract images from PDF
def extract_images_from_pdf(file_path: str) -> List[Image.Image]:
    images = []
    doc = fitz.open(file_path)
    for page in doc:
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append(img)
    return images

# Function to extract images from DOC/DOCX
def extract_images_from_doc(file_path: str) -> List[Image.Image]:
    images = []
    doc = docx.Document(file_path)
    for rel in doc.part.rels.values():
        if "image" in rel.target_ref:
            image_data = rel.target_part.blob
            img = Image.open(io.BytesIO(image_data)).convert("RGB")
            images.append(img)
    return images

# Function to resize images
def resize_image(image: Image.Image, max_size: Tuple[int, int] = (2000, 2000)) -> Tuple[Image.Image, float]:
    img = np.array(image)
    height, width = img.shape[:2]
    if height > max_size[0] or width > max_size[1]:
        scaling_factor = min(max_size[0] / height, max_size[1] / width)
        img = cv2.resize(img, None, fx=scaling_factor, fy=scaling_factor, interpolation=cv2.INTER_AREA)
    elif height < 800 or width < 800:
        scaling_factor = max(800 / height, 800 / width)
        img = cv2.resize(img, None, fx=scaling_factor, fy=scaling_factor, interpolation=cv2.INTER_LINEAR)
    else:
        scaling_factor = 1  # No resizing needed
    return Image.fromarray(img), scaling_factor

# Function to extract text lines using OCR
def extract_text_lines(image: Image.Image) -> List[Dict[str, Dict[str, int]]]:
    image, scaling_factor = resize_image(image)
    img = np.array(image)
    pdb.set_trace()
    print(img.shape)
    
    # Check if the image is already in grayscale
    if len(img.shape) == 2:
        gray = img
    else:
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    adaptive_threshold = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    d = pytesseract.image_to_data(adaptive_threshold, output_type=pytesseract.Output.DICT)
    n_boxes = len(d['level'])
    text_lines = []
    for i in range(n_boxes):
        if int(d['conf'][i]) > 0:  # Filter out low confidence detections
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            # Adjust the coordinates according to the scaling factor
            text_lines.append({
                "text": d['text'][i],
                "location": {"x": int(x / scaling_factor), "y": int(y / scaling_factor), 
                             "width": int(w / scaling_factor), "height": int(h / scaling_factor)}
            })
    return text_lines

# Function to save output images and JSON
def save_output(images: List[Image.Image], text_lines: List[List[Dict[str, Dict[str, int]]]], output_prefix: str) -> None:
    output_dir = f"{output_prefix}"
    clear_output_directory(output_dir)
    
    for i, image in enumerate(images):
        img = np.array(image)
        img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)  # Convert to BGR for OpenCV

        for line in text_lines[i]:
            x, y, w, h = line['location']['x'], line['location']['y'], line['location']['width'], line['location']['height']
            cv2.rectangle(img_bgr, (x, y), (x + w, y + h), (0, 255, 0), 2)
        
        output_image_path = os.path.join(output_dir, f"page_{i+1}.png")
        cv2.imwrite(output_image_path, img_bgr)
    
    output_json_path = f"{output_prefix}_output.json"
    with open(output_json_path, 'w') as f:
        json.dump(text_lines, f, indent=4)

# Function to process the document and extract text lines
def process_document(file_path: str, output_prefix: str) -> None:
    images = preprocess_document(file_path)
    all_text_lines = []
    for image in images:
        text_lines = extract_text_lines(image)
        all_text_lines.append(text_lines)
    save_output(images, all_text_lines, output_prefix)

# Example usage
file_path = "/home/vanellope/Pictures/448646825_3690120521261666_1678735826989084568_n.png"  # Change this to your file path
output_prefix = 'output'  # Change this to your desired output prefix
process_document(file_path, output_prefix)

> [0;32m/tmp/ipykernel_52478/3350174162.py[0m(68)[0;36mextract_text_lines[0;34m()[0m
[0;32m     66 [0;31m    [0mimg[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0marray[0m[0;34m([0m[0mimage[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     67 [0;31m    [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 68 [0;31m    [0mprint[0m[0;34m([0m[0mimg[0m[0;34m.[0m[0mshape[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     69 [0;31m[0;34m[0m[0m
[0m[0;32m     70 [0;31m    [0;31m# Check if the image is already in grayscale[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  img.size


3036000


ipdb>  img.shape


(800, 1265, 3)
