In [None]:
import pytesseract
from PIL import Image
import os
import re
import cv2
from pdf2image import convert_from_path
import concurrent.futures

In [None]:
# Function to convert PDF files to images and perform OCR
def convert_pdf(pdf_file):
    try:
        # Create a directory for each PDF file
        pdf_name = os.path.splitext(pdf_file)[0]
        pdf_output_dir = os.path.join(pdf_directory, pdf_name)
        os.makedirs(pdf_output_dir, exist_ok=True)

        # Convert PDF to a list of images
        images = convert_from_path(os.path.join(pdf_directory, pdf_file))

        for i, image in enumerate(images):
            # Perform OCR on the image in a separate thread
            process_image_to_text(image, page_number=i+1, pdf_output_dir=pdf_output_dir)

    except Exception as e:
        print(f"Convert_pdf an error occurred: {str(e)}")

In [None]:
# Function to process an image and save text
def process_image_to_text(image_file, page_number, pdf_output_dir):
    print(image_file.filename)
    try:
        # Perform OCR on the image
        extracted_text = pytesseract.image_to_string(image_file)

        # Define the output text file path using the image file name
        text_file = os.path.join(pdf_output_dir, f"page_{page_number}.txt")
        print("extracted text from " + pdf_output_dir + " to " + text_file)
        
        # Save the extracted text to a text file in the same directory as the image
        with open(text_file, 'w', encoding='utf-8') as file:
            file.write(extracted_text)
            
    except Exception as e:
        print(f"process_image_to_text an error occurred: {str(e)}")

In [None]:
# Usage example:
if __name__ == "__main__":
    pdf_directory = '2024trends'
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(convert_pdf, pdf_files)