In [29]:
import pytesseract
import concurrent.futures
import os
import re

In [None]:
# Function to count tokens in a text
def count_tokens(text):
    try:
        # Split text into tokens based on spaces and punctuation
        tokens = re.findall(r'\b\w+\b', text)
        return len(tokens)
    except Exception as e:
        print(f"An error occurred while counting tokens: {str(e)}")
        return 0  # Return 0 tokens in case of an error

In [33]:
# Function to process text files
def process_text_file(text_file):

    # Read the text from the file
    with open(text_file, 'r', encoding='utf-8') as file:
        text = file.read()

    # Count tokens in the text
    num_tokens = count_tokens(text)
    
    # Print the file name and number of tokens
    print(f'File: {text_file}, Number of Tokens: {num_tokens}')

    return num_tokens

In [34]:
def main():
    # Directory containing the saved text files (same directory as images)
    text_dir = 'extracted_images'

    # List of text files with '.txt' extension
    text_files = [os.path.join(text_dir, text_file) for text_file in os.listdir(text_dir) if text_file.endswith('.txt')]

    # Number of maximum workers
    max_workers = 10

    # Create a ThreadPoolExecutor with the specified maximum workers
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit the text file processing tasks in parallel and get the token counts
        token_counts = list(executor.map(process_text_file, text_files))

    # Calculate the total tokens
    total_tokens = sum(token_counts)
    print(f'Total Tokens Across All Text Files: {total_tokens}')


if __name__ == "__main__":
    main()

File: extracted_images/image_63.txt, Number of Tokens: 185File: extracted_images/image_48.txt, Number of Tokens: 88

File: extracted_images/image_8.txt, Number of Tokens: 72
File: extracted_images/image_49.txt, Number of Tokens: 180
File: extracted_images/image_60.txt, Number of Tokens: 183
File: extracted_images/image_71.txt, Number of Tokens: 246
File: extracted_images/image_61.txt, Number of Tokens: 166
File: extracted_images/image_62.txt, Number of Tokens: 108
File: extracted_images/image_9.txt, Number of Tokens: 149
File: extracted_images/image_65.txt, Number of Tokens: 97
File: extracted_images/image_74.txt, Number of Tokens: 261
File: extracted_images/image_64.txt, Number of Tokens: 263
File: extracted_images/image_58.txt, Number of Tokens: 250
File: extracted_images/image_59.txt, Number of Tokens: 245
File: extracted_images/image_70.txt, Number of Tokens: 262
File: extracted_images/image_14.txt, Number of Tokens: 108
File: extracted_images/image_12.txt, Number of Tokens: 140
Fi