In [13]:
import pytesseract
import concurrent.futures
import os
import re
import sys
import json
import requests

In [14]:
def generate(prompt):
    llmmodel = 'llama2'
    try:
        r = requests.post('http://localhost:11434/api/generate',
                          json={
                              'model': llmmodel,
                              'prompt': prompt,
                          },
                          stream=True)
        r.raise_for_status()
        response_text = str(r.text)
    except requests.exceptions.RequestException as e:
        print(f"Error during API request: {e}")

    full_response = ''
 
    for line in r.iter_lines():
        body = json.loads(line)
        response_part = body.get('response', '')
        full_response += response_part

    return full_response

In [15]:
# Function to process a text file (perform OCR)
def process_text_file(text_file_path):
    try:
        with open(text_file_path, 'r', encoding='utf-8') as file:
            text_content = file.read()
        
        # Define a prompt based on the file contents or any other logic
        prompt = "You are a succint researcher, your job is to objectively summarize the key points from this page in 80 words."
        
        # Call the generate function with the defined prompt  
        response = generate(prompt + " " + text_content)
        summary = "Source : " + text_file_path + "\n" + response


        # write the summary to disk 
        filename = os.path.basename(text_file_path)
        new_file_path = os.path.join(os.path.dirname(text_file_path), "summary_" + filename)

        with open(new_file_path, 'w') as summary_file:
            summary_file.write(summary)
            
    except Exception as e:
        print(f"Error processing text file: {text_file_path}, Error: {str(e)}")

In [16]:
def main():
    # Specify the root directory to start processing
    root_directory = 'trends2024'

    # Set the maximum number of workers (e.g., 4)
    max_workers = 4

    try:
        # List all files in the root directory and its subdirectories
        for root, _, files in os.walk(root_directory):
            for file in files:
                if file.endswith('.txt'):
                    # If it's a text file, process it using a thread
                    file_path = os.path.join(root, file)
                    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                        executor.submit(process_text_file, file_path)
    except Exception as e:
        print(f"Error processing directory: {root_directory}, Error: {str(e)}")

if __name__ == "__main__":
    main()