In [84]:
import os
import shutil
from pathlib import Path
import nltk
from openai import OpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [86]:
client = OpenAI(
    api_key="Put_your_api_key / or use .env",
)

In [88]:
# Initialize cache to store previously classified chunks
category_cache = {}

In [90]:
def sanitize_folder_name(name):
    # Replace invalid characters for Windows folder names
    name = ''.join(char for char in name if char.isalnum() or char in (' ', '-')).strip()
    return name.replace('\n', '_')  # Replace newlines with underscores

In [92]:
def classify_with_openai(text):
    if text in category_cache:
        return category_cache[text]  # Return cached category if it exists
    try:
        # Call OpenAI to classify the text
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": f"Classify the following text into a single category name (one word): {text}. Provide a name like 'Insurance', 'Finance', etc."}
            ]
        )
         # Extract the category from the response
        category = response.choices[0].message.content.strip()
        if category:
            # Return the first word found, sanitized
            return sanitize_folder_name(category.split()[0])  # Get only the first word
        else:
            return "uncategorized"
    except Exception as e:
        print(f"Error during OpenAI classification: {e}")
        return "uncategorized"

In [94]:
def split_and_classify(content, max_chunks=5):
    """Split large content and classify up to max_chunks."""
    # Split the document into smaller chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_text(content)

    # Limit the number of chunks to classify (e.g., first 5)
    classified_chunks = []
    for chunk in chunks[:max_chunks]:  # Classify only first few chunks
        category = classify_with_openai(chunk)
        classified_chunks.append(category)

    # Return the most common category among the classified chunks
    return max(set(classified_chunks), key=classified_chunks.count)

In [114]:
def classify_pdfs(input_dir):
    input_dir = Path(input_dir)

    # Ensure the input directory exists
    if not input_dir.exists():
        print(f"Directory {input_dir} not found. Please create it.")
        return

    # Collect all PDF files to process in a list
    files_to_process = list(input_dir.rglob('*.pdf'))

    # Iterate through the collected PDF files
    for file in files_to_process:  # Only look for PDF files
        if file.is_file():
            print(f"Processing {file.name}")
            try:
                # Load PDF content using PyPDFLoader without splitting
                loader = PyPDFLoader(str(file))
                document = loader.load()  # Load the entire document

                # Combine the content of the loaded document into a single string for classification
                full_content = document[0].page_content if document else ""

                # Split and classify the content (handle large files by classifying only part of it)
                final_category = split_and_classify(full_content)

                # Determine the category folder
                if final_category:
                    category_folder = input_dir / final_category
                    print(f"Attempting to create/move to folder: {category_folder}")  # Debug output
                    category_folder.mkdir(parents=True, exist_ok=True)

                    # Move the file to the corresponding category folder
                    shutil.move(str(file), category_folder / file.name)
                    print(f"Classified {file.name} into {category_folder}")
                else:
                    # If no category matched, move to 'uncategorized'
                    uncategorized_folder = input_dir / 'uncategorized'
                    uncategorized_folder.mkdir(parents=True, exist_ok=True)
                    shutil.move(str(file), uncategorized_folder / file.name)
                    print(f"Classified {file.name} into {uncategorized_folder}")

            except Exception as e:
                print(f"Error processing {file.name}: {e}")

In [118]:
if __name__ == "__main__":
    input_dir = r"your_path"  # Specify your input directory here
    classify_pdfs(input_dir)

Processing Attestation CVEC 2023-2024.pdf
Error processing Attestation CVEC 2023-2024.pdf: max() iterable argument is empty
Processing CARNIVORE DIET.pdf
Attempting to create/move to folder: C:\Users\paule\OneDrive\Bureau\2024suce\Diet
Classified CARNIVORE DIET.pdf into C:\Users\paule\OneDrive\Bureau\2024suce\Diet
Processing Etude biblique.pdf
Attempting to create/move to folder: C:\Users\paule\OneDrive\Bureau\2024suce\Religion
Classified Etude biblique.pdf into C:\Users\paule\OneDrive\Bureau\2024suce\Religion
Processing Gym Super-Héro ( à jour sur Drive ).pdf
Attempting to create/move to folder: C:\Users\paule\OneDrive\Bureau\2024suce\Fitness
Classified Gym Super-Héro ( à jour sur Drive ).pdf into C:\Users\paule\OneDrive\Bureau\2024suce\Fitness
Processing Plan épargne 2024.pdf
Attempting to create/move to folder: C:\Users\paule\OneDrive\Bureau\2024suce\Finance
Classified Plan épargne 2024.pdf into C:\Users\paule\OneDrive\Bureau\2024suce\Finance
Processing Programme Musculation ( à jou