In [5]:
# Import Dependencies
import os
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler  # Import FileSystemEventHandler
from PyPDF2 import PdfFileReader

In [8]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as f:
        reader = PdfFileReader(f)
        text = ''
        for page_num in range(reader.numPages):
            page = reader.getPage(page_num)
            text += page.extract_text()
        return text

In [9]:
# Function to chunk text
def chunk_text(text, chunk_size=500):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i + chunk_size]
        chunks.append(chunk)
    return chunks

In [10]:
# Define a handler for file system events (new file creation)
class NewFileHandler(FileSystemEventHandler):
    def on_created(self, event):
        if not event.is_directory and event.src_path.endswith('.pdf'):
            process_new_pdf(event.src_path)

In [12]:
# Function to process a new PDF file
def process_new_pdf(pdf_file):
    print(f"Processing new PDF file: {pdf_file}")
    # Extract text from PDF
    extracted_text = extract_text_from_pdf(pdf_file)
    # Chunk the extracted text
    text_chunks = chunk_text(extracted_text)
    # Perform further processing (indexing, etc.)
    # Example: Print chunks
    for idx, chunk in enumerate(text_chunks):
        print(f"Chunk {idx + 1}: {chunk}")
    # Optionally, perform indexing tasks here

In [13]:
# Main function to start monitoring
def start_monitoring():
    # Monitor the 'Data' folder for new files
    path = 'path/to/your/Data/folder'
    event_handler = NewFileHandler()
    observer = Observer()
    observer.schedule(event_handler, path, recursive=False)
    observer.start()
    print(f"Monitoring folder '{path}' for new PDF files...")

    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()


In [14]:
# Start monitoring the 'Data' folder
if __name__ == "__main__":
    start_monitoring()

Monitoring folder 'path/to/your/Data/folder' for new PDF files...
