In [2]:
import os
import json
import hashlib
from datetime import datetime
from PyPDF2 import PdfFileReader
#from langchain.embeddings import LlamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader

In [3]:
# Directory containing PDF files
pdf_directory = '/mnt/c/ML/DU/local_rag_llm/data'
metadata_file = 'processed_files_metadata.json'

In [4]:
# Function to compute the hash of a file
def compute_file_hash(filepath):
    hasher = hashlib.md5()
    with open(filepath, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

In [5]:
# Function to read and chunk new/updated PDFs
def load_and_chunk_new_pdfs(directory, metadata_file):
    if os.path.exists(metadata_file):
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)
    else:
        metadata = {}

    new_documents = []
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            filepath = os.path.join(directory, filename)
            file_hash = compute_file_hash(filepath)
            last_modified = os.path.getmtime(filepath)
            if filename not in metadata or metadata[filename]['hash'] != file_hash or metadata[filename]['last_modified'] != last_modified:
                loader = PyPDFLoader(filepath)
                new_documents.extend(loader.load())
                metadata[filename] = {
                    'hash': file_hash,
                    'last_modified': last_modified
                }

    # Save updated metadata
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f)

    return new_documents

In [6]:
# Perform semantic chunking on new/updated documents
new_documents = load_and_chunk_new_pdfs(pdf_directory, metadata_file)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
new_chunks = text_splitter.split_documents(new_documents)