In [1]:
import os
import re
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

# === CONFIGURATION ===
pdf_folder = r"F:\Prashant Project\Python AI ML Projects\ClassifiedDocs\New"
index_folder = r"F:\Prashant Project\Python AI ML Projects\ClassifiedDocsVectorIndex"

# === TEXT CLEANING FUNCTION ===
def preprocess_paragraphs(text):
    text = re.sub(r"[ ]{2,}", " ", text)  # Remove extra spaces
    lines = text.split('\n')

    new_paras = []
    join_paras = []

    for line in lines:
        line = line.strip()
        if not line:
            continue

        if re.search(r'[.?:]$', line):  # Ends with punctuation
            if join_paras:
                join_paras.append(line)
                para = ' '.join(join_paras)
                new_paras.append(para)
                join_paras = []
            else:
                new_paras.append(line)
        else:
            join_paras.append(line)

    if join_paras:
        new_paras.append(' '.join(join_paras))

    return "\n\n".join(new_paras)

# === TEXT SPLITTER ===
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_chunks = []

# === LOAD, PREPROCESS, SPLIT ===
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

for file in pdf_files:
    pdf_path = os.path.join(pdf_folder, file)
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()

    for page in pages:
        raw_text = page.page_content
        cleaned_text = preprocess_paragraphs(raw_text)

        # Create a temporary Document with cleaned text and metadata
        cleaned_doc = Document(
            page_content=cleaned_text,
            metadata={"source": file}
        )

        # Split cleaned document
        chunks = splitter.split_documents([cleaned_doc])

        # Add source metadata
        for chunk in chunks:
            chunk.metadata['source'] = file
        all_chunks.extend(chunks)

print(f"✅ Processed {len(all_chunks)} chunks from {len(pdf_files)} PDF files.")

# === EMBEDDINGS & FAISS VECTOR STORE ===
embeddings = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs={"device": "cpu"}
)

vectorstore = FAISS.from_documents(all_chunks, embeddings)
vectorstore.save_local(index_folder)

print(f"✅ FAISS vector store created and saved at: {index_folder}")



✅ Processed 51 chunks from 1 PDF files.


  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


✅ FAISS vector store created and saved at: F:\Prashant Project\Python AI ML Projects\ClassifiedDocsVectorIndex
