In [26]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

In [27]:
def process_all_pdfs (pdf_directory):
    all_documents = []
    pdf_dir = Path(pdf_directory)

    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nprocessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
            print(f" Loaded {len(documents)} pages")

        except Exception as e:
            print(f"Error: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data")


Found 9 PDF files to process

processing: 1524838021991306.pdf
 Loaded 13 pages

processing: 18.Dr.AV LoRA.pdf
 Loaded 4 pages

processing: 2272.pdf
 Loaded 21 pages

processing: Application_for_Women_Safety_Spark_Women.pdf
 Loaded 7 pages

processing: doc.pdf
 Loaded 6 pages

processing: IJRESM_V3_I4_5.pdf
 Loaded 6 pages

processing: iot-based-wearable-safety-device-for-women-IJERTV9IS050697.pdf
 Loaded 4 pages

processing: PerceivedRiskStreetHarassmentandEdChoicesofWomen_Borker.pdf
 Loaded 69 pages

processing: “SafeSteps” A Flutter -Based Application to provide security to Women’s.pdf
 Loaded 2 pages

Total documents loaded: 132


In [28]:
all_pdf_documents 

[Document(metadata={'producer': 'Acrobat Distiller 10.0.0 (Windows); modified using iText 4.2.0 by 1T3XT', 'creator': 'Arbortext Advanced Print Publisher 9.0.114/W', 'creationdate': '2021-02-19T12:14:30+05:30', 'keywords': 'safe spaces,evaluation,conflict,gender-based violence,systematic review', 'moddate': '2021-02-25T03:55:31-08:00', 'subject': 'Trauma, Violence, & Abuse 0.0:1524838021991306', 'author': 'Lindsay Stark, Mackenzie V. Robinson, Ilana Seff, Alli Gillespie, Jonathan Colarelli, and Debbie Landis', 'title': 'The Effectiveness of Women and Girls Safe Spaces: A Systematic Review of Evidence to Address Violence Against Women and Girls in Humanitarian Contexts', 'source': '..\\data\\pdf\\1524838021991306.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1', 'source_file': '1524838021991306.pdf', 'file_type': 'pdf'}, page_content='Review Manuscript\nThe Effectiveness of Women and Girls Safe\nSpaces: A Systematic Review of Evidence\nto Address Violence Against Women\nand Girls i

In [29]:
def split_documents(documents,chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    if split_docs:
        print(f"\nExample chunk: ")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs

In [30]:
chunks = split_documents(all_pdf_documents)
chunks

Split 132 documents into 428 chunks

Example chunk: 
Content: Review Manuscript
The Effectiveness of Women and Girls Safe
Spaces: A Systematic Review of Evidence
to Address Violence Against Women
and Girls in Humanitarian Contexts
Lindsay Stark1 , Mackenzie V. R...
Metadata: {'producer': 'Acrobat Distiller 10.0.0 (Windows); modified using iText 4.2.0 by 1T3XT', 'creator': 'Arbortext Advanced Print Publisher 9.0.114/W', 'creationdate': '2021-02-19T12:14:30+05:30', 'keywords': 'safe spaces,evaluation,conflict,gender-based violence,systematic review', 'moddate': '2021-02-25T03:55:31-08:00', 'subject': 'Trauma, Violence, & Abuse 0.0:1524838021991306', 'author': 'Lindsay Stark, Mackenzie V. Robinson, Ilana Seff, Alli Gillespie, Jonathan Colarelli, and Debbie Landis', 'title': 'The Effectiveness of Women and Girls Safe Spaces: A Systematic Review of Evidence to Address Violence Against Women and Girls in Humanitarian Contexts', 'source': '..\\data\\pdf\\1524838021991306.pdf', 'total_pages': 

[Document(metadata={'producer': 'Acrobat Distiller 10.0.0 (Windows); modified using iText 4.2.0 by 1T3XT', 'creator': 'Arbortext Advanced Print Publisher 9.0.114/W', 'creationdate': '2021-02-19T12:14:30+05:30', 'keywords': 'safe spaces,evaluation,conflict,gender-based violence,systematic review', 'moddate': '2021-02-25T03:55:31-08:00', 'subject': 'Trauma, Violence, & Abuse 0.0:1524838021991306', 'author': 'Lindsay Stark, Mackenzie V. Robinson, Ilana Seff, Alli Gillespie, Jonathan Colarelli, and Debbie Landis', 'title': 'The Effectiveness of Women and Girls Safe Spaces: A Systematic Review of Evidence to Address Violence Against Women and Girls in Humanitarian Contexts', 'source': '..\\data\\pdf\\1524838021991306.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1', 'source_file': '1524838021991306.pdf', 'file_type': 'pdf'}, page_content='Review Manuscript\nThe Effectiveness of Women and Girls Safe\nSpaces: A Systematic Review of Evidence\nto Address Violence Against Women\nand Girls i

In [None]:

#%pip install tf-keras

import numpy as np 
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:

        if not self.model:
            raise ValueError("Model not loaded.")
        
        print(f'Generating embeddings for {len(texts)} texts...')
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f'Generated embeddings with shape: {embeddings.shape}')
        return embeddings
    
embedding_manager = EmbeddingManager()
embedding_manager

    

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x267556aa850>

In [21]:
class VectorStore:
    def __init__(self, collection_name: str="pdf_documents", persist_directory: str="../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddingd for RAG"}
            )
            print(f"Vector store initalized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddidngs")
        
        print(f"Adding {len(documents)} documents to vector store...")

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)

            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Erroradding documents to vector store: {e}")
            raise

vectorstore = VectorStore()
vectorstore



Vector store initalized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x267556b09d0>

In [31]:
chunks

[Document(metadata={'producer': 'Acrobat Distiller 10.0.0 (Windows); modified using iText 4.2.0 by 1T3XT', 'creator': 'Arbortext Advanced Print Publisher 9.0.114/W', 'creationdate': '2021-02-19T12:14:30+05:30', 'keywords': 'safe spaces,evaluation,conflict,gender-based violence,systematic review', 'moddate': '2021-02-25T03:55:31-08:00', 'subject': 'Trauma, Violence, & Abuse 0.0:1524838021991306', 'author': 'Lindsay Stark, Mackenzie V. Robinson, Ilana Seff, Alli Gillespie, Jonathan Colarelli, and Debbie Landis', 'title': 'The Effectiveness of Women and Girls Safe Spaces: A Systematic Review of Evidence to Address Violence Against Women and Girls in Humanitarian Contexts', 'source': '..\\data\\pdf\\1524838021991306.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1', 'source_file': '1524838021991306.pdf', 'file_type': 'pdf'}, page_content='Review Manuscript\nThe Effectiveness of Women and Girls Safe\nSpaces: A Systematic Review of Evidence\nto Address Violence Against Women\nand Girls i

In [33]:
texts = [doc.page_content for doc in chunks]

embeddings = embedding_manager.generate_embeddings(texts)

vectorstore.add_documents(chunks,embeddings)

Generating embeddings for 428 texts...


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Generated embeddings with shape: (428, 384)
Adding 428 documents to vector store...


: 