In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    print("Pdf directory : ", pdf_dir)

    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print("List of PDF files found: ", pdf_files)
    
    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files: 
        ## Load the pdf file. 
        pdf_loader = PyMuPDFLoader(str(pdf_file)) 
        documents = pdf_loader.load() 

        for doc in documents:
            doc.metadata['source'] = str(pdf_file) 
            doc.metadata['file_type'] = 'pdf' 

        print(f"Loaded {len(documents)} documentsfrom file - {pdf_file}")

        all_documents.extend(documents)

    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

all_pdf_documents

Pdf directory :  ..\data
List of PDF files found:  [WindowsPath('../data/pdf/Edurica.pdf'), WindowsPath('../data/pdf/Ilinois_Pgp.pdf'), WindowsPath('../data/pdf/MicrosoftAIEndProgram.pdf')]
Found 3 PDF files to process
Loaded 34 documentsfrom file - ..\data\pdf\Edurica.pdf
Loaded 16 documentsfrom file - ..\data\pdf\Ilinois_Pgp.pdf
Loaded 26 documentsfrom file - ..\data\pdf\MicrosoftAIEndProgram.pdf


[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-08-05T19:11:08+05:30', 'source': '..\\data\\pdf\\Edurica.pdf', 'file_path': '..\\data\\pdf\\Edurica.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': 'Devyanshi Shukla', 'subject': '', 'keywords': '', 'moddate': '2025-08-05T19:11:08+05:30', 'trapped': '', 'modDate': "D:20250805191108+05'30'", 'creationDate': "D:20250805191108+05'30'", 'page': 0, 'file_type': 'pdf'}, page_content='www.edureka.co \n© Brain4ce Education Solutions Pvt. Ltd. All rights Reserved.\nedureka! \nAbout Edureka \nEdureka is one of the world’s largest and most effective online education platform for \ntechnology professionals. In a span of 10 years, 100,000+ students from over 176 \ncountries have upskilled themselves with the help of our online courses. Since our \ninception, we have been dedicated to helping technology professionals from all corners \nof the

In [4]:
def split_documents(documents, chunk_size=100, chunk_overlap=50) -> list:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)

    print (f"Split {len(documents)} documents into {len(split_docs)} chunks") 

    return split_docs

In [None]:
split_pdf_documents = split_documents(all_pdf_documents)
### Create Embeddings 
from sentence_transformers import SentenceTransformersplit_pdf_documents

Split 76 documents into 1037 chunks


[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-08-05T19:11:08+05:30', 'source': '..\\data\\pdf\\Edurica.pdf', 'file_path': '..\\data\\pdf\\Edurica.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': 'Devyanshi Shukla', 'subject': '', 'keywords': '', 'moddate': '2025-08-05T19:11:08+05:30', 'trapped': '', 'modDate': "D:20250805191108+05'30'", 'creationDate': "D:20250805191108+05'30'", 'page': 0, 'file_type': 'pdf'}, page_content='www.edureka.co \n© Brain4ce Education Solutions Pvt. Ltd. All rights Reserved.\nedureka!'),
 Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-08-05T19:11:08+05:30', 'source': '..\\data\\pdf\\Edurica.pdf', 'file_path': '..\\data\\pdf\\Edurica.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': 'Devyanshi Shukla', 'subject': '', 'keywords': '', 'moddate

In [9]:
### Create Embeddings 
from sentence_transformers import SentenceTransformer

class EmbeddingManager: 
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model = model_name 
        self.embeddingModel = self.loadModel()
        return 
    
    ### Load model.  
    def loadModel(self):
        try: 
            print("Loading model : ", self.model)
            self.embeddingsModel = SentenceTransformer(self.model)
        except Exception as e:
            print("Exception in loading model : ", e)
            print("Failed to load model.")
            raise e

        return self.embeddingsModel
    
    ### Create embeddings model. 
    def createEmbeddings(self, documents, chunk_size=100, chunk_overlap=50) -> list: 

        if not self.embeddingModel: 
            print("Embeddings model is not loaded. Please load the model first.")
            raise Exception("Embeddings model is not loaded. Please load the model first.")

        embeddings = self.embeddingModel.encode([doc.page_content for doc in documents], show_progress_bar=True)
        print("Generated Embeddings with shape: ", embeddings.shape)
        return embeddings

In [10]:
embeddingManager = EmbeddingManager()
embeddingManager.createEmbeddings(split_pdf_documents)

Loading model :  all-MiniLM-L6-v2


Batches: 100%|██████████| 33/33 [00:02<00:00, 11.92it/s]

Generated Embeddings with shape:  (1037, 384)





array([[ 0.07163063,  0.05930459, -0.01320795, ...,  0.03590067,
        -0.02970807,  0.00240865],
       [ 0.00079836,  0.07922199,  0.01025009, ...,  0.01030861,
        -0.01477767,  0.03909266],
       [-0.00877913,  0.00748089, -0.04733979, ..., -0.01977468,
         0.01310607,  0.06669142],
       ...,
       [-0.05545438, -0.00760634, -0.06299905, ...,  0.01518496,
        -0.10769875,  0.0537996 ],
       [-0.08911707, -0.01486699, -0.05484404, ...,  0.00894779,
        -0.07535298,  0.01340303],
       [-0.12109854,  0.0203612 , -0.05038118, ..., -0.04037002,
        -0.03239254, -0.02835743]], shape=(1037, 384), dtype=float32)