In [26]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path


In [27]:

### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    print("Pdf directory : ", pdf_dir)

    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print("List of PDF files found: ", pdf_files)
    
    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files: 
        ## Load the pdf file. 
        pdf_loader = PyMuPDFLoader(str(pdf_file)) 
        documents = pdf_loader.load() 

        for doc in documents:
            doc.metadata['source'] = str(pdf_file) 
            doc.metadata['file_type'] = 'pdf' 

        print(f"Loaded {len(documents)} documentsfrom file - {pdf_file}")

        all_documents.extend(documents)

    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

all_pdf_documents

Pdf directory :  ..\data
List of PDF files found:  [WindowsPath('../data/pdf/Edurica.pdf'), WindowsPath('../data/pdf/Ilinois_Pgp.pdf'), WindowsPath('../data/pdf/MicrosoftAIEndProgram.pdf')]
Found 3 PDF files to process
Loaded 34 documentsfrom file - ..\data\pdf\Edurica.pdf
Loaded 16 documentsfrom file - ..\data\pdf\Ilinois_Pgp.pdf
Loaded 26 documentsfrom file - ..\data\pdf\MicrosoftAIEndProgram.pdf


[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-08-05T19:11:08+05:30', 'source': '..\\data\\pdf\\Edurica.pdf', 'file_path': '..\\data\\pdf\\Edurica.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': 'Devyanshi Shukla', 'subject': '', 'keywords': '', 'moddate': '2025-08-05T19:11:08+05:30', 'trapped': '', 'modDate': "D:20250805191108+05'30'", 'creationDate': "D:20250805191108+05'30'", 'page': 0, 'file_type': 'pdf'}, page_content='www.edureka.co \n© Brain4ce Education Solutions Pvt. Ltd. All rights Reserved.\nedureka! \nAbout Edureka \nEdureka is one of the world’s largest and most effective online education platform for \ntechnology professionals. In a span of 10 years, 100,000+ students from over 176 \ncountries have upskilled themselves with the help of our online courses. Since our \ninception, we have been dedicated to helping technology professionals from all corners \nof the

In [28]:
def split_documents(documents, chunk_size=100, chunk_overlap=50) -> list:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)

    print (f"Split {len(documents)} documents into {len(split_docs)} chunks") 

    return split_docs

In [29]:
#from langchain_core.documents import Document  

## Text loader 
#from langchain.document_loaders import TextLoader 
#loader = TextLoader("../data/textfiles/python_programming_instruction.txt", encoding="utf8")
#documents = loader.load() 

#print(documents)

#all_pdf_documents
#print(len(all_pdf_documents))

## Split the documents into smaller chunks 

split_pdf_documents = split_documents(all_pdf_documents)
split_pdf_documents


Split 76 documents into 1037 chunks


[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-08-05T19:11:08+05:30', 'source': '..\\data\\pdf\\Edurica.pdf', 'file_path': '..\\data\\pdf\\Edurica.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': 'Devyanshi Shukla', 'subject': '', 'keywords': '', 'moddate': '2025-08-05T19:11:08+05:30', 'trapped': '', 'modDate': "D:20250805191108+05'30'", 'creationDate': "D:20250805191108+05'30'", 'page': 0, 'file_type': 'pdf'}, page_content='www.edureka.co \n© Brain4ce Education Solutions Pvt. Ltd. All rights Reserved.\nedureka!'),
 Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-08-05T19:11:08+05:30', 'source': '..\\data\\pdf\\Edurica.pdf', 'file_path': '..\\data\\pdf\\Edurica.pdf', 'total_pages': 34, 'format': 'PDF 1.7', 'title': '', 'author': 'Devyanshi Shukla', 'subject': '', 'keywords': '', 'moddate

In [30]:
### Create Embeddings 
from sentence_transformers import SentenceTransformer

def performEmbeddings(documents: list): 
    ## Load embeddings model. 
    embeddingsModel = SentenceTransformer("all-MiniLM-L6-v2")

    ## Feed chunks and create embeddings. 
    embeddings = embeddingsModel.encode([doc.page_content for doc in documents], show_progress_bar=True)

    ## Return embeddings object. 

    print("Count of embeddings created : ", len(embeddings))
    print(f"Generated embeddings with shape: {embeddings.shape}")

    return embeddings

In [31]:
embeddings = performEmbeddings(split_pdf_documents)


Batches: 100%|██████████| 33/33 [00:03<00:00, 10.21it/s]

Count of embeddings created :  1037
Generated embeddings with shape: (1037, 384)





In [32]:
class EmbeddingManager: 
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model = model_name 
        self.embeddingModel = self.loadModel()
        return 
    
    ### Load model.  
    def loadModel(self):
        try: 
            print("Loading model : ", self.model)
            self.embeddingsModel = SentenceTransformer(self.model)
        except Exception as e:
            print("Exception in loading model : ", e)
            print("Failed to load model.")
            raise e

        return self.embeddingsModel
    
    ### Create embeddings model. 
    def createEmbeddings(self, documents, chunk_size=100, chunk_overlap=50) -> list: 

        if not self.embeddingModel: 
            print("Embeddings model is not loaded. Please load the model first.")
            raise Exception("Embeddings model is not loaded. Please load the model first.")

        embeddings = self.embeddingModel.encode([doc.page_content for doc in documents], show_progress_bar=True)
        print("Generated Embeddings with shape: ", embeddings.shape)
        return embeddings


In [33]:
embeddingManager = EmbeddingManager()
embeddings = embeddingManager.createEmbeddings(split_pdf_documents)


Loading model :  all-MiniLM-L6-v2


Batches: 100%|██████████| 33/33 [00:02<00:00, 11.20it/s]

Generated Embeddings with shape:  (1037, 384)





In [34]:
print(embeddings.shape)
print(len(embeddings))
print(len(split_pdf_documents))

embeddings
#split_pdf_documents

(1037, 384)
1037
1037


array([[ 0.07163063,  0.05930459, -0.01320795, ...,  0.03590067,
        -0.02970807,  0.00240865],
       [ 0.00079836,  0.07922199,  0.01025009, ...,  0.01030861,
        -0.01477767,  0.03909266],
       [-0.00877913,  0.00748089, -0.04733979, ..., -0.01977468,
         0.01310607,  0.06669142],
       ...,
       [-0.05545438, -0.00760634, -0.06299905, ...,  0.01518496,
        -0.10769875,  0.0537996 ],
       [-0.08911707, -0.01486699, -0.05484404, ...,  0.00894779,
        -0.07535298,  0.01340303],
       [-0.12109854,  0.0203612 , -0.05038118, ..., -0.04037002,
        -0.03239254, -0.02835743]], shape=(1037, 384), dtype=float32)

In [38]:
#doc_texts = [doc.page_content for doc in split_pdf_documents] 
#doc_ids = [f"doc_{i}" for i in range(len(doc_texts))] 

## Prepare Metadatas. 
metadatas = [] 
i=1 

for doc in split_pdf_documents:
    metadata = {
        "doc_id": f"doc_{i}", 
        "source": doc.metadata.get("source", "N/A"), 
        "file_type": doc.metadata.get("file_type", "N/A")
    }
    i += 1
    metadatas.append(metadata)

metadatas


[{'doc_id': 'doc_1',
  'source': '..\\data\\pdf\\Edurica.pdf',
  'file_type': 'pdf'},
 {'doc_id': 'doc_2',
  'source': '..\\data\\pdf\\Edurica.pdf',
  'file_type': 'pdf'},
 {'doc_id': 'doc_3',
  'source': '..\\data\\pdf\\Edurica.pdf',
  'file_type': 'pdf'},
 {'doc_id': 'doc_4',
  'source': '..\\data\\pdf\\Edurica.pdf',
  'file_type': 'pdf'},
 {'doc_id': 'doc_5',
  'source': '..\\data\\pdf\\Edurica.pdf',
  'file_type': 'pdf'},
 {'doc_id': 'doc_6',
  'source': '..\\data\\pdf\\Edurica.pdf',
  'file_type': 'pdf'},
 {'doc_id': 'doc_7',
  'source': '..\\data\\pdf\\Edurica.pdf',
  'file_type': 'pdf'},
 {'doc_id': 'doc_8',
  'source': '..\\data\\pdf\\Edurica.pdf',
  'file_type': 'pdf'},
 {'doc_id': 'doc_9',
  'source': '..\\data\\pdf\\Edurica.pdf',
  'file_type': 'pdf'},
 {'doc_id': 'doc_10',
  'source': '..\\data\\pdf\\Edurica.pdf',
  'file_type': 'pdf'},
 {'doc_id': 'doc_11',
  'source': '..\\data\\pdf\\Edurica.pdf',
  'file_type': 'pdf'},
 {'doc_id': 'doc_12',
  'source': '..\\data\\pdf\\Ed

In [None]:
document_texts = [doc.page_content for doc in split_pdf_documents]
document_ids = [metadata["doc_id"] for metadata in metadatas]

document_ids


['doc_1',
 'doc_2',
 'doc_3',
 'doc_4',
 'doc_5',
 'doc_6',
 'doc_7',
 'doc_8',
 'doc_9',
 'doc_10',
 'doc_11',
 'doc_12',
 'doc_13',
 'doc_14',
 'doc_15',
 'doc_16',
 'doc_17',
 'doc_18',
 'doc_19',
 'doc_20',
 'doc_21',
 'doc_22',
 'doc_23',
 'doc_24',
 'doc_25',
 'doc_26',
 'doc_27',
 'doc_28',
 'doc_29',
 'doc_30',
 'doc_31',
 'doc_32',
 'doc_33',
 'doc_34',
 'doc_35',
 'doc_36',
 'doc_37',
 'doc_38',
 'doc_39',
 'doc_40',
 'doc_41',
 'doc_42',
 'doc_43',
 'doc_44',
 'doc_45',
 'doc_46',
 'doc_47',
 'doc_48',
 'doc_49',
 'doc_50',
 'doc_51',
 'doc_52',
 'doc_53',
 'doc_54',
 'doc_55',
 'doc_56',
 'doc_57',
 'doc_58',
 'doc_59',
 'doc_60',
 'doc_61',
 'doc_62',
 'doc_63',
 'doc_64',
 'doc_65',
 'doc_66',
 'doc_67',
 'doc_68',
 'doc_69',
 'doc_70',
 'doc_71',
 'doc_72',
 'doc_73',
 'doc_74',
 'doc_75',
 'doc_76',
 'doc_77',
 'doc_78',
 'doc_79',
 'doc_80',
 'doc_81',
 'doc_82',
 'doc_83',
 'doc_84',
 'doc_85',
 'doc_86',
 'doc_87',
 'doc_88',
 'doc_89',
 'doc_90',
 'doc_91',
 'doc_92

In [None]:
import chromadb

## Create a collection. 
persist_directory = '../data/vectorstore' 
os.makedirs(persist_directory, exist_ok=True) 

vectordbclient = chromadb.PersistentClient(path=persist_directory)
collection = vectordbclient.get_or_create_collection(name="pdf_rag", metadata={"description":"RAG store for pdf documents"})

print("Available collections:", vectordbclient.list_collections())

### Store embeddings inside collection. 
## Prepare document list 

collection.add(
    embeddings=embeddings.tolist(), 
    documents=document_texts, 
    metadatas=metadatas, 
    ids=document_ids
)







Available collections: [Collection(name=pdf_rag)]


Collection(name=pdf_rag)

In [58]:
### Query the collection. 

inputQuery = "Which programs are available to learn Generative and Agentic AI"

results = collection.query(
    query_texts=[inputQuery],
    n_results=3
)

results 



C:\Users\jsonchhatra\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [01:13<00:00, 1.13MiB/s]


{'ids': [['doc_16', 'doc_356', 'doc_364']],
 'embeddings': None,
 'documents': [['implement them in real-world applications. This generative AI online course includes',
   '• AI Agents vs. Agentic AI \n• Comparison: Agentic AI, Generative AI, and Traditional AI',
   '• AI Agents vs. Agentic AI \n• Comparison: Agentic AI, Generative AI, and Traditional AI']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'doc_id': 'doc_16',
    'file_type': 'pdf',
    'source': '..\\data\\pdf\\Edurica.pdf'},
   {'file_type': 'pdf',
    'source': '..\\data\\pdf\\Edurica.pdf',
    'doc_id': 'doc_356'},
   {'file_type': 'pdf',
    'source': '..\\data\\pdf\\Edurica.pdf',
    'doc_id': 'doc_364'}]],
 'distances': [[0.518720805644989, 0.5390955805778503, 0.5390955805778503]]}

In [66]:
### Query the collection using Embeddings. 

inputQuery = "Which programs are available to learn Generative and Agentic AI"

queryEmbedding = embeddingManager.embeddingModel.encode([inputQuery]) 
queryEmbedding

results = collection.query(
    query_embeddings=queryEmbedding.tolist(), 
    n_results=50, 
    include=["metadatas", "documents", "distances"]
) 

#results

print("Query results:")
for i, (doc, metadata, distance) in enumerate(zip(results['documents'][0], results['metadatas'][0], results['distances'][0])):
    print(f"\n--- Result {i+1} (Distance: {distance:.4f}) ---")
    print(f"Source: {metadata['source']}")
    print(f"Content: {doc}...")  # Show first 200 characters  





Query results:

--- Result 1 (Distance: 0.5187) ---
Source: ..\data\pdf\Edurica.pdf
Content: implement them in real-world applications. This generative AI online course includes...

--- Result 2 (Distance: 0.5391) ---
Source: ..\data\pdf\Edurica.pdf
Content: • AI Agents vs. Agentic AI 
• Comparison: Agentic AI, Generative AI, and Traditional AI...

--- Result 3 (Distance: 0.5391) ---
Source: ..\data\pdf\Edurica.pdf
Content: • AI Agents vs. Agentic AI 
• Comparison: Agentic AI, Generative AI, and Traditional AI...

--- Result 4 (Distance: 0.5657) ---
Source: ..\data\pdf\MicrosoftAIEndProgram.pdf
Content: Master Class on Agentic AI Solutions 
Using Copilot Studio and AutoGen
Advanced 
Generative AI...

--- Result 5 (Distance: 0.5657) ---
Source: ..\data\pdf\MicrosoftAIEndProgram.pdf
Content: Master Class on Agentic AI Solutions Using Copilot Studio and AutoGen
Advanced Generative AI...

--- Result 6 (Distance: 0.5869) ---
Source: ..\data\pdf\MicrosoftAIEndProgram.pdf
Content: Attend indu