RAG Piplelines - Data Ingestion to Vector DB Pipleline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


In [12]:

def process_all_pdfs(pdf_directory):
    """ Process all the pdf files in a directory"""

    all_document = [ ]
    pdf_dir = Path(pdf_directory)

    # Find all PDF files recursively
    pdf_files = list (pdf_dir.glob('**/*.pdf'))

    print(f"Found {len(pdf_files)} PDF files to process.")

    for pdf_file in pdf_files:
        print(f"\nProcessing {pdf_file.name}")

        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            #Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_document.extend(documents)
            print(f" Loaded {len(documents)} pages")
        except Exception as e:
            print(f" Error: {e}")

    print(f"\n Total documents loaded:{len(all_document)}")
    return all_document

# Process all PDF's in the data directory
all_pdf_documents = process_all_pdfs("../data")


Found 5 PDF files to process.

Processing Research_Paper_on_Artificial_Intelligence.pdf
 Loaded 3 pages

Processing AI_ITS_Application.pdf


Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)


 Loaded 5 pages

Processing USA_CHINA_AI.pdf
 Loaded 1 pages

Processing start_hamburg.pdf
 Loaded 4 pages

Processing AI_Applications_in_CyberSecurity.pdf
 Loaded 11 pages

 Total documents loaded:24


In [21]:
# Checking the all pdfs data.
all_pdf_documents

[Document(metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft¬Æ Word 2016', 'creationdate': '2023-02-20T14:13:44+00:00', 'title': 'CASE STUDIES JOURNAL VOL-2, ISSUE 6  ISSN (2305-509X)', 'author': 'any', 'moddate': '2023-02-20T14:13:44+00:00', 'source': '../data/pdf/Research_Paper_on_Artificial_Intelligence.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1', 'source_file': 'Research_Paper_on_Artificial_Intelligence.pdf', 'file_type': 'pdf'}, page_content='www.ijecs.in \nInternational Journal Of Engineering And Computer Science \nVolume 12 Issue 02, February 2023, Page No.25654-25656 \nISSN: 2319-7242 DOI: 10.18535/ijecs/v11i02.4671 \n \n \n \n7  Ijecs 02 February Rajiv Gupta Research Paper on Artificial Intelligence \n \nPage | \n25654 \nResearch Paper on Artificial Intelligence \n \nRajiv Gupta \nChandigarh University, Chandigarh,  Haryana \n \n \nAbstract: This branch of computer science is concerned with making computers behave like humans. \nArtificial intelligence in

In [27]:
# Text splitting get into chunks

def split_document(documents, chunk_size=100, chunk_overlap=20):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]

    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(split_docs)} documents into {len(split_docs)} chunks")

    # Show example of a chunk

    if split_docs:
        print(f"\n Example chunks:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata:\n{split_docs[0].metadata}")

    return split_docs


In [28]:
chunks = split_document(all_pdf_documents)

Split 747 documents into 747 chunks

 Example chunks:
Content: www.ijecs.in 
International Journal Of Engineering And Computer Science...
Metadata:
{'producer': 'www.ilovepdf.com', 'creator': 'Microsoft¬Æ Word 2016', 'creationdate': '2023-02-20T14:13:44+00:00', 'title': 'CASE STUDIES JOURNAL VOL-2, ISSUE 6  ISSN (2305-509X)', 'author': 'any', 'moddate': '2023-02-20T14:13:44+00:00', 'source': '../data/pdf/Research_Paper_on_Artificial_Intelligence.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1', 'source_file': 'Research_Paper_on_Artificial_Intelligence.pdf', 'file_type': 'pdf'}
