In [None]:
# Step 1 :- Partitioning the pdf into atomic elements
# Step 2 :-

In [2]:
%pip install -Uq "unstructured[all-docs]"

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install -Uq langchain_chroma

Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install -Uq langchain langchain_community langchain-openai
%pip install -Uq python_dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
import json
from typing import List
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

from langchain_core.documents import Document
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [6]:
#Step 1 :- Partitioning the PDF
def partition_document(file_path:str):
    """Extract Elements from the PDF using unstructured"""
    print(f"Partioning document :: " + file_path)

    elements = partition_pdf(
        filename=file_path,
        strategy="hi_res",
        infer_table_structure=True,
        extract_image_block_types=["Image"],
        extract_image_block_to_payload=True
    )
    print(f"Extracted {len(elements)} elements")
    return elements


file_path = './attention_is_all_you_need.pdf'
elements = partition_document(file_path)


Partioning document :: ./attention_is_all_you_need.pdf


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


Extracted 173 elements


In [7]:
len(elements)

173

In [8]:
def create_chunks_by_title(elements):
    """Create intelligent chunks using title-based strategy"""
    print("ðŸ”¨ Creating smart chunks...")
    
    chunks = chunk_by_title(
        elements, # The parsed PDF elements from previous step
        max_characters=3000, # Hard limit - never exceed 3000 characters per chunk
        new_after_n_chars=2400, # Try to start a new chunk after 2400 characters
        combine_text_under_n_chars=500 # Merge tiny chunks under 500 chars with neighbors
    )
    
    print(f"âœ… Created {len(chunks)} chunks")
    return chunks

# Create chunks 
chunks = create_chunks_by_title(elements)

ðŸ”¨ Creating smart chunks...
âœ… Created 21 chunks


In [13]:
def separate_content_types(chunk):
    """Analyze what types of content are in a chunk"""
    content_data = {
        'text': chunk.text,
        'tables': [],
        'images': [],
        'types': ['text']
    }
    if hasattr(chunk, 'metadata') and hasattr(chunk.metadata,'orig_elements'):
        for element in chunk.metadata.orig_elements:
            element_type = type(element).__name__

            if(element_type == 'Table'):
                content_data['types'].append('table')
                table_html = getattr(element.metadata, 'text_as_html', element.text)
                content_data['tables'].append(table_html)

            elif (element_type == 'Image'):
                if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_base64'):
                    content_data['types'].append('image')
                    content_data['images'].append(element.metadata.image_base64)    


    content_data['types'] = list(set(content_data['types']))
    return content_data            

def create_ai_enhanced_summary(text: str, tables: List[str], images: List[str]) -> str:
    """Create AI-enhanced summary for mixed content"""
    
    try:
        llm = ChatOpenAI(model="gpt-4o", temperature=0)
        
        prompt_text = f"""You are creating a searchable description for document content retrieval.

        CONTENT TO ANALYZE:
        TEXT CONTENT:
        {text}

        """
        
        if tables:
            prompt_text += "TABLES:\n"
            for i, table in enumerate(tables):
                prompt_text += f"Table {i+1}:\n{table}\n\n"
        
                prompt_text += """
                YOUR TASK:
                Generate a comprehensive, searchable description that covers:

                1. Key facts, numbers, and data points from text and tables
                2. Main topics and concepts discussed  
                3. Questions this content could answer
                4. Visual content analysis (charts, diagrams, patterns in images)
                5. Alternative search terms users might use

                Make it detailed and searchable - prioritize findability over brevity.

                SEARCHABLE DESCRIPTION:"""

       
        message_content = [{"type": "text", "text": prompt_text}]
        
        
        for image_base64 in images:
            message_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
            })
        
        
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        
        return response.content
        
    except Exception as e:
        print(f" AI summary failed: {e}")
        summary = f"{text[:300]}..."
        if tables:
            summary += f" [Contains {len(tables)} table(s)]"
        if images:
            summary += f" [Contains {len(images)} image(s)]"
        return summary


  


def summarize_chunks(chunks):
    """ Process all chunks with AI Summaries """
    print(" Processing chunks with AI Summaries ...")

    langchain_documents = []
    total_chunks = len(chunks)

    for i,chunk in enumerate(chunks):
        current_chunk = i + 1
        print(f" Processing chunk {current_chunk}/{total_chunks}")

        content_data = separate_content_types(chunk)

        print(f"     Types found: {content_data['types']}")
        print(f"     Tables: {len(content_data['tables'])}, Images: {len(content_data['images'])}")


        if content_data['tables'] or content_data['images']:
            print(f"   Creating AI summary for mixed content...")
            
            enhanced_content = create_ai_enhanced_summary(content_data['images'], content_data['tables'], content_data['text'])
            print(f"     â†’ AI summary created successfully")
            print(f"     â†’ Enhanced content preview: {enhanced_content[:200]}...") 
            

    doc = Document(
           page_content=enhanced_content,
                                         metadata={
                "original_content": json.dumps({
                    "raw_text": content_data['text'],
                    "tables_html": content_data['tables'],
                    "images_base64": content_data['images']
                })
            }
        )
    langchain_documents.append(doc)
    print(f"Processed {len(langchain_documents)} chunks")
    return langchain_documents        




processed_chunks = summarize_chunks(chunks)


 Processing chunks with AI Summaries ...
 Processing chunk 1/21
     Types found: ['text']
     Tables: 0, Images: 0
 Processing chunk 2/21
     Types found: ['text']
     Tables: 0, Images: 0
 Processing chunk 3/21
     Types found: ['text']
     Tables: 0, Images: 0
 Processing chunk 4/21
     Types found: ['text']
     Tables: 0, Images: 0
 Processing chunk 5/21
     Types found: ['image', 'text']
     Tables: 0, Images: 1
   Creating AI summary for mixed content...
 AI summary failed: Error code: 400 - {'error': {'message': 'Request containted 1671 images, max 1500 images allowed per request.', 'type': 'max_images_per_request', 'param': None, 'code': 'max_images_per_request'}}
     â†’ AI summary created successfully
     â†’ Enhanced content preview: ['/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMj...
 Processing chunk 6/21
     Ty

In [17]:
print(processed_chunks) 

[Document(metadata={'original_content': '{"raw_text": "[31] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google\\u2019s neural machine translation system: Bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144, 2016.\\n\\n[32] Jie Zhou, Ying Cao, Xuguang Wang, Peng Li, and Wei Xu. Deep recurrent models with fast-forward connections for neural machine translation. CoRR, abs/1606.04199, 2016.\\n\\n11", "tables_html": [], "images_base64": []}'}, page_content='[]... [Contains 1 table(s)] [Contains 2147 image(s)]')]


In [18]:
def create_vector_store(documents, persist_directory="dbv1/chroma_db"):
    """Create and persist ChromaDB vector store"""
    print("Creating embeddings and storing in ChromaDB...")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    print("--- Creating vector store ---")
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=persist_directory, 
        collection_metadata={"hnsw:space": "cosine"}
    )
    print("--- Finished creating vector store ---")
    
    print(f"Vector store created and saved to {persist_directory}")
    return vectorstore


db = create_vector_store(processed_chunks)

Creating embeddings and storing in ChromaDB...
--- Creating vector store ---
--- Finished creating vector store ---
Vector store created and saved to dbv1/chroma_db


In [19]:
query = "What are the two main components of the Transformer architecture? "
retriever = db.as_retriever(search_kwargs={"k": 3})
chunks = retriever.invoke(query)

print(chunks)

[Document(id='18cc78d1-5dc9-4189-9644-0588db48a684', metadata={'original_content': '{"raw_text": "[31] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google\\u2019s neural machine translation system: Bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144, 2016.\\n\\n[32] Jie Zhou, Ying Cao, Xuguang Wang, Peng Li, and Wei Xu. Deep recurrent models with fast-forward connections for neural machine translation. CoRR, abs/1606.04199, 2016.\\n\\n11", "tables_html": [], "images_base64": []}'}, page_content='[]... [Contains 1 table(s)] [Contains 2147 image(s)]')]


In [20]:
def run_complete_ingestion_pipeline(pdf_path:str):
    """Run the complete RAG ingestion pipeline"""
    print("Start RAG Ingestion Pipeline")

    elements = partition_document(pdf_path)

    chunks = create_chunks_by_title(elements)

    summarize = summarize_chunks(chunks)

    db = create_vector_store(summarize, persist_directory="dbv2/chroma_db")

    return db
