In [None]:
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

def process_page(page):
    """Extract structure from a single PDF page"""
    structure = {
        "page_number": page.page_number,
        "headers": [],
        "subheaders": [],
        "content": [],
        "current_header": None,
        "current_subheader": None
    }
    
    # Extract text elements with formatting info
    elements = page.extract_text_lines(extra_attrs=["size", "fontname"])
    
    for element in elements:
        text = element["text"].strip()
        font_size = element["size"]
        
        # Header detection (adjust thresholds based on your document)
        if font_size >= 14 and text:
            structure["headers"].append(text)
            structure["current_header"] = text
            structure["current_subheader"] = None
        elif font_size >= 12 and text:
            structure["subheaders"].append(text)
            structure["current_subheader"] = text
        elif text:
            structure["content"].append({
                "text": text,
                "header": structure["current_header"],
                "subheader": structure["current_subheader"]
            })
    
    return structure

def process_document(pdf_path):
    """Process entire PDF document"""
    document = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            document.append(process_page(page))
    return document

def build_toc(document_data):
    """Generate table of contents"""
    toc = []
    for page in document_data:
        for header in page["headers"]:
            toc.append({"title": header, "page": page["page_number"], "type": "header"})
        for subheader in page["subheaders"]:
            toc.append({"title": subheader, "page": page["page_number"], "type": "subheader"})
    return toc

def prepare_chunks(document_data):
    """Create chunks with metadata"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    
    chunks = []
    for page in document_data:
        full_text = " ".join([item["text"] for item in page["content"]])
        page_chunks = text_splitter.split_text(full_text)
        
        for chunk in page_chunks:
            chunks.append({
                "text": chunk,
                "metadata": {
                    "page": page["page_number"],
                    "header": page["current_header"],
                    "subheader": page["current_subheader"]
                }
            })
    return chunks

def save_to_faiss(chunks):
    """Store chunks in FAISS vector database"""
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    texts = [chunk["text"] for chunk in chunks]
    metadatas = [chunk["metadata"] for chunk in chunks]
    
    vector_store = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
    vector_store.save_local("faiss_index")
    return vector_store

# Main execution flow
def process_pdf(pdf_path):
    document_data = process_document(pdf_path)
    toc = build_toc(document_data)
    chunks = prepare_chunks(document_data)
    vector_db = save_to_faiss(chunks)
    
    # Print formatted table of contents
    print("Table of Contents:")
    print("{:<50} {:<10}".format("Title", "Page"))
    print("-" * 60)
    for entry in toc:
        print("{:<50} {:<10}".format(entry["title"], entry["page"]))
    
    return vector_db