In [1]:
import os
import re
import uuid
from typing import List, Dict
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain_huggingface import HuggingFaceEmbeddings
from rank_bm25 import BM25Okapi
import numpy as np
import faiss
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_FOLDERS = ["/Users/saurabhjain/Desktop/RAG/PDF_parser/output_extracted", "/Users/saurabhjain/Desktop/RAG/PDF_parser/segmented_reports"]

# -----------------------------
# Load all text & md into Documents
# -----------------------------
def load_documents(folders: List[str]) -> List[Document]:
    docs = []
    for folder in folders:
        for file in os.listdir(folder):
            if file.endswith(".txt") or file.endswith(".md"):
                file_path = os.path.join(folder, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()
                doc = Document(
                    page_content=content,
                    metadata={"source_file": file, "folder": folder}
                )
                docs.append(doc)
    return docs

# -----------------------------
# Chunk documents
# -----------------------------
def chunk_documents(docs: List[Document], chunk_size: int, chunk_overlap: int = 50) -> List[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len  # by default counts characters, can swap with token counter
    )
    chunks = []
    for doc in docs:
        split_docs = splitter.split_documents([doc])
        for i, chunk in enumerate(split_docs):
            # assign unique ID
            chunk.metadata.update({
                "chunk_id": str(uuid.uuid4()),
                "chunk_index": i,
                "chunk_size": chunk_size,
                "parent_source": doc.metadata["source_file"],
                "folder": doc.metadata["folder"]
            })
            chunks.append(chunk)
    return chunks



In [3]:
# -----------------------------
# Embeddings: MiniLM
# -----------------------------
def embed_documents(docs: List[Document], model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode([doc.page_content for doc in docs], show_progress_bar=True)
    return embeddings

In [None]:
### Huggingface And OpenAI Models

## Initialize a simple Embedding model(no API Key needed!)
embeddings=HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
embeddings




HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [5]:
# -----------------------------
# Build FAISS Index (Dense Vector)
# -----------------------------
def build_faiss_index(embeddings: np.ndarray, docs: List[Document], faiss_path="faiss_index.bin"):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))
    
    # Save FAISS + docs
    faiss.write_index(index, faiss_path)
    with open(faiss_path.replace(".bin", "_docs.pkl"), "wb") as f:
        pickle.dump(docs, f)
    
    print(f"✅ FAISS index built with {len(docs)} docs")
    return index

In [6]:

# -----------------------------
# Build Sparse Index (TF-IDF + BM25)
# -----------------------------
def build_sparse_indexes(docs: List[Document], output_prefix="sparse_index"):
    texts = [doc.page_content for doc in docs]
    
    # TF-IDF
    tfidf = TfidfVectorizer(stop_words="english")
    tfidf_matrix = tfidf.fit_transform(texts)
    
    with open(f"{output_prefix}_tfidf.pkl", "wb") as f:
        pickle.dump((tfidf, tfidf_matrix, docs), f)
    
    # BM25
    tokenized_texts = [text.split() for text in texts]
    bm25 = BM25Okapi(tokenized_texts)
    
    with open(f"{output_prefix}_bm25.pkl", "wb") as f:
        pickle.dump((bm25, docs), f)
    
    print(f"✅ Sparse indexes built: TF-IDF & BM25")
    return tfidf, tfidf_matrix, bm25



In [7]:
# -----------------------------
# MAIN
# -----------------------------
if __name__ == "__main__":
    # Load
    docs = load_documents(DATA_FOLDERS)
    print(f"Loaded {len(docs)} raw documents")

    # Create two granularities
    chunks_100 = chunk_documents(docs, chunk_size=100)
    chunks_400 = chunk_documents(docs, chunk_size=400)

    print(f"✅ Created {len(chunks_100)} chunks (100 chars)")
    print(f"✅ Created {len(chunks_400)} chunks (400 chars)")

    # Example inspection
    sample = chunks_100[0]
    print("\nSample Chunk Metadata:", sample.metadata)
    print("Sample Chunk Text:\n", sample.page_content[:300])


Loaded 39 raw documents
✅ Created 6854 chunks (100 chars)
✅ Created 1293 chunks (400 chars)

Sample Chunk Metadata: {'source_file': 'Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results_page5_table0.md', 'folder': '/Users/saurabhjain/Desktop/RAG/PDF_parser/output_extracted', 'chunk_id': '1e8a17c1-ce51-4707-ad1d-c17adda6ff99', 'chunk_index': 0, 'chunk_size': 100, 'parent_source': 'Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results_page5_table0.md'}
Sample Chunk Text:
 # Table from Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023


In [10]:
# -----------------------------
# MAIN
# -----------------------------
if __name__ == "__main__":
    # Load + chunk
    docs = load_documents(DATA_FOLDERS)
    chunks_400 = chunk_documents(docs, chunk_size=400)
    print(f"Loaded {len(chunks_400)} chunks")
    all_docs = chunks_400
    # Embeddings
    embeddings = embed_documents(chunks_400)

    # Dense index
    index = build_faiss_index(np.array(embeddings), chunks_400, "faiss_index.bin")

    # Sparse indexes
    build_sparse_indexes(chunks_400, "sparse_index")


Loaded 1293 chunks


Batches: 100%|██████████| 41/41 [00:05<00:00,  7.76it/s]


✅ FAISS index built with 1293 docs
✅ Sparse indexes built: TF-IDF & BM25


In [15]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle

def compare_dense_sparse(query, index, docs, bm25, k=3, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    query_emb = model.encode([query])
    
    # Dense (FAISS)
    D, I = index.search(np.array(query_emb), k=k)
    
    # Sparse (BM25)
    scores = bm25.get_scores(query.split())
    top_n = np.argsort(scores)[::-1][:k]
    
    print(f"\n🔎 Query: {query}\n")
    print("=== Dense (FAISS Semantic Search) ===\n")
    for rank, idx in enumerate(I[0], start=1):
        print(f"--- Result {rank} (Score={D[0][rank-1]:.4f}) ---")
        print("Metadata:", docs[idx].metadata)
        print("Answer:\n", docs[idx].page_content[:400], "\n")
    
    print("\n=== Sparse (BM25 Keyword Search) ===\n")
    for rank, idx in enumerate(top_n, start=1):
        print(f"--- Result {rank} (Score={scores[idx]:.4f}) ---")
        print("Metadata:", docs[idx].metadata)
        print("Answer:\n", docs[idx].page_content[:400], "\n")


# Example Usage (after reloading FAISS + BM25)
# reload BM25
with open("sparse_index_bm25.pkl", "rb") as f:
    bm25, bm25_docs = pickle.load(f)

query = "What was Pure Storage’s revenue in FY23?"
compare_dense_sparse(query, index, all_docs, bm25, k=3)



🔎 Query: What was Pure Storage’s revenue in FY23?

=== Dense (FAISS Semantic Search) ===

--- Result 1 (Score=0.4687) ---
Metadata: {'source_file': 'Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results.txt', 'folder': '/Users/saurabhjain/Desktop/RAG/PDF_parser/output_extracted', 'chunk_id': '299e5cd6-da33-4033-8215-240c152096e8', 'chunk_index': 64, 'chunk_size': 400, 'parent_source': 'Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results.txt'}
Answer:
 Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results  RESOURCES SUPPORT PARTNERS CONTACT BLOG $ 10,245 (c) 335 (d) 543 (e) INVESTOR HOME NEWS & EVENTS  FINANCIAL INFORMATION  C1O3R,0P6O3RATE(f )GOVERNAN Gross profit -- product $ 1,222,360 68.2 % $ 24,186 $ 1,246 $ 22,630 (c) 1,210 (d) 575 (e) 135 (g) 88 (h) Gross profit -- subscription services $ 

--- Result 2 (Score=0.5243) ---
Metadata: {'

In [18]:
import numpy as np
from sentence_transformers import SentenceTransformer

def hybrid_search(query, index, dense_docs, bm25, sparse_docs, k=5, alpha=0.5, model_name="all-MiniLM-L6-v2"):
    """
    Hybrid retrieval: weighted combination of FAISS (dense) + BM25 (sparse)
    alpha = weight for dense scores (0.0 = pure BM25, 1.0 = pure FAISS)
    """
    model = SentenceTransformer(model_name)
    query_emb = model.encode([query])

    # Dense search (FAISS)
    D, I = index.search(np.array(query_emb), k=len(dense_docs))
    dense_scores = np.zeros(len(dense_docs))
    dense_scores[I[0]] = 1 / (1 + D[0])  # convert distance to similarity

    # Sparse search (BM25)
    sparse_scores = np.array(bm25.get_scores(query.split()))

    # Normalize both to [0,1]
    if dense_scores.max() > 0:
        dense_scores = dense_scores / dense_scores.max()
    if sparse_scores.max() > 0:
        sparse_scores = sparse_scores / sparse_scores.max()

    # Hybrid score
    hybrid_scores = alpha * dense_scores + (1 - alpha) * sparse_scores

    # Get top-k
    top_idx = np.argsort(hybrid_scores)[::-1][:k]

    print(f"\n🔎 Hybrid Search Results for Query: {query}\n")
    for rank, idx in enumerate(top_idx, start=1):
        print(f"--- Result {rank} (Score={hybrid_scores[idx]:.4f}) ---")
        print("Metadata:", dense_docs[idx].metadata)
        print("Answer:\n", dense_docs[idx].page_content[:400], "\n")

    return [dense_docs[idx] for idx in top_idx]

# Example usage
query = "What was Pure Storage’s revenue in FY23?"
results = hybrid_search(query, index, all_docs, bm25, bm25_docs, k=3, alpha=0.5)



🔎 Hybrid Search Results for Query: What was Pure Storage’s revenue in FY23?

--- Result 1 (Score=0.9458) ---
Metadata: {'source_file': 'Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2024 Financial Results.txt', 'folder': '/Users/saurabhjain/Desktop/RAG/PDF_parser/output_extracted', 'chunk_id': '49457068-72f3-4f80-bed7-930e49f2b3ba', 'chunk_index': 14, 'chunk_size': 400, 'parent_source': 'Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2024 Financial Results.txt'}
Answer:
 gaming, manufacturing, and many more. Industry Recognition and Accolades: In FY24, Pure Storage was recognized as a leader for the tenth consecutive year in the Gartner Magic Quadrant for Primary Storage, and the third consecutive year in the Gartner Magic Quadrant for Distributed File Systems and Object Storage. Additionally, Pure Storage was named a leader in the inaugural IDC MarketSpace: 

--- Result 2 (Score=0.8483) ---
Metadata: {'source_file': '