In [4]:
import os
import pickle
import pdfplumber
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import faiss


In [6]:
# 1. Text extraction from each pdf
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# 2. Load and extract all pdfs from a directory
def load_all_pdfs(pdf_directory):
    documents=[]
    
    for filename in os.listdir(pdf_directory):
        if filename.lower().endswith(".pdf"):
            full_path = os.path.join(pdf_directory, filename)
            print(f"Processing {full_path}...")
            text = extract_text_from_pdf(full_path)
            documents.append(text)
            
    return documents

# 3. Chunk into retrievable sections
def chunk_documents(documents, chunk_size=500, chunk_overlap=50):
    #tunable but works fine eitherways
    text_splitter = CharacterTextSplitter(separator = ". " ,chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks=[]
    for doc in documents:
        doc_chunks = text_splitter.split_text(doc)
        chunks.extend(doc_chunks)
    return chunks

# 4. Chunk Caching
def load_or_generate_chunks(pdf_directory, chunks_cache_file="chunks_cache.pkl", chunk_size=350, chunk_overlap=100):
    if os.path.exists(chunks_cache_file):
        print(f"Loading cached chunks from {chunks_cache_file}...")
        with open(chunks_cache_file, "rb") as f:
            chunks = pickle.load(f)
    else:
        print("No cached chunks found. Processing PDFs to generate chunks...")
        documents = load_all_pdfs(pdf_directory)
        chunks = chunk_documents(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        print(f"Caching {len(chunks)} chunks to {chunks_cache_file}...")
        with open(chunks_cache_file, "wb") as f:
            pickle.dump(chunks, f)
    return chunks

# 5. Embeddings generation
def generate_embeddings(text_list, model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32):
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")
    model = SentenceTransformer(model_name, device=device)
    
    embeddings = model.encode(text_list, batch_size=batch_size, convert_to_tensor=True)
    return embeddings

# 6. Loading cache embeddings for saving time
def load_or_generate_embeddings(chunks, cache_file="embeddings_cache.pkl", model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32):
    if os.path.exists(cache_file):
        with open(cache_file, "rb") as f:
            embeddings = pickle.load(f)
            
        if embeddings.shape[0] != len(chunks):
            print("Mismatch between cached embeddings and current chunk count. Regenerating embeddings.")
            embeddings = generate_embeddings(chunks, model_name=model_name, batch_size = batch_size)
            with open (cache_file, "wb") as f:
                pickle.dump(embeddings, f)
    else:
        print("No cached embeddings found, generating new ones...")
        embeddings = generate_embeddings(chunks, model_name = model_name, batch_size = batch_size)
        print(f"Caching embeddings to {cache_file}")
        with open(cache_file, "wb") as f:
            pickle.dump(embeddings, f)
    
    return embeddings
        
# 7. FAISS index build for similarity search
def build_faiss_index(embeddings):
    if torch.is_tensor(embeddings):
        embeddings_np = embeddings.cpu().numpy()
    else:
        embeddings_np = np.array(embeddings)
        
    dim = embeddings_np.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings_np)
    return index

In [9]:
def main():
    pdf_directory = "/Users/jananinareshkumar/Desktop/rag/data"
    
    # Load or generate chunks from the PDFs (using cached chunks if available)
    print("Loading or generating chunks")
    chunks = load_or_generate_chunks(pdf_directory, chunks_cache_file="chunks_cache.pkl", chunk_size=350, chunk_overlap=100)
    print(f"{len(chunks)} chunks loaded.")
    
    # Generate or load cached embeddings for these chunks
    print("Embedding Generation")
    embeddings = load_or_generate_embeddings(chunks, cache_file="embeddings_cache.pkl")
    
    # Build the FAISS index
    print("FAISS Index Setup")
    index = build_faiss_index(embeddings)
    print(f"FAISS Index contains {index.ntotal} vectors.")
    
    # Query the index with a sample query
    query_text = "hydraulic failure in left engine"
    query_embedding = generate_embeddings([query_text])
    
    # Convert the query embedding to a NumPy array for FAISS
    if torch.is_tensor(query_embedding):
        query_np = query_embedding.cpu().numpy()
    else:
        query_np = np.array(query_embedding)
    
    k = 5  # Number of nearest neighbors to retrieve
    distances, indices = index.search(query_np, k)
    print("Top 5 matches", indices)
    
    # Optionally, print the text for each retrieved chunk
    for distance, idx in zip(distances[0], indices[0]):
        print(f"\nChunk index {idx} with L2 distance {distance}:")
        print(chunks[idx])
        print("=" * 80)

if __name__ == "__main__":
    main()


Loading or generating chunks
No cached chunks found. Processing PDFs to generate chunks...
Processing /Users/jananinareshkumar/Desktop/rag/data/airlines-erp-checklist.pdf...
Processing /Users/jananinareshkumar/Desktop/rag/data/Fox-Rothschild-Emergency-Response-Handbook-Jan-2020.pdf...
Processing /Users/jananinareshkumar/Desktop/rag/data/Accident-IncidentPreparedness.pdf...
Processing /Users/jananinareshkumar/Desktop/rag/data/78297d73-7754-426a-9294-0b70beefae74.pdf...
Processing /Users/jananinareshkumar/Desktop/rag/data/737-800 Quick Reference Handbook (QRH).pdf...
Processing /Users/jananinareshkumar/Desktop/rag/data/19_afh_ch18.pdf...
Processing /Users/jananinareshkumar/Desktop/rag/data/aviation-emergency-response-guidebook-2021.pdf...
Processing /Users/jananinareshkumar/Desktop/rag/data/cc3.pdf...
Processing /Users/jananinareshkumar/Desktop/rag/data/ndem_vpn_user_manual.pdf...
Processing /Users/jananinareshkumar/Desktop/rag/data/00_afh_full.pdf...
Processing /Users/jananinareshkumar/

Created a chunk of size 571, which is longer than the specified 350
Created a chunk of size 387, which is longer than the specified 350
Created a chunk of size 360, which is longer than the specified 350
Created a chunk of size 364, which is longer than the specified 350
Created a chunk of size 444, which is longer than the specified 350
Created a chunk of size 476, which is longer than the specified 350
Created a chunk of size 3405, which is longer than the specified 350
Created a chunk of size 1196, which is longer than the specified 350
Created a chunk of size 702, which is longer than the specified 350
Created a chunk of size 808, which is longer than the specified 350
Created a chunk of size 859, which is longer than the specified 350
Created a chunk of size 1826, which is longer than the specified 350
Created a chunk of size 1985, which is longer than the specified 350
Created a chunk of size 948, which is longer than the specified 350
Created a chunk of size 3327, which is longe

Caching 12982 chunks to chunks_cache.pkl...
12982 chunks loaded.
Embedding Generation
No cached embeddings found, generating new ones...
Using device: mps
Caching embeddings to embeddings_cache.pkl
FAISS Index Setup
FAISS Index contains 12982 vectors.
Using device: mps
Top 5 matches [[ 7864 10719  7969  2725  2680]]

Chunk index 7864 with L2 distance 1.1568076610565186:
As a result, failure of the left engine will result in the most asymmetrical
thrust (adverse yaw) as the right engine will be providing the remaining thrust. [Figure 13-12]
13-23
Figure 13-12. Forces created during single-engine operation.
Many twins are designed with a counter-rotating right engine

Chunk index 10719 with L2 distance 1.1573790311813354:
This is
the most important condition, which causes brake system failures. It is not possible to
determine exactly when it occurs.
Information about the brake fluid used by the manufacturer is recorded on a placard
at the firewall in the engine compartment.
WHEELS AND BR

the above set up has the following limitations : 
though there is sufficient response for single basic queries that are obvious, there is only a keyword based response for complex issues like hydraulic failure in the left engine, it addresses hydraulic and left engine separately

this could be beacause of lacking input data
improving it : 
1. change in chunking strategy
2. post process retrieved chunk material by re-ranking as a secondary filter
3. imo a more broad input data should fix it