In [2]:
import os
import pickle
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import faiss




In [3]:
# 1. Text extraction from each pdf
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# 2. Load and extract all pdfs from a directory
def load_all_pdfs(pdf_directory):
    documents=[]
    
    for filename in os.listdir(pdf_directory):
        if filename.lower().endswith(".pdf"):
            full_path = os.path.join(pdf_directory, filename)
            print(f"Processing {full_path}...")
            text = extract_text_from_pdf(full_path)
            documents.append(text)
            
    return documents

# 3. Chunk into retrievable sections
def chunk_documents(documents, chunk_size=500, chunk_overlap=50):
    #tunable but works fine eitherways
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks=[]
    for doc in documents:
        doc_chunks = text_splitter.split_text(doc)
        chunks.extend(doc_chunks)
    return chunks

# 4. Chunk Caching
def load_or_generate_chunks(pdf_directory, chunks_cache_file="chunks_cache.pkl", chunk_size=500, chunk_overlap=50):
    if os.path.exists(chunks_cache_file):
        print(f"Loading cached chunks from {chunks_cache_file}...")
        with open(chunks_cache_file, "rb") as f:
            chunks = pickle.load(f)
    else:
        print("No cached chunks found. Processing PDFs to generate chunks...")
        documents = load_all_pdfs(pdf_directory)
        chunks = chunk_documents(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        print(f"Caching {len(chunks)} chunks to {chunks_cache_file}...")
        with open(chunks_cache_file, "wb") as f:
            pickle.dump(chunks, f)
    return chunks

# 5. Embeddings generation
def generate_embeddings(text_list, model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32):
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")
    model = SentenceTransformer(model_name, device=device)
    
    embeddings = model.encode(text_list, batch_size=batch_size, convert_to_tensor=True)
    return embeddings

# 6. Loading cache embeddings for saving time
def load_or_generate_embeddings(chunks, cache_file="embeddings_cache.pkl", model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32):
    if os.path.exists(cache_file):
        with open(cache_file, "rb") as f:
            embeddings = pickle.load(f)
            
        if embeddings.shape[0] != len(chunks):
            print("Mismatch between cached embeddings and current chunk count. Regenerating embeddings.")
            embeddings = generate_embeddings(chunks, model_name=model_name, batch_size = batch_size)
            with open (cache_file, "wb") as f:
                pickle.dump(embeddings, f)
    else:
        print("No cached embeddings found, generating new ones...")
        embeddings = generate_embeddings(chunks, model_name = model_name, batch_size = batch_size)
        print(f"Caching embeddings to {cache_file}")
        with open(cache_file, "wb") as f:
            pickle.dump(embeddings, f)
    
    return embeddings
        
# 7. FAISS index build for similarity search
def build_faiss_index(embeddings):
    if torch.is_tensor(embeddings):
        embeddings_np = embeddings.cpu().numpy()
    else:
        embeddings_np = np.array(embeddings)
        
    dim = embeddings_np.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings_np)
    return index

In [4]:
def main():
    pdf_directory = "/Users/jananinareshkumar/Desktop/rag/fly"
    
    # Load or generate chunks from the PDFs (using cached chunks if available)
    print("Loading or generating chunks")
    chunks = load_or_generate_chunks(pdf_directory, chunks_cache_file="chunks_cache.pkl", chunk_size=500, chunk_overlap=50)
    print(f"{len(chunks)} chunks loaded.")
    
    # Generate or load cached embeddings for these chunks
    print("Embedding Generation")
    embeddings = load_or_generate_embeddings(chunks, cache_file="embeddings_cache.pkl")
    
    # Build the FAISS index
    print("FAISS Index Setup")
    index = build_faiss_index(embeddings)
    print(f"FAISS Index contains {index.ntotal} vectors.")
    
    # Query the index with a sample query
    query_text = "hydraulic failure in left engine"
    query_embedding = generate_embeddings([query_text])
    
    # Convert the query embedding to a NumPy array for FAISS
    if torch.is_tensor(query_embedding):
        query_np = query_embedding.cpu().numpy()
    else:
        query_np = np.array(query_embedding)
    
    k = 5  # Number of nearest neighbors to retrieve
    distances, indices = index.search(query_np, k)
    print("Top 5 matches", indices)
    
    # Optionally, print the text for each retrieved chunk
    for distance, idx in zip(distances[0], indices[0]):
        print(f"\nChunk index {idx} with L2 distance {distance}:")
        print(chunks[idx])
        print("=" * 80)

if __name__ == "__main__":
    main()


Loading or generating chunks
Loading cached chunks from chunks_cache.pkl...
13383 chunks loaded.
Embedding Generation
FAISS Index Setup
FAISS Index contains 13383 vectors.
Using device: mps
Top 5 matches [[4414 3635 3629  252 8764]]

Chunk index 4414 with L2 distance 1.0111110210418701:
Hydraulics
737 Flight Crew Operations Manual
Table of Contents
Intentionally
Blank
13.TOC.2 Copyright ©The Boeing Company. See title page for details.
D6-27370-804-BRI(P2) July 17, 2009
13.1
737 Flight Crew Operations Manual
NLONWC. 1P3R NEoSnS-UNRoErm -a Hl Cydhreacuklliics tPs-uHmypd (rsaiunlgicles)
HYDRAULIC PUMP LOW
LOW
PRESSURE PRESSURE
Condition: The hydraulic pump pressure is low.
1 HYD PUMP switch (affected side) . . . . . . . . . OFF
Note: Loss of an engine-driven hydraulic pump and

Chunk index 3635 with L2 distance 1.056025743484497:
 When CAB PR ΔP < 1 psi:
RAM AIR........................................................................................ON
FOR TRAINING PURPOSES ONLY
ABNORMAL A