In [None]:
import fitz  # PyMuPDF

# Open the PDF file
pdf_path = "reference_guide.pdf"
doc = fitz.open(pdf_path)

# Function to extract text and save images
def extract_pdf_content(doc):
    pages_data = []
    
    # Loop through each page of the document
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)  # Load the page
        
        # Extract text from the page
        page_text = page.get_text("text")
        
        # Optional: Split text into chunks based on headers or paragraphs
        page_chunks = page_text.split("\n\n")  # Split by paragraphs or use other logic
        
        # Store text chunks along with their page number
        pages_data.append({
            "page_number": page_num + 1,
            "text_chunks": page_chunks
        })
    
    return pages_data

# Extract text data
pages_data = extract_pdf_content(doc)

In [None]:
import concurrent.futures
from pdf2image import convert_from_path
import os

# Function to process a single page
def convert_single_page(page_number):
    return convert_from_path("reference_guide.pdf", first_page=page_number, last_page=page_number, dpi=150)

# Create a folder to store the images
output_folder = 'reference_guide_images'
os.makedirs(output_folder, exist_ok=True)

# Split the pages into ranges (now accounting for 664 pages)
total_pages = 52
page_numbers = range(1, total_pages + 1)

# Use concurrent processing to convert pages
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(convert_single_page, page) for page in page_numbers]
    for future in concurrent.futures.as_completed(futures):
        images = future.result()
        for i, image in enumerate(images):
            page_number = page_numbers[futures.index(future)]
            image.save(f"{output_folder}/page_{page_number}.png", "PNG")
            print(f"Saved: page_{page_number}.png")

In [None]:
from PIL import Image

page_data_images = {}

# Assuming pages_data is a list where the index corresponds to the page number minus one
for page_number in range(1, 53):  # Assuming you have 664 pages
    # Load the image for the current page
    page_image = Image.open(f"reference_guide_images/page_{page_number}.png")
    
    # Get the text for the current page from pages_data (index is page_number - 1)
    page_text = pages_data[page_number - 1]  # Access the text using index
    
    # Store both text and image in a dictionary
    page_data_images[page_number] = {"text": page_text, "image": page_image}

In [None]:
page_data_images

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create a list of texts (each page's text) for embedding
texts = [data["text"] for data in page_data_images.values()]

# Create embeddings for the texts
embeddings = model.encode(texts, convert_to_tensor=True)

# Store embeddings for each page number
page_embeddings = {page_number: embeddings[page_number - 1] for page_number in range(1, 53)}

In [None]:
texts = [str(data["text"]) if data["text"] is not None else "" for data in page_data_images.values()]
embeddings = model.encode(texts, convert_to_tensor=True)


In [None]:
page_embeddings = {page_number: embeddings[page_number - 1] for page_number in range(1, 53)}

In [None]:
import faiss

# Create a FAISS index (using L2 distance)
index = faiss.IndexFlatL2(embeddings.shape[1])  # embedding dimension

# Add the embeddings to the FAISS index
index.add(embeddings.cpu().numpy())

# Function to search for the most relevant page based on a query
def search_query(query, k=3):
    # Generate the embedding for the query
    query_embedding = model.encode([query], convert_to_tensor=True)
    
    # Search for the top k most similar pages
    distances, indices = index.search(query_embedding.cpu().numpy(), k)
    
    # Return the top k pages (page numbers) and their distances
    results = [(index + 1, distances[0][i]) for i, index in enumerate(indices[0])]
    return results

In [None]:
# Example query
query = "how do I start the windshield wipers?"

# Retrieve the top 3 most relevant pages
results = search_query(query)

# Display the results
for page_number, distance in results:
    print(f"Page {page_number} (Distance: {distance}):")
    print(page_data_images[page_number]["text"])  # Display text for that page
    page_data_images[page_number]["image"].show()  # Display the image for that page