In [2]:
import fitz  # PyMuPDF

# Open the PDF file
pdf_path = "reference_guide.pdf"
doc = fitz.open(pdf_path)

# Function to extract text and save images
def extract_pdf_content(doc):
    pages_data = []
    
    # Loop through each page of the document
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)  # Load the page
        
        # Extract text from the page
        page_text = page.get_text("text")
        
        # Optional: Split text into chunks based on headers or paragraphs
        page_chunks = page_text.split("\n\n")  # Split by paragraphs or use other logic
        
        # Store text chunks along with their page number
        pages_data.append({
            "page_number": page_num + 1,
            "text_chunks": page_chunks
        })
    
    return pages_data

# Extract text data
pages_data = extract_pdf_content(doc)

In [3]:
import concurrent.futures
from pdf2image import convert_from_path
import os

# Function to process a single page
def convert_single_page(page_number):
    return convert_from_path("reference_guide.pdf", first_page=page_number, last_page=page_number, dpi=150)

# Create a folder to store the images
output_folder = 'reference_guide_images'
os.makedirs(output_folder, exist_ok=True)

# Split the pages into ranges (now accounting for 664 pages)
total_pages = 52
page_numbers = range(1, total_pages + 1)

# Use concurrent processing to convert pages
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(convert_single_page, page) for page in page_numbers]
    for future in concurrent.futures.as_completed(futures):
        images = future.result()
        for i, image in enumerate(images):
            page_number = page_numbers[futures.index(future)]
            image.save(f"{output_folder}/page_{page_number}.png", "PNG")
            print(f"Saved: page_{page_number}.png")

Saved: page_2.png
Saved: page_3.png
Saved: page_6.png
Saved: page_7.png
Saved: page_12.png
Saved: page_8.png
Saved: page_10.png
Saved: page_5.png
Saved: page_4.png
Saved: page_11.png
Saved: page_1.png
Saved: page_9.png
Saved: page_14.png
Saved: page_16.png
Saved: page_13.png
Saved: page_20.png
Saved: page_15.png
Saved: page_17.png
Saved: page_19.png
Saved: page_18.png
Saved: page_23.png
Saved: page_21.png
Saved: page_22.png
Saved: page_24.png
Saved: page_28.png
Saved: page_31.png
Saved: page_29.png
Saved: page_26.png
Saved: page_25.png
Saved: page_32.png
Saved: page_27.png
Saved: page_30.png
Saved: page_33.png
Saved: page_34.png
Saved: page_36.png
Saved: page_35.png
Saved: page_38.png
Saved: page_37.png
Saved: page_39.png
Saved: page_41.png
Saved: page_40.png
Saved: page_43.png
Saved: page_44.png
Saved: page_42.png
Saved: page_48.png
Saved: page_46.png
Saved: page_47.png
Saved: page_45.png
Saved: page_51.png
Saved: page_49.png
Saved: page_50.png
Saved: page_52.png


In [7]:
from PIL import Image

page_data_images = {}

# Assuming pages_data is a list where the index corresponds to the page number minus one
for page_number in range(1, 53):  # Assuming you have 664 pages
    # Load the image for the current page
    page_image = Image.open(f"reference_guide_images/page_{page_number}.png")
    
    # Get the text for the current page from pages_data (index is page_number - 1)
    page_text = pages_data[page_number - 1]  # Access the text using index
    
    # Store both text and image in a dictionary
    page_data_images[page_number] = {"text": page_text, "image": page_image}

In [8]:
page_data_images

{1: {'text': {'page_number': 1,
   'text_chunks': ['QUICK\nREFERENCE \nGUIDE\n2020\n']},
  'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=826x1275>},
 2: {'text': {'page_number': 2,
   'text_chunks': ['CAMRY\n2020\nThis Quick Reference Guide is a summary of basic vehicle\noperations. It contains brief descriptions of fundamental\noperations so you can locate and use the vehicle’s main \nequipment quickly and easily.\nThe Quick Reference Guide is not intended as a substitute for \nthe Owner’s Manual located in your vehicle’s glove box. We \nstrongly encourage you to review the Owner’s Manual and \nsupplementary manuals so you will have a better understanding \nof your vehicle’s capabilities and limitations.\nYour dealership and the entire staff of Toyota Motor North \n"NFSJDB\r\x03*OD\x0f\x03XJTI\x03ZPV\x03NBOZ\x03ZFBST\x03PG\x03TBUJTmFE\x03ESJWJOH\x03JO\x03ZPVS\x03\nnew Camry.\nA word about safe vehicle operations\nThis Quick Reference Guide is not a full description of 

In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create a list of texts (each page's text) for embedding
texts = [data["text"] for data in page_data_images.values()]

# Create embeddings for the texts
embeddings = model.encode(texts, convert_to_tensor=True)

# Store embeddings for each page number
page_embeddings = {page_number: embeddings[page_number - 1] for page_number in range(1, 53)}

TypeError: object of type 'int' has no len()

In [10]:
texts = [str(data["text"]) if data["text"] is not None else "" for data in page_data_images.values()]
embeddings = model.encode(texts, convert_to_tensor=True)


In [12]:
page_embeddings = {page_number: embeddings[page_number - 1] for page_number in range(1, 53)}

In [13]:
import faiss

# Create a FAISS index (using L2 distance)
index = faiss.IndexFlatL2(embeddings.shape[1])  # embedding dimension

# Add the embeddings to the FAISS index
index.add(embeddings.cpu().numpy())

# Function to search for the most relevant page based on a query
def search_query(query, k=3):
    # Generate the embedding for the query
    query_embedding = model.encode([query], convert_to_tensor=True)
    
    # Search for the top k most similar pages
    distances, indices = index.search(query_embedding.cpu().numpy(), k)
    
    # Return the top k pages (page numbers) and their distances
    results = [(index + 1, distances[0][i]) for i, index in enumerate(indices[0])]
    return results

In [18]:
# Example query
query = "how do I start the windshield wipers?"

# Retrieve the top 3 most relevant pages
results = search_query(query)

# Display the results
for page_number, distance in results:
    print(f"Page {page_number} (Distance: {distance}):")
    print(page_data_images[page_number]["text"])  # Display text for that page
    page_data_images[page_number]["image"].show()  # Display the image for that page

Page 17 (Distance: 0.9971246719360352):
{'page_number': 17, 'text_chunks': ['15\n* Intermittent windshield wiper frequency adjustment Rotate to increase/decrease \nwipe frequency.\nRefer to the Owner’s Manual for more details.\nWindshield wipers & washers\nOVERVIEW\nFEATURES & OPERATIONS\nTOYOTA SAFETY SENSE\nSAFETY & EMERGENCY FEATURES\n4&"5\x037&/5*-"5034\x03\t*\'\x03&26*11&%\n4&"5\x03)&"5&34\x03\t*\'\x03&26*11&%\nWithout seat ventilators\nWith seat ventilators\nAdjust frequency*\nPull to wash and wipe\nMist\nInterval wipe\nSlow\nFast\nSeat heaters/ventilators (if equipped)\nDriver seat\nDriver seat\nDriver seat\nFront passenger \nseat\nFront passenger \nseat\nFront passenger \nseat\n']}
Page 4 (Distance: 1.2776788473129272):
{'page_number': 4, 'text_chunks': ['Steering wheel controls\n2\nOVERVIEW\nInstrument panel\nTilt and telescopic steering lock release lever \n(below the steering wheel)\nIgnition switch1\nHeadlight/turn signal/fog light control\nAutomatic High Beam (AHB) switch\