In [None]:
# !pip install faiss-cpu sentence-transformers PyMuPDF torchvision
# !pip install git+https://github.com/openai/CLIP.git 
# Note that CLIP can't be downloaded directly. Requires a git clone of the link. 
# It is a multi-modal AI model developed by OpenAI that can understand and relate text and images. 
# It allows us to encode images and text into a shared embedding space, making it useful for image retrieval, classification, and search.


In [1]:
# Imports

import os
import faiss
import numpy as np
import fitz  
from sentence_transformers import SentenceTransformer
import torch
import clip
from PIL import Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration


print("Done")

Done


In [2]:
import subprocess

def check_cuda_memory():
    try:
        output = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.total,memory.used,memory.free", "--format=csv,nounits,noheader"])
        total, used, free = map(int, output.decode("utf-8").strip().split("\n")[0].split(", "))
        print(f"Total GPU Memory: {total} MB")
        print(f"Used GPU Memory: {used} MB")
        print(f"Free GPU Memory: {free} MB")
    except Exception as e:
        print(f"Error: {e}")

check_cuda_memory()


Total GPU Memory: 24576 MB
Used GPU Memory: 9812 MB
Free GPU Memory: 14334 MB


In [3]:
import torch
import gc

# Free up memory before running Qwen
torch.cuda.empty_cache()  
gc.collect()

# Load text embedding model
text_model = SentenceTransformer("all-MiniLM-L6-v2")

# Load CLIP model for image embeddings
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)
# clip_model, preprocess = clip.load("ViT-B/32", device="cpu")

# Load Qwen2.5-VL-3B model & processor
qwen_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", 
    torch_dtype=torch.bfloat16,  # bfloat16 is more memory-efficient
    attn_implementation="flash_attention_2",
    device_map="auto"  # Auto-manages CPU/GPU offloading
)

qwen_processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

print("✅ Qwen2.5-VL-3B model loaded successfully!")
print("Done")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


✅ Qwen2.5-VL-3B model loaded successfully!
Done


In [4]:
import subprocess 

def check_cuda_memory():
    try:
        output = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.total,memory.used,memory.free", "--format=csv,nounits,noheader"])
        total, used, free = map(int, output.decode("utf-8").strip().split("\n")[0].split(", "))
        print(f"Total GPU Memory: {total} MB")
        print(f"Used GPU Memory: {used} MB")
        print(f"Free GPU Memory: {free} MB")
    except Exception as e:
        print(f"Error: {e}")

check_cuda_memory()


Total GPU Memory: 24576 MB
Used GPU Memory: 18010 MB
Free GPU Memory: 6137 MB


In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

# Define directory paths
base_dir = "business_flowcharts"
pdf_dir = os.path.join(base_dir, "documents")
image_dir = os.path.join(base_dir, "flowcharts")

# Read all PDFs and extract text
pdf_texts = {}
for file in sorted(os.listdir(pdf_dir)):  
    if file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, file)
        text = extract_text_from_pdf(pdf_path)
        pdf_texts[file] = text
        print(f"Extracted text from {file}")

print("Done")


In [None]:
# for p in pdf_texts.values():
#     print(p)
#     print("===")

In [None]:
# Convert extracted PDF texts to embeddings
pdf_embeddings = []
pdf_filenames = list(pdf_texts.keys())

for filename in pdf_filenames:
    embedding = text_model.encode(pdf_texts[filename])
    pdf_embeddings.append(embedding)

# Convert list to NumPy array for FAISS
pdf_embeddings = np.array(pdf_embeddings, dtype="float32")

# Create FAISS index for text embeddings
text_index = faiss.IndexFlatL2(pdf_embeddings.shape[1])
text_index.add(pdf_embeddings)

print("Stored document text embeddings in FAISS database.")

print("Done")

In [None]:
def get_image_embedding(image_path):
    """Generate an embedding for a flowchart image using CLIP."""
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    with torch.no_grad():
        embedding = clip_model.encode_image(image).cpu().numpy()
    return embedding.flatten()

# Process and store embeddings for flowchart images
image_embeddings = []
image_filenames = []

for file in sorted(os.listdir(image_dir)):  # Ensure order matches PDFs
    if file.endswith(".png"):
        image_path = os.path.join(image_dir, file)
        embedding = get_image_embedding(image_path)
        image_embeddings.append(embedding)
        image_filenames.append(file)

# Convert list to NumPy array for FAISS
image_embeddings = np.array(image_embeddings, dtype="float32")

# Create FAISS index for image embeddings
image_index = faiss.IndexFlatL2(image_embeddings.shape[1])
image_index.add(image_embeddings)

print("Stored flowchart image embeddings in FAISS database.")

print("Done")


In [5]:
# No RAG
# Function to query Qwen with both text and an image
def query_qwen_with_image(image_path, query_text):
    """Send an image and text query to Qwen2.5-VL-3B."""
    
    # Load the image
    image = Image.open(image_path).convert("RGB")
    
    # Define the user message
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": query_text},
            ],
        }
    ]

    # Format input for Qwen
    text = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = qwen_processor(
        text=[text],
        images=[image],  # Provide the image input
        padding=True,
        return_tensors="pt",
    ).to(qwen_model.device)

    # Generate response
    with torch.no_grad():
        output_ids = qwen_model.generate(**inputs, max_new_tokens=512)

    # Decode response
    response_text = qwen_processor.batch_decode(output_ids, skip_special_tokens=True)[0]

    return response_text

print("Done")

Done


In [9]:
# NO RAG
# query_text = "In this order processing flowchart, who prepares and packs the order upon a successful payment?"
# image_path = "business_flowcharts/flowcharts/4.png"  # Flowchart image here
query_text = "In this Customer Support Ticket flowchart, what happens If further assistance is required?"
image_path = "business_flowcharts/flowcharts/1.png"

qwen_response = query_qwen_with_image(image_path, query_text)
print(f"🤖 Qwen's Response:\n{qwen_response}")

🤖 Qwen's Response:
system
You are a helpful assistant.
user
In this Customer Support Ticket flowchart, what happens If further assistance is required?
assistant
If further assistance is required, the process moves to "Escalate to Senior Support."


In [None]:
# With RAG
# Function to Retrieve Relevant PDFs & Flowcharts
def retrieve_relevant_data(query, top_k=2):
    """Retrieve the most relevant documents & images for the query using FAISS."""
    
    # Convert query to text embedding
    query_embedding = text_model.encode(query).reshape(1, -1)
    
    # Search FAISS text database (Retrieve relevant PDFs)
    _, text_results = text_index.search(query_embedding, top_k)
    retrieved_pdfs = [pdf_filenames[idx] for idx in text_results[0]]
    
    # Convert query to image embedding using CLIP
    text_tokenized = clip.tokenize([query]).to(device)
    with torch.no_grad():
        image_query_embedding = clip_model.encode_text(text_tokenized).cpu().numpy()
    
    # Search FAISS image database (Retrieve relevant flowcharts)
    _, image_results = image_index.search(image_query_embedding, top_k)
    retrieved_images = [image_filenames[idx] for idx in image_results[0]]

    return retrieved_pdfs, retrieved_images

print("Done")

In [None]:
# Function to Query Qwen with RAG (Flowchart Context)
def query_qwen_with_rag(query, flowchart_img, top_k=2):
    """Retrieve relevant flowchart data (text & images) and query Qwen for an AI-generated response."""
    
    # Retrieve relevant PDFs (text) & Flowcharts (images)
    retrieved_pdfs, retrieved_images = retrieve_relevant_data(query, top_k)
    
    # Extract text from retrieved PDFs
    context = "\n".join([pdf_texts[pdf] for pdf in retrieved_pdfs])
    
    # Load the provided flowchart image
    image = Image.open(flowchart_img).convert("RGB")
    
    # Define the user message (Injecting retrieved context)
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},  # Provide the provided image
                {"type": "text", "text": f"Attached is document providing context. Based on the provided image and textual information, please analyze the content and generate a response that accurately addresses the user's inquiry. The document contains relevant details that should be considered in forming a well-informed answer. Ensure that your response integrates both visual and textual elements for a comprehensive analysis.\n\nContext:\n{context}\n\nQuery: {query}"},
            ],
        }
    ]
    
    # Format input for Qwen
    text = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = qwen_processor(
        text=[text],
        images=[image],  # Provide the image input
        padding=True,
        return_tensors="pt",
    ).to(qwen_model.device)
    
    # Generate response
    with torch.no_grad():
        output_ids = qwen_model.generate(**inputs, max_new_tokens=512)
    
    # Decode response
    response_text = "========\n\n".join(qwen_processor.batch_decode(output_ids, skip_special_tokens=False))
    
    return response_text, retrieved_pdfs, retrieved_images

print("Done")

In [None]:
# # Define query and flowchart image
# query = "Can you see the flowchart i provided here about customer support? What if further assistance is needed?"
# flowchart_img = "business_flowcharts/flowcharts/1.png"

# # Query Qwen with RAG
# qwen_response_rag, retrieved_pdfs, retrieved_images = query_qwen_with_rag(query, flowchart_img)

# # Print results
# print(f"\n🤖 Qwen's Response (With RAG):\n{qwen_response_rag}")
# print(f"🔍 Retrieved Documents: {retrieved_pdfs}")
# print(f"🖼️ Retrieved Flowcharts: {retrieved_images}")

🔍 Retrieved Documents: ['23_Academic_Research.pdf', '21_Student_Enrollment.pdf'] 
⏳ Execution Time: 2.24 seconds

In [None]:
import time

# user_query = "In this Employee Onboarding flowchart, what will the employees learn during orientation? Is additional training provided should it be needed?"
# user_query = "What can you tell me about this flowchart? I want all the details."
# user_query = "In this order processing flowchart, who prepares and packs the order upon a successful payment?"
# flowchart_img = "business_flowcharts/flowcharts/4.png"
user_query = "What can you tell me about this flowchart on academic research? Can you explain each step in great detail?"
flowchart_img = "business_flowcharts/flowcharts/23_academic_research.png"

# Measure execution time
start_time = time.time()

# Query Qwen with RAG
qwen_response_rag, retrieved_pdfs, retrieved_images = query_qwen_with_rag(user_query, flowchart_img)

# Calculate total time taken
end_time = time.time()
execution_time = end_time - start_time

# Print results
print(f"\n🤖 Qwen's Response (With RAG):\n{qwen_response_rag}")
print(f"🔍 Retrieved Documents: {retrieved_pdfs}")
print(f"🖼️ Retrieved Flowcharts: {retrieved_images}")
print(f"⏳ Execution Time: {execution_time:.2f} seconds")
