In [49]:
import os
import faiss
import numpy as np
import fitz  
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

### DOCUMENT PROCESSING ###

# Load text embedding model
text_model = SentenceTransformer("all-MiniLM-L6-v2")

def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

def preprocess_text(text):
    """Enhance retrieval by applying TF-IDF weighting."""
    vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    X = vectorizer.fit_transform([text]).toarray()
    return " ".join(vectorizer.get_feature_names_out())

# Directory setup
base_dir = "business_flowcharts"
pdf_dir = os.path.join(base_dir, "documents")

# Read and process PDFs
pdf_texts = {}
pdf_filenames = sorted([f for f in os.listdir(pdf_dir) if f.endswith(".pdf")])

pdf_embeddings = []
for file in pdf_filenames:
    pdf_path = os.path.join(pdf_dir, file)
    raw_text = extract_text_from_pdf(pdf_path)
    processed_text = preprocess_text(raw_text)
    pdf_texts[file] = raw_text
    embedding = text_model.encode(processed_text)
    pdf_embeddings.append(embedding)

# Convert to FAISS-compatible format
pdf_embeddings = np.array(pdf_embeddings, dtype="float32")
pdf_embeddings /= np.linalg.norm(pdf_embeddings, axis=1, keepdims=True)  # Normalize (IMPORTANT)

# Create FAISS index
text_index = faiss.IndexFlatL2(pdf_embeddings.shape[1])
text_index.add(pdf_embeddings)

print("✅ Stored document text embeddings in FAISS database.")


✅ Stored document text embeddings in FAISS database.


In [50]:
import torch
import clip
from PIL import Image
from torchvision import transforms

### FLOWCHART IMG PROCESSING ###

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

def preprocess_image(image_path):
    """Preprocess an image dynamically while maintaining aspect ratio."""
    image = Image.open(image_path).convert("RGB")
    
    # Resize while keeping aspect ratio
    aspect_ratio = image.width / image.height
    if aspect_ratio > 1:
        new_width = 224
        new_height = int(224 / aspect_ratio)
    else:
        new_height = 224
        new_width = int(224 * aspect_ratio)

    transform = transforms.Compose([
        transforms.Resize((new_height, new_width)),  # Maintain aspect ratio
        transforms.Pad((0, 0, 224 - new_width, 224 - new_height), fill=(255, 255, 255)),  # Pad with white
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.481, 0.457, 0.408], std=[0.268, 0.261, 0.275]),  # CLIP normalization (ALSO IMPORATNT)
    ])

    return transform(image).unsqueeze(0)

def get_image_embedding(image_path):
    """Generate an embedding for a flowchart image using CLIP."""
    image_tensor = preprocess_image(image_path).to(device)
    with torch.no_grad():
        embedding = clip_model.encode_image(image_tensor).cpu().numpy()
    return embedding.flatten()

# Directory setup
image_dir = os.path.join(base_dir, "flowcharts")
image_filenames = sorted([f for f in os.listdir(image_dir) if f.endswith(".png")])

# Process images
image_embeddings = [get_image_embedding(os.path.join(image_dir, file)) for file in image_filenames]
image_embeddings = np.array(image_embeddings, dtype="float32")
image_embeddings /= np.linalg.norm(image_embeddings, axis=1, keepdims=True)  # Normalize

# Create FAISS index
image_index = faiss.IndexFlatL2(image_embeddings.shape[1])
image_index.add(image_embeddings)

print("✅ Stored flowchart image embeddings in FAISS database.")


✅ Stored flowchart image embeddings in FAISS database.


In [51]:
import pytesseract
from PIL import Image

def extract_text_from_image(image_path):
    """Extract text from a given flowchart image using OCR."""
    image = Image.open(image_path).convert("RGB")
    extracted_text = pytesseract.image_to_string(image)
    return extracted_text.strip()

def get_text_embedding(text):
    """Generate an embedding for extracted text using Sentence-BERT."""
    return text_model.encode(text)

print("Done")

Done


In [52]:
def get_query_image_embedding(image_path):
    """Generate a normalized embedding for a query image using CLIP."""
    image_tensor = preprocess_image(image_path).to(device)
    
    with torch.no_grad():
        embedding = clip_model.encode_image(image_tensor).cpu().numpy()
    
    return embedding.flatten() / np.linalg.norm(embedding)  # Normalize


# Updated Retrieval Function - Supports Multiple Flowchart Images and PDFs

def retrieve_relevant_data(query, flowchart_imgs=[], top_k=2):
    """Retrieve the most relevant documents & images for the query using FAISS.
    Supports multiple flowchart images for comparison."""

    # Convert query to text embedding
    query_embedding = text_model.encode(query).reshape(1, -1)
    query_embedding /= np.linalg.norm(query_embedding)  # Normalize

    # Search FAISS text database (Retrieve relevant PDFs)
    _, text_results = text_index.search(query_embedding, top_k)
    retrieved_pdfs = [pdf_filenames[idx] for idx in text_results[0]]

    # Process each flowchart image if provided
    extracted_texts = []
    image_embeddings_list = []
    
    if flowchart_imgs:
        for img_path in flowchart_imgs:
            extracted_text = extract_text_from_image(img_path)
            extracted_texts.append(extracted_text)
            text_embedding = get_text_embedding(extracted_text)

            # Use extracted text to retrieve relevant documents
            _, text_results = text_index.search(text_embedding.reshape(1, -1), top_k)
            retrieved_pdfs.extend([pdf_filenames[idx] for idx in text_results[0]])

            # Generate CLIP image embedding
            image_query_embedding = get_query_image_embedding(img_path)
            image_embeddings_list.append(image_query_embedding)

    # Normalize embeddings for images
    if image_embeddings_list:
        image_query_embeddings = np.array(image_embeddings_list, dtype="float32")
        image_query_embeddings /= np.linalg.norm(image_query_embeddings, axis=1, keepdims=True)
        
        # Search FAISS image database
        _, image_results = image_index.search(image_query_embeddings, top_k)
        retrieved_images = [image_filenames[idx] for idx in image_results.flatten()]
    else:
        retrieved_images = []

    return list(set(retrieved_pdfs)), list(set(retrieved_images)), extracted_texts

print("✅ Retrieval function is ready.")


✅ Retrieval function is ready.


In [53]:
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

# Load Qwen model
qwen_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", 
    torch_dtype=torch.bfloat16,  
    attn_implementation="flash_attention_2",
    device_map="auto"
)

qwen_processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [55]:
import re


# Updated Qwen RAG Query Function - Multi-Flowchart & Multi-Document Support

def query_qwen_with_rag(query, flowchart_imgs=[], top_k=2):
    """Retrieve relevant flowchart data (text & images) and query Qwen for comparison-style answers."""
    
    # Retrieve relevant PDFs (text) & Flowcharts (images)
    retrieved_pdfs, retrieved_images, extracted_texts = retrieve_relevant_data(query, flowchart_imgs, top_k)
    
    # Extract text from retrieved PDFs
    context = "\n\n".join([f"📄 **{pdf}**\n{pdf_texts[pdf]}" for pdf in retrieved_pdfs])
    
    # Extract text from all provided flowcharts
    flowchart_texts = "\n\n".join([f"🖼️ **Flowchart {i+1}**\n{text}" for i, text in enumerate(extracted_texts)])
    
    # Select primary image for visualization (first one provided or retrieved)
    primary_image_path = flowchart_imgs[0] if flowchart_imgs else os.path.join(image_dir, retrieved_images[0])
    primary_image = Image.open(primary_image_path).convert("RGB")
    
    # Define the user message (Injecting retrieved context)
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": primary_image},  # Show one image
                {"type": "text", "text": f"""
Given the attached document and flowcharts, compare the provided flowcharts and documents. 
Analyze differences, improvements, or regressions. If the newer flowchart is better than the old one, explain why.

### **Context from Retrieved Documents**
{context}

### **Extracted Text from Provided Flowcharts**
{flowchart_texts}

### **Query**
{query}
"""},
            ],
        }
    ]
    
    # Format input for Qwen
    text = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = qwen_processor(
        text=[text],
        images=[primary_image],  # Only sending one image (rest used as context)
        padding=True,
        return_tensors="pt",
    ).to(qwen_model.device)
    
    # Generate response
    with torch.no_grad():
        output_ids = qwen_model.generate(**inputs, max_new_tokens=512)
    
    # Decode response
    response_text = qwen_processor.batch_decode(output_ids, skip_special_tokens=False)[0]

    # Extract Assistant's Response Only
    match = re.search(r"assistant\s*\n(.*)", response_text, re.DOTALL)
    cleaned_response = match.group(1).strip() if match else response_text.strip()
    
    return cleaned_response, retrieved_pdfs, retrieved_images

print("✅ Updated Qwen RAG system is ready.")


✅ Updated Qwen RAG system is ready.


In [57]:
import time

# ✅ Multi-Flowchart Test Query

user_query = "Compare these flowcharts. Which one is better and why?"
flowchart_imgs = [
    "business_flowcharts/flowcharts/6_incident_management.png",
    "6_incident_management_v2.png"
]  

start_time = time.time()

# Query Qwen with Multi-Flowchart RAG
qwen_response_rag, retrieved_pdfs, retrieved_images = query_qwen_with_rag(user_query, flowchart_imgs)

end_time = time.time()
execution_time = end_time - start_time

# Print results
print(f"\n🤖 Qwen's Response (With RAG - Multi-Flowchart Comparison):\n{qwen_response_rag}")
print(f"🔍 Retrieved Documents: {retrieved_pdfs}")
print(f"🖼️ Retrieved Flowcharts: {retrieved_images}")
print(f"⏳ Execution Time: {execution_time:.2f} seconds")



🤖 Qwen's Response (With RAG - Multi-Flowchart Comparison):
To compare the two flowcharts and determine which one is better, let's analyze them step by step:

### **Flowchart 1: Incident Management**

**Steps:**
1. Start
2. User Reports Issue
3. Classify Incident
4. High Impact?
   - No: Assign to Support Team
   - Yes: Escalate to Higher Support
5. Assign to Support Team
6. Resolve Issue
7. Issue Resolved?
   - Yes: Close Ticket
   - No: Escalate to Higher Support

**Improvements:**
- The flowchart clearly distinguishes between high-impact and non-high-impact incidents.
- It provides a clear path for escalating issues to higher support levels.
- It includes a step for confirming the issue is resolved before closing the ticket.

### **Flowchart 2: Incident Management**

**Steps:**
1. Start
2. User Reports Issue
3. Initial Triage
4. Classify Incident
5. Assign to Support Team | | Escalate to Higher Support
6. Resolve Issue
7. Issue Resolved?
   - Yes: Close Ticket
   - No: Escalate to H

In [15]:
# # Test saved documents in FAISS database
# def view_stored_pdfs():
#     """Display all stored PDFs and their extracted text."""
#     for filename, text in pdf_texts.items():
#         print(f"📄 PDF: {filename}\n")
#         print(f"Extracted Content:\n{text[:1000]}")  # Show first 1000 characters
#         print("="*80)

# view_stored_pdfs()


In [48]:
# import matplotlib.pyplot as plt

# # Test saved images in FAISS database

# def view_stored_flowcharts():
#     """Display all stored flowchart images and their filenames."""
#     for filename in image_filenames:
#         image_path = os.path.join(image_dir, filename)
#         image = Image.open(image_path)
        
#         plt.figure(figsize=(5, 5))
#         plt.imshow(image)
#         plt.axis("off")
#         plt.title(f"🖼️ Flowchart: {filename}")
#         plt.show()

# view_stored_flowcharts()


In [19]:
# def check_pdf_embedding(index=11):
#     """Check stored text embedding by retrieving the closest match for a given PDF."""
#     query_embedding = pdf_embeddings[index].reshape(1, -1)
#     _, retrieved_indices = text_index.search(query_embedding, 1)
    
#     original_pdf = pdf_filenames[index]
#     matched_pdf = pdf_filenames[retrieved_indices[0][0]]
    
#     print(f"📄 Original PDF: {original_pdf}")
#     print(f"🔍 Closest Match: {matched_pdf}")
#     print(f"Similarity Score: {np.dot(pdf_embeddings[index], pdf_embeddings[retrieved_indices[0][0]])}")
    
# check_pdf_embedding()


In [22]:
# import matplotlib.pyplot as plt

# def check_image_embedding(index=4):
#     """Check stored image embedding by retrieving the closest match for a given flowchart."""
#     query_embedding = image_embeddings[index].reshape(1, -1)
#     _, retrieved_indices = image_index.search(query_embedding, 1)
    
#     original_image = image_filenames[index]
#     matched_image = image_filenames[retrieved_indices[0][0]]
    
#     print(f"🖼️ Original Flowchart: {original_image}")
#     print(f"🔍 Closest Match: {matched_image}")
#     print(f"Similarity Score: {np.dot(image_embeddings[index], image_embeddings[retrieved_indices[0][0]])}")
    
#     # Show both images
#     fig, ax = plt.subplots(1, 2, figsize=(10, 5))
    
#     ax[0].imshow(Image.open(os.path.join(image_dir, original_image)))
#     ax[0].set_title("Original Image")
#     ax[0].axis("off")
    
#     ax[1].imshow(Image.open(os.path.join(image_dir, matched_image)))
#     ax[1].set_title("Closest Match")
#     ax[1].axis("off")
    
#     plt.show()

# check_image_embedding()


In [23]:
# # Check if all image embeddings are unique
# unique_embeddings = np.unique(image_embeddings, axis=0)

# if unique_embeddings.shape[0] == 1:
#     print("⚠️ WARNING: All image embeddings are identical! FAISS cannot differentiate them.")
# else:
#     print(f"✅ FAISS has {unique_embeddings.shape[0]} unique image embeddings.")


✅ FAISS has 25 unique image embeddings.


In [14]:
# # Print shapes of stored image embeddings
# print(f"Stored Image Embeddings Shape: {image_embeddings.shape}")

# # Generate a query embedding for comparison
# query_embedding = get_query_image_embedding(os.path.join(image_dir, image_filenames[0]))  # Use any image as query
# print(f"Query Image Embedding Shape: {query_embedding.shape}")

# # Print first stored embedding vs query embedding
# print(f"\nFirst Stored Embedding:\n{image_embeddings[0][:10]}")  # Print first 10 values
# print(f"\nQuery Embedding:\n{query_embedding[:10]}")  # Print first 10 values


In [15]:
# def debug_faiss_retrieval(index=3):
#     """Check if FAISS is properly differentiating images."""
#     query_embedding = image_embeddings[index].reshape(1, -1)  # Use stored image for retrieval test
#     _, retrieved_indices = image_index.search(query_embedding, 5)  # Top 5 results

#     print(f"🖼️ Original Flowchart: {image_filenames[index]}")
#     print(f"\n🔍 Closest Matches:")
#     for rank, idx in enumerate(retrieved_indices[0]):
#         matched_image = image_filenames[idx]
#         similarity_score = np.dot(image_embeddings[index], image_embeddings[idx])  # Cosine similarity
#         print(f"{rank + 1}. {matched_image} (Score: {similarity_score:.6f})")

# debug_faiss_retrieval()
