In [None]:
!apt-get install poppler-utils

In [None]:
import os

pdf_files = [f for f in os.listdir() if f.endswith('.pdf')]
print(pdf_files)


In [None]:
!apt-get install -y poppler-utils


In [None]:
from google.colab import files
import glob

uploaded = files.upload()  # Upload PDFs
pdf_files = glob.glob("*.pdf")  # List all uploaded PDFs
print("📄 Uploaded PDFs:", pdf_files)



In [None]:
import os
import fitz  # PyMuPDF
import faiss
import numpy as np
from pdf2image import convert_from_path
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from sentence_transformers import SentenceTransformer
import openai

# === STEP 0: Setup ===
pdf_folder = "sample_data"  # Your folder containing PDFs

# Create the directory if it doesn't exist
os.makedirs(pdf_folder, exist_ok=True)

# Check if directory exists and list files
print(f"Checking directory: {os.path.abspath(pdf_folder)}")
if not os.path.exists(pdf_folder):
    print(f"⚠️ Warning: The directory {pdf_folder} does not exist!")
else:
    print(f"✅ Directory exists. Contents: {os.listdir(pdf_folder)}")

# Get PDF files from the folder
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files: {pdf_files}")

# Check if there are PDF files to process
if not pdf_files:
    print("⚠️ No PDF files found in the directory. Please add PDF files to the sample_data folder.")
    # Exit or use sample data for testing
    # For testing purposes, you could create a simple PDF here
    print("Creating sample PDF for testing...")
    # You would add code here to create a sample PDF if needed
    # Or exit the script
    import sys
    sys.exit("Please add PDF files to continue.")

openai.api_key = "API KEY"# === STEP 1: Extract text from PDFs ===
all_text = {}

for pdf_file in pdf_files:
    try:
        doc = fitz.open(pdf_file)
        text = ""
        for page in doc:
            text += page.get_text()
        all_text[pdf_file] = text

        # Save extracted text
        output_txt = pdf_file.replace('.pdf', '.txt')
        with open(output_txt, 'w', encoding='utf-8') as f:
            f.write(text)
        print(f"✅ Text extracted from {pdf_file} and saved to {output_txt}")
    except Exception as e:
        print(f"⚠️ Error processing {pdf_file}: {e}")

print(f"✅ Text extracted from {len(all_text)} PDFs!")

# === STEP 2: Chunk the text ===
def chunk_text(text, chunk_size=500):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

chunks = []
chunk_sources = []  # To track source info
for file, text in all_text.items():
    file_chunks = chunk_text(text)
    chunks.extend(file_chunks)
    chunk_sources.extend([file] * len(file_chunks))

print(f"✅ Total chunks created: {len(chunks)}")

# Check if we have any chunks to process
if not chunks:
    print("⚠️ No text chunks were created. The PDFs might be empty or could not be processed.")
    import sys
    sys.exit("Please add non-empty PDFs to continue.")

# === STEP 3: Text Embeddings ===
text_model = SentenceTransformer('all-MiniLM-L6-v2')
text_embeddings = text_model.encode(chunks, show_progress_bar=True)
print(f"✅ Text Embeddings shape: {text_embeddings.shape}")

# Ensure models directory exists
os.makedirs("models", exist_ok=True)

text_index = faiss.IndexFlatL2(text_embeddings.shape[1])
text_index.add(np.array(text_embeddings))
faiss.write_index(text_index, "models/faiss_index_text.idx")
print("✅ FAISS index for text saved!")

# === STEP 4: Extract images from PDFs ===
all_images = []

for pdf_file in pdf_files:
    try:
        images = convert_from_path(pdf_file)
        all_images.extend(images)
        print(f"✅ Extracted {len(images)} images from {pdf_file}")
    except Exception as e:
        print(f"⚠️ Error extracting images from {pdf_file}: {e}")

print(f"✅ Total images extracted: {len(all_images)}")

# Check if we have any images to process
if not all_images:
    print("⚠️ No images were extracted from the PDFs.")
    # Continue with just text processing, or exit if images are required
    # For this example, we'll continue with text only if no images
    print("Continuing with text processing only...")
else:
    # === STEP 5: Image Embeddings ===
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    image_embeddings = []

    for i, img in enumerate(all_images):
        try:
            inputs = clip_processor(images=img, return_tensors="pt")
            outputs = clip_model.get_image_features(**inputs)
            image_embeddings.append(outputs.detach().numpy())
            if (i + 1) % 10 == 0:
                print(f"Processed {i + 1}/{len(all_images)} images")
        except Exception as e:
            print(f"⚠️ Error processing image {i}: {e}")

    if image_embeddings:
        image_embeddings = np.vstack(image_embeddings)
        print(f"✅ Image Embeddings shape: {image_embeddings.shape}")

        image_index = faiss.IndexFlatL2(image_embeddings.shape[1])
        image_index.add(image_embeddings)
        faiss.write_index(image_index, "models/faiss_index_images.idx")
        print("✅ FAISS index for images saved!")
    else:
        print("⚠️ No image embeddings were created.")

# === STEP 6: Text Retrieval ===
def search_text(query, top_k=5):
    query_vec = text_model.encode([query])
    D, I = text_index.search(np.array(query_vec), top_k)
    return I[0]

def retrieve_chunks(indices, chunks):
    return [chunks[i] for i in indices]

# Example text query
query = "What is the revenue growth in 2022?"
print(f"\nExecuting text query: '{query}'")
top_indices = search_text(query)
retrieved = retrieve_chunks(top_indices, chunks)

print("🔍 Retrieved Text Chunks:")
for i, text in enumerate(retrieved):
    print(f"Chunk {i+1}:\n{text[:200]}...\n")  # Print a preview of each chunk

# === STEP 7: Generate Answer ===
def generate_answer(context_chunks, query):
    context = "\n".join(context_chunks)
    prompt = f"Answer the query based on the following extracted context:\n{context}\n\nQuery: {query}"

    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are an expert data analyst."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"⚠️ Error generating answer: {e}")
        return "Error generating response: " + str(e)

answer = generate_answer(retrieved, query)
print("\n🧠 GPT-4 Answer:")
print(answer)

# === STEP 7B: Image Query Support ===
if 'image_index' in locals() and all_images:
    # Only run this if we have images and an image index
    def search_image_query(image_pil, top_k=5):
        inputs = clip_processor(images=image_pil, return_tensors="pt")
        features = clip_model.get_image_features(**inputs)
        img_vec = features.detach().numpy()
        D, I = image_index.search(img_vec, top_k)
        return I[0]  # return top matching image indices

    print("\n✅ Image query functionality is ready to use.")
else:
    print("\n⚠️ Image query functionality is not available (no images processed).")

In [None]:
!pip install transformers accelerate


from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")




In [None]:
# Install required packages
!pip install -q PyMuPDF faiss-cpu pdf2image transformers sentence-transformers openai pytesseract scikit-learn matplotlib seaborn nltk rouge-score pandas Pillow torch

# Download NLTK data
import nltk
nltk.download('punkt')

# Install tesseract for OCR
!apt-get update && apt-get install -y tesseract-ocr
!apt-get install -y poppler-utils

import os
import fitz  # PyMuPDF
import faiss
import numpy as np
from pdf2image import convert_from_path
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from sentence_transformers import SentenceTransformer
import openai
import pytesseract
import pickle
import time
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import pandas as pd
from collections import defaultdict

# === STEP 0: Setup ===
pdf_folder = "sample_data"  # Your folder containing PDFs

# Create the directory if it doesn't exist
os.makedirs(pdf_folder, exist_ok=True)

# Check if directory exists and list files
print(f"Checking directory: {os.path.abspath(pdf_folder)}")
if not os.path.exists(pdf_folder):
    print(f" Warning: The directory {pdf_folder} does not exist!")
else:
    print(f" Directory exists. Contents: {os.listdir(pdf_folder)}")

# Get PDF files from the folder
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files: {pdf_files}")

# Check if there are PDF files to process
if not pdf_files:
    print("No PDF files found in the directory. Please add PDF files to the sample_data folder.")
    import sys
    sys.exit("Please add PDF files to continue.")

openai.api_key =  # Replace with your actual API key

# === STEP 1: Extract text from PDFs (with OCR for image-based PDFs) ===
all_text = {}

def extract_text_from_pdf_with_ocr(pdf_file):
    doc = fitz.open(pdf_file)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)

        # Check if the page is a text-based PDF or an image-based page
        if page.get_text("text"):  # Text-based page
            text += page.get_text()
        else:  # Image-based page (requires OCR)
            image = page.get_pixmap()
            pil_image = Image.frombytes("RGB", [image.width, image.height], image.samples)
            text += pytesseract.image_to_string(pil_image)

    return text

# Process each PDF and extract text
for pdf_file in pdf_files:
    try:
        text = extract_text_from_pdf_with_ocr(pdf_file)
        all_text[pdf_file] = text

        # Save extracted text
        output_txt = pdf_file.replace('.pdf', '.txt')
        with open(output_txt, 'w', encoding='utf-8') as f:
            f.write(text)
        print(f" Text extracted from {pdf_file} and saved to {output_txt}")
    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")

print(f" Text extracted from {len(all_text)} PDFs!")

# === STEP 2: Chunk the text ===
def chunk_text(text, chunk_size=500):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

chunks = []
chunk_sources = []  # To track source info
for file, text in all_text.items():
    file_chunks = chunk_text(text)
    chunks.extend(file_chunks)
    chunk_sources.extend([file] * len(file_chunks))

print(f" Total chunks created: {len(chunks)}")

# Check if we have any chunks to process
if not chunks:
    print("No text chunks were created. The PDFs might be empty or could not be processed.")
    import sys
    sys.exit("Please add non-empty PDFs to continue.")

# === STEP 3: Text Embeddings ===
text_model = SentenceTransformer('all-MiniLM-L6-v2')
text_embeddings = text_model.encode(chunks, show_progress_bar=True)
print(f"Text Embeddings shape: {text_embeddings.shape}")

# Ensure models directory exists
os.makedirs("models", exist_ok=True)

text_index = faiss.IndexFlatL2(text_embeddings.shape[1])
text_index.add(np.array(text_embeddings))
faiss.write_index(text_index, "models/faiss_index_text.idx")
print("FAISS index for text saved!")

# === STEP 4: Extract images from PDFs ===
all_images = []
image_sources = []  # Track which PDF each image came from

for pdf_file in pdf_files:
    try:
        images = convert_from_path(pdf_file)
        all_images.extend(images)
        image_sources.extend([pdf_file] * len(images))
        print(f" Extracted {len(images)} images from {pdf_file}")
    except Exception as e:
        print(f" Error extracting images from {pdf_file}: {e}")

print(f" Total images extracted: {len(all_images)}")

# Check if we have any images to process
if not all_images:
    print("No images were extracted from the PDFs.")
    print("Continuing with text processing only...")
    image_index = None
else:
    # === STEP 5: Image Embeddings ===
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    image_embeddings = []

    for i, img in enumerate(all_images):
        try:
            inputs = clip_processor(images=img, return_tensors="pt")
            outputs = clip_model.get_image_features(**inputs)
            image_embeddings.append(outputs.detach().numpy())
            if (i + 1) % 10 == 0:
                print(f"Processed {i + 1}/{len(all_images)} images")
        except Exception as e:
            print(f" Error processing image {i}: {e}")

    if image_embeddings:
        image_embeddings = np.vstack(image_embeddings)
        print(f"Image Embeddings shape: {image_embeddings.shape}")

        image_index = faiss.IndexFlatL2(image_embeddings.shape[1])
        image_index.add(image_embeddings)
        faiss.write_index(image_index, "models/faiss_index_images.idx")
        print(" FAISS index for images saved!")
    else:
        print("No image embeddings were created.")
        image_index = None

# === STEP 6: Text Retrieval ===
def search_text(query, top_k=5):
    start_time = time.time()
    query_vec = text_model.encode([query])
    D, I = text_index.search(np.array(query_vec), top_k)
    elapsed_time = time.time() - start_time
    return I[0], D[0], elapsed_time  # Return indices, distances, and query time

def retrieve_chunks(indices, chunks):
    return [chunks[i] for i in indices]

# Example text query
query = "What is the revenue growth in 2022?"
print(f"\nExecuting text query: '{query}'")
top_indices, distances, query_time = search_text(query)
retrieved = retrieve_chunks(top_indices, chunks)

print(f"🔍 Retrieved Text Chunks (in {query_time:.2f} seconds):")
for i, (text, dist) in enumerate(zip(retrieved, distances)):
    print(f"Chunk {i+1} (Distance: {dist:.4f}):\n{text[:200]}...\n")  # Print a preview of each chunk

# === STEP 7: Generate Answer ===
def generate_answer(context_chunks, query):
    start_time = time.time()
    context = "\n".join(context_chunks)
    prompt = f"Answer the query based on the following extracted context:\n{context}\n\nQuery: {query}"

    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[{"role": "system", "content": "You are an expert data analyst."},
                      {"role": "user", "content": prompt}]
        )
        answer = response["choices"][0]["message"]["content"]
        elapsed_time = time.time() - start_time
        return answer, elapsed_time
    except Exception as e:
        print(f"Error generating answer: {e}")
        return "Error generating response: " + str(e), -1

answer, generation_time = generate_answer(retrieved, query)
print(f"\nGPT-4 Answer (generated in {generation_time:.2f} seconds):")
print(answer)

# === STEP 7B: Image Query Support ===
def search_image_query(image_pil, top_k=5):
    if image_index is None:
        return [], [], -1

    start_time = time.time()
    inputs = clip_processor(images=image_pil, return_tensors="pt")
    features = clip_model.get_image_features(**inputs)
    img_vec = features.detach().numpy()
    D, I = image_index.search(img_vec, top_k)
    elapsed_time = time.time() - start_time
    return I[0], D[0], elapsed_time  # return top matching image indices, distances, and time

if image_index is not None:
    print("\n Image query functionality is ready to use.")
else:
    print("\n Image query functionality is not available (no images processed).")

# === STEP 8: Evaluation and Visualization ===
print("\n=== STEP 8: Evaluation and Visualization ===")

# Create results directory
os.makedirs("evaluation_results", exist_ok=True)

# === 8.1: Visualize Embedding Space ===
def visualize_embeddings(embeddings, labels=None, title="Embedding Space Visualization", filename="embedding_viz.png"):
    """Visualize embeddings using PCA and t-SNE"""
    plt.figure(figsize=(20, 10))

    # PCA for dimensionality reduction
    plt.subplot(1, 2, 1)
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)

    if labels is not None:
        unique_labels = list(set(labels))
        colors = sns.color_palette("husl", len(unique_labels))
        for i, label in enumerate(unique_labels):
            indices = [j for j, l in enumerate(labels) if l == label]
            plt.scatter(reduced_embeddings[indices, 0], reduced_embeddings[indices, 1],
                      label=os.path.basename(label), alpha=0.7, s=10, color=colors[i])
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    else:
        plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], alpha=0.7, s=10)

    plt.title(f"PCA of {title}")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")

    # t-SNE for better cluster visualization
    plt.subplot(1, 2, 2)
    # Use a sample if there are too many points for t-SNE
    max_tsne_samples = 5000
    if len(embeddings) > max_tsne_samples:
        indices = np.random.choice(len(embeddings), max_tsne_samples, replace=False)
        tsne_embeddings = embeddings[indices]
        tsne_labels = [labels[i] for i in indices] if labels is not None else None
    else:
        tsne_embeddings = embeddings
        tsne_labels = labels

    tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=42)
    tsne_results = tsne.fit_transform(tsne_embeddings)

    if tsne_labels is not None:
        unique_labels = list(set(tsne_labels))
        colors = sns.color_palette("husl", len(unique_labels))
        for i, label in enumerate(unique_labels):
            indices = [j for j, l in enumerate(tsne_labels) if l == label]
            plt.scatter(tsne_results[indices, 0], tsne_results[indices, 1],
                      label=os.path.basename(label), alpha=0.7, s=10, color=colors[i])
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    else:
        plt.scatter(tsne_results[:, 0], tsne_results[:, 1], alpha=0.7, s=10)

    plt.title(f"t-SNE of {title}")
    plt.tight_layout()
    plt.savefig(os.path.join("evaluation_results", filename), dpi=300)
    plt.close()
    print(f" Embedding visualization saved to evaluation_results/{filename}")

# Visualize text embeddings
visualize_embeddings(text_embeddings, labels=chunk_sources,
                    title="Text Chunk Embeddings", filename="text_embeddings_viz.png")

# Visualize image embeddings if available
if image_index is not None:
    visualize_embeddings(image_embeddings, labels=image_sources,
                        title="Image Embeddings", filename="image_embeddings_viz.png")

# === 8.2: Evaluate Retrieval Quality ===
def evaluate_retrieval_quality(query, ground_truth_chunks, retrieved_chunks, distances):
    """Evaluate retrieval quality for a given query"""
    # Convert to sets for easier intersection calculation
    ground_truth_set = set(ground_truth_chunks)
    retrieved_set = set(retrieved_chunks)

    # Calculate metrics
    hit_rate = len(ground_truth_set.intersection(retrieved_set)) / len(ground_truth_set) if ground_truth_chunks else 0

    # Calculate semantic similarity between query and retrieved chunks
    query_vec = text_model.encode([query])
    retrieved_vecs = text_model.encode(retrieved_chunks)

    # Cosine similarity between query and each chunk
    similarities = np.dot(retrieved_vecs, query_vec.T).flatten() / (
        np.linalg.norm(retrieved_vecs, axis=1) * np.linalg.norm(query_vec)
    )

    mean_similarity = np.mean(similarities)

    return {
        "hit_rate": hit_rate,
        "mean_similarity": float(mean_similarity),
        "similarity_scores": similarities.tolist(),
        "retrieval_distances": distances.tolist()
    }

# For this example, we don't have ground truth, so we'll simulate it
# In a real scenario, you would have actual ground truth data
# Let's just assume the first retrieved chunk is relevant for this example
simulated_ground_truth = [retrieved[0]]
retrieval_metrics = evaluate_retrieval_quality(query, simulated_ground_truth, retrieved, distances)

print("\n📊 Retrieval Evaluation:")
print(f"Hit Rate: {retrieval_metrics['hit_rate']:.2f}")
print(f"Mean Semantic Similarity: {retrieval_metrics['mean_similarity']:.4f}")

# Visualize retrieval metrics
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.bar(range(len(retrieved)), retrieval_metrics["similarity_scores"])
plt.title("Query-Chunk Semantic Similarity")
plt.xlabel("Retrieved Chunk Index")
plt.ylabel("Cosine Similarity")

plt.subplot(1, 2, 2)
plt.bar(range(len(retrieved)), retrieval_metrics["retrieval_distances"])
plt.title("FAISS Retrieval Distances")
plt.xlabel("Retrieved Chunk Index")
plt.ylabel("L2 Distance (lower is better)")

plt.tight_layout()
plt.savefig(os.path.join("evaluation_results", "retrieval_metrics.png"), dpi=300)
plt.close()
print("✅ Retrieval metrics visualization saved")

# === 8.3: Evaluate Generated Response ===
def evaluate_generated_response(reference_answer, generated_answer):
    """Evaluate the quality of generated answer using BLEU and ROUGE scores"""
    try:
        # BLEU score calculation
        smoothing = SmoothingFunction().method1
        reference_tokens = [reference_answer.split()]
        candidate_tokens = generated_answer.split()
        bleu_score = sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smoothing)

        # ROUGE score calculation using rouge_scorer
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        rouge_scores = scorer.score(reference_answer, generated_answer)

        return {
            "bleu": bleu_score,
            "rouge1_f": rouge_scores["rouge1"].fmeasure,
            "rouge2_f": rouge_scores["rouge2"].fmeasure,
            "rougeL_f": rouge_scores["rougeL"].fmeasure
        }
    except Exception as e:
        print(f"⚠️ Error calculating evaluation metrics: {e}")
        return {
            "bleu": 0,
            "rouge1_f": 0,
            "rouge2_f": 0,
            "rougeL_f": 0
        }

# For demonstration purposes, let's use a simulated reference answer
# In a real evaluation, this would be a human-written or known correct answer
simulated_reference = "In 2022, the company saw a 15% increase in revenue compared to the previous year."
response_metrics = evaluate_generated_response(simulated_reference, answer)

print("\nResponse Quality Evaluation:")
print(f"BLEU Score: {response_metrics['bleu']:.4f}")
print(f"ROUGE-1 F1: {response_metrics['rouge1_f']:.4f}")
print(f"ROUGE-2 F1: {response_metrics['rouge2_f']:.4f}")
print(f"ROUGE-L F1: {response_metrics['rougeL_f']:.4f}")

# Visualize response metrics
plt.figure(figsize=(8, 6))
metrics = list(response_metrics.keys())
values = list(response_metrics.values())

plt.bar(metrics, values)
plt.title("Generated Response Quality Metrics")
plt.ylabel("Score")
plt.ylim(0, 1)

plt.tight_layout()
plt.savefig(os.path.join("evaluation_results", "response_metrics.png"), dpi=300)
plt.close()
print("Response metrics visualization saved")

# === 8.4: Performance Dashboard ===
def create_performance_dashboard(num_queries=1):
    """Create a simple performance dashboard with query times and quality metrics"""
    data = {
        "Total PDFs Processed": len(pdf_files),
        "Total Text Chunks": len(chunks),
        "Total Images": len(all_images) if all_images else 0,
        "Average Query Time (s)": query_time,
        "Average Generation Time (s)": generation_time,
        "Average Hit Rate": retrieval_metrics["hit_rate"],
        "Average Semantic Similarity": retrieval_metrics["mean_similarity"],
        "Average BLEU Score": response_metrics["bleu"],
        "Average ROUGE-L F1": response_metrics["rougeL_f"]
    }

    # Create dashboard visualization
    fig, ax = plt.subplots(2, 2, figsize=(15, 10))

    # Processing stats
    ax[0, 0].bar(['PDFs', 'Text Chunks', 'Images'],
                [data['Total PDFs Processed'], data['Total Text Chunks'], data['Total Images']])
    ax[0, 0].set_title('Processing Statistics')
    ax[0, 0].set_ylabel('Count')

    # Time performance
    ax[0, 1].bar(['Query', 'Generation'], [data['Average Query Time (s)'], data['Average Generation Time (s)']])
    ax[0, 1].set_title('Time Performance')
    ax[0, 1].set_ylabel('Seconds')

    # Retrieval quality
    ax[1, 0].bar(['Hit Rate', 'Semantic Similarity'],
                [data['Average Hit Rate'], data['Average Semantic Similarity']])
    ax[1, 0].set_title('Retrieval Quality')
    ax[1, 0].set_ylim(0, 1)

    # Response quality
    ax[1, 1].bar(['BLEU', 'ROUGE-L'], [data['Average BLEU Score'], data['Average ROUGE-L F1']])
    ax[1, 1].set_title('Response Quality')
    ax[1, 1].set_ylim(0, 1)

    plt.tight_layout()
    plt.savefig(os.path.join("evaluation_results", "performance_dashboard.png"), dpi=300)
    plt.close()

    # Also save as CSV for record keeping
    pd.DataFrame([data]).to_csv(os.path.join("evaluation_results", "performance_metrics.csv"), index=False)
    print(" Performance dashboard created")

    return data

dashboard_data = create_performance_dashboard()

# === STEP 9: Save Models and Embeddings ===
print("\n=== STEP 9: Save Models and Embeddings ===")

# Save FAISS indices
faiss.write_index(text_index, "models/text_index.faiss")
if image_index is not None:
    faiss.write_index(image_index, "models/image_index.faiss")

# Save embeddings
np.save("models/text_embeddings.npy", text_embeddings)
if 'image_embeddings' in locals() and len(image_embeddings) > 0:
    np.save("models/image_embeddings.npy", image_embeddings)

# Save text chunks and metadata
with open("models/text_chunks.pkl", "wb") as f:
    pickle.dump({
        "chunks": chunks,
        "sources": chunk_sources,
        "creation_time": time.time()
    }, f)

# Save evaluation results
with open("evaluation_results/evaluation_summary.pkl", "wb") as f:
    pickle.dump({
        "retrieval_metrics": retrieval_metrics,
        "response_metrics": response_metrics,
        "performance_dashboard": dashboard_data,
        "timestamp": time.time()
    }, f)

print(" All models, embeddings, and evaluation results saved to disk!")

# For downloading in Google Colab
try:
    from google.colab import files
    # Download the saved files
    files.download("models/text_index.faiss")
    if image_index is not None:
        files.download("models/image_index.faiss")
    files.download("models/text_embeddings.npy")
    if 'image_embeddings' in locals() and len(image_embeddings) > 0:
        files.download("models/image_embeddings.npy")
    files.download("models/text_chunks.pkl")
    files.download("evaluation_results/performance_dashboard.png")
    files.download("evaluation_results/performance_metrics.csv")
    print("All files prepared for download!")
except ImportError:
    print("Files saved locally. Not running in Colab, so no downloads initiated.")

print("\nPDF  RAG System with Evaluation Complete!")
print("Check the 'evaluation_results' folder for visualizations and metrics.")