# ColPali RAG System Demo (Google Colab)

This notebook demonstrates the ColPali-based RAG system with GPU acceleration.

**Before running:** Set Runtime > Change runtime type > Hardware accelerator > GPU (T4 recommended)

## 1. Install Dependencies

In [None]:
# Install system dependencies for PDF processing
!apt-get update -qq
!apt-get install -y -qq poppler-utils

# Install Python packages
!pip install -q colpali-engine pdf2image Pillow torch torchvision openai chromadb

## 2. Mount Google Drive (Recommended)

Upload the entire `colpali_colab_upload/` folder to your Google Drive, then mount it here.

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Change to the uploaded folder
BASE_DIR = '/content/drive/MyDrive/colpali_colab_upload'
os.chdir(BASE_DIR)

print(f"Working directory: {os.getcwd()}")
print(f"\nFiles in directory:")
!ls -la

## 3. Check GPU Availability

In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")

## 4. Load and Test System

In [None]:
import json
import os
from colpali_rag_system import ColPaliRAGSystem

# Initialize RAG system (will use GPU if available)
print("Initializing ColPali RAG System...")
rag = ColPaliRAGSystem()
print("System initialized!")

# Show stats
stats = rag.get_stats()
print(f"\nCurrent stats: {stats}")

## 5. Load Posts from Dashboard

In [None]:
# Load posts
posts_file = 'posts/dashboard.json'
posts_base_dir = 'posts'

with open(posts_file, 'r') as f:
    data = json.load(f)

posts = data['posts']
print(f"Found {len(posts)} posts")

# Show what will be processed
for post in posts:
    attachments = post.get('attachments', [])
    image_atts = [att for att in attachments if att.get('file_type') in ['pdf', 'image']]
    if image_atts:
        print(f"\nPost: {post['title']}")
        for att in image_atts:
            print(f"  - {att['file_type']}: {att['filename']}")

## 6. Process Posts

In [None]:
print("\nProcessing posts...\n")

for i, post in enumerate(posts, 1):
    print(f"[{i}/{len(posts)}] Processing: {post['title']}")
    
    result = rag.add_post(post, base_dir=posts_base_dir)
    
    if result['success']:
        if result.get('skipped'):
            print(f"  → Already processed")
        else:
            texts = result.get('texts', 0)
            pdf_count = result.get('pdf_count', 0)
            image_count = result.get('image_file_count', 0)
            total_pages = result.get('total_pages', 0)
            
            parts = []
            if texts > 0:
                parts.append(f"{texts} text")
            if pdf_count > 0:
                parts.append(f"{pdf_count} pdf ({total_pages} pages)")
            if image_count > 0:
                parts.append(f"{image_count} image" if image_count == 1 else f"{image_count} images")
            
            print(f"  → Added: {', '.join(parts)}")
    else:
        print(f"  → Failed")

print("\nAll posts processed!")

# Show final stats
stats = rag.get_stats()
print(f"\nFinal stats:")
print(f"  Total text items: {stats['total_texts']}")
print(f"  Total image items: {stats['total_images']}")
print(f"  Total items: {stats['total_items']}")

## 7. Test Queries

In [None]:
queries = [
    "Where can we get the solution for assignment 2 question 5b",
    "When can we get assignment 2 solutions?",
    "Looking for final project group",
]

for i, query in enumerate(queries, 1):
    print(f"\n{'='*70}")
    print(f"Query {i}: \"{query}\"")
    print('='*70)

    results = rag.retrieve(query, n_results=5, similarity_threshold=0.5)

    # Show top match
    top = results['top_match']
    if top:
        print(f"\n→ TOP MATCH: POST - {top['title']}")
        print(f"  Author: {top.get('author', 'Unknown')}")
        print(f"  Similarity: {top.get('similarity', 0):.4f}")

        if top.get('matched_via') == 'image':
            file_type = top.get('matched_file_type', 'unknown')
            print(f"  Matched via {file_type}: {top['matched_image']}")
        else:
            print(f"  Matched via post text")

        # Show attachments
        attachments = top.get('attachments', [])
        if attachments:
            print(f"\n  Attachments in post:")
            for att in attachments:
                print(f"    • {att['filename']} ({att['file_type']})")
    else:
        print(f"\n→ No relevant information found")

    # Show similar posts
    if results['similar_posts']:
        print(f"\n  Similar posts:")
        for j, post in enumerate(results['similar_posts'][:3], 1):
            print(f"    {j}. {post['title']} (sim: {post.get('similarity', 0):.4f})")

    # Show course materials
    if results['course_materials']:
        print(f"\n  Related materials:")
        for j, material in enumerate(results['course_materials'][:3], 1):
            print(f"    {j}. {material['filename']} (sim: {material.get('similarity', 0):.4f})")

## 8. Custom Query

Try your own query:

In [None]:
# Your custom query
custom_query = "Your question here"

results = rag.retrieve(custom_query, n_results=5)

top = results['top_match']
if top:
    print(f"TOP MATCH: {top['title']}")
    print(f"Similarity: {top.get('similarity', 0):.4f}")
    print(f"Matched via: {top.get('matched_via')}")
else:
    print("No relevant match found")

## 9. Download Results

Download the processed embeddings to use later:

In [None]:
# Zip the database folder
!zip -r colpali_rag_db.zip colpali_rag_db/

# Download
from google.colab import files
files.download('colpali_rag_db.zip')