# Image Search Engine Demo with CLIP and Milvus

This notebook demonstrates how to use the image search engine with CLIP embeddings and Milvus vector database.

In [None]:
import sys
from pathlib import Path
import matplotlib.pyplot as plt
from PIL import Image
from datetime import datetime

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

from src import ImageSearchEngine, CLIPExtractor
from src.utils import load_image_paths

## 1. Initialize the Search Engine with CLIP

In [None]:
# Initialize with CLIP ViT-Large-Patch14
engine = ImageSearchEngine(
    model_name="openai/clip-vit-large-patch14",
    collection_name="image_search",
    db_path="../data/index/milvus_lite.db"
)

print("✓ Search engine initialized with CLIP and Milvus")

## 2. Build or Load Index

Milvus automatically persists data, so if you've already built an index, it will connect to the existing collection.

In [None]:
# Path to your images
image_dir = Path("../data/raw")
index_dir = Path("../data/index")

# Check if metadata exists (indicates previous index)
metadata_file = index_dir / "engine_metadata.pkl"

if metadata_file.exists():
    print("Loading existing index...")
    engine.load(index_dir)
else:
    print("Building new index with CLIP embeddings...")
    print("This may take a while for the first run (downloading CLIP model)")
    engine.build_index(
        image_dir=image_dir,
        batch_size=16,  # Adjust based on your GPU/CPU
        save_path=index_dir,
        drop_existing=False
    )

# Show statistics
stats = engine.get_stats()
print(f"\n✓ Index ready")
print(f"  Collection: {stats['name']}")
print(f"  Images: {stats['num_entities']}")
print(f"  Model: {stats['model_name']}")
print(f"  Embedding dimension: {stats['feature_dim']}")

## 3. Perform a Search

In [None]:
# Get available images
image_paths = load_image_paths(image_dir)

if len(image_paths) > 0:
    # Use the first image as query
    query_image_path = image_paths[0]
    print(f"Query image: {query_image_path.name}")
    
    # Search for similar images
    results = engine.search(query_image_path, top_k=10)
    
    # Display results with metadata
    print("\nTop 5 similar images:")
    for result in results[:5]:
        mod_time = datetime.fromtimestamp(result['modified_time']).strftime('%Y-%m-%d %H:%M')
        size_mb = result['file_size'] / (1024 * 1024)
        print(f"  {result['rank']}. {result['filename']}")
        print(f"     Score: {result['score']:.4f} | Size: {size_mb:.2f} MB | Modified: {mod_time}")
else:
    print("No images found. Please add images to data/raw/")

## 4. Visualize Results with Metadata

In [None]:
def visualize_search_results(query_path, results, top_k=5):
    """
    Visualize query image and top search results with metadata.
    """
    fig, axes = plt.subplots(1, top_k + 1, figsize=(3 * (top_k + 1), 4))
    
    # Show query
    query_img = Image.open(query_path)
    axes[0].imshow(query_img)
    axes[0].set_title("Query", fontsize=14, fontweight='bold')
    axes[0].axis('off')
    
    # Show results with metadata
    for idx, result in enumerate(results[:top_k], 1):
        img = Image.open(result['path'])
        axes[idx].imshow(img)
        
        # Format metadata
        size_mb = result['file_size'] / (1024 * 1024)
        title = f"#{result['rank']}\n{result['score']:.3f}\n{size_mb:.1f}MB"
        
        axes[idx].set_title(title, fontsize=10)
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

# Visualize if we have results
if len(image_paths) > 0:
    visualize_search_results(query_image_path, results, top_k=5)

## 5. Try Different Query Images

In [None]:
# Try searching with different images
if len(image_paths) > 1:
    query_idx = 1  # Change this to try different images
    query_path = image_paths[query_idx]
    
    print(f"Searching with: {query_path.name}")
    results = engine.search(query_path, top_k=10)
    
    visualize_search_results(query_path, results, top_k=5)

## 6. Experiment with Different CLIP Models

You can try different CLIP models for different speed/accuracy tradeoffs.

In [None]:
# Try CLIP base model (faster, smaller)
engine_base = ImageSearchEngine(
    model_name="openai/clip-vit-base-patch32",
    collection_name="image_search_base",
    db_path="../data/index/milvus_base.db"
)

print("Building index with CLIP-Base...")
if len(image_paths) > 0:
    engine_base.build_index(image_dir, batch_size=16)
    print("✓ Index built with CLIP-Base")

## 7. Compare Results Between Models

In [None]:
if len(image_paths) > 0:
    query = image_paths[0]
    
    # Search with both models
    results_large = engine.search(query, top_k=5)
    results_base = engine_base.search(query, top_k=5)
    
    print("CLIP-Large Results:")
    for r in results_large:
        print(f"  {r['rank']}. {r['filename']} - {r['score']:.4f}")
    
    print("\nCLIP-Base Results:")
    for r in results_base:
        print(f"  {r['rank']}. {r['filename']} - {r['score']:.4f}")

## 8. Explore Image Metadata

Milvus stores rich metadata alongside embeddings.

In [None]:
if len(image_paths) > 0:
    # Search and display full metadata
    query = image_paths[0]
    results = engine.search(query, top_k=3)
    
    print("Detailed metadata for top 3 results:\n")
    for result in results:
        print(f"Rank {result['rank']}:")
        print(f"  Filename: {result['filename']}")
        print(f"  Path: {result['path']}")
        print(f"  Score: {result['score']:.4f}")
        print(f"  Size: {result['file_size']:,} bytes ({result['file_size']/(1024*1024):.2f} MB)")
        print(f"  Created: {datetime.fromtimestamp(result['created_time'])}")
        print(f"  Modified: {datetime.fromtimestamp(result['modified_time'])}")
        print()

## 9. Get Collection Statistics

In [None]:
# Get detailed statistics
stats = engine.get_stats()

print("Collection Statistics:")
print(f"  Name: {stats['name']}")
print(f"  Total images: {stats['num_entities']}")
print(f"  Model: {stats['model_name']}")
print(f"  Embedding dimension: {stats['feature_dim']}")
print(f"  Database exists: {stats['exists']}")