# Lucide Icon Loader - Vector Search Indexer

This notebook fetches Lucide icon metadata, generates semantic embeddings, and stores them in Redis for vector search.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/itay-ct/IconLoader/blob/main/IconLoader.ipynb)

## Workflow
0. **Configure** - Enter Redis connection details
1. **Setup** - Install dependencies and connect to Redis
2. **Check Existing Icons** - See what's already indexed
3. **How It Works** - See a demo of the embedding process
4. **Update Icons (Optional)** - Upload new icons.txt to re-index
5. **Configure Tests** - Define test sentences with expected results
6. **Run Tests** - Execute tests and see summary report

## Step 0: Configure - Redis Connection Details

In [None]:
# Prompt for Redis connection details
print("Please provide Redis connection details:\n")
username = input('Username: ').strip()
password = input('Password: ').strip()
redis_url_port = input('Redis URL:Port, without redis:// prefix (e.g., localhost:6379): ').strip()

# Validate required fields
if not username or not password or not redis_url_port:
    raise ValueError("Username, password, and Redis URL:Port are required!")

# Extract prefix from username if it starts with 'redisboard-'
if username.startswith('redisboard-'):
    prefix = username.split('redisboard-', 1)[1]
    INDEX_NAME = f"{prefix}_lucide_icon_index"
    KEY_PREFIX = f"{prefix}:lucide:icon:"
    print(f"\n‚úì Detected prefix '{prefix}' from username")
else:
    prefix = None
    INDEX_NAME = "lucide_icon_index"
    KEY_PREFIX = "lucide:icon:"

# Build Redis URL
REDIS_URL = f"redis://{username}:{password}@{redis_url_port}"

print(f"‚úì Index name: {INDEX_NAME}")
print(f"‚úì Key prefix: {KEY_PREFIX}")
print("\n‚úì Configuration complete! Continue to Step 1.")

## Step 1: Setup - Install Dependencies & Connect to Redis

In [None]:
# Install dependencies
!pip install -q sentence-transformers redisvl redis requests numpy

import os
import json
import requests
import redis
import numpy as np
import warnings
from typing import List, Dict, Tuple
from sentence_transformers import SentenceTransformer
from redisvl.index import SearchIndex
from redisvl.query import VectorQuery

# Suppress HuggingFace token warning
warnings.filterwarnings('ignore', message='.*HF_TOKEN.*')

# Configuration
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
EMBEDDING_DIM = 384
LUCIDE_RAW_BASE = "https://raw.githubusercontent.com/lucide-icons/lucide/main/icons/"

# Connect to Redis
print("Connecting to Redis...")
redis_client = redis.from_url(REDIS_URL)
redis_client.ping()
print("‚úì Connected to Redis")

# Setup index
index_config = {
    "index": {
        "name": INDEX_NAME,
        "prefix": KEY_PREFIX,
        "storage_type": "hash",
    },
    "fields": [
        {"name": "name", "type": "tag"},
        {"name": "description", "type": "text"},
        {
            "name": "embedding",
            "type": "vector",
            "attrs": {
                "dims": EMBEDDING_DIM,
                "algorithm": "flat",
                "distance_metric": "cosine",
                "datatype": "float32",
            },
        },
    ],
}

index = SearchIndex.from_dict(index_config)
index.set_client(redis_client)

if not index.exists():
    index.create()
    print(f"‚úì Created index '{INDEX_NAME}'")
else:
    print(f"‚úì Using existing index '{INDEX_NAME}'")

print("\n‚úì Setup complete!")

## Step 2: Check Existing Icons

See what icons are already indexed in Redis:

In [None]:
# Scan for existing icon keys (ACL-friendly alternative to KEYS)
existing_icons = []
cursor = 0

while True:
    cursor, keys = redis_client.scan(cursor, match=f"{KEY_PREFIX}*", count=100)
    for key in keys:
        icon_data = redis_client.hget(key, 'name')
        if icon_data:
            icon_name = icon_data.decode('utf-8') if isinstance(icon_data, bytes) else icon_data
            existing_icons.append(icon_name)
    if cursor == 0:
        break

if existing_icons:
    existing_icons.sort()
    print(f"Found {len(existing_icons)} icons already indexed in Redis:\n")
    
    # Show first 30 icons (5 per line)
    display_count = min(30, len(existing_icons))
    for i in range(0, display_count, 5):
        print("  " + ", ".join(existing_icons[i:i+5]))
    
    if len(existing_icons) > display_count:
        print(f"  ... and {len(existing_icons) - display_count} more")
    
    print(f"\n‚úì You can skip to Step 5 to configure tests with existing icons")
    print(f"‚úì Or continue to Step 3 to see how embedding works")
    print(f"‚úì Or jump to Step 4 to update/replace icons")
else:
    print("‚ö† No icons found in Redis")
    print("‚úì Continue to Step 3 to see how embedding works")
    print("‚úì Then run Step 4 to upload and index icons")

## Step 3: How It Works - Embedding Demo

Let's see how an icon is processed and stored in Redis:

In [None]:
import requests
import numpy as np
import warnings
from sentence_transformers import SentenceTransformer

# Suppress HuggingFace token warning
warnings.filterwarnings('ignore', message='.*HF_TOKEN.*')

# Example: Process the 'beer' icon
demo_icon = "beer"

print("=" * 80)
print(f"DEMO: How '{demo_icon}' icon is processed and stored")
print("=" * 80)

# Step 1: Fetch metadata from Lucide GitHub
print(f"\n1Ô∏è‚É£ Fetching metadata from Lucide GitHub...")
url = f"https://raw.githubusercontent.com/lucide-icons/lucide/main/icons/{demo_icon}.json"
response = requests.get(url, timeout=10)
metadata = response.json()

print(f"\nüìÑ Icon JSON from GitHub:")
import json
print(json.dumps(metadata, indent=2))

# Step 2: Build description string
print(f"\n2Ô∏è‚É£ Building description string for embedding...")
tags = metadata.get('tags', [])
component_name = 'Beer'  # Capitalized version
tag_str = ", ".join(tags) if tags else "icon"
description = f"{component_name} - {demo_icon}; {tag_str}"

print(f"\nüìù Description string (what gets embedded):")
print(f"   \"{description}\"")

# Step 3: Generate embedding vector
print(f"\n3Ô∏è‚É£ Generating 384-dimensional embedding vector...")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embedding_vector = model.encode(description, show_progress_bar=False).tolist()

print(f"\nüî¢ Embedding vector (first 10 dimensions):")
print(f"   {embedding_vector[:10]}")
print(f"   ... (374 more dimensions)")
print(f"\n   Total dimensions: {len(embedding_vector)}")
print(f"   Vector type: {type(embedding_vector[0]).__name__} (floating point numbers)")

# Step 4: Convert to bytes for Redis storage
print(f"\n4Ô∏è‚É£ Converting to bytes for Redis storage...")
embedding_bytes = np.array(embedding_vector, dtype=np.float32).tobytes()
print(f"   Byte size: {len(embedding_bytes)} bytes")

# Step 5: What gets stored in Redis
print(f"\n5Ô∏è‚É£ What gets stored in Redis:")
print(f"\n   Redis Key: lucide:icon:{demo_icon}")
print(f"   Redis Hash Fields:")
print(f"     - name: \"{demo_icon}\"")
print(f"     - description: \"{description}\"")
print(f"     - embedding: [binary vector data, {len(embedding_bytes)} bytes]")

print(f"\nüí° How vector search works:")
print(f"   1. User query: \"Found 5 places offering a relaxing drink\"")
print(f"   2. Query is converted to a 384-dimensional vector (same process)")
print(f"   3. Redis compares query vector with all icon vectors using cosine similarity")
print(f"   4. Icons with most similar vectors are returned (e.g., 'beer', 'wine', 'coffee')")

print(f"\n" + "=" * 80)
print(f"‚úì Demo complete! Now you understand how icons are embedded and stored.")
print("=" * 80)

## Step 4: Update Icons (Optional - Skip if using existing icons)

‚ö†Ô∏è **Only run this cell if you want to update/replace the indexed icons**

You can either:
- Upload your own `icons.txt` file, OR
- Press Cancel to use the default icon list from GitHub

In [None]:
from google.colab import files

# Helper functions
def slug_to_component_name(slug: str) -> str:
    """Convert icon slug to component name"""
    parts = slug.split('-')
    return ''.join(p.capitalize() for p in parts)

def fetch_icon_metadata(slug: str) -> dict:
    """Fetch icon metadata JSON from Lucide GitHub"""
    url = f"{LUCIDE_RAW_BASE}{slug}.json"
    resp = requests.get(url, timeout=10)
    resp.raise_for_status()
    return resp.json()

def build_description(name: str, slug: str, tags: List[str]) -> str:
    """Build a semantically rich description string to embed"""
    tag_str = ", ".join(tags) if tags else "icon"
    return f"{name} - {slug}; {tag_str}"

# Upload icons.txt file (or use default)
print("Upload your icons.txt file (or press Cancel to use default from GitHub):")
uploaded = files.upload()

# Parse uploaded file or fetch default
icon_slugs = None
if uploaded:
    # Get first uploaded file
    filename = list(uploaded.keys())[0]
    content = uploaded[filename].decode('utf-8')
    icon_slugs = [s.strip() for s in content.split('\n') if s.strip() and not s.startswith('#')]
    print(f"\n‚úì Loaded {len(icon_slugs)} icons from uploaded file")
else:
    # Fetch default icons.txt from GitHub
    print("\n‚ö† No file uploaded. Fetching default icons.txt from GitHub...")
    try:
        default_url = "https://raw.githubusercontent.com/itay-ct/IconLoader/refs/heads/main/icons.txt"
        response = requests.get(default_url, timeout=10)
        response.raise_for_status()
        icon_slugs = [s.strip() for s in response.text.split('\n') if s.strip() and not s.startswith('#')]
        print(f"‚úì Loaded {len(icon_slugs)} icons from default list")
    except Exception as e:
        print(f"‚úó Error fetching default icons: {e}")
        print("‚úì Skipping icon update. Continue to Step 5 to configure tests")

if icon_slugs:
    # Clean existing icons using SCAN (ACL-friendly)
    print("\nCleaning existing icons from Redis...")
    deleted_count = 0
    cursor = 0
    while True:
        cursor, keys = redis_client.scan(cursor, match=f"{KEY_PREFIX}*", count=100)
        if keys:
            redis_client.delete(*keys)
            deleted_count += len(keys)
        if cursor == 0:
            break
    if deleted_count > 0:
        print(f"‚úì Deleted {deleted_count} existing icons")
    
    # Load embedding model
    print(f"\nLoading embedding model: {EMBEDDING_MODEL_NAME}...")
    model = SentenceTransformer(EMBEDDING_MODEL_NAME)
    print("‚úì Model loaded\n")
    
    # Process icons
    docs = []
    
    for i, slug in enumerate(icon_slugs, 1):
        print(f"[{i}/{len(icon_slugs)}] Processing '{slug}'...", end=" ")
        try:
            # Fetch metadata and build description
            meta = fetch_icon_metadata(slug)
            tags = meta.get("tags", [])
            component_name = slug_to_component_name(slug)
            description = build_description(component_name, slug, tags)
            
            # Generate embedding and convert to bytes
            embedding = model.encode(description, show_progress_bar=False)
            embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
            
            docs.append({"name": slug, "description": description, "embedding": embedding_bytes})
            print("‚úì")
        except Exception as e:
            print(f"‚úó {e}")
    
    # Index in Redis
    if docs:
        print(f"\nIndexing {len(docs)} icons into Redis...")
        index.load(docs)
        print(f"‚úì Successfully indexed {len(docs)} icons")
    else:
        print("\n‚ö† No icons to index")
    
    print("\n‚úì Icon update complete! Continue to Step 5 to configure tests.")

## Step 5: Configure Test Dataset

Define test sentences with their expected icon results:

In [None]:
# Test dataset: (query, expected_icons)
# Edit this list to add/modify test cases
# expected_icons is now an array - test passes if actual icon matches ANY of the expected icons
TEST_DATASET = [
    ("Found 5 places offering a relaxing drink for your tour.", ["beer"]),
    ("I found 9 parks to enjoy nature's beauty nearby.", ["trees"]),
    ("There are 3 locations of cultural interest, ready to inspire.", ["building-2", "theater"]),
    ("Found 8 exciting sports and activity locations around.", ["binoculars"]),
    ("Found 17 delicious food spots awaiting your hungry stomach.", ["ice-cream-cone"]),
    ("Discovered 4 historical landmarks worth visiting.", ["landmark"]),
    ("Located 6 shopping centers for your retail therapy.", ["shopping-bag"]),
    ("Found 12 entertainment venues for a fun night out.", ["popcorn", "theater"]),
    ("There are 7 hotels offering comfortable accommodation.", ["bed", "hotel"]),
    ("Spotted 10 scenic viewpoints for amazing photos.", ["camera", "binoculars"]),
]

print(f"‚úì Configured {len(TEST_DATASET)} test cases:\n")
for i, (query, expected_icons) in enumerate(TEST_DATASET, 1):
    expected_str = ", ".join(expected_icons) if len(expected_icons) > 1 else expected_icons[0]
    print(f"{i}. \"{query[:50]}...\" => {expected_str}")

print("\n‚úì Ready to run tests! Continue to Step 6.")

## Step 6: Run Tests & Generate Report

Execute all test cases and generate a summary report:

In [None]:
# Load embedding model if not already loaded
try:
    model
except NameError:
    print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
    model = SentenceTransformer(EMBEDDING_MODEL_NAME)
    print("‚úì Model loaded\n")

# Run tests
print("=" * 100)
print("RUNNING VECTOR SEARCH TESTS")
print("=" * 100)
print()
# Print table header
print(f"{'#':<4} {'Status':<8} {'Expected':<20} {'Actual':<20} {'Query':<50}")
print("-" * 100)

passed = 0

for i, (query, expected_icons) in enumerate(TEST_DATASET, 1):
    # Encode query and search
    query_embedding = model.encode(query, show_progress_bar=False)
    query_bytes = np.array(query_embedding, dtype=np.float32).tobytes()
    
    vector_query = VectorQuery(
        vector=query_bytes,
        vector_field_name="embedding",
        return_fields=["name"],
        num_results=1
    )
    results = index.query(vector_query)
    
    # Check result
    expected_str = ", ".join(expected_icons) if len(expected_icons) > 1 else expected_icons[0]
    actual_icon = results[0].get("name") if results else "NO_RESULT"
    success = actual_icon in expected_icons
    
    if success:
        passed += 1
    
    # Print result
    status = "‚úì PASS" if success else "‚úó FAIL"
    query_short = query[:47] + '...' if len(query) > 50 else query
    print(f"{i:<4} {status:<8} {expected_str:<20} {actual_icon:<20} {query_short:<50}")

# Summary report
failed = len(TEST_DATASET) - passed
print("\n" + "=" * 100)
print("TEST SUMMARY REPORT")
print("=" * 100)
print(f"\nTotal Tests:  {len(TEST_DATASET)}")
print(f"Passed:       {passed} ({passed/len(TEST_DATASET)*100:.1f}%)")
print(f"Failed:       {failed} ({failed/len(TEST_DATASET)*100:.1f}%)\n")

if passed == len(TEST_DATASET):
    print("üéâ ALL TESTS PASSED!")
elif passed > 0:
    print("‚ö† PARTIAL SUCCESS - Review failed tests above")
else:
    print("‚ùå ALL TESTS FAILED - Check icon indexing and expected values")

print("=" * 100)