## Lecture 5: Hands-On: Implementing a Vector Database for AI

### Introduction
In this hands-on lecture, we will implement a vector database for an AI application. We'll focus on how to convert unstructured data, such as text or images, into vectors, and store them in a vector database for efficient similarity-based search. By the end of this lecture, you will have a functional vector database that can be used to store, retrieve, and search for similar data points based on their vector representations.

---


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import faiss
import os
import pickle
from typing import List, Dict, Tuple, Any, Optional
import time


class VectorDatabase:
    def __init__(self, model_name: str = "distilbert-base-uncased"):
        """
        Initialize the vector database with a specific transformer model.

        Args:
            model_name: Name of the pre-trained model to use for embeddings
        """
        self.model_name = model_name
        print(f"Loading model: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.index = None
        self.documents = []
        self.metadata = []

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts.

        Args:
            texts: List of strings to embed

        Returns:
            NumPy array of embeddings
        """
        # Tokenize and encode the texts
        inputs = self.tokenizer(texts, padding=True, truncation=True,
                                max_length=512, return_tensors="pt")

        # Generate embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)
            # Use mean of last hidden states as embedding
            embeddings = outputs.last_hidden_state.mean(dim=1).numpy()

        return embeddings

    def add_documents(self, documents: List[str], metadata: Optional[List[Dict[str, Any]]] = None) -> None:
        """
        Add documents to the database and build or update the index.

        Args:
            documents: List of document texts
            metadata: Optional list of metadata dictionaries for each document
        """
        if not documents:
            print("No documents provided")
            return

        # Generate embeddings for the documents
        print(f"Generating embeddings for {len(documents)} documents...")
        start_time = time.time()
        embeddings = self.generate_embeddings(documents)
        print(f"Embeddings generated in {time.time() - start_time:.2f} seconds")

        # Create or update the index
        if self.index is None:
            dimension = embeddings.shape[1]
            print(f"Creating new FAISS index with dimension {dimension}")
            self.index = faiss.IndexFlatL2(dimension)

        # Add embeddings to the index
        self.index.add(embeddings)

        # Store the documents and metadata
        start_idx = len(self.documents)
        self.documents.extend(documents)

        # Add metadata if provided, otherwise use empty dictionaries
        if metadata is None:
            metadata = [{} for _ in documents]
        assert len(metadata) == len(documents), "Metadata list must match documents list length"

        # Add document index to metadata
        for i, meta in enumerate(metadata):
            meta['document_idx'] = start_idx + i

        self.metadata.extend(metadata)

        print(f"Added {len(documents)} documents to index. Total documents: {len(self.documents)}")

    def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
        """
        Search for similar documents based on a query string.

        Args:
            query: Query string to search for
            k: Number of results to return

        Returns:
            List of dictionaries containing search results
        """
        if self.index is None or len(self.documents) == 0:
            return []

        # Generate embedding for the query
        query_embedding = self.generate_embeddings([query])

        # Perform the search
        k = min(k, len(self.documents))  # Ensure k is not larger than number of documents
        distances, indices = self.index.search(query_embedding, k)

        # Format results
        results = []
        for i, (idx, distance) in enumerate(zip(indices[0], distances[0])):
            result = {
                'rank': i + 1,
                'document_idx': int(idx),
                'distance': float(distance),
                'text': self.documents[idx],
                'metadata': self.metadata[idx]
            }
            results.append(result)

        return results

    def save(self, directory: str) -> None:
        """
        Save the vector database to disk.

        Args:
            directory: Directory to save the database
        """
        if not os.path.exists(directory):
            os.makedirs(directory)

        # Save the index
        if self.index is not None:
            faiss.write_index(self.index, os.path.join(directory, "index.faiss"))

        # Save documents and metadata
        with open(os.path.join(directory, "documents.pkl"), 'wb') as f:
            pickle.dump(self.documents, f)

        with open(os.path.join(directory, "metadata.pkl"), 'wb') as f:
            pickle.dump(self.metadata, f)

        # Save model name
        with open(os.path.join(directory, "model_name.txt"), 'w') as f:
            f.write(self.model_name)

        print(f"Vector database saved to {directory}")

    @classmethod
    def load(cls, directory: str) -> 'VectorDatabase':
        """
        Load a vector database from disk.

        Args:
            directory: Directory containing the saved database

        Returns:
            Loaded VectorDatabase instance
        """
        # Load model name
        with open(os.path.join(directory, "model_name.txt"), 'r') as f:
            model_name = f.read().strip()

        # Create instance with the same model
        db = cls(model_name)

        # Load documents and metadata
        with open(os.path.join(directory, "documents.pkl"), 'rb') as f:
            db.documents = pickle.load(f)

        with open(os.path.join(directory, "metadata.pkl"), 'rb') as f:
            db.metadata = pickle.load(f)

        # Load the index
        index_path = os.path.join(directory, "index.faiss")
        if os.path.exists(index_path):
            db.index = faiss.read_index(index_path)

        print(f"Loaded vector database from {directory} with {len(db.documents)} documents")
        return db


def batch_process_documents(documents: List[str], batch_size: int = 8) -> List[np.ndarray]:
    """
    Process a large number of documents in batches to avoid memory issues.

    Args:
        documents: List of document texts
        batch_size: Number of documents to process at once

    Returns:
        List of embeddings for all documents
    """
    db = VectorDatabase()
    all_embeddings = []

    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        print(f"Processing batch {i // batch_size + 1}/{(len(documents) - 1) // batch_size + 1}")
        embeddings = db.generate_embeddings(batch)
        all_embeddings.append(embeddings)

    return np.vstack(all_embeddings)


def main():
    # Example usage of the VectorDatabase class

    # Create a new vector database
    db = VectorDatabase()

    # Example documents
    documents = [
        "The quick brown fox jumps over the lazy dog.",
        "A journey of a thousand miles begins with a single step.",
        "To be or not to be, that is the question.",
        "All that glitters is not gold.",
        "The early bird catches the worm.",
        "Actions speak louder than words.",
        "Don't judge a book by its cover.",
        "The pen is mightier than the sword.",
        "Fortune favors the bold.",
        "Knowledge is power."
    ]

    # Add metadata for each document
    metadata = [
        {"source": "proverb", "category": "animals"},
        {"source": "Lao Tzu", "category": "philosophy"},
        {"source": "Shakespeare", "category": "literature"},
        {"source": "proverb", "category": "wisdom"},
        {"source": "proverb", "category": "animals"},
        {"source": "proverb", "category": "behavior"},
        {"source": "proverb", "category": "wisdom"},
        {"source": "Edward Bulwer-Lytton", "category": "literature"},
        {"source": "proverb", "category": "courage"},
        {"source": "Francis Bacon", "category": "wisdom"}
    ]

    # Add documents to the database
    db.add_documents(documents, metadata)

    # Example queries
    queries = [
        "What is the meaning of life?",
        "Tell me about courage and boldness",
        "I need some wisdom about appearances"
    ]

    # Search for each query
    for query in queries:
        print(f"\nQuery: {query}")
        results = db.search(query)

        print("Search Results:")
        for result in results:
            print(f"{result['rank']}. (Distance: {result['distance']:.4f}) '{result['text']}'")
            print(
                f"   Source: {result['metadata'].get('source', 'Unknown')}, Category: {result['metadata'].get('category', 'Uncategorized')}")

    # Save the database
    save_dir = "vector_db_example"
    db.save(save_dir)

    # Load the database
    loaded_db = VectorDatabase.load(save_dir)

    # Verify the loaded database works
    print("\nTesting loaded database:")
    results = loaded_db.search("Tell me about wisdom")

    print("Search Results:")
    for result in results:
        print(f"{result['rank']}. (Distance: {result['distance']:.4f}) '{result['text']}'")


if __name__ == "__main__":
    main()

### Output:
```
Loading model: distilbert-base-uncased
Generating embeddings for 10 documents...
Embeddings generated in 0.44 seconds
Creating new FAISS index with dimension 768
Added 10 documents to index. Total documents: 10

Query: What is the meaning of life?
Search Results:
1. (Distance: 28.9064) 'Knowledge is power.'
   Source: Francis Bacon, Category: wisdom
2. (Distance: 29.8704) 'To be or not to be, that is the question.'
   Source: Shakespeare, Category: literature
3. (Distance: 35.8834) 'Actions speak louder than words.'
   Source: proverb, Category: behavior
4. (Distance: 37.3288) 'All that glitters is not gold.'
   Source: proverb, Category: wisdom
5. (Distance: 39.9361) 'A journey of a thousand miles begins with a single step.'
   Source: Lao Tzu, Category: philosophy

Query: Tell me about courage and boldness
Search Results:
1. (Distance: 29.0239) 'Knowledge is power.'
   Source: Francis Bacon, Category: wisdom
2. (Distance: 29.1098) 'Actions speak louder than words.'
   Source: proverb, Category: behavior
3. (Distance: 32.2485) 'Fortune favors the bold.'
   Source: proverb, Category: courage
4. (Distance: 33.7360) 'The pen is mightier than the sword.'
   Source: Edward Bulwer-Lytton, Category: literature
5. (Distance: 34.9861) 'Don't judge a book by its cover.'
   Source: proverb, Category: wisdom

Query: I need some wisdom about appearances
Search Results:
1. (Distance: 26.9088) 'Don't judge a book by its cover.'
   Source: proverb, Category: wisdom
2. (Distance: 28.1751) 'Actions speak louder than words.'
   Source: proverb, Category: behavior
3. (Distance: 29.9099) 'Knowledge is power.'
   Source: Francis Bacon, Category: wisdom
4. (Distance: 31.9937) 'Fortune favors the bold.'
   Source: proverb, Category: courage
5. (Distance: 34.1386) 'The pen is mightier than the sword.'
   Source: Edward Bulwer-Lytton, Category: literature
Vector database saved to vector_db_example
Loading model: distilbert-base-uncased
Loaded vector database from vector_db_example with 10 documents

Testing loaded database:
Search Results:
1. (Distance: 30.5364) 'Knowledge is power.'
2. (Distance: 32.7921) 'Don't judge a book by its cover.'
3. (Distance: 33.9872) 'Actions speak louder than words.'
4. (Distance: 35.6948) 'Fortune favors the bold.'
5. (Distance: 35.7062) 'All that glitters is not gold.'

```