# Step 16: In-Memory Vector Store Cache

This notebook demonstrates caching LLM responses using in-memory vector store with semantic search.

In [None]:
import asyncio
import time
from collections.abc import Awaitable, Callable
from dataclasses import dataclass, field
from typing import Annotated
from uuid import uuid4

from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase
from semantic_kernel.connectors.ai.open_ai import (
    AzureChatCompletion,
    AzureTextEmbedding,
)
from semantic_kernel.connectors.in_memory import InMemoryStore
from semantic_kernel.data.vector import (
    VectorStoreField,
    vectorstoremodel,
    FieldTypes,
    VectorSearchOptions,
    VectorStore,
    VectorStoreCollection,
)
from semantic_kernel.filters import (
    FilterTypes,
    FunctionInvocationContext,
    PromptRenderContext,
)
from semantic_kernel.functions import FunctionResult

## Define Cache Data Model

In [None]:
@vectorstoremodel
@dataclass
class CacheRecord:
    prompt: Annotated[str, VectorStoreField(is_indexed=True)]
    result: Annotated[str, VectorStoreField(is_full_text_indexed=True)]
    prompt_embedding: Annotated[
        list[float], VectorStoreField(field_type=FieldTypes.VECTOR, dimensions=1536)
    ] = field(default_factory=list)
    id: Annotated[str, VectorStoreField(field_type=FieldTypes.KEY)] = field(
        default_factory=lambda: str(uuid4())
    )

Refer to [steps/16.py](../steps/16.py) for the complete implementation with:
- Prompt cache filter
- Function invocation cache
- Vector-based semantic caching
- Performance optimization

In [None]:
# Initialize kernel and services
kernel = Kernel()
chat = AzureChatCompletion(service_id="default")
embedding = AzureTextEmbedding(service_id="embedder")
kernel.add_service(chat)
kernel.add_service(embedding)

In [None]:
# Create in-memory vector store
vector_store = InMemoryStore()
print("Vector store initialized")

In [None]:
# Example query
async def execute_async(title: str, prompt: str):
    print(f"{title}: {prompt}")
    start = time.time()
    result = await kernel.invoke_prompt(prompt)
    elapsed = time.time() - start
    print(f"\tElapsed Time: {elapsed:.3f}")
    return result

In [None]:
# Test query
result = await execute_async("Test", "What's the tallest building in New York?")
print(f"Result: {result}")