heshinth · heshinth · Apr 24, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/.env.sample b/.env.sample
@@ -6,10 +6,14 @@ HF_TOKEN=
 # (Default: BAAI/bge-small-en-v1.5)
 DEFAULT_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
 
+# Maximum number of embedding models to keep in memory.
+# Least recently used model is evicted when the limit is exceeded.
+MODEL_CACHE_LIMIT=2
+
 #Number of threads to use for embedding generation. 
 #Adjust based on your CPU capabilities.
-EMBEDDING_THREADS=8
+# EMBEDDING_THREADS=8
 
 # Batch size for embedding generation. 
 #Adjust based on your system's memory and performance.
-BATCH_SIZE=256
+# BATCH_SIZE=256
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -43,8 +43,31 @@ jobs:
                   push: true
                   tags: ${{ steps.meta.outputs.tags }}
                   labels: ${{ steps.meta.outputs.labels }}
-                  cache-from: type=gha
-                  cache-to: type=gha,mode=max
+                  cache-from: type=gha,scope=build-cpu
+                  cache-to: type=gha,mode=max,scope=build-cpu
+
+            - name: Extract metadata for Docker (GPU)
+              id: meta_gpu
+              uses: docker/metadata-action@v6
+              with:
+                  images: ${{ secrets.DOCKERHUB_USERNAME }}/localembed
+                  flavor: |
+                      latest=false
+                  tags: |
+                      type=raw,value=latest-gpu
+                      type=semver,pattern={{version}}-gpu
+                      type=semver,pattern={{major}}.{{minor}}-gpu
+
+            - name: Build and push Docker image (GPU)
+              uses: docker/build-push-action@v7
+              with:
+                  context: .
+                  file: ./Dockerfile.gpu
+                  push: true
+                  tags: ${{ steps.meta_gpu.outputs.tags }}
+                  labels: ${{ steps.meta_gpu.outputs.labels }}
+                  cache-from: type=gha,scope=build-gpu
+                  cache-to: type=gha,mode=max,scope=build-gpu
 
     draft-release:
         runs-on: ubuntu-latest
@@ -53,6 +76,10 @@ jobs:
             - name: Checkout code
               uses: actions/checkout@v4
 
+            - name: Normalize semver tag for image examples
+              id: version
+              run: echo "semver=${GITHUB_REF_NAME#v}" >> "$GITHUB_OUTPUT"
+
             - name: Create Draft Release
               id: create_release
               uses: softprops/action-gh-release@v2
@@ -64,12 +91,18 @@ jobs:
                   body: |
                       For full documentation, supported models, and usage examples, please check the [README](https://github.com/heshinth/LocalEmbed/blob/main/README.md).
 
-                      **Quick Start:**
+                      **CPU Quick Start:**
+                      ```bash
+                      docker run -d --name localembed -p 8000:8000 heshinth/localembed:${{ steps.version.outputs.semver }}
+                      ```
+
+                      **GPU Quick Start:**
                       ```bash
-                      docker run -d --name localembed -p 8000:8000 heshinth/localembed:${{ github.ref_name }}
+                      docker run -d --gpus all --name localembed-gpu -p 8000:8000 heshinth/localembed:${{ steps.version.outputs.semver }}-gpu
                       ```
 
-                      Or for the latest version:
+                      **Latest tags:**
                       ```bash
-                      docker run -d --pull=always --name localembed --env-file .env -p 8000:8000 heshinth/localembed:latest
+                      heshinth/localembed:latest
+                      heshinth/localembed:latest-gpu
                       ```
diff --git a/Dockerfile b/Dockerfile
@@ -9,7 +9,7 @@ WORKDIR /app
 COPY pyproject.toml uv.lock README.md ./
 
 # We generate a standalone virtual environment
-RUN uv sync --locked --no-cache --no-dev --no-install-project
+RUN uv sync --locked --no-dev --no-install-project --extra cpu
 
 # --- Stage 2: Final Runtime Image ---
 FROM python:3.12-slim-trixie

diff --git a/Dockerfile.gpu b/Dockerfile.gpu
@@ -0,0 +1,37 @@
+# We use the lean NVIDIA CUDA runtime as the base so your GPU actually works
+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+
+# Python environment variables
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+# Magic trick: Copy the 'uv' binary directly from Astral's official image
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+COPY --from=ghcr.io/astral-sh/uv:0.4.30 /uv /uvx /bin/
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+COPY --from=ghcr.io/astral-sh/uv:0.4.30 /uv /uvx /bin/
+
+WORKDIR /app
+
+# Copy your dependency files first (for Docker layer caching)
+COPY pyproject.toml uv.lock README.md .python-version ./
+
+# uv will automatically download Python 3.12 (based on your pyproject.toml),
+RUN uv venv
+
+# and install dependencies into a standalone virtual environment at /app/.venv
+RUN uv sync --locked --no-dev --no-install-project --extra gpu
+
+# Copy your application code
+COPY app ./app
+
+# Run a final sync to install your actual project code
+RUN uv sync --locked --no-dev --extra gpu
+
+# Signal the app to use GPU logic
+ENV USE_GPU=True
+
+EXPOSE 8000
+
+# Put the virtual environment on the PATH so it works natively
+ENV PATH="/app/.venv/bin:$PATH"
+
+# Run Uvicorn directly!
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/README.md b/README.md
@@ -3,6 +3,8 @@
 ![GitHub License](https://img.shields.io/github/license/heshinth/LocalEmbed?cacheSeconds=20)
 ![Python](https://img.shields.io/badge/python-3.12-blue.svg?logo=python)
 ![Docker Image Version](https://img.shields.io/docker/v/heshinth/localembed?logo=docker)
+![Docker Pulls](https://img.shields.io/docker/pulls/heshinth/localembed?logo=docker)
+
 
 A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint. 
 
@@ -14,13 +16,15 @@ Built with FastAPI and `fastembed`, LocalEmbed is optimized for running local do
 * **Privacy First:** 100% local execution. No data ever leaves your network.
 * **Zero-Latency Starts:** Automatically pre-loads your default model into memory on server boot.
 * **Container-Native:** Multi-stage Docker build utilizing `uv` for a minimal, highly optimized runtime footprint.
+* **CPU + GPU Ready:** Published Docker images for both CPU (`latest`) and NVIDIA GPU (`latest-gpu`) deployments.
 
 ---
 
 ## Getting Started
 
 ### Prerequisites
 - **Docker** (Recommended)
+- **For GPU deployment:** NVIDIA GPU + drivers + NVIDIA Container Toolkit
 - Python 3.12+ (for local development)
 
 ### Configuration
@@ -33,17 +37,32 @@ LocalEmbed uses optional environment variables for configuration. Create a `.env
    ```
 2. Open the `.env` file and set your desired configurations (like `DEFAULT_EMBEDDING_MODEL` or `HF_TOKEN`).
 
+Environment variables:
+
+- `DEFAULT_EMBEDDING_MODEL`: model to preload on startup
+- `HF_TOKEN`: optional, useful to avoid model download rate limits
+- `MODEL_CACHE_LIMIT`: max number of models kept in memory (LRU eviction)
+- `EMBEDDING_THREADS`: CPU threads for embedding computation
+- `BATCH_SIZE`: number of inputs processed per batch
+- `USE_GPU`: set `true` to force CUDA provider in local/non-GPU-image runs
- `USE_GPU`: set `true` to force CUDA provider in local/non-GPU-image runs
+- `USE_GPU`: set `true` to request the CUDA provider in local/non-GPU-image runs. This requires the GPU extra to be installed (`uv sync --extra gpu` or `fastembed-gpu`) and a working CUDA-capable environment; otherwise model initialization may fail.
- `USE_GPU`: set `true` to force CUDA provider in local/non-GPU-image runs
+- `USE_GPU`: set `true` to request the CUDA provider in local/non-GPU-image runs. This requires the GPU extra to be installed (`uv sync --extra gpu` or `fastembed-gpu`) and a working CUDA-capable environment; otherwise model initialization may fail.
+
 ### Deployment (Docker)
 
 The easiest and recommended way to run LocalEmbed is using the pre-built Docker image from Docker Hub.
 
-#### Option 1: Docker CLI
+#### Option 1: Docker CLI (CPU)
 
 ```bash
 docker run -d --pull=always --name localembed --env-file .env -p 8000:8000 heshinth/localembed:latest
 ```
 
-#### Option 2: Using Docker Compose
+#### Option 2: Docker CLI (GPU)
+
+```bash
+docker run -d --pull=always --gpus all --name localembed-gpu --env-file .env -p 8000:8000 heshinth/localembed:latest-gpu
+```
+
+#### Option 3: Docker Compose (CPU)
 
 The compose file includes environment variables directly within it.
 
@@ -55,23 +74,53 @@ You can edit the file to configure it, then simply run:
 docker compose up -d
 ```
 
-**The API will be available at**: `http://localhost:8000`.
+#### Option 4: Docker Compose (GPU)
+
+Download the `docker-compose.gpu.yml` file from [here](./docker-compose.gpu.yml), then run:
+
+```bash
+docker compose -f docker-compose.gpu.yml up -d
+```
+
+### Docker Tag Scheme
+
+For a release tag like `v0.1.3`, published image tags are:
+
+- CPU: `latest`, `0.1.3`, `0.1`
+- GPU: `latest-gpu`, `0.1.3-gpu`, `0.1-gpu`
+
+**The API will be available at**: `http://localhost:8000/v1`.
 
 ### Local Development
 
 If you want to run the application natively without Docker:
 
-1. Install the dependencies using `uv` (recommended):
+1. Install dependencies for CPU mode:
 
    ```bash
-   uv sync
+   uv sync --extra cpu
    ```
 
 2. Run the FastAPI development server:
+
    ```bash
    fastapi dev app/main.py
    ```
 
+For local GPU mode:
+
+1. Install dependencies for GPU mode:
+
+   ```bash
+   uv sync --extra gpu
+   ```
+
+2. Start with GPU provider enabled:
+
+   ```bash
+   USE_GPU=true fastapi dev app/main.py
+   ```
+
 ## API Endpoints
 
 - `GET /v1/health` — Health check

diff --git a/app/config.py b/app/config.py
@@ -9,12 +9,18 @@ class Settings(BaseSettings):
         env_file=".env", env_file_encoding="utf-8", extra="ignore"
     )
 
+    USE_GPU: bool = False
+    """Toggle to enable CUDA Execution Provider logic"""
+
     HF_TOKEN: SecretStr | None = None
     """Hugging Face API token. Optional"""
 
     DEFAULT_EMBEDDING_MODEL: str = "BAAI/bge-small-en-v1.5"
     """The default embedding model to use."""
 
+    MODEL_CACHE_LIMIT: int = 2
+    """Maximum number of models to keep in memory (LRU eviction)."""
+
     EMBEDDING_THREADS: int = 8
     """Number of threads to use for embedding generation. Adjust based on your CPU capabilities."""
 

diff --git a/app/main.py b/app/main.py
@@ -41,7 +41,10 @@ async def lifespan(app: FastAPI):
 
 @app.get("/")
 def read_root():
-    return {"Project": "LocalEmbed", "description": "LocalEmbed"}
+    return {
+        "Project": "LocalEmbed",
+        "description": "A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint. ",
-        "description": "A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint. ",
+        "description": "A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint.",
-        "description": "A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint. ",
+        "description": "A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint.",
+    }
 
 
 app.include_router(router)
diff --git a/app/services/embedder.py b/app/services/embedder.py
@@ -1,21 +1,53 @@
+from collections import OrderedDict
+from threading import RLock
 from typing import Iterable
 from pydantic import BaseModel
 from fastembed import TextEmbedding
 from loguru import logger
 from app.config import settings
 
-model_cache: dict[str, TextEmbedding] = {}
+model_cache: OrderedDict[str, TextEmbedding] = OrderedDict()
+model_cache_lock = RLock()
+
+
+def _evict_lru_models_if_needed() -> None:
+    cache_limit = max(1, settings.MODEL_CACHE_LIMIT)
+    while len(model_cache) > cache_limit:
+        evicted_model_id, _ = model_cache.popitem(last=False)
+        logger.info(
+            f"Evicting least recently used embedding model from memory: {evicted_model_id}"
+        )
 
 
 def get_model(model_id: str) -> TextEmbedding:
     """Fetch the model from cache, or load it if not present."""
 
-    if model_id not in model_cache:
+    with model_cache_lock:
+        cached_model = model_cache.get(model_id)
+        if cached_model is not None:
+            model_cache.move_to_end(model_id)
+            return cached_model
+
         logger.info(f"Loading embedding model into memory: {model_id}")
-        model_cache[model_id] = TextEmbedding(
-            model_id, threads=settings.EMBEDDING_THREADS
+
+        # Configure providers based on GPU setting
+        providers = None
+        if settings.USE_GPU:
+            providers = ["CUDAExecutionProvider"]
+            logger.info("GPU acceleration (CUDAExecutionProvider) enabled.")
+
+        model = TextEmbedding(
+            model_id, threads=settings.EMBEDDING_THREADS, providers=providers
+        )
-            providers = ["CUDAExecutionProvider"]
-            logger.info("GPU acceleration (CUDAExecutionProvider) enabled.")
-
-        model = TextEmbedding(
-            model_id, threads=settings.EMBEDDING_THREADS, providers=providers
-        )
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+            logger.info(
+                "GPU acceleration preferred (CUDAExecutionProvider) with CPUExecutionProvider fallback enabled."
+            )
+
+        try:
+            model = TextEmbedding(
+                model_id, threads=settings.EMBEDDING_THREADS, providers=providers
+            )
+        except Exception as e:
+            if settings.USE_GPU:
+                logger.warning(
+                    f"Failed to initialize model {model_id} with GPU-enabled providers {providers}: {e}. Retrying with CPUExecutionProvider only."
+                )
+                model = TextEmbedding(
+                    model_id,
+                    threads=settings.EMBEDDING_THREADS,
+                    providers=["CPUExecutionProvider"],
+                )
+            else:
+                raise
-            providers = ["CUDAExecutionProvider"]
-            logger.info("GPU acceleration (CUDAExecutionProvider) enabled.")
-
-        model = TextEmbedding(
-            model_id, threads=settings.EMBEDDING_THREADS, providers=providers
-        )
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+            logger.info(
+                "GPU acceleration preferred (CUDAExecutionProvider) with CPUExecutionProvider fallback enabled."
+            )
+
+        try:
+            model = TextEmbedding(
+                model_id, threads=settings.EMBEDDING_THREADS, providers=providers
+            )
+        except Exception as e:
+            if settings.USE_GPU:
+                logger.warning(
+                    f"Failed to initialize model {model_id} with GPU-enabled providers {providers}: {e}. Retrying with CPUExecutionProvider only."
+                )
+                model = TextEmbedding(
+                    model_id,
+                    threads=settings.EMBEDDING_THREADS,
+                    providers=["CPUExecutionProvider"],
+                )
+            else:
+                raise
+
+        resolved_providers = model.model.model.get_providers()
+        logger.info(
+            f"Model {model_id} loaded successfully with providers: {resolved_providers}"
         )
-    return model_cache[model_id]
+
+        model_cache[model_id] = model
+        _evict_lru_models_if_needed()
+        return model
 
 
 def preload_default_model():
@@ -47,9 +79,11 @@ def embed_text(
 
     try:
         # model.embed natively batches an iterable of documents giving an iterable of numpy arrays
-        vectors = [vec.tolist() for vec in model.embed(texts,batch_size=settings.BATCH_SIZE)]
+        vectors = [
+            vec.tolist() for vec in model.embed(texts, batch_size=settings.BATCH_SIZE)
+        ]
 
-        # token_count returns an iterator of ints (tokens per document), so we sum them
+        # token_count returns a single int for total tokens
         total_tokens = model.token_count(texts)
 
         return EmbeddingResult(

diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml
@@ -0,0 +1,14 @@
+services:
+  localembed:
+    image: heshinth/localembed:latest-gpu
+    pull_policy: always
+    container_name: localembed-gpu
+    ports:
+      - "8000:8000"
+    environment:
+      - DEFAULT_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
+      # - HF_TOKEN=your_token_here
+      # - MODEL_CACHE_LIMIT=2
+      # - EMBEDDING_THREADS=8
+      # - BATCH_SIZE=256
+    gpus: all
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -8,5 +8,6 @@ services:
     environment:
       - DEFAULT_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
       # - HF_TOKEN=your_token_here
+      # - MODEL_CACHE_LIMIT=2
       # - EMBEDDING_THREADS=8
       # - BATCH_SIZE=256
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,6 @@ authors = [
 requires-python = ">=3.12"
 dependencies = [
     "fastapi[standard]<1.0.0,>=0.116.1",
-    "fastembed>=0.8.0",
     "loguru>=0.7.3",
     "pydantic>=2.12.5",
     "pydantic-settings>=2.13.1",
@@ -19,3 +18,10 @@ dependencies = [
 dev = [
     "poethepoet>=0.44.0",
 ]
+
+[project.optional-dependencies]
+cpu = ["fastembed>=0.8.0"]
+gpu = ["fastembed-gpu>=0.8.0"]
+
+[tool.poe.tasks]
+summary = "npx repomix@latest --style markdown"