Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,14 @@ HF_TOKEN=
# (Default: BAAI/bge-small-en-v1.5)
DEFAULT_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5

# Maximum number of embedding models to keep in memory.
# Least recently used model is evicted when the limit is exceeded.
MODEL_CACHE_LIMIT=2

#Number of threads to use for embedding generation.
#Adjust based on your CPU capabilities.
EMBEDDING_THREADS=8
# EMBEDDING_THREADS=8
Comment on lines 13 to +15
Copy link

Copilot AI Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor formatting: the comment #Number of threads... is missing a space after #, and there are trailing spaces on these comment lines. Cleaning this up improves readability in the sample env file.

Copilot uses AI. Check for mistakes.

# Batch size for embedding generation.
#Adjust based on your system's memory and performance.
BATCH_SIZE=256
# BATCH_SIZE=256
45 changes: 39 additions & 6 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,31 @@ jobs:
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
cache-from: type=gha,scope=build-cpu
cache-to: type=gha,mode=max,scope=build-cpu

- name: Extract metadata for Docker (GPU)
id: meta_gpu
uses: docker/metadata-action@v6
with:
images: ${{ secrets.DOCKERHUB_USERNAME }}/localembed
flavor: |
latest=false
tags: |
type=raw,value=latest-gpu
type=semver,pattern={{version}}-gpu
type=semver,pattern={{major}}.{{minor}}-gpu

- name: Build and push Docker image (GPU)
uses: docker/build-push-action@v7
with:
context: .
file: ./Dockerfile.gpu
push: true
tags: ${{ steps.meta_gpu.outputs.tags }}
labels: ${{ steps.meta_gpu.outputs.labels }}
cache-from: type=gha,scope=build-gpu
cache-to: type=gha,mode=max,scope=build-gpu

draft-release:
runs-on: ubuntu-latest
Expand All @@ -53,6 +76,10 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Normalize semver tag for image examples
id: version
run: echo "semver=${GITHUB_REF_NAME#v}" >> "$GITHUB_OUTPUT"

- name: Create Draft Release
id: create_release
uses: softprops/action-gh-release@v2
Expand All @@ -64,12 +91,18 @@ jobs:
body: |
For full documentation, supported models, and usage examples, please check the [README](https://github.com/heshinth/LocalEmbed/blob/main/README.md).

**Quick Start:**
**CPU Quick Start:**
```bash
docker run -d --name localembed -p 8000:8000 heshinth/localembed:${{ steps.version.outputs.semver }}
```

**GPU Quick Start:**
```bash
docker run -d --name localembed -p 8000:8000 heshinth/localembed:${{ github.ref_name }}
docker run -d --gpus all --name localembed-gpu -p 8000:8000 heshinth/localembed:${{ steps.version.outputs.semver }}-gpu
```

Or for the latest version:
**Latest tags:**
```bash
docker run -d --pull=always --name localembed --env-file .env -p 8000:8000 heshinth/localembed:latest
heshinth/localembed:latest
heshinth/localembed:latest-gpu
```
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ WORKDIR /app
COPY pyproject.toml uv.lock README.md ./

# We generate a standalone virtual environment
RUN uv sync --locked --no-cache --no-dev --no-install-project
RUN uv sync --locked --no-dev --no-install-project --extra cpu

# --- Stage 2: Final Runtime Image ---
FROM python:3.12-slim-trixie
Expand Down
37 changes: 37 additions & 0 deletions Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# We use the lean NVIDIA CUDA runtime as the base so your GPU actually works
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04

# Python environment variables
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1

# Magic trick: Copy the 'uv' binary directly from Astral's official image
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
Copy link

Copilot AI Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

COPY --from=ghcr.io/astral-sh/uv:latest ... makes the GPU image build non-reproducible and can break unexpectedly when the upstream latest tag changes. Prefer pinning uv to a specific version tag or digest (matching what’s used in the CPU Dockerfile) to keep builds deterministic.

Suggested change
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
COPY --from=ghcr.io/astral-sh/uv:0.4.30 /uv /uvx /bin/

Copilot uses AI. Check for mistakes.

WORKDIR /app

# Copy your dependency files first (for Docker layer caching)
COPY pyproject.toml uv.lock README.md .python-version ./

# uv will automatically download Python 3.12 (based on your pyproject.toml),
RUN uv venv

# and install dependencies into a standalone virtual environment at /app/.venv
RUN uv sync --locked --no-dev --no-install-project --extra gpu

# Copy your application code
COPY app ./app

# Run a final sync to install your actual project code
RUN uv sync --locked --no-dev --extra gpu

# Signal the app to use GPU logic
ENV USE_GPU=True

EXPOSE 8000

# Put the virtual environment on the PATH so it works natively
ENV PATH="/app/.venv/bin:$PATH"

# Run Uvicorn directly!
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
59 changes: 54 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
![GitHub License](https://img.shields.io/github/license/heshinth/LocalEmbed?cacheSeconds=20)
![Python](https://img.shields.io/badge/python-3.12-blue.svg?logo=python)
![Docker Image Version](https://img.shields.io/docker/v/heshinth/localembed?logo=docker)
![Docker Pulls](https://img.shields.io/docker/pulls/heshinth/localembed?logo=docker)


A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint.

Expand All @@ -14,13 +16,15 @@ Built with FastAPI and `fastembed`, LocalEmbed is optimized for running local do
* **Privacy First:** 100% local execution. No data ever leaves your network.
* **Zero-Latency Starts:** Automatically pre-loads your default model into memory on server boot.
* **Container-Native:** Multi-stage Docker build utilizing `uv` for a minimal, highly optimized runtime footprint.
* **CPU + GPU Ready:** Published Docker images for both CPU (`latest`) and NVIDIA GPU (`latest-gpu`) deployments.

---

## Getting Started

### Prerequisites
- **Docker** (Recommended)
- **For GPU deployment:** NVIDIA GPU + drivers + NVIDIA Container Toolkit
- Python 3.12+ (for local development)

### Configuration
Expand All @@ -33,17 +37,32 @@ LocalEmbed uses optional environment variables for configuration. Create a `.env
```
2. Open the `.env` file and set your desired configurations (like `DEFAULT_EMBEDDING_MODEL` or `HF_TOKEN`).

Environment variables:

- `DEFAULT_EMBEDDING_MODEL`: model to preload on startup
- `HF_TOKEN`: optional, useful to avoid model download rate limits
- `MODEL_CACHE_LIMIT`: max number of models kept in memory (LRU eviction)
- `EMBEDDING_THREADS`: CPU threads for embedding computation
- `BATCH_SIZE`: number of inputs processed per batch
- `USE_GPU`: set `true` to force CUDA provider in local/non-GPU-image runs
Copy link

Copilot AI Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The README suggests USE_GPU can be set to force CUDA “in local/non-GPU-image runs”, but enabling it without installing the GPU extra (and without a working CUDA runtime) will typically fail model initialization. Clarify that USE_GPU=true requires the gpu extra (uv sync --extra gpu / fastembed-gpu) and a CUDA-capable environment, or document the service’s fallback behavior if CUDA isn’t available.

Suggested change
- `USE_GPU`: set `true` to force CUDA provider in local/non-GPU-image runs
- `USE_GPU`: set `true` to request the CUDA provider in local/non-GPU-image runs. This requires the GPU extra to be installed (`uv sync --extra gpu` or `fastembed-gpu`) and a working CUDA-capable environment; otherwise model initialization may fail.

Copilot uses AI. Check for mistakes.

### Deployment (Docker)

The easiest and recommended way to run LocalEmbed is using the pre-built Docker image from Docker Hub.

#### Option 1: Docker CLI
#### Option 1: Docker CLI (CPU)

```bash
docker run -d --pull=always --name localembed --env-file .env -p 8000:8000 heshinth/localembed:latest
```

#### Option 2: Using Docker Compose
#### Option 2: Docker CLI (GPU)

```bash
docker run -d --pull=always --gpus all --name localembed-gpu --env-file .env -p 8000:8000 heshinth/localembed:latest-gpu
```

#### Option 3: Docker Compose (CPU)

The compose file includes environment variables directly within it.

Expand All @@ -55,23 +74,53 @@ You can edit the file to configure it, then simply run:
docker compose up -d
```

**The API will be available at**: `http://localhost:8000`.
#### Option 4: Docker Compose (GPU)

Download the `docker-compose.gpu.yml` file from [here](./docker-compose.gpu.yml), then run:

```bash
docker compose -f docker-compose.gpu.yml up -d
```

### Docker Tag Scheme

For a release tag like `v0.1.3`, published image tags are:

- CPU: `latest`, `0.1.3`, `0.1`
- GPU: `latest-gpu`, `0.1.3-gpu`, `0.1-gpu`

**The API will be available at**: `http://localhost:8000/v1`.

### Local Development

If you want to run the application natively without Docker:

1. Install the dependencies using `uv` (recommended):
1. Install dependencies for CPU mode:

```bash
uv sync
uv sync --extra cpu
```

2. Run the FastAPI development server:

```bash
fastapi dev app/main.py
```

For local GPU mode:

1. Install dependencies for GPU mode:

```bash
uv sync --extra gpu
```

2. Start with GPU provider enabled:

```bash
USE_GPU=true fastapi dev app/main.py
```

## API Endpoints

- `GET /v1/health` — Health check
Expand Down
6 changes: 6 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@ class Settings(BaseSettings):
env_file=".env", env_file_encoding="utf-8", extra="ignore"
)

USE_GPU: bool = False
"""Toggle to enable CUDA Execution Provider logic"""

HF_TOKEN: SecretStr | None = None
"""Hugging Face API token. Optional"""

DEFAULT_EMBEDDING_MODEL: str = "BAAI/bge-small-en-v1.5"
"""The default embedding model to use."""

MODEL_CACHE_LIMIT: int = 2
"""Maximum number of models to keep in memory (LRU eviction)."""

EMBEDDING_THREADS: int = 8
"""Number of threads to use for embedding generation. Adjust based on your CPU capabilities."""

Expand Down
5 changes: 4 additions & 1 deletion app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,10 @@ async def lifespan(app: FastAPI):

@app.get("/")
def read_root():
return {"Project": "LocalEmbed", "description": "LocalEmbed"}
return {
"Project": "LocalEmbed",
"description": "A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint. ",
Copy link

Copilot AI Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The description string has a trailing space at the end of the sentence, which will be reflected in the JSON output. Trim the extra whitespace for a cleaner response payload.

Suggested change
"description": "A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint. ",
"description": "A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint.",

Copilot uses AI. Check for mistakes.
}


app.include_router(router)
48 changes: 41 additions & 7 deletions app/services/embedder.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,53 @@
from collections import OrderedDict
from threading import RLock
from typing import Iterable
from pydantic import BaseModel
from fastembed import TextEmbedding
from loguru import logger
from app.config import settings

model_cache: dict[str, TextEmbedding] = {}
model_cache: OrderedDict[str, TextEmbedding] = OrderedDict()
model_cache_lock = RLock()


def _evict_lru_models_if_needed() -> None:
cache_limit = max(1, settings.MODEL_CACHE_LIMIT)
while len(model_cache) > cache_limit:
evicted_model_id, _ = model_cache.popitem(last=False)
logger.info(
f"Evicting least recently used embedding model from memory: {evicted_model_id}"
)


def get_model(model_id: str) -> TextEmbedding:
"""Fetch the model from cache, or load it if not present."""

if model_id not in model_cache:
with model_cache_lock:
cached_model = model_cache.get(model_id)
if cached_model is not None:
model_cache.move_to_end(model_id)
return cached_model

logger.info(f"Loading embedding model into memory: {model_id}")
model_cache[model_id] = TextEmbedding(
model_id, threads=settings.EMBEDDING_THREADS

# Configure providers based on GPU setting
providers = None
if settings.USE_GPU:
providers = ["CUDAExecutionProvider"]
logger.info("GPU acceleration (CUDAExecutionProvider) enabled.")

model = TextEmbedding(
model_id, threads=settings.EMBEDDING_THREADS, providers=providers
)
Comment on lines +25 to +41
Copy link

Copilot AI Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_model() holds model_cache_lock while performing the potentially expensive model download/load (TextEmbedding(...)). This serializes concurrent requests and can stall the whole service during first-load or cache-miss storms. Consider using a double-checked approach (check cache under lock, release lock to load, then re-acquire to insert/move-to-end with a second check) so unrelated requests can continue while one thread loads a model.

Copilot uses AI. Check for mistakes.
Comment on lines +36 to +41
Copy link

Copilot AI Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When USE_GPU is enabled, providers are set to only ['CUDAExecutionProvider']. If CUDA libraries/providers aren’t available (driver/toolkit mismatch, container started without GPUs, etc.), ONNX Runtime typically fails session creation rather than falling back. Consider including CPUExecutionProvider as a fallback (ordered after CUDA), or catching provider init errors and retrying with CPU to keep the service available.

Suggested change
providers = ["CUDAExecutionProvider"]
logger.info("GPU acceleration (CUDAExecutionProvider) enabled.")
model = TextEmbedding(
model_id, threads=settings.EMBEDDING_THREADS, providers=providers
)
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
logger.info(
"GPU acceleration preferred (CUDAExecutionProvider) with CPUExecutionProvider fallback enabled."
)
try:
model = TextEmbedding(
model_id, threads=settings.EMBEDDING_THREADS, providers=providers
)
except Exception as e:
if settings.USE_GPU:
logger.warning(
f"Failed to initialize model {model_id} with GPU-enabled providers {providers}: {e}. Retrying with CPUExecutionProvider only."
)
model = TextEmbedding(
model_id,
threads=settings.EMBEDDING_THREADS,
providers=["CPUExecutionProvider"],
)
else:
raise

Copilot uses AI. Check for mistakes.

resolved_providers = model.model.model.get_providers()
logger.info(
f"Model {model_id} loaded successfully with providers: {resolved_providers}"
)
Comment on lines +43 to 46
Copy link

Copilot AI Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

resolved_providers = model.model.model.get_providers() relies on internal/private attributes of fastembed’s TextEmbedding implementation (multiple nested .model). This is brittle across library versions and can break startup even if embedding works. Prefer a public API for provider reporting (if available), or guard this log line so provider introspection failures don’t prevent the model from loading.

Copilot uses AI. Check for mistakes.
return model_cache[model_id]

model_cache[model_id] = model
_evict_lru_models_if_needed()
return model


def preload_default_model():
Expand Down Expand Up @@ -47,9 +79,11 @@ def embed_text(

try:
# model.embed natively batches an iterable of documents giving an iterable of numpy arrays
vectors = [vec.tolist() for vec in model.embed(texts,batch_size=settings.BATCH_SIZE)]
vectors = [
vec.tolist() for vec in model.embed(texts, batch_size=settings.BATCH_SIZE)
]

# token_count returns an iterator of ints (tokens per document), so we sum them
# token_count returns a single int for total tokens
total_tokens = model.token_count(texts)

return EmbeddingResult(
Expand Down
14 changes: 14 additions & 0 deletions docker-compose.gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
services:
localembed:
image: heshinth/localembed:latest-gpu
pull_policy: always
container_name: localembed-gpu
ports:
- "8000:8000"
environment:
- DEFAULT_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
# - HF_TOKEN=your_token_here
# - MODEL_CACHE_LIMIT=2
# - EMBEDDING_THREADS=8
# - BATCH_SIZE=256
gpus: all
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ services:
environment:
- DEFAULT_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
# - HF_TOKEN=your_token_here
# - MODEL_CACHE_LIMIT=2
# - EMBEDDING_THREADS=8
# - BATCH_SIZE=256
8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ authors = [
requires-python = ">=3.12"
dependencies = [
"fastapi[standard]<1.0.0,>=0.116.1",
"fastembed>=0.8.0",
"loguru>=0.7.3",
"pydantic>=2.12.5",
"pydantic-settings>=2.13.1",
Expand All @@ -19,3 +18,10 @@ dependencies = [
dev = [
"poethepoet>=0.44.0",
]

[project.optional-dependencies]
cpu = ["fastembed>=0.8.0"]
gpu = ["fastembed-gpu>=0.8.0"]

[tool.poe.tasks]
summary = "npx repomix@latest --style markdown"
Loading