diff --git a/.env.sample b/.env.sample index 43d4245..738c8c5 100644 --- a/.env.sample +++ b/.env.sample @@ -6,10 +6,14 @@ HF_TOKEN= # (Default: BAAI/bge-small-en-v1.5) DEFAULT_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 +# Maximum number of embedding models to keep in memory. +# Least recently used model is evicted when the limit is exceeded. +MODEL_CACHE_LIMIT=2 + #Number of threads to use for embedding generation. #Adjust based on your CPU capabilities. -EMBEDDING_THREADS=8 +# EMBEDDING_THREADS=8 # Batch size for embedding generation. #Adjust based on your system's memory and performance. -BATCH_SIZE=256 +# BATCH_SIZE=256 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5a4b80f..8302c35 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -43,8 +43,31 @@ jobs: push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max + cache-from: type=gha,scope=build-cpu + cache-to: type=gha,mode=max,scope=build-cpu + + - name: Extract metadata for Docker (GPU) + id: meta_gpu + uses: docker/metadata-action@v6 + with: + images: ${{ secrets.DOCKERHUB_USERNAME }}/localembed + flavor: | + latest=false + tags: | + type=raw,value=latest-gpu + type=semver,pattern={{version}}-gpu + type=semver,pattern={{major}}.{{minor}}-gpu + + - name: Build and push Docker image (GPU) + uses: docker/build-push-action@v7 + with: + context: . + file: ./Dockerfile.gpu + push: true + tags: ${{ steps.meta_gpu.outputs.tags }} + labels: ${{ steps.meta_gpu.outputs.labels }} + cache-from: type=gha,scope=build-gpu + cache-to: type=gha,mode=max,scope=build-gpu draft-release: runs-on: ubuntu-latest @@ -53,6 +76,10 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Normalize semver tag for image examples + id: version + run: echo "semver=${GITHUB_REF_NAME#v}" >> "$GITHUB_OUTPUT" + - name: Create Draft Release id: create_release uses: softprops/action-gh-release@v2 @@ -64,12 +91,18 @@ jobs: body: | For full documentation, supported models, and usage examples, please check the [README](https://github.com/heshinth/LocalEmbed/blob/main/README.md). - **Quick Start:** + **CPU Quick Start:** + ```bash + docker run -d --name localembed -p 8000:8000 heshinth/localembed:${{ steps.version.outputs.semver }} + ``` + + **GPU Quick Start:** ```bash - docker run -d --name localembed -p 8000:8000 heshinth/localembed:${{ github.ref_name }} + docker run -d --gpus all --name localembed-gpu -p 8000:8000 heshinth/localembed:${{ steps.version.outputs.semver }}-gpu ``` - Or for the latest version: + **Latest tags:** ```bash - docker run -d --pull=always --name localembed --env-file .env -p 8000:8000 heshinth/localembed:latest + heshinth/localembed:latest + heshinth/localembed:latest-gpu ``` diff --git a/Dockerfile b/Dockerfile index 8b87ef2..865c60b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ WORKDIR /app COPY pyproject.toml uv.lock README.md ./ # We generate a standalone virtual environment -RUN uv sync --locked --no-cache --no-dev --no-install-project +RUN uv sync --locked --no-dev --no-install-project --extra cpu # --- Stage 2: Final Runtime Image --- FROM python:3.12-slim-trixie diff --git a/Dockerfile.gpu b/Dockerfile.gpu new file mode 100644 index 0000000..9c3be98 --- /dev/null +++ b/Dockerfile.gpu @@ -0,0 +1,37 @@ +# We use the lean NVIDIA CUDA runtime as the base so your GPU actually works +FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 + +# Python environment variables +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 + +# Magic trick: Copy the 'uv' binary directly from Astral's official image +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +WORKDIR /app + +# Copy your dependency files first (for Docker layer caching) +COPY pyproject.toml uv.lock README.md .python-version ./ + +# uv will automatically download Python 3.12 (based on your pyproject.toml), +RUN uv venv + +# and install dependencies into a standalone virtual environment at /app/.venv +RUN uv sync --locked --no-dev --no-install-project --extra gpu + +# Copy your application code +COPY app ./app + +# Run a final sync to install your actual project code +RUN uv sync --locked --no-dev --extra gpu + +# Signal the app to use GPU logic +ENV USE_GPU=True + +EXPOSE 8000 + +# Put the virtual environment on the PATH so it works natively +ENV PATH="/app/.venv/bin:$PATH" + +# Run Uvicorn directly! +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/README.md b/README.md index 9649d31..910aaa4 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ ![GitHub License](https://img.shields.io/github/license/heshinth/LocalEmbed?cacheSeconds=20) ![Python](https://img.shields.io/badge/python-3.12-blue.svg?logo=python) ![Docker Image Version](https://img.shields.io/docker/v/heshinth/localembed?logo=docker) +![Docker Pulls](https://img.shields.io/docker/pulls/heshinth/localembed?logo=docker) + A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint. @@ -14,6 +16,7 @@ Built with FastAPI and `fastembed`, LocalEmbed is optimized for running local do * **Privacy First:** 100% local execution. No data ever leaves your network. * **Zero-Latency Starts:** Automatically pre-loads your default model into memory on server boot. * **Container-Native:** Multi-stage Docker build utilizing `uv` for a minimal, highly optimized runtime footprint. +* **CPU + GPU Ready:** Published Docker images for both CPU (`latest`) and NVIDIA GPU (`latest-gpu`) deployments. --- @@ -21,6 +24,7 @@ Built with FastAPI and `fastembed`, LocalEmbed is optimized for running local do ### Prerequisites - **Docker** (Recommended) +- **For GPU deployment:** NVIDIA GPU + drivers + NVIDIA Container Toolkit - Python 3.12+ (for local development) ### Configuration @@ -33,17 +37,32 @@ LocalEmbed uses optional environment variables for configuration. Create a `.env ``` 2. Open the `.env` file and set your desired configurations (like `DEFAULT_EMBEDDING_MODEL` or `HF_TOKEN`). +Environment variables: + +- `DEFAULT_EMBEDDING_MODEL`: model to preload on startup +- `HF_TOKEN`: optional, useful to avoid model download rate limits +- `MODEL_CACHE_LIMIT`: max number of models kept in memory (LRU eviction) +- `EMBEDDING_THREADS`: CPU threads for embedding computation +- `BATCH_SIZE`: number of inputs processed per batch +- `USE_GPU`: set `true` to force CUDA provider in local/non-GPU-image runs + ### Deployment (Docker) The easiest and recommended way to run LocalEmbed is using the pre-built Docker image from Docker Hub. -#### Option 1: Docker CLI +#### Option 1: Docker CLI (CPU) ```bash docker run -d --pull=always --name localembed --env-file .env -p 8000:8000 heshinth/localembed:latest ``` -#### Option 2: Using Docker Compose +#### Option 2: Docker CLI (GPU) + +```bash +docker run -d --pull=always --gpus all --name localembed-gpu --env-file .env -p 8000:8000 heshinth/localembed:latest-gpu +``` + +#### Option 3: Docker Compose (CPU) The compose file includes environment variables directly within it. @@ -55,23 +74,53 @@ You can edit the file to configure it, then simply run: docker compose up -d ``` -**The API will be available at**: `http://localhost:8000`. +#### Option 4: Docker Compose (GPU) + +Download the `docker-compose.gpu.yml` file from [here](./docker-compose.gpu.yml), then run: + +```bash +docker compose -f docker-compose.gpu.yml up -d +``` + +### Docker Tag Scheme + +For a release tag like `v0.1.3`, published image tags are: + +- CPU: `latest`, `0.1.3`, `0.1` +- GPU: `latest-gpu`, `0.1.3-gpu`, `0.1-gpu` + +**The API will be available at**: `http://localhost:8000/v1`. ### Local Development If you want to run the application natively without Docker: -1. Install the dependencies using `uv` (recommended): +1. Install dependencies for CPU mode: ```bash - uv sync + uv sync --extra cpu ``` 2. Run the FastAPI development server: + ```bash fastapi dev app/main.py ``` +For local GPU mode: + +1. Install dependencies for GPU mode: + + ```bash + uv sync --extra gpu + ``` + +2. Start with GPU provider enabled: + + ```bash + USE_GPU=true fastapi dev app/main.py + ``` + ## API Endpoints - `GET /v1/health` — Health check diff --git a/app/config.py b/app/config.py index a049c03..d85e01c 100644 --- a/app/config.py +++ b/app/config.py @@ -9,12 +9,18 @@ class Settings(BaseSettings): env_file=".env", env_file_encoding="utf-8", extra="ignore" ) + USE_GPU: bool = False + """Toggle to enable CUDA Execution Provider logic""" + HF_TOKEN: SecretStr | None = None """Hugging Face API token. Optional""" DEFAULT_EMBEDDING_MODEL: str = "BAAI/bge-small-en-v1.5" """The default embedding model to use.""" + MODEL_CACHE_LIMIT: int = 2 + """Maximum number of models to keep in memory (LRU eviction).""" + EMBEDDING_THREADS: int = 8 """Number of threads to use for embedding generation. Adjust based on your CPU capabilities.""" diff --git a/app/main.py b/app/main.py index 9607334..1d8a17f 100644 --- a/app/main.py +++ b/app/main.py @@ -41,7 +41,10 @@ async def lifespan(app: FastAPI): @app.get("/") def read_root(): - return {"Project": "LocalEmbed", "description": "LocalEmbed"} + return { + "Project": "LocalEmbed", + "description": "A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint. ", + } app.include_router(router) diff --git a/app/services/embedder.py b/app/services/embedder.py index cff065d..e19b5c4 100644 --- a/app/services/embedder.py +++ b/app/services/embedder.py @@ -1,21 +1,53 @@ +from collections import OrderedDict +from threading import RLock from typing import Iterable from pydantic import BaseModel from fastembed import TextEmbedding from loguru import logger from app.config import settings -model_cache: dict[str, TextEmbedding] = {} +model_cache: OrderedDict[str, TextEmbedding] = OrderedDict() +model_cache_lock = RLock() + + +def _evict_lru_models_if_needed() -> None: + cache_limit = max(1, settings.MODEL_CACHE_LIMIT) + while len(model_cache) > cache_limit: + evicted_model_id, _ = model_cache.popitem(last=False) + logger.info( + f"Evicting least recently used embedding model from memory: {evicted_model_id}" + ) def get_model(model_id: str) -> TextEmbedding: """Fetch the model from cache, or load it if not present.""" - if model_id not in model_cache: + with model_cache_lock: + cached_model = model_cache.get(model_id) + if cached_model is not None: + model_cache.move_to_end(model_id) + return cached_model + logger.info(f"Loading embedding model into memory: {model_id}") - model_cache[model_id] = TextEmbedding( - model_id, threads=settings.EMBEDDING_THREADS + + # Configure providers based on GPU setting + providers = None + if settings.USE_GPU: + providers = ["CUDAExecutionProvider"] + logger.info("GPU acceleration (CUDAExecutionProvider) enabled.") + + model = TextEmbedding( + model_id, threads=settings.EMBEDDING_THREADS, providers=providers + ) + + resolved_providers = model.model.model.get_providers() + logger.info( + f"Model {model_id} loaded successfully with providers: {resolved_providers}" ) - return model_cache[model_id] + + model_cache[model_id] = model + _evict_lru_models_if_needed() + return model def preload_default_model(): @@ -47,9 +79,11 @@ def embed_text( try: # model.embed natively batches an iterable of documents giving an iterable of numpy arrays - vectors = [vec.tolist() for vec in model.embed(texts,batch_size=settings.BATCH_SIZE)] + vectors = [ + vec.tolist() for vec in model.embed(texts, batch_size=settings.BATCH_SIZE) + ] - # token_count returns an iterator of ints (tokens per document), so we sum them + # token_count returns a single int for total tokens total_tokens = model.token_count(texts) return EmbeddingResult( diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml new file mode 100644 index 0000000..34dcdee --- /dev/null +++ b/docker-compose.gpu.yml @@ -0,0 +1,14 @@ +services: + localembed: + image: heshinth/localembed:latest-gpu + pull_policy: always + container_name: localembed-gpu + ports: + - "8000:8000" + environment: + - DEFAULT_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 + # - HF_TOKEN=your_token_here + # - MODEL_CACHE_LIMIT=2 + # - EMBEDDING_THREADS=8 + # - BATCH_SIZE=256 + gpus: all diff --git a/docker-compose.yml b/docker-compose.yml index e9621b8..b583ff2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,5 +8,6 @@ services: environment: - DEFAULT_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 # - HF_TOKEN=your_token_here + # - MODEL_CACHE_LIMIT=2 # - EMBEDDING_THREADS=8 # - BATCH_SIZE=256 diff --git a/pyproject.toml b/pyproject.toml index 40064d3..fce32bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,6 @@ authors = [ requires-python = ">=3.12" dependencies = [ "fastapi[standard]<1.0.0,>=0.116.1", - "fastembed>=0.8.0", "loguru>=0.7.3", "pydantic>=2.12.5", "pydantic-settings>=2.13.1", @@ -19,3 +18,10 @@ dependencies = [ dev = [ "poethepoet>=0.44.0", ] + +[project.optional-dependencies] +cpu = ["fastembed>=0.8.0"] +gpu = ["fastembed-gpu>=0.8.0"] + +[tool.poe.tasks] +summary = "npx repomix@latest --style markdown" \ No newline at end of file diff --git a/uv.lock b/uv.lock index 1f8553d..1781b82 100644 --- a/uv.lock +++ b/uv.lock @@ -323,6 +323,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/e8/26b7d78bb8972498c467ca34cb12ee2e60d26ba5eae6d8443189a1af37a5/fastembed-0.8.0-py3-none-any.whl", hash = "sha256:40bee672657574a1009e35ec50030a55f2b426842cb011845379817641bbbbd0", size = 116572, upload-time = "2026-03-23T16:34:40.69Z" }, ] +[[package]] +name = "fastembed-gpu" +version = "0.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "loguru" }, + { name = "mmh3" }, + { name = "numpy" }, + { name = "onnxruntime-gpu" }, + { name = "pillow" }, + { name = "py-rust-stemmers" }, + { name = "requests" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/82/b7/cf536f4c3d6764cba0229e9e4c3fde08c5780c9c5850eef736a21d4500d3/fastembed_gpu-0.8.0.tar.gz", hash = "sha256:465c058a366c8cde536bead27366b969618301c2a2ef2b771f5828d4ae83e561", size = 75246, upload-time = "2026-03-23T16:34:52.233Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/75/25/f952c90695df67d5c4cf088b8ed2f23fb630c13ebd5e84f68bd820d03a53/fastembed_gpu-0.8.0-py3-none-any.whl", hash = "sha256:8674d4d6d41416c02afa51f371bd1f2792daf6267a38f29831b60b1d89ca1c95", size = 116621, upload-time = "2026-03-23T16:34:50.951Z" }, +] + [[package]] name = "filelock" version = "3.25.2" @@ -494,12 +515,19 @@ version = "0.1.3" source = { virtual = "." } dependencies = [ { name = "fastapi", extra = ["standard"] }, - { name = "fastembed" }, { name = "loguru" }, { name = "pydantic" }, { name = "pydantic-settings" }, ] +[package.optional-dependencies] +cpu = [ + { name = "fastembed" }, +] +gpu = [ + { name = "fastembed-gpu" }, +] + [package.dev-dependencies] dev = [ { name = "poethepoet" }, @@ -508,11 +536,13 @@ dev = [ [package.metadata] requires-dist = [ { name = "fastapi", extras = ["standard"], specifier = ">=0.116.1,<1.0.0" }, - { name = "fastembed", specifier = ">=0.8.0" }, + { name = "fastembed", marker = "extra == 'cpu'", specifier = ">=0.8.0" }, + { name = "fastembed-gpu", marker = "extra == 'gpu'", specifier = ">=0.8.0" }, { name = "loguru", specifier = ">=0.7.3" }, { name = "pydantic", specifier = ">=2.12.5" }, { name = "pydantic-settings", specifier = ">=2.13.1" }, ] +provides-extras = ["cpu", "gpu"] [package.metadata.requires-dev] dev = [{ name = "poethepoet", specifier = ">=0.44.0" }] @@ -799,6 +829,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6c/1d/1666dc64e78d8587d168fec4e3b7922b92eb286a2ddeebcf6acb55c7dc82/onnxruntime-1.24.4-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e1cc6a518255f012134bc791975a6294806be9a3b20c4a54cca25194c90cf731", size = 17247021, upload-time = "2026-03-17T22:04:52.377Z" }, ] +[[package]] +name = "onnxruntime-gpu" +version = "1.25.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "flatbuffers" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "protobuf" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/1d/6d/2c13d3eff74caa9e59820a044a75becd34e9cbeeaf7617ad7679cdb1fdb7/onnxruntime_gpu-1.25.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2f0c36c63c8b0eb4091f2567067f480f66f0aedc189eb009545c98ce7e919056", size = 270342429, upload-time = "2026-04-22T17:28:10.526Z" }, + { url = "https://files.pythonhosted.org/packages/8c/2e/9fc303ae59d4caeb85ec3cea6881b7de8ca1d2a07140fade39913cd7ff10/onnxruntime_gpu-1.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:61178cc4d84f59861714554531e01cccbd33ddf13cc0e87a3adea13b24d297ce", size = 220847708, upload-time = "2026-04-22T17:20:47.993Z" }, + { url = "https://files.pythonhosted.org/packages/f5/15/e63fe7b1abad6884bed07e9bb333e9f0ea48fbb8cbc1ea4a67ee6019d5d0/onnxruntime_gpu-1.25.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e462eb13ee9955117baec4f518916c1e7cb1a96001114105632bc6d454c6aee6", size = 270342324, upload-time = "2026-04-22T17:28:21.142Z" }, + { url = "https://files.pythonhosted.org/packages/21/10/b3533243d062b589d4b1f3ae26584af332c5cde618e7f6f5ff6fabbfd5f2/onnxruntime_gpu-1.25.0-cp313-cp313-win_amd64.whl", hash = "sha256:9a3682158e5e911385252eb95d6332b6f525972746c582e10f8a78213b39e624", size = 220848188, upload-time = "2026-04-22T17:20:56.946Z" }, + { url = "https://files.pythonhosted.org/packages/35/6c/d7706dd1d0eaafdba44d5c89f8d952de41e425a1b0cbd3ecfa60f918c249/onnxruntime_gpu-1.25.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8514b92c5929c953850090d823d018770cba2a971efab5f8f69a3c4280cdc632", size = 270364210, upload-time = "2026-04-22T17:28:33.568Z" }, + { url = "https://files.pythonhosted.org/packages/37/01/9f1b16ea857e3a4b5e82a2d70b52ea46a0083569f737d840f74a1b86818f/onnxruntime_gpu-1.25.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ffe9df4016b061ec3a5565a4fc08cdb86808cd8b9c255c42301066c0c24a81b5", size = 270345126, upload-time = "2026-04-22T17:28:44.416Z" }, + { url = "https://files.pythonhosted.org/packages/56/c8/aae22f3c9cea9160d8d969734a1927720fcb4d4ad4abe269c407c1d2b63c/onnxruntime_gpu-1.25.0-cp314-cp314-win_amd64.whl", hash = "sha256:2173b71631208177fe704ce2d92eac3acbf758285327247ea40a31a9f0bcc073", size = 223385369, upload-time = "2026-04-22T17:21:06.026Z" }, + { url = "https://files.pythonhosted.org/packages/ed/0a/79fba6a1a32803a2bf8b99187e0ea5d5d69ffe0c5c0f469bde232ceb8327/onnxruntime_gpu-1.25.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8576c721c600cc669717a2ae49af30fdfff230480099653adc7b79d58a240852", size = 270364130, upload-time = "2026-04-22T17:28:54.708Z" }, +] + [[package]] name = "packaging" version = "26.0"