diff --git a/.env.sample b/.env.sample
index 43d4245..738c8c5 100644
--- a/.env.sample
+++ b/.env.sample
@@ -6,10 +6,14 @@ HF_TOKEN=
 # (Default: BAAI/bge-small-en-v1.5)
 DEFAULT_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
 
+# Maximum number of embedding models to keep in memory.
+# Least recently used model is evicted when the limit is exceeded.
+MODEL_CACHE_LIMIT=2
+
 #Number of threads to use for embedding generation. 
 #Adjust based on your CPU capabilities.
-EMBEDDING_THREADS=8
+# EMBEDDING_THREADS=8
 
 # Batch size for embedding generation. 
 #Adjust based on your system's memory and performance.
-BATCH_SIZE=256
+# BATCH_SIZE=256
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 5a4b80f..8302c35 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -43,8 +43,31 @@ jobs:
                   push: true
                   tags: ${{ steps.meta.outputs.tags }}
                   labels: ${{ steps.meta.outputs.labels }}
-                  cache-from: type=gha
-                  cache-to: type=gha,mode=max
+                  cache-from: type=gha,scope=build-cpu
+                  cache-to: type=gha,mode=max,scope=build-cpu
+
+            - name: Extract metadata for Docker (GPU)
+              id: meta_gpu
+              uses: docker/metadata-action@v6
+              with:
+                  images: ${{ secrets.DOCKERHUB_USERNAME }}/localembed
+                  flavor: |
+                      latest=false
+                  tags: |
+                      type=raw,value=latest-gpu
+                      type=semver,pattern={{version}}-gpu
+                      type=semver,pattern={{major}}.{{minor}}-gpu
+
+            - name: Build and push Docker image (GPU)
+              uses: docker/build-push-action@v7
+              with:
+                  context: .
+                  file: ./Dockerfile.gpu
+                  push: true
+                  tags: ${{ steps.meta_gpu.outputs.tags }}
+                  labels: ${{ steps.meta_gpu.outputs.labels }}
+                  cache-from: type=gha,scope=build-gpu
+                  cache-to: type=gha,mode=max,scope=build-gpu
 
     draft-release:
         runs-on: ubuntu-latest
@@ -53,6 +76,10 @@ jobs:
             - name: Checkout code
               uses: actions/checkout@v4
 
+            - name: Normalize semver tag for image examples
+              id: version
+              run: echo "semver=${GITHUB_REF_NAME#v}" >> "$GITHUB_OUTPUT"
+
             - name: Create Draft Release
               id: create_release
               uses: softprops/action-gh-release@v2
@@ -64,12 +91,18 @@ jobs:
                   body: |
                       For full documentation, supported models, and usage examples, please check the [README](https://github.com/heshinth/LocalEmbed/blob/main/README.md).
 
-                      **Quick Start:**
+                      **CPU Quick Start:**
+                      ```bash
+                      docker run -d --name localembed -p 8000:8000 heshinth/localembed:${{ steps.version.outputs.semver }}
+                      ```
+
+                      **GPU Quick Start:**
                       ```bash
-                      docker run -d --name localembed -p 8000:8000 heshinth/localembed:${{ github.ref_name }}
+                      docker run -d --gpus all --name localembed-gpu -p 8000:8000 heshinth/localembed:${{ steps.version.outputs.semver }}-gpu
                       ```
 
-                      Or for the latest version:
+                      **Latest tags:**
                       ```bash
-                      docker run -d --pull=always --name localembed --env-file .env -p 8000:8000 heshinth/localembed:latest
+                      heshinth/localembed:latest
+                      heshinth/localembed:latest-gpu
                       ```
diff --git a/Dockerfile b/Dockerfile
index 8b87ef2..865c60b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,7 +9,7 @@ WORKDIR /app
 COPY pyproject.toml uv.lock README.md ./
 
 # We generate a standalone virtual environment
-RUN uv sync --locked --no-cache --no-dev --no-install-project
+RUN uv sync --locked --no-dev --no-install-project --extra cpu
 
 # --- Stage 2: Final Runtime Image ---
 FROM python:3.12-slim-trixie
diff --git a/Dockerfile.gpu b/Dockerfile.gpu
new file mode 100644
index 0000000..9c3be98
--- /dev/null
+++ b/Dockerfile.gpu
@@ -0,0 +1,37 @@
+# We use the lean NVIDIA CUDA runtime as the base so your GPU actually works
+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+
+# Python environment variables
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+# Magic trick: Copy the 'uv' binary directly from Astral's official image
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+
+WORKDIR /app
+
+# Copy your dependency files first (for Docker layer caching)
+COPY pyproject.toml uv.lock README.md .python-version ./
+
+# uv will automatically download Python 3.12 (based on your pyproject.toml),
+RUN uv venv
+
+# and install dependencies into a standalone virtual environment at /app/.venv
+RUN uv sync --locked --no-dev --no-install-project --extra gpu
+
+# Copy your application code
+COPY app ./app
+
+# Run a final sync to install your actual project code
+RUN uv sync --locked --no-dev --extra gpu
+
+# Signal the app to use GPU logic
+ENV USE_GPU=True
+
+EXPOSE 8000
+
+# Put the virtual environment on the PATH so it works natively
+ENV PATH="/app/.venv/bin:$PATH"
+
+# Run Uvicorn directly!
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
\ No newline at end of file
diff --git a/README.md b/README.md
index 9649d31..910aaa4 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,8 @@
 ![GitHub License](https://img.shields.io/github/license/heshinth/LocalEmbed?cacheSeconds=20)
 ![Python](https://img.shields.io/badge/python-3.12-blue.svg?logo=python)
 ![Docker Image Version](https://img.shields.io/docker/v/heshinth/localembed?logo=docker)
+![Docker Pulls](https://img.shields.io/docker/pulls/heshinth/localembed?logo=docker)
+
 
 A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint. 
 
@@ -14,6 +16,7 @@ Built with FastAPI and `fastembed`, LocalEmbed is optimized for running local do
 * **Privacy First:** 100% local execution. No data ever leaves your network.
 * **Zero-Latency Starts:** Automatically pre-loads your default model into memory on server boot.
 * **Container-Native:** Multi-stage Docker build utilizing `uv` for a minimal, highly optimized runtime footprint.
+* **CPU + GPU Ready:** Published Docker images for both CPU (`latest`) and NVIDIA GPU (`latest-gpu`) deployments.
 
 ---
 
@@ -21,6 +24,7 @@ Built with FastAPI and `fastembed`, LocalEmbed is optimized for running local do
 
 ### Prerequisites
 - **Docker** (Recommended)
+- **For GPU deployment:** NVIDIA GPU + drivers + NVIDIA Container Toolkit
 - Python 3.12+ (for local development)
 
 ### Configuration
@@ -33,17 +37,32 @@ LocalEmbed uses optional environment variables for configuration. Create a `.env
    ```
 2. Open the `.env` file and set your desired configurations (like `DEFAULT_EMBEDDING_MODEL` or `HF_TOKEN`).
 
+Environment variables:
+
+- `DEFAULT_EMBEDDING_MODEL`: model to preload on startup
+- `HF_TOKEN`: optional, useful to avoid model download rate limits
+- `MODEL_CACHE_LIMIT`: max number of models kept in memory (LRU eviction)
+- `EMBEDDING_THREADS`: CPU threads for embedding computation
+- `BATCH_SIZE`: number of inputs processed per batch
+- `USE_GPU`: set `true` to force CUDA provider in local/non-GPU-image runs
+
 ### Deployment (Docker)
 
 The easiest and recommended way to run LocalEmbed is using the pre-built Docker image from Docker Hub.
 
-#### Option 1: Docker CLI
+#### Option 1: Docker CLI (CPU)
 
 ```bash
 docker run -d --pull=always --name localembed --env-file .env -p 8000:8000 heshinth/localembed:latest
 ```
 
-#### Option 2: Using Docker Compose
+#### Option 2: Docker CLI (GPU)
+
+```bash
+docker run -d --pull=always --gpus all --name localembed-gpu --env-file .env -p 8000:8000 heshinth/localembed:latest-gpu
+```
+
+#### Option 3: Docker Compose (CPU)
 
 The compose file includes environment variables directly within it.
 
@@ -55,23 +74,53 @@ You can edit the file to configure it, then simply run:
 docker compose up -d
 ```
 
-**The API will be available at**: `http://localhost:8000`.
+#### Option 4: Docker Compose (GPU)
+
+Download the `docker-compose.gpu.yml` file from [here](./docker-compose.gpu.yml), then run:
+
+```bash
+docker compose -f docker-compose.gpu.yml up -d
+```
+
+### Docker Tag Scheme
+
+For a release tag like `v0.1.3`, published image tags are:
+
+- CPU: `latest`, `0.1.3`, `0.1`
+- GPU: `latest-gpu`, `0.1.3-gpu`, `0.1-gpu`
+
+**The API will be available at**: `http://localhost:8000/v1`.
 
 ### Local Development
 
 If you want to run the application natively without Docker:
 
-1. Install the dependencies using `uv` (recommended):
+1. Install dependencies for CPU mode:
 
    ```bash
-   uv sync
+   uv sync --extra cpu
    ```
 
 2. Run the FastAPI development server:
+
    ```bash
    fastapi dev app/main.py
    ```
 
+For local GPU mode:
+
+1. Install dependencies for GPU mode:
+
+   ```bash
+   uv sync --extra gpu
+   ```
+
+2. Start with GPU provider enabled:
+
+   ```bash
+   USE_GPU=true fastapi dev app/main.py
+   ```
+
 ## API Endpoints
 
 - `GET /v1/health` — Health check
diff --git a/app/config.py b/app/config.py
index a049c03..d85e01c 100644
--- a/app/config.py
+++ b/app/config.py
@@ -9,12 +9,18 @@ class Settings(BaseSettings):
         env_file=".env", env_file_encoding="utf-8", extra="ignore"
     )
 
+    USE_GPU: bool = False
+    """Toggle to enable CUDA Execution Provider logic"""
+
     HF_TOKEN: SecretStr | None = None
     """Hugging Face API token. Optional"""
 
     DEFAULT_EMBEDDING_MODEL: str = "BAAI/bge-small-en-v1.5"
     """The default embedding model to use."""
 
+    MODEL_CACHE_LIMIT: int = 2
+    """Maximum number of models to keep in memory (LRU eviction)."""
+
     EMBEDDING_THREADS: int = 8
     """Number of threads to use for embedding generation. Adjust based on your CPU capabilities."""
 
diff --git a/app/main.py b/app/main.py
index 9607334..1d8a17f 100644
--- a/app/main.py
+++ b/app/main.py
@@ -41,7 +41,10 @@ async def lifespan(app: FastAPI):
 
 @app.get("/")
 def read_root():
-    return {"Project": "LocalEmbed", "description": "LocalEmbed"}
+    return {
+        "Project": "LocalEmbed",
+        "description": "A lightweight text embedding API designed as a drop-in replacement for the OpenAI embeddings endpoint. ",
+    }
 
 
 app.include_router(router)
diff --git a/app/services/embedder.py b/app/services/embedder.py
index cff065d..e19b5c4 100644
--- a/app/services/embedder.py
+++ b/app/services/embedder.py
@@ -1,21 +1,53 @@
+from collections import OrderedDict
+from threading import RLock
 from typing import Iterable
 from pydantic import BaseModel
 from fastembed import TextEmbedding
 from loguru import logger
 from app.config import settings
 
-model_cache: dict[str, TextEmbedding] = {}
+model_cache: OrderedDict[str, TextEmbedding] = OrderedDict()
+model_cache_lock = RLock()
+
+
+def _evict_lru_models_if_needed() -> None:
+    cache_limit = max(1, settings.MODEL_CACHE_LIMIT)
+    while len(model_cache) > cache_limit:
+        evicted_model_id, _ = model_cache.popitem(last=False)
+        logger.info(
+            f"Evicting least recently used embedding model from memory: {evicted_model_id}"
+        )
 
 
 def get_model(model_id: str) -> TextEmbedding:
     """Fetch the model from cache, or load it if not present."""
 
-    if model_id not in model_cache:
+    with model_cache_lock:
+        cached_model = model_cache.get(model_id)
+        if cached_model is not None:
+            model_cache.move_to_end(model_id)
+            return cached_model
+
         logger.info(f"Loading embedding model into memory: {model_id}")
-        model_cache[model_id] = TextEmbedding(
-            model_id, threads=settings.EMBEDDING_THREADS
+
+        # Configure providers based on GPU setting
+        providers = None
+        if settings.USE_GPU:
+            providers = ["CUDAExecutionProvider"]
+            logger.info("GPU acceleration (CUDAExecutionProvider) enabled.")
+
+        model = TextEmbedding(
+            model_id, threads=settings.EMBEDDING_THREADS, providers=providers
+        )
+
+        resolved_providers = model.model.model.get_providers()
+        logger.info(
+            f"Model {model_id} loaded successfully with providers: {resolved_providers}"
         )
-    return model_cache[model_id]
+
+        model_cache[model_id] = model
+        _evict_lru_models_if_needed()
+        return model
 
 
 def preload_default_model():
@@ -47,9 +79,11 @@ def embed_text(
 
     try:
         # model.embed natively batches an iterable of documents giving an iterable of numpy arrays
-        vectors = [vec.tolist() for vec in model.embed(texts,batch_size=settings.BATCH_SIZE)]
+        vectors = [
+            vec.tolist() for vec in model.embed(texts, batch_size=settings.BATCH_SIZE)
+        ]
 
-        # token_count returns an iterator of ints (tokens per document), so we sum them
+        # token_count returns a single int for total tokens
         total_tokens = model.token_count(texts)
 
         return EmbeddingResult(
diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml
new file mode 100644
index 0000000..34dcdee
--- /dev/null
+++ b/docker-compose.gpu.yml
@@ -0,0 +1,14 @@
+services:
+  localembed:
+    image: heshinth/localembed:latest-gpu
+    pull_policy: always
+    container_name: localembed-gpu
+    ports:
+      - "8000:8000"
+    environment:
+      - DEFAULT_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
+      # - HF_TOKEN=your_token_here
+      # - MODEL_CACHE_LIMIT=2
+      # - EMBEDDING_THREADS=8
+      # - BATCH_SIZE=256
+    gpus: all
diff --git a/docker-compose.yml b/docker-compose.yml
index e9621b8..b583ff2 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -8,5 +8,6 @@ services:
     environment:
       - DEFAULT_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
       # - HF_TOKEN=your_token_here
+      # - MODEL_CACHE_LIMIT=2
       # - EMBEDDING_THREADS=8
       # - BATCH_SIZE=256
diff --git a/pyproject.toml b/pyproject.toml
index 40064d3..fce32bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,6 @@ authors = [
 requires-python = ">=3.12"
 dependencies = [
     "fastapi[standard]<1.0.0,>=0.116.1",
-    "fastembed>=0.8.0",
     "loguru>=0.7.3",
     "pydantic>=2.12.5",
     "pydantic-settings>=2.13.1",
@@ -19,3 +18,10 @@ dependencies = [
 dev = [
     "poethepoet>=0.44.0",
 ]
+
+[project.optional-dependencies]
+cpu = ["fastembed>=0.8.0"]
+gpu = ["fastembed-gpu>=0.8.0"]
+
+[tool.poe.tasks]
+summary = "npx repomix@latest --style markdown"
\ No newline at end of file
diff --git a/uv.lock b/uv.lock
index 1f8553d..1781b82 100644
--- a/uv.lock
+++ b/uv.lock
@@ -323,6 +323,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/e8/26b7d78bb8972498c467ca34cb12ee2e60d26ba5eae6d8443189a1af37a5/fastembed-0.8.0-py3-none-any.whl", hash = "sha256:40bee672657574a1009e35ec50030a55f2b426842cb011845379817641bbbbd0", size = 116572, upload-time = "2026-03-23T16:34:40.69Z" },
 ]
 
+[[package]]
+name = "fastembed-gpu"
+version = "0.8.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "loguru" },
+    { name = "mmh3" },
+    { name = "numpy" },
+    { name = "onnxruntime-gpu" },
+    { name = "pillow" },
+    { name = "py-rust-stemmers" },
+    { name = "requests" },
+    { name = "tokenizers" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/82/b7/cf536f4c3d6764cba0229e9e4c3fde08c5780c9c5850eef736a21d4500d3/fastembed_gpu-0.8.0.tar.gz", hash = "sha256:465c058a366c8cde536bead27366b969618301c2a2ef2b771f5828d4ae83e561", size = 75246, upload-time = "2026-03-23T16:34:52.233Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/75/25/f952c90695df67d5c4cf088b8ed2f23fb630c13ebd5e84f68bd820d03a53/fastembed_gpu-0.8.0-py3-none-any.whl", hash = "sha256:8674d4d6d41416c02afa51f371bd1f2792daf6267a38f29831b60b1d89ca1c95", size = 116621, upload-time = "2026-03-23T16:34:50.951Z" },
+]
+
 [[package]]
 name = "filelock"
 version = "3.25.2"
@@ -494,12 +515,19 @@ version = "0.1.3"
 source = { virtual = "." }
 dependencies = [
     { name = "fastapi", extra = ["standard"] },
-    { name = "fastembed" },
     { name = "loguru" },
     { name = "pydantic" },
     { name = "pydantic-settings" },
 ]
 
+[package.optional-dependencies]
+cpu = [
+    { name = "fastembed" },
+]
+gpu = [
+    { name = "fastembed-gpu" },
+]
+
 [package.dev-dependencies]
 dev = [
     { name = "poethepoet" },
@@ -508,11 +536,13 @@ dev = [
 [package.metadata]
 requires-dist = [
     { name = "fastapi", extras = ["standard"], specifier = ">=0.116.1,<1.0.0" },
-    { name = "fastembed", specifier = ">=0.8.0" },
+    { name = "fastembed", marker = "extra == 'cpu'", specifier = ">=0.8.0" },
+    { name = "fastembed-gpu", marker = "extra == 'gpu'", specifier = ">=0.8.0" },
     { name = "loguru", specifier = ">=0.7.3" },
     { name = "pydantic", specifier = ">=2.12.5" },
     { name = "pydantic-settings", specifier = ">=2.13.1" },
 ]
+provides-extras = ["cpu", "gpu"]
 
 [package.metadata.requires-dev]
 dev = [{ name = "poethepoet", specifier = ">=0.44.0" }]
@@ -799,6 +829,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6c/1d/1666dc64e78d8587d168fec4e3b7922b92eb286a2ddeebcf6acb55c7dc82/onnxruntime-1.24.4-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e1cc6a518255f012134bc791975a6294806be9a3b20c4a54cca25194c90cf731", size = 17247021, upload-time = "2026-03-17T22:04:52.377Z" },
 ]
 
+[[package]]
+name = "onnxruntime-gpu"
+version = "1.25.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "flatbuffers" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "protobuf" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1d/6d/2c13d3eff74caa9e59820a044a75becd34e9cbeeaf7617ad7679cdb1fdb7/onnxruntime_gpu-1.25.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2f0c36c63c8b0eb4091f2567067f480f66f0aedc189eb009545c98ce7e919056", size = 270342429, upload-time = "2026-04-22T17:28:10.526Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/2e/9fc303ae59d4caeb85ec3cea6881b7de8ca1d2a07140fade39913cd7ff10/onnxruntime_gpu-1.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:61178cc4d84f59861714554531e01cccbd33ddf13cc0e87a3adea13b24d297ce", size = 220847708, upload-time = "2026-04-22T17:20:47.993Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/15/e63fe7b1abad6884bed07e9bb333e9f0ea48fbb8cbc1ea4a67ee6019d5d0/onnxruntime_gpu-1.25.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e462eb13ee9955117baec4f518916c1e7cb1a96001114105632bc6d454c6aee6", size = 270342324, upload-time = "2026-04-22T17:28:21.142Z" },
+    { url = "https://files.pythonhosted.org/packages/21/10/b3533243d062b589d4b1f3ae26584af332c5cde618e7f6f5ff6fabbfd5f2/onnxruntime_gpu-1.25.0-cp313-cp313-win_amd64.whl", hash = "sha256:9a3682158e5e911385252eb95d6332b6f525972746c582e10f8a78213b39e624", size = 220848188, upload-time = "2026-04-22T17:20:56.946Z" },
+    { url = "https://files.pythonhosted.org/packages/35/6c/d7706dd1d0eaafdba44d5c89f8d952de41e425a1b0cbd3ecfa60f918c249/onnxruntime_gpu-1.25.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8514b92c5929c953850090d823d018770cba2a971efab5f8f69a3c4280cdc632", size = 270364210, upload-time = "2026-04-22T17:28:33.568Z" },
+    { url = "https://files.pythonhosted.org/packages/37/01/9f1b16ea857e3a4b5e82a2d70b52ea46a0083569f737d840f74a1b86818f/onnxruntime_gpu-1.25.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ffe9df4016b061ec3a5565a4fc08cdb86808cd8b9c255c42301066c0c24a81b5", size = 270345126, upload-time = "2026-04-22T17:28:44.416Z" },
+    { url = "https://files.pythonhosted.org/packages/56/c8/aae22f3c9cea9160d8d969734a1927720fcb4d4ad4abe269c407c1d2b63c/onnxruntime_gpu-1.25.0-cp314-cp314-win_amd64.whl", hash = "sha256:2173b71631208177fe704ce2d92eac3acbf758285327247ea40a31a9f0bcc073", size = 223385369, upload-time = "2026-04-22T17:21:06.026Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/0a/79fba6a1a32803a2bf8b99187e0ea5d5d69ffe0c5c0f469bde232ceb8327/onnxruntime_gpu-1.25.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8576c721c600cc669717a2ae49af30fdfff230480099653adc7b79d58a240852", size = 270364130, upload-time = "2026-04-22T17:28:54.708Z" },
+]
+
 [[package]]
 name = "packaging"
 version = "26.0"