diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 000000000..01a5554b0 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,18 @@ +FROM mcr.microsoft.com/devcontainers/cpp:1-ubuntu-24.04 + +ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="none" + +# Optionally install the cmake for vcpkg +COPY ./reinstall-cmake.sh /tmp/ + +RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \ + chmod +x /tmp/reinstall-cmake.sh && /tmp/reinstall-cmake.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \ + fi \ + && rm -f /tmp/reinstall-cmake.sh + +# [Optional] Uncomment this section to install additional vcpkg ports. +# RUN su vscode -c "${VCPKG_ROOT}/vcpkg install " + +# [Optional] Uncomment this section to install additional packages. +# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ +# && apt-get -y install --no-install-recommends diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 000000000..8af8ffd59 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,35 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/cpp +{ + "hostRequirements": { + "cpus": 4, + "memory": "8gb", + "storage": "32gb" + }, + "name": "C++", + "build": { + "dockerfile": "Dockerfile" + }, + "features": { + "ghcr.io/stuartleeks/dev-container-features/shell-history:0": {}, + "ghcr.io/sebst/devcontainer-features/pkgx:1": {}, + "ghcr.io/devcontainers-extra/features/apt-packages:1": { + "clean_ppas": true, + "preserve_apt_list": true, + "packages": "build-essential linux-libc-dev", + "ppas": "ppa:deadsnakes/ppa" + } + }, + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + // Use 'postCreateCommand' to run commands after the container is created. + "postCreateCommand": "git submodule update --init", + // Use 'postAttachCommand' to run commands after the container is attached. + "postAttachCommand": "pkgx install npx && npx -y runme run --filename README.md -t devcontainer -y" + // Configure tool-specific properties. + // "customizations": {}, + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} diff --git a/.devcontainer/reinstall-cmake.sh b/.devcontainer/reinstall-cmake.sh new file mode 100644 index 000000000..408b81d22 --- /dev/null +++ b/.devcontainer/reinstall-cmake.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +#------------------------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. +#------------------------------------------------------------------------------------------------------------- +# +set -e + +CMAKE_VERSION=${1:-"none"} + +if [ "${CMAKE_VERSION}" = "none" ]; then + echo "No CMake version specified, skipping CMake reinstallation" + exit 0 +fi + +# Cleanup temporary directory and associated files when exiting the script. +cleanup() { + EXIT_CODE=$? + set +e + if [[ -n "${TMP_DIR}" ]]; then + echo "Executing cleanup of tmp files" + rm -Rf "${TMP_DIR}" + fi + exit $EXIT_CODE +} +trap cleanup EXIT + + +echo "Installing CMake..." +apt-get -y purge --auto-remove cmake +mkdir -p /opt/cmake + +architecture=$(dpkg --print-architecture) +case "${architecture}" in + arm64) + ARCH=aarch64 ;; + amd64) + ARCH=x86_64 ;; + *) + echo "Unsupported architecture ${architecture}." + exit 1 + ;; +esac + +CMAKE_BINARY_NAME="cmake-${CMAKE_VERSION}-linux-${ARCH}.sh" +CMAKE_CHECKSUM_NAME="cmake-${CMAKE_VERSION}-SHA-256.txt" +TMP_DIR=$(mktemp -d -t cmake-XXXXXXXXXX) + +echo "${TMP_DIR}" +cd "${TMP_DIR}" + +curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_BINARY_NAME}" -O +curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_CHECKSUM_NAME}" -O + +sha256sum -c --ignore-missing "${CMAKE_CHECKSUM_NAME}" +sh "${TMP_DIR}/${CMAKE_BINARY_NAME}" --prefix=/opt/cmake --skip-license + +ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake +ln -s /opt/cmake/bin/ctest /usr/local/bin/ctest diff --git a/.github/workflows/beta-build.yml b/.github/workflows/beta-build.yml index bdc277231..dc99034f4 100644 --- a/.github/workflows/beta-build.yml +++ b/.github/workflows/beta-build.yml @@ -156,9 +156,9 @@ jobs: - Network Installer: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-linux-amd64-network-installer.deb - Local Installer: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-linux-amd64-local-installer.deb - Binary: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-linux-amd64.tar.gz - - Linux amd64 Deb: + - Linux arm64 Deb: - Network Installer: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-linux-arm64-network-installer.deb - Local Installer: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-linux-arm64-local-installer.deb - Binary: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-linux-arm64.tar.gz - Docker: menloltd/cortex:beta-${{ env.VERSION }} - - Github Release: https://github.com/janhq/cortex.cpp/releases/tag/v${{ env.VERSION }} \ No newline at end of file + - Github Release: https://github.com/janhq/cortex.cpp/releases/tag/v${{ env.VERSION }} diff --git a/.github/workflows/cortex-cpp-quality-gate.yml b/.github/workflows/cortex-cpp-quality-gate.yml index fd98930a1..87be2cf47 100644 --- a/.github/workflows/cortex-cpp-quality-gate.yml +++ b/.github/workflows/cortex-cpp-quality-gate.yml @@ -14,6 +14,7 @@ env: jobs: build-and-test: + if: (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) || github.event_name == 'workflow_dispatch' runs-on: ${{ matrix.runs-on }} timeout-minutes: 60 strategy: @@ -255,6 +256,7 @@ jobs: AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}" build-docker-and-test: + if: (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) || github.event_name == 'workflow_dispatch' runs-on: ubuntu-24-04-docker steps: - name: Getting the repo @@ -309,3 +311,249 @@ jobs: docker stop cortex docker rm cortex echo "y\n" | docker system prune -af + + build-and-test-target-pr: + permissions: + contents: read + if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository + runs-on: ${{ matrix.runs-on }} + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: + - os: "linux" + name: "arm64" + runs-on: "ubuntu-2004-arm64" + cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" + build-deps-cmake-flags: "" + ccache-dir: "" + - os: "linux" + name: "amd64" + runs-on: "ubuntu-20-04-cuda-12-0" + cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" + build-deps-cmake-flags: "" + ccache-dir: "" + - os: "mac" + name: "amd64" + runs-on: "macos-selfhosted-12" + cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" + build-deps-cmake-flags: "" + ccache-dir: "" + - os: "mac" + name: "arm64" + runs-on: "macos-selfhosted-12-arm64" + cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DMAC_ARM64=ON -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" + build-deps-cmake-flags: "" + ccache-dir: "" + - os: "windows" + name: "amd64" + runs-on: "windows-cuda-12-0" + cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_TOOLCHAIN_FILE=C:/w/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake -DVCPKG_TARGET_TRIPLET=x64-windows-static -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + build-deps-cmake-flags: "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: use python + continue-on-error: true + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install tools on Linux + if: runner.os == 'Linux' + run: | + python3 -m pip install awscli + + - name: Install choco on Windows + if: runner.os == 'Windows' + run: | + choco install make pkgconfiglite ccache awscli 7zip ninja -y + + - name: Configure vcpkg + if: runner.os != 'Linux' + run: | + cd engine + make configure-vcpkg + + - name: Configure vcpkg linux amd64 + if: runner.os != 'Linux' + run: | + cd engine + make configure-vcpkg + + - name: Configure vcpkg linux arm64 + if: runner.os == 'Linux' + run: | + cd engine + # Set env if arch is arm64 + if [ "${{ matrix.name }}" == "arm64" ]; then + sudo apt install ninja-build pkg-config -y + export VCPKG_FORCE_SYSTEM_BINARIES=1 + fi + make configure-vcpkg + + - name: Build + if: runner.os != 'Linux' + run: | + cd engine + make build CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}" BUILD_DEPS_CMAKE_EXTRA_FLAGS="${{ matrix.build-deps-cmake-flags }}" + + - name: Build + if: runner.os == 'Linux' + run: | + cd engine + if [ "${{ matrix.name }}" == "arm64" ]; then + export VCPKG_FORCE_SYSTEM_BINARIES=1 + fi + make build CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}" BUILD_DEPS_CMAKE_EXTRA_FLAGS="${{ matrix.build-deps-cmake-flags }}" + + - name: Run setup config + run: | + cd engine + echo "gitHubToken: ${{ secrets.GITHUB_TOKEN }}" > ~/.cortexrc + # ./build/cortex + cat ~/.cortexrc + + - name: Run unit tests + run: | + cd engine + make run-unit-tests + env: + GITHUB_TOKEN: ${{ secrets.PAT_SERVICE_ACCOUNT }} + + - name: Run setup config + run: | + cd engine + echo "apiServerPort: 3928" > ~/.cortexrc + echo "gitHubToken: ${{ secrets.GITHUB_TOKEN }}" > ~/.cortexrc + # ./build/cortex + cat ~/.cortexrc + + - name: Run e2e tests + if: github.event_name != 'schedule' && runner.os != 'Windows' && github.event.pull_request.draft == false + run: | + cd engine + cp build/cortex build/cortex-nightly + cp build/cortex build/cortex-beta + python -m pip install --upgrade pip + python -m pip install -r e2e-test/requirements.txt + python e2e-test/main.py + rm build/cortex-nightly + rm build/cortex-beta + env: + GITHUB_TOKEN: ${{ secrets.PAT_SERVICE_ACCOUNT }} + + - name: Run e2e tests + if: github.event_name != 'schedule' && runner.os == 'Windows' && github.event.pull_request.draft == false + run: | + cd engine + cp build/cortex.exe build/cortex-nightly.exe + cp build/cortex.exe build/cortex-beta.exe + python -m pip install --upgrade pip + python -m pip install -r e2e-test/requirements.txt + python e2e-test/main.py + rm build/cortex-nightly.exe + rm build/cortex-beta.exe + env: + GITHUB_TOKEN: ${{ secrets.PAT_SERVICE_ACCOUNT }} + + - name: Run e2e tests + if: github.event_name == 'schedule' && runner.os != 'Windows' && github.event.pull_request.draft == false + run: | + cd engine + cp build/cortex build/cortex-nightly + cp build/cortex build/cortex-beta + python -m pip install --upgrade pip + python -m pip install -r e2e-test/requirements.txt + python e2e-test/cortex-llamacpp-e2e-nightly.py + rm build/cortex-nightly + rm build/cortex-beta + env: + GITHUB_TOKEN: ${{ secrets.PAT_SERVICE_ACCOUNT }} + + - name: Run e2e tests + if: github.event_name == 'schedule' && runner.os == 'Windows' && github.event.pull_request.draft == false + run: | + cd engine + cp build/cortex.exe build/cortex-nightly.exe + cp build/cortex.exe build/cortex-beta.exe + python -m pip install --upgrade pip + python -m pip install -r e2e-test/requirements.txt + python e2e-test/cortex-llamacpp-e2e-nightly.py + rm build/cortex-nightly.exe + rm build/cortex-beta.exe + env: + GITHUB_TOKEN: ${{ secrets.PAT_SERVICE_ACCOUNT }} + + - name: Pre-package + run: | + cd engine + make pre-package DESTINATION_BINARY_NAME="cortex" + + - name: Package + run: | + cd engine + make package + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ matrix.os }}-${{ matrix.name }} + path: ./engine/cortex + + build-docker-and-test-target-pr: + permissions: + contents: read + if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository + runs-on: ubuntu-24-04-docker + steps: + - name: Getting the repo + uses: actions/checkout@v3 + with: + submodules: 'recursive' + + - name: Run Docker + if: github.event_name != 'schedule' + run: | + docker build \ + -t menloltd/cortex:test -f docker/Dockerfile . + docker run -it -d -p 3928:39281 --name cortex menloltd/cortex:test + sleep 20 + + - name: Run Docker + if: github.event_name == 'schedule' + run: | + latest_prerelease=$(curl -s https://api.github.com/repos/cortexcpp/cortex.cpp/releases | jq -r '.[] | select(.prerelease == true) | .tag_name' | head -n 1) + echo "cortex.llamacpp latest release: $latest_prerelease" + docker build \ + --build-arg CORTEX_CPP_VERSION="${latest_prerelease}" \ + -t menloltd/cortex:test -f docker/Dockerfile . + docker run -it -d -p 3928:39281 --name cortex menloltd/cortex:test + sleep 20 + + - name: use python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Run e2e tests + run: | + cd engine + python -m pip install --upgrade pip + python -m pip install -r e2e-test/requirements.txt + pytest e2e-test/test_api_docker.py + + - name: Run Docker + continue-on-error: true + if: always() + run: | + docker logs cortex + docker stop cortex + docker rm cortex + echo "y\n" | docker system prune -af \ No newline at end of file diff --git a/.gitignore b/.gitignore index ad579aed8..8f10ea41e 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,10 @@ platform/command platform/src/infrastructure/commanders/test/test_data **/vcpkg_installed engine/test.db -!docs/yarn.lock \ No newline at end of file +!docs/yarn.lock + +# Local +docs/.yarn/ +docs/.yarnrc.yml +docs/bun.lockb +docs/yarn.lock diff --git a/BUILDING.md b/BUILDING.md new file mode 100644 index 000000000..47d246a03 --- /dev/null +++ b/BUILDING.md @@ -0,0 +1,111 @@ +# Build Cortex.cpp from source + +Firstly, clone the Cortex.cpp repository [here](https://github.com/janhq/cortex.cpp) and initialize the submodules: + +```bash +git clone https://github.com/janhq/cortex.cpp +cd cortex.cpp +git submodule update --init --recursive +``` + +You also need to install CMake. On Linux and MacOS, you can install CMake via your package manager + +```bash +sudo apt install cmake # Ubuntu +brew install cmake # MacOS +``` + +On Windows, you can download CMake from https://cmake.org/download/. + +#### Windows + +1. Navigate to the `engine` folder. +2. Configure the vpkg: + +```bash +cd vcpkg +./bootstrap-vcpkg.bat +./vcpkg install +``` + +3. Build the Cortex.cpp inside the `engine/build` folder (you can change `-DCMAKE_TOOLCHAIN_FILE` to use your own `vcpkg`): + +```bash +mkdir build +cd build +cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_TOOLCHAIN_FILE=../vcpkg/scripts/buildsystems/vcpkg.cmake -DVCPKG_TARGET_TRIPLET=x64-windows-static +cmake --build . --config Release +``` + +4. Verify that Cortex.cpp is installed correctly by getting help information. + +```sh +cortex -h +``` + +#### Linux and MacOS + +1. Navigate to the `engine` folder. +2. Configure the vpkg: + +```bash +cd vcpkg +./bootstrap-vcpkg.sh +./vcpkg install +``` + +3. Build the Cortex.cpp inside the `engine/build` folder (you can change `-DCMAKE_TOOLCHAIN_FILE` to use your own `vcpkg`): + +```bash +mkdir build +cd build +cmake .. -DCMAKE_TOOLCHAIN_FILE=../vcpkg/scripts/buildsystems/vcpkg.cmake +make -j4 +``` + +4. Verify that Cortex.cpp is installed correctly by getting help information. + +```sh +./cortex -h +``` + +#### Devcontainer / Codespaces + +1. Open Cortex.cpp repository in Codespaces or local devcontainer + + [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/janhq/cortex.cpp?quickstart=1) + + ```sh + devcontainer up --workspace-folder . + ``` + +2. Configure vpkg in `engine/vcpkg`: + +```bash {"tag": "devcontainer"} +cd engine/vcpkg +export VCPKG_FORCE_SYSTEM_BINARIES="$([[ $(uname -m) == 'arm64' ]] && echo '1' || echo '0')" +./bootstrap-vcpkg.sh +``` + +3. Build the Cortex.cpp inside the `engine/build` folder: + +```bash {"tag": "devcontainer"} +cd engine +mkdir -p build +cd build +cmake .. -DCMAKE_TOOLCHAIN_FILE=$(realpath ..)/vcpkg/scripts/buildsystems/vcpkg.cmake +make -j$(grep -c ^processor /proc/cpuinfo) +``` + +4. Verify that Cortex.cpp is installed correctly by getting help information. + +```sh {"tag": "devcontainer"} +cd engine/build +./cortex -h +``` + +5. Everytime a rebuild is needed, just run the commands above using oneliner + +```sh +npx -y runme run --filename README.md -t devcontainer -y +``` diff --git a/README.md b/README.md index 8ab39aba8..0feb179d3 100644 --- a/README.md +++ b/README.md @@ -1,400 +1,168 @@ -# Cortex.cpp +# Cortex

-Cortex cpp's Readme Banner - + Cortex.cpp Banner

- GitHub commit activity Github Last Commit Github Contributors - GitHub closed issues Discord

- Documentation - API Reference - - Changelog - Bug reports - Discord + Docs • + API Reference • + Changelog • + Issues • + Community

-> **Cortex.cpp is currently in active development.** - -## Overview - -Cortex is a Local AI API Platform that is used to run and customize LLMs. - -Key Features: -- Pull from Huggingface, or Cortex Built-in Models -- Models stored in universal file formats (vs blobs) -- Swappable Engines (default: [`llamacpp`](https://github.com/janhq/cortex.llamacpp), future: [`ONNXRuntime`](https://github.com/janhq/cortex.onnx), [`TensorRT-LLM`](https://github.com/janhq/cortex.tensorrt-llm)) -- Cortex can be deployed as a standalone API server, or integrated into apps like [Jan.ai](https://jan.ai/) +> **Under Active Development** - Expect rapid improvements! -Coming soon; now available on [cortex-nightly](#beta--nightly-versions): -- Engines Management (install specific llama-cpp version and variants) -- Nvidia Hardware detection & activation (current: Nvidia, future: AMD, Intel, Qualcomm) -- Cortex's roadmap is to implement the full OpenAI API including Tools, Runs, Multi-modal and Realtime APIs. -## Local Installation +Cortex is the open-source brain for robots: vision, speech, language, tabular, and action -- the cloud is optional. -Cortex has an Local Installer that packages all required dependencies, so that no internet connection is required during the installation process. +## Installation -Cortex also has a [Network Installer](#network-installer) which downloads the necessary dependencies from the internet during the installation. +| Platform | Installer | +|----------|-----------| +| **Windows** | [cortex.exe](https://app.cortexcpp.com/download/latest/windows-amd64-local) | +| **macOS** | [cortex.pkg](https://app.cortexcpp.com/download/latest/mac-universal-local) | +| **Linux (Debian)** | [cortex.deb](https://app.cortexcpp.com/download/latest/linux-amd64-local) | -

- - Windows: - cortex.exe -

- -

- - MacOS (Silicon/Intel): - cortex.pkg -

- -

- - Linux debian based distros: - cortex-linux-local-installer.deb -

- -- For Linux: Download the installer and run the following command in terminal: +All other Linux distributions: ```bash - # Linux debian based distros - curl -s https://raw.githubusercontent.com/janhq/cortex/main/engine/templates/linux/install.sh | sudo bash -s -- --deb_local - - # Other Linux distros - curl -s https://raw.githubusercontent.com/janhq/cortex/main/engine/templates/linux/install.sh | sudo bash -s +curl -s https://raw.githubusercontent.com/janhq/cortex/main/engine/templates/linux/install.sh | sudo bash ``` -- The binary will be installed in the `/usr/bin/` directory. - -## Usage - -### CLI - -After installation, you can run Cortex.cpp from the command line by typing `cortex --help`. +### Start the Server +```bash +cortex start ``` -# Run a Model -cortex pull llama3.2 -cortex pull bartowski/Meta-Llama-3.1-8B-Instruct-GGUF -cortex run llama3.2 - -# Resource Management -cortex ps (view active models & RAM/VRAM used) -cortex models stop llama3.2 - -# Available on cortex-nightly: -cortex engines install llama-cpp -m (lists versions and variants) -cortex hardware list (hardware detection) -cortex hardware activate - -cortex stop +``` +Set log level to INFO +Host: 127.0.0.1 Port: 39281 +Server started +API Documentation available at: http://127.0.0.1:39281 ``` -Refer to our [Quickstart](https://cortex.so/docs/quickstart/) and -[CLI documentation](https://cortex.so/docs/cli) for more details. - -### API: -Cortex.cpp includes a REST API accessible at `localhost:39281`. - -Refer to our [API documentation](https://cortex.so/api-reference) for more details. - -## Models - -Cortex.cpp allows users to pull models from multiple Model Hubs, offering flexibility and extensive model access: -- [Hugging Face](https://huggingface.co): GGUF models eg `author/Model-GGUF` -- Cortex Built-in Models - -Once downloaded, the model `.gguf` and `model.yml` files are stored in `~\cortexcpp\models`. - -> **Note**: -> You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 14B models, and 32 GB to run the 32B models. - -### Cortex Built-in Models & Quantizations +[Full API docs](https://cortex.so/api-reference). -| Model /Engine | llama.cpp | Command | -| -------------- | --------------------- | ----------------------------- | -| phi-3.5 | ✅ | cortex run phi3.5 | -| llama3.2 | ✅ | cortex run llama3.2 | -| llama3.1 | ✅ | cortex run llama3.1 | -| codestral | ✅ | cortex run codestral | -| gemma2 | ✅ | cortex run gemma2 | -| mistral | ✅ | cortex run mistral | -| ministral | ✅ | cortex run ministral | -| qwen2 | ✅ | cortex run qwen2.5 | -| openhermes-2.5 | ✅ | cortex run openhermes-2.5 | -| tinyllama | ✅ | cortex run tinyllama | +### Download Models -View all [Cortex Built-in Models](https://cortex.so/models). +You can download models from the huggingface model hub using the `cortex pull` command: -Cortex supports multiple quantizations for each model. +```bash +cortex pull llama3.2 +``` ``` -❯ cortex-nightly pull llama3.2 Downloaded models: + llama3.1:8b-gguf-q4-km llama3.2:3b-gguf-q2-k Available to download: - 1. llama3.2:3b-gguf-q3-kl - 2. llama3.2:3b-gguf-q3-km - 3. llama3.2:3b-gguf-q3-ks - 4. llama3.2:3b-gguf-q4-km (default) - 5. llama3.2:3b-gguf-q4-ks - 6. llama3.2:3b-gguf-q5-km - 7. llama3.2:3b-gguf-q5-ks - 8. llama3.2:3b-gguf-q6-k - 9. llama3.2:3b-gguf-q8-0 - -Select a model (1-9): -``` + 1. llama3:8b-gguf + 2. llama3:8b-gguf-q2-k + 3. llama3:8b-gguf-q3-kl + 4. ... -## Advanced Installation - -### Network Installer (Stable) - -Cortex.cpp is available with a Network Installer, which is a smaller installer but requires internet connection during installation to download the necessary dependencies. - -

- - Windows: - cortex-windows-network-installer.exe -

- -

- - MacOS (Universal): - cortex-mac-network-installer.pkg -

- -

- - Linux debian based distros: - cortex-linux-network-installer.deb -

- -### Beta & Nightly Versions (Local Installer) - -Cortex releases Beta and Nightly versions for advanced users to try new features (we appreciate your feedback!) -- Beta (early preview): CLI command: `cortex-beta` -- Nightly (released every night): CLI Command: `cortex-nightly` - - Nightly automatically pulls the latest changes from upstream [llama.cpp](https://github.com/ggerganov/llama.cpp/) repo, creates a PR and runs tests. - - If all test pass, the PR is automatically merged into our repo, with the latest llama.cpp version. - - - - - - - - - - - - - - - - - - - - -
VersionWindowsMacOSLinux debian based distros
Beta (Preview) - - - cortex.exe - - - - - cortex.pkg - - - - - cortex.deb - -
Nightly (Experimental) - - - cortex.exe - - - - - cortex.pkg - - - - - cortex.deb - -
- -### Network Installer - -Cortex.cpp is available with a Network Installer, which is a smaller installer but requires internet connection during installation to download the necessary dependencies. - - - - - - - - - - - - - - - - - - - - - - - - - - -
Version TypeWindowsMacOSLinux debian based distros
Stable (Recommended) - - - cortex.exe - - - - - cortex.pkg - - - - - cortex.deb - -
Beta (Preview) - - - cortex.exe - - - - - cortex.pkg - - - - - cortex.deb - -
Nightly (Experimental) - - - cortex.exe - - - - - cortex.pkg - - - - - cortex.deb - -
+Select a model (1-21): +``` -### Build from Source +### Run Models -#### Windows +```sh +cortex run llama3.2 +``` +``` +In order to exit, type `exit()` +> +``` -1. Clone the Cortex.cpp repository [here](https://github.com/janhq/cortex.cpp). -2. Navigate to the `engine` folder. -3. Configure the vpkg: +You can also run it in detached mode, meaning, you can run it in the background and can +use the model via the API: -```bash -cd vcpkg -./bootstrap-vcpkg.bat -vcpkg install +```sh +cortex run -d llama3.2:3b-gguf-q2-k ``` -4. Build the Cortex.cpp inside the `engine/build` folder: +### Manage resources -```bash -mkdir build -cd build -cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_TOOLCHAIN_FILE=path_to_vcpkg_folder_in_cortex_repo/vcpkg/scripts/buildsystems/vcpkg.cmake -DVCPKG_TARGET_TRIPLET=x64-windows-static -cmake --build . --config Release +```sh +cortex ps # View active models ``` -5. Verify that Cortex.cpp is installed correctly by getting help information. - ```sh -cortex -h +cortex stop # Shutdown server ``` -#### MacOS +## Why Cortex.cpp? -1. Clone the Cortex.cpp repository [here](https://github.com/janhq/cortex.cpp). -2. Navigate to the `engine` folder. -3. Configure the vpkg: +Local AI platform for running AI models with: -```bash -cd vcpkg -./bootstrap-vcpkg.sh -vcpkg install -``` +- **Multi-Engine Support** - Start with llama.cpp or add your own +- **Hardware Optimized** - Automatic GPU detection (NVIDIA/AMD/Intel) +- **OpenAI-Compatible API** - Tools, Runs, and Multi-modal coming soon + +## Featured Models + +| Model | Command | Min RAM | +|----------------|---------------------------|---------| +| Llama 3 8B | `cortex run llama3.1` | 8GB | +| Phi-4 | `cortex run phi-4` | 8GB | +| Mistral | `cortex run mistral` | 4GB | +| Gemma 2B | `cortex run gemma2` | 6GB | + +[View all supported models →](https://cortex.so/models) + +## Advanced Features -4. Build the Cortex.cpp inside the `engine/build` folder: +See table below for the binaries with the nightly builds. ```bash -mkdir build -cd build -cmake .. -DCMAKE_TOOLCHAIN_FILE=path_to_vcpkg_folder_in_cortex_repo/vcpkg/scripts/buildsystems/vcpkg.cmake -make -j4 +# Multiple quantizations +cortex-nightly pull llama3.2 # Choose from several quantization options ``` -5. Verify that Cortex.cpp is installed correctly by getting help information. +```bash +# Engine management (nightly) +cortex-nightly engines install llama-cpp -m +``` ```sh -cortex -h +# Hardware control +cortex-nightly hardware detect +cortex-nightly hardware activate ``` -#### Linux +## Need Help? -1. Clone the Cortex.cpp repository [here](https://github.com/janhq/cortex.cpp). -2. Navigate to the `engine` folder. -3. Configure the vpkg: +- Quick troubleshooting: `cortex --help` +- [Documentation](https://cortex.so/docs) +- [Community Discord](https://discord.gg/FTk2MvZwJH) +- [Report Issues](https://github.com/janhq/cortex.cpp/issues) -```bash -cd vcpkg -./bootstrap-vcpkg.sh -vcpkg install -``` +--- -4. Build the Cortex.cpp inside the `engine/build` folder: +## For Contributors -```bash -mkdir build -cd build -cmake .. -DCMAKE_TOOLCHAIN_FILE=path_to_vcpkg_folder_in_cortex_repo/vcpkg/scripts/buildsystems/vcpkg.cmake -make -j4 -``` +### Development Builds -5. Verify that Cortex.cpp is installed correctly by getting help information. +| Version | Windows | macOS | Linux | +|-----------|---------|-------|-------| +| **Stable** | [exe](https://app.cortexcpp.com/download/latest/windows-amd64-network) | [pkg](https://app.cortexcpp.com/download/latest/mac-universal-network) | [deb](https://app.cortexcpp.com/download/latest/linux-amd64-network) | +| **Beta** | [exe](https://app.cortexcpp.com/download/beta/windows-amd64-network) | [pkg](https://app.cortexcpp.com/download/beta/mac-universal-network) | [deb](https://app.cortexcpp.com/download/beta/linux-amd64-network) | +| **Nightly** | [exe](https://app.cortexcpp.com/download/nightly/windows-amd64-network) | [pkg](https://app.cortexcpp.com/download/nightly/mac-universal-network) | [deb](https://app.cortexcpp.com/download/nightly/linux-amd64-network) | -```sh -cortex -h -``` +### Build from Source -## Uninstallation +See [BUILDING.md](BUILDING.md) + +## Uninstall Cortex ### Windows @@ -402,24 +170,18 @@ cortex -h 2. Navigate to `Add or Remove Programs`. 3. Search for `cortexcpp` and double click to uninstall. (for beta and nightly builds, search for `cortexcpp-beta` and `cortexcpp-nightly` respectively) -### MacOs +### MacOs/Linux Run the uninstaller script: ```bash -sudo sh cortex-uninstall.sh +sudo cortex-uninstall.sh ``` -For MacOS, there is a uninstaller script comes with the binary and added to the `/usr/local/bin/` directory. The script is named `cortex-uninstall.sh` for stable builds, `cortex-beta-uninstall.sh` for beta builds and `cortex-nightly-uninstall.sh` for nightly builds. - -### Linux - -```bash -sudo apt remove cortexcpp -``` +The script to uninstall Cortex comes with the binary and was added to the `/usr/local/bin/` directory. The script is named `cortex-uninstall.sh` for stable builds, `cortex-beta-uninstall.sh` for beta builds and `cortex-nightly-uninstall.sh` for nightly builds. ## Contact Support - For support, please file a [GitHub ticket](https://github.com/janhq/cortex.cpp/issues/new/choose). - For questions, join our Discord [here](https://discord.gg/FTk2MvZwJH). -- For long-form inquiries, please email [hello@jan.ai](mailto:hello@jan.ai). +- For long-form inquiries, please email [hello@jan.ai](mailto:hello@jan.ai). \ No newline at end of file diff --git a/assets/cortex-banner.png b/assets/cortex-banner.png index 95a7262a9..b01cfb89e 100644 Binary files a/assets/cortex-banner.png and b/assets/cortex-banner.png differ diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 99bdd0009..761c5bc19 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -5,13 +5,15 @@ echo "apiServerHost: 0.0.0.0" > /root/.cortexrc echo "enableCors: true" >> /root/.cortexrc +# Start the cortex server +cortex start + # Install the engine cortex engines install llama-cpp -s /opt/cortex.llamacpp -# Start the cortex server -cortex start cortex engines list + # Keep the container running by tailing the log files tail -f /root/cortexcpp/logs/cortex.log & tail -f /root/cortexcpp/logs/cortex-cli.log & diff --git a/docs/docs/architecture.mdx b/docs/docs/architecture.mdx index eae6b1d2f..8e9520810 100644 --- a/docs/docs/architecture.mdx +++ b/docs/docs/architecture.mdx @@ -5,10 +5,6 @@ slug: "architecture" draft: true --- -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: - ## Introduction Cortex is a C++ AI engine designed to operate entirely on your local hardware infrastructure. This headless backend platform is also engineered to support TensorRT-LLM, ensuring high-performance machine-learning model execution. It is packaged with a Docker-inspired command-line interface and a Typescript client library. diff --git a/docs/docs/architecture/cortex-db.mdx b/docs/docs/architecture/cortex-db.mdx index 2bcdb806a..13ab30ba9 100644 --- a/docs/docs/architecture/cortex-db.mdx +++ b/docs/docs/architecture/cortex-db.mdx @@ -7,11 +7,13 @@ slug: "cortex-db" import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: -This document outlines the architecture of the database designed to store and manage various types of entities and their associated metadata. +![sqlite-view](/img/sqlite_view_harlequin.png) +**db view via [Harlequin](https://harlequin.sh/)** + + +This document outlines Cortex database architecture which is designed to store and manage models, engines, +files and more. ## Table Structure ### schema Table @@ -49,3 +51,32 @@ The `hardware` table is designed to hold metadata about hardware information. Be | activated | INTEGER | A boolean value (0 or 1) indicating whether the hardware is activated or not. | | priority | INTEGER | An integer value representing the priority associated with the hardware. | +### engines Table +The `engines` table is designed to hold metadata about the different engines available for useage with Cortex. +Below is the structure of the table: + +| Column Name | Data Type | Description | +|--------------|-----------|---------------------------------------------------------| +| id | INTEGER | A unique identifier for each engine (Primary Key). | +| engine_name | TEXT | The name of the engine. | +| type | TEXT | | +| api_key | TEXT | | +| url | TEXT | | +| version | TEXT | The current version of the engine. | +| variant | TEXT | | +| status | TEXT | Current status of the engine (e.g., "downloaded", "downloadable"). | +| metadata | TEXT | Additional metadata or information about the engine. | +| date_ceated | TEXT | Date when the engine was downloaded. | +| date_updated | TEXT | Date when the engine was last updated. | + +### files Table +The `files` table is designed to hold metadata about objects dowloaded via Cortex. + +| Column Name | Data Type | Description | +|-------------|-----------|---------------------------------| +| id | TEXT | The primary key for the table | +| object | TEXT | The type of hardware. | +| purpose | TEXT | Purpose of file | +| filename | TEXT | The name of the file. | +| created_at | INTEGER | Date when file was created | +| bytes | INTEGER | | diff --git a/docs/docs/architecture/cortexrc.mdx b/docs/docs/architecture/cortexrc.mdx index 061e2cffc..a19c23afe 100644 --- a/docs/docs/architecture/cortexrc.mdx +++ b/docs/docs/architecture/cortexrc.mdx @@ -7,11 +7,10 @@ slug: "cortexrc" import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: -Cortex.cpp supports reading its configuration from a file called `.cortexrc`. Using this file, you can also change the data folder, Cortex.cpp API server port, and host. +Cortex supports using a config-based approach to configuring most of its functionality. During the +installation process, a `.cortexrc` will be generated with some sensible defaults in it. Using this +file, you can change the location and name of the data directory, the Cortex API server port, the host and more. ## File Location @@ -31,25 +30,52 @@ You can configure the following parameters in the `.cortexrc` file: | `apiServerPort` | Port number for the Cortex.cpp API server. | `39281` | | `logFolderPath` | Path the folder where logs are located | User's home folder. | | `logLlamaCppPath` | The llama-cpp engine . | `./logs/cortex.log` | -| `logTensorrtLLMPath` | The tensorrt-llm engine log file path. | `./logs/cortex.log` | | `logOnnxPath` | The onnxruntime engine log file path. | `./logs/cortex.log` | | `maxLogLines` | The maximum log lines that write to file. | `100000` | | `checkedForUpdateAt` | The last time for checking updates. | `0` | | `latestRelease` | The lastest release vesion. | Empty string | | `huggingFaceToken` | HuggingFace token. | Empty string | + +In the future, every parameter will be editable from the Cortex CLI. At present, only a selected few are configurable. + Example of the `.cortexrc` file: ``` -logFolderPath: /Users//cortexcpp +logFolderPath: /home//cortexcpp logLlamaCppPath: ./logs/cortex.log logTensorrtLLMPath: ./logs/cortex.log logOnnxPath: ./logs/cortex.log -dataFolderPath: /Users//cortexcpp +dataFolderPath: /home//cortexcpp maxLogLines: 100000 apiServerHost: 127.0.0.1 apiServerPort: 39281 -checkedForUpdateAt: 1730501224 -latestRelease: v1.0.1 +checkedForUpdateAt: 1737636738 +checkedForLlamacppUpdateAt: 1737636592699 +latestRelease: v1.0.8 +latestLlamacppRelease: v0.1.49 huggingFaceToken: "" +gitHubUserAgent: "" +gitHubToken: "" +llamacppVariant: linux-amd64-avx2-cuda-12-0 +llamacppVersion: v0.1.49 +enableCors: true +allowedOrigins: + - http://localhost:39281 + - http://127.0.0.1:39281 + - http://0.0.0.0:39281 +proxyUrl: "" +verifyProxySsl: true +verifyProxyHostSsl: true +proxyUsername: "" +proxyPassword: "" +noProxy: example.com,::1,localhost,127.0.0.1 +verifyPeerSsl: true +verifyHostSsl: true +sslCertPath: "" +sslKeyPath: "" +supportedEngines: + - llama-cpp + - onnxruntime + - tensorrt-llm ``` diff --git a/docs/docs/architecture/data-folder.mdx b/docs/docs/architecture/data-folder.mdx index 5a4fe1964..735b746a2 100644 --- a/docs/docs/architecture/data-folder.mdx +++ b/docs/docs/architecture/data-folder.mdx @@ -1,23 +1,21 @@ --- -title: Data Folder and App Folder -description: Cortex.cpp's data folder and app folder. +title: Data and App Directories +description: Cortex's data and app directories. slug: "data-folder" --- import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: When you install Cortex.cpp, three types of files will be generated on your device: - **Binary Files** - **Configuration Files** -- **Data Folder** +- **A Data Directory** ## Binary Files - under the App Folder + These are the executable files of the Cortex.cpp application. The file format varies depending on the operating system: - **Windows**: @@ -30,53 +28,73 @@ These are the executable files of the Cortex.cpp application. The file format va - cli: `/usr/local/bin/cortex` - server: `/usr/local/bin/cortex-server` -## Cortex.cpp Data Folder -The data folder stores the engines, models, and logs required by Cortex.cpp. This folder is located at: +## Cortex Data Directory + +The data DIRECTORY stores the engines, models used by Cortex and the logs generated by the +server and CLI. The data directory is located at: - **Windows**: `C:\Users\\cortexcpp` - **Linux**: `/home//cortexcpp` - **macOS**: `/Users/\cortexcpp` -### Folder Structure -The Cortex.cpp data folder typically follows this structure: - - - - ```yaml - ~/.cortex - ├── models/ - │ └── model.list - │ └── huggingface.co/ - │ └── / - └── / - └── model.yaml - └── model.gguf - │ └── cortex.so/ - │ └── / - │ └── / - └── ...engine_files - └── model.yaml - │ └── imported/ - └── imported_model.yaml - ├── logs/ - │ └── cortex.txt - └── cortex-cli.txt - └── engines/ - └── llamacpp - ``` - - - -#### `cortexcpp` -The main directory that stores all Cortex-related files, located in the user's home directory. +### Directory Structure + +The Cortex data directory can be found in the user's home directory, and +it typically follows the structure below: + +``` +~/cortexcpp +├── cortex.db +├── engines/ +│   ├── cortex.llamacpp/ +│   ├── deps/ +│   │   ├── libcublasLt.so.12 +│   │   └── libcudart.so.12 +│   └── linux-amd64-avx2-cuda-12-0/ +│   └── ... +├── files +├── logs/ +│   ├── cortex-cli.log +│   └── cortex.log +├── models/ +│   ├── cortex.so/ +│   │   ├── deepseek-r1-distill-llama-8b/ +│   │   │   └── 8b-gguf-q2-k/ +│   │   │   ├── metadata.yml +│   │   │   ├── model.gguf +│   │   │   └── model.yml +│   │   └── ... +│   └── huggingface.co/ +│   ├── bartowski/ +│   │   └── phi-4-GGUF/ +│   │   ├── phi-4-Q3_K_S.gguf +│   │   └── phi-4-Q3_K_S.yml +│   └── unsloth/ +│   └── DeepSeek-R1-Distill-Qwen-1.5B-GGUF/ +│   ├── DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf +│   └── DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.yml +└── threads +``` + #### `models/` -Contains the AI models used by Cortex for processing and generating responses. + +Contains all of the models used by Cortex for processing and generating responses. These models can come from +anywhere in the HuggingFace model Hub or the Cortex model Hub. + :::info For more information regarding the `model.list` and `model.yaml`, please see [here](/docs/capabilities/models/model-yaml). ::: + #### `logs/` -Stores log files that are essential for troubleshooting and monitoring the performance of the Cortex.cpp API server and CLI. -We use Trantor for logging, which ensures non-blocking, thread-safe, multi-stream file logging without affecting system performance. Trantor automatically creates a new log file for each server session, based on the date and time, simplifying debugging. It also supports setting limits on log file size and the number of log files per session. +Stores log files from the Cortex server and CLI. These are essential for troubleshooting and +monitoring performance. + +We use [Trantor](https://github.com/an-tao/trantor) for logging, which ensures non-blocking, thread-safe, +and multi-stream file logging without affecting your system's performance. Trantor automatically creates a +new log file for each server session based on the date and time, simplifying debugging. It also supports +setting limits on log file size and the number of log files per session. + #### `engines/` -Stores the necessary dependencies and engine files needed to run Cortex on supported engines. \ No newline at end of file + +Stores the necessary dependencies and engine files needed to run run models on via the supported engines. diff --git a/docs/docs/architecture/updater.mdx b/docs/docs/architecture/updater.mdx index d9dc7d4f8..f454ecb62 100644 --- a/docs/docs/architecture/updater.mdx +++ b/docs/docs/architecture/updater.mdx @@ -7,15 +7,14 @@ slug: "updater" import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: -This document outlines the architectural design for a C++ updater responsible for downloading and executing installers for two binaries: CLI and Server. +This document outlines the architectural design for a C++ updater responsible for downloading and executing +the installers for the CLI and Server binaries. ## Overview -The updater is designed to check for available updates, download the necessary installer files, and execute them to update the CLI and Server binaries. The architecture consists of several key components that work together to achieve this functionality. +The updater is designed to check for available updates, download the necessary installer files, and execute +them to update the CLI and Server binaries. ## Components @@ -24,7 +23,7 @@ The updater is designed to check for available updates, download the necessary i - **Purpose**: Responsible for checking the current version of the installed binaries and determining if updates are available. - **Responsibilities**: - Retrieve the current version from local installations. - - Fetch the latest version information from a remote source, latest version information is saved to `.cortexrc`. + - Fetch information regarding the latest version from a remote source and save such info in `.cortexrc`. - Determine if an update is necessary based on version comparison. ### 2. **Installer Download Manager** @@ -61,7 +60,8 @@ The updater is designed to check for available updates, download the necessary i - It fetches the latest version information from a remote source. 3. **Update Decision**: - - If newer versions are available, the updater proceeds to download the installers; otherwise, it informs the user that no updates are necessary. + - If newer versions are available, the updater proceeds to download the installers; otherwise, it informs the + user that no updates are necessary. 4. **Download Process**: - The Installer Download Manager downloads the latest installer files using HTTP requests. @@ -72,5 +72,6 @@ The updater is designed to check for available updates, download the necessary i - It monitors the installation process, capturing any output or errors. 6. **Completion Notification**: - - After successful installation, the User Interface notifies users of completion or any issues encountered during installation. - - Logs are updated with relevant information about the update process. \ No newline at end of file + - After successful installation, the User Interface notifies users of completion or any issues encountered + during installation. + - Logs are updated with relevant information about the update process. diff --git a/docs/docs/assistants/index.md b/docs/docs/assistants/index.md index d38b33e52..887752052 100644 --- a/docs/docs/assistants/index.md +++ b/docs/docs/assistants/index.md @@ -1,3 +1,180 @@ --- title: Assistants ---- \ No newline at end of file +--- + +# Building Local AI Assistants + +While Cortex doesn't yet support the full OpenAI Assistants API, we can build assistant-like functionality +using the chat completions API. Here's how to create persistent, specialized assistants locally. + +## Get Started + +First, fire up our model: + +```sh +cortex run -d llama3.1:8b-gguf-q4-km +``` + +Set up your Python environment: + +```bash +mkdir assistant-test +cd assistant-test +python -m venv .venv +source .venv/bin/activate +pip install openai +``` + +## Creating an Assistant + +Here's how to create an assistant-like experience using chat completions: + +```python +from openai import OpenAI +from typing import List, Dict + +class LocalAssistant: + def __init__(self, name: str, instructions: str): + self.client = OpenAI( + base_url="http://localhost:39281/v1", + api_key="not-needed" + ) + self.name = name + self.instructions = instructions + self.conversation_history: List[Dict] = [] + + def add_message(self, content: str, role: str = "user") -> str: + # Add message to history + self.conversation_history.append({"role": role, "content": content}) + + # Prepare messages with system instructions and history + messages = [ + {"role": "system", "content": self.instructions}, + *self.conversation_history + ] + + # Get response + response = self.client.chat.completions.create( + model="llama3.1:8b-gguf-q4-km", + messages=messages + ) + + # Add assistant's response to history + assistant_message = response.choices[0].message.content + self.conversation_history.append({"role": "assistant", "content": assistant_message}) + + return assistant_message + +# Create a coding assistant +coding_assistant = LocalAssistant( + name="Code Buddy", + instructions="""You are a helpful coding assistant who: + - Explains concepts with practical examples + - Provides working code snippets + - Points out potential pitfalls + - Keeps responses concise but informative""" +) + +# Ask a question +response = coding_assistant.add_message("Can you explain Python list comprehensions with examples?") +print(response) + +# Follow-up question (with conversation history maintained) +response = coding_assistant.add_message("Can you show a more complex example with filtering?") +print(response) +``` + +## Specialized Assistants + +You can create different types of assistants by changing the instructions: + +```python +# Math tutor assistant +math_tutor = LocalAssistant( + name="Math Buddy", + instructions="""You are a patient math tutor who: + - Breaks down problems step by step + - Uses clear explanations + - Provides practice problems + - Encourages understanding over memorization""" +) + +# Writing assistant +writing_assistant = LocalAssistant( + name="Writing Buddy", + instructions="""You are a writing assistant who: + - Helps improve clarity and structure + - Suggests better word choices + - Maintains the author's voice + - Explains the reasoning behind suggestions""" +) +``` + +## Working with Context + +Here's how to create an assistant that can work with context: + +```python +class ContextAwareAssistant(LocalAssistant): + def __init__(self, name: str, instructions: str, context: str): + super().__init__(name, instructions) + self.context = context + + def add_message(self, content: str, role: str = "user") -> str: + # Include context in the system message + messages = [ + {"role": "system", "content": f"{self.instructions}\n\nContext:\n{self.context}"}, + *self.conversation_history, + {"role": role, "content": content} + ] + + response = self.client.chat.completions.create( + model="llama3.1:8b-gguf-q4-km", + messages=messages + ) + + assistant_message = response.choices[0].message.content + self.conversation_history.append({"role": role, "content": content}) + self.conversation_history.append({"role": "assistant", "content": assistant_message}) + + return assistant_message + +# Example usage with code review context +code_context = """ +def calculate_average(numbers): + total = 0 + for num in numbers: + total += num + return total / len(numbers) +""" + +code_reviewer = ContextAwareAssistant( + name="Code Reviewer", + instructions="You are a helpful code reviewer. Suggest improvements while being constructive.", + context=code_context +) + +response = code_reviewer.add_message("Can you review this code and suggest improvements?") +print(response) +``` + +## Pro Tips + +- Keep the conversation history focused - clear it when starting a new topic +- Use specific instructions to get better responses +- Consider using temperature and max_tokens parameters for different use cases +- Remember that responses are stateless - maintain context yourself + +## Memory Management + +For longer conversations, you might want to limit the history: + +```python +def trim_conversation_history(self, max_messages: int = 10): + if len(self.conversation_history) > max_messages: + # Keep system message and last N messages + self.conversation_history = self.conversation_history[-max_messages:] +``` + +That's it! While we don't have the full Assistants API yet, we can still create powerful assistant-like +experiences using the chat completions API. The best part? It's all running locally on your machine. diff --git a/docs/docs/basic-usage/cortex-js.md b/docs/docs/basic-usage/cortex-js.md index 4e5a4a774..698e9e011 100644 --- a/docs/docs/basic-usage/cortex-js.md +++ b/docs/docs/basic-usage/cortex-js.md @@ -3,19 +3,17 @@ title: cortex.js description: How to use the Cortex.js Library --- -[Cortex.js](https://github.com/janhq/cortex.js) is a Typescript client library that can be used to interact with the Cortex API. - -This is still a work in progress, and we will let the community know once a stable version is available. - :::warning -🚧 Cortex.js is currently under development, and this page is a stub for future development. +🚧 Cortex.js is currently under development, and this page is a stub for future development. ::: +[Cortex.js](https://github.com/janhq/cortex.js) is a Typescript client library that can be used to +interact with the Cortex API. It is a fork of the OpenAI Typescript library with additional methods for Local AI. - +With the `llama-cpp` engine, cortex will accept all parameters from [`model.yml` inference section](#Inference Parameters) +and from the chat completion API. diff --git a/docs/docs/capabilities/models/presets.mdx b/docs/docs/capabilities/models/presets.mdx index 799cf6cbc..c1cc8eb48 100644 --- a/docs/docs/capabilities/models/presets.mdx +++ b/docs/docs/capabilities/models/presets.mdx @@ -4,10 +4,11 @@ description: Model Presets --- :::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. +🚧 Cortex.cpp is currently under active development. Our documentation outlines the intended behavior +of Cortex, which may not yet be fully implemented in the codebase. ::: - \ No newline at end of file +::: --> diff --git a/docs/docs/capabilities/text-generation.md b/docs/docs/capabilities/text-generation.md index 680625667..0001514f4 100644 --- a/docs/docs/capabilities/text-generation.md +++ b/docs/docs/capabilities/text-generation.md @@ -2,6 +2,187 @@ title: Text Generation --- -:::info -🚧 Cortex is currently under development, and this page is a stub for future development. -::: \ No newline at end of file + +Cortex provides a text generation endpoint that is fully compatible with OpenAI's API. +This section shows you how to generate text using Cortex with the OpenAI Python SDK. + +## Text Generation with OpenAI compatibility + +Start server and run model in detached mode. + +```sh +cortex run -d llama3.1:8b-gguf-q4-km +``` + +Create a directory and a python environment, and start a python or IPython shell. + +```sh +mkdir test-generation +cd test-generation +``` +```sh +python -m venv .venv # or uv venv .venv --python 3.13 +source .venv/bin/activate +pip install ipython openai rich # or uv pip install ipython openai rich +``` +```sh +ipython # or "uv run ipython" +``` + +Import the necessary modules and create a client. + +```py +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:39281/v1", + api_key="not-needed" +) +``` + +### Generate Text + +Basic completion: + +```py +response = client.chat.completions.create( + model="llama3.1:8b-gguf-q4-km", + messages=[ + {"role": "user", "content": "Tell me a short story about a friendly robot."} + ] +) +print(response.choices[0].message.content) +``` +``` +Here's a short story about a friendly robot: + +**Zeta's Gift** + +In a small town surrounded by lush green hills, there lived a robot named Zeta. Zeta was unlike any other robot in the world. While others +were designed for specific tasks like assembly or transportation, Zeta was created with a single purpose: to spread joy and kindness. + +Zeta's bright blue body was shaped like a ball, with glowing lines that pulsed with warmth on its surface. Its large, round eyes sparkled +with a warm light, as if reflecting the friendliness within. Zeta loved nothing more than making new friends and surprising them with small +gifts. + +One sunny morning, Zeta decided to visit the local bakery owned by Mrs. Emma, who was famous for her delicious pastries. As Zeta entered the +shop, it was greeted by the sweet aroma of freshly baked bread. The robot's advanced sensors detected a young customer, Timmy, sitting at a +corner table, looking sad. + +Zeta quickly approached Timmy and offered him a warm smile. "Hello there! I'm Zeta. What seems to be troubling you?" Timmy explained that he +was feeling down because his family couldn't afford his favorite dessert – Mrs. Emma's famous chocolate cake – for his birthday. + +Moved by Timmy's story, Zeta asked Mrs. Emma if she could help the young boy celebrate his special day. The baker smiled and handed Zeta a +beautifully decorated cake. As the robot carefully placed the cake on a tray, it sang a gentle melody: "Happy Birthday, Timmy! May your day +be as sweet as this treat!" + +Timmy's eyes widened with joy, and he hugged Zeta tightly. Word of Zeta's kindness spread quickly through the town, earning the robot the +nickname "The Friendly Robot." From that day on, whenever anyone in need was spotted, Zeta would appear at their side, bearing gifts and +spreading love. + +Zeta continued to surprise the townspeople with its thoughtfulness and warm heart, proving that even a machine could be a source of comfort +and joy. +``` + +With additional parameters: + +```py +response = client.chat.completions.create( + model="llama3.1:8b-gguf-q4-km", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What are the main differences between Python and C++?"} + ], + temperature=0.7, + max_tokens=150, + top_p=1.0, + frequency_penalty=0.0, + presence_penalty=0.0 +) +``` +```sh +ChatCompletion( + id='dnMbB12ZR6JdVDw2Spi8', + choices=[ + Choice( + finish_reason='stop', + index=0, + logprobs=None, + message=ChatCompletionMessage( + content="Python and C++ are two popular programming languages with distinct characteristics, use cases, ...", + refusal=None, + role='assistant', + audio=None, + function_call=None, + tool_calls=None + ) + ) + ], + created=1738236652, + model='_', + object='chat.completion', + service_tier=None, + system_fingerprint='_', + usage=CompletionUsage( + completion_tokens=150, + prompt_tokens=33, + total_tokens=183, + completion_tokens_details=None, + prompt_tokens_details=None + ) +) +``` + +Stream the response: + +```py +stream = client.chat.completions.create( + model="llama3.1:8b-gguf-q4-km", + messages=[ + {"role": "user", "content": "Write a haiku about programming."} + ], + stream=True +) + +for chunk in stream: + if chunk.choices[0].delta.content is not None: + print(chunk.choices[0].delta.content, end="") +``` +``` +Code flows like a stream + Errors lurk in every line +Bug hunt, endless quest +``` + +Multiple messages in a conversation: + +```py +messages = [ + {"role": "system", "content": "You are a knowledgeable science teacher."}, + {"role": "user", "content": "What is photosynthesis?"}, + {"role": "assistant", "content": "Photosynthesis is the process by which plants convert sunlight into energy."}, + {"role": "user", "content": "Can you explain it in more detail?"} +] + +response = client.chat.completions.create( + model="llama3.1:8b-gguf-q4-km", + messages=messages +) +print(response.choices[0].message.content) +``` +``` +"Photosynthesis is actually one of my favorite topics to teach! It's a crucial process that supports life on Earth, and +I'd be happy to break it down for you.\n\nPhotosynthesis occurs in specialized organelles called chloroplasts, which are present in plant +cells. These tiny factories use energy from the sun to convert carbon dioxide (CO2) and water (H2O) into glucose (a type of sugar) and +oxygen (O2).\n\nHere's a simplified equation:\n\n6 CO2 + 6 H2O + light energy → C6H12O6 (glucose) + 6 O2\n\nIn more detail, the process +involves several steps:\n\n1. **Light absorption**: Light from the sun is absorbed by pigments ..." +``` + +The API endpoint provided by Cortex supports all standard OpenAI parameters including: +- `temperature`: Controls randomness (0.0 to 2.0) +- `max_tokens`: Limits the length of the response +- `top_p`: Controls diversity via nucleus sampling +- `frequency_penalty`: Reduces repetition of token sequences +- `presence_penalty`: Encourages talking about new topics +- `stop`: Custom stop sequences +- `stream`: Enable/disable streaming responses diff --git a/docs/docs/chat-completions.mdx b/docs/docs/chat-completions.mdx index c4f40f0d1..3c540f266 100644 --- a/docs/docs/chat-completions.mdx +++ b/docs/docs/chat-completions.mdx @@ -6,9 +6,6 @@ description: Chat Completions Feature import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: Cortex's Chat API is compatible with OpenAI’s [Chat Completions](https://platform.openai.com/docs/api-reference/chat) endpoint. It is a drop-in replacement for local inference. @@ -23,8 +20,8 @@ Cortex routes requests to multiple APIs for remote inference while providing a s ## Usage ### CLI -```bash -# Streaming +```bash +# Streaming cortex chat --model mistral ``` ### API diff --git a/docs/docs/cli/config.mdx b/docs/docs/cli/config.mdx index 471a7a04a..b377c42a6 100644 --- a/docs/docs/cli/config.mdx +++ b/docs/docs/cli/config.mdx @@ -9,6 +9,11 @@ import TabItem from "@theme/TabItem"; # `cortex config` +:::warning +At the moment, the `cortex config` command only supports a few configurations. More +configurations will be added soon. +::: + This command allows you to update server configurations such as CORS and Allowed Headers. ## Usage @@ -65,14 +70,34 @@ This command returns all server configurations. For example, it returns the following: ``` -+-------------------------------------------------------------------------------------+ -| Config name | Value | -+-------------------------------------------------------------------------------------+ -| allowed_origins | http://localhost:39281 | -+-------------------------------------------------------------------------------------+ -| allowed_origins | http://127.0.0.1:39281/ | -+-------------------------------------------------------------------------------------+ -| cors | true | -+-------------------------------------------------------------------------------------+ ++-----------------------+-------------------------------------+ +| Config name | Value | ++-----------------------+-------------------------------------+ +| allowed_origins | http://localhost:39281 | ++-----------------------+-------------------------------------+ +| allowed_origins | http://127.0.0.1:39281 | ++-----------------------+-------------------------------------+ +| allowed_origins | http://0.0.0.0:39281 | ++-----------------------+-------------------------------------+ +| cors | true | ++-----------------------+-------------------------------------+ +| huggingface_token | | ++-----------------------+-------------------------------------+ +| no_proxy | example.com,::1,localhost,127.0.0.1 | ++-----------------------+-------------------------------------+ +| proxy_password | | ++-----------------------+-------------------------------------+ +| proxy_url | | ++-----------------------+-------------------------------------+ +| proxy_username | | ++-----------------------+-------------------------------------+ +| verify_host_ssl | true | ++-----------------------+-------------------------------------+ +| verify_peer_ssl | true | ++-----------------------+-------------------------------------+ +| verify_proxy_host_ssl | true | ++-----------------------+-------------------------------------+ +| verify_proxy_ssl | true | ++-----------------------+-------------------------------------+ -``` \ No newline at end of file +``` diff --git a/docs/docs/cli/engines/index.mdx b/docs/docs/cli/engines/index.mdx index 2712e0af5..0ebcb9461 100644 --- a/docs/docs/cli/engines/index.mdx +++ b/docs/docs/cli/engines/index.mdx @@ -9,8 +9,8 @@ import TabItem from "@theme/TabItem"; This command allows you to manage various engines available within Cortex. - **Usage**: + ```sh @@ -24,7 +24,6 @@ This command allows you to manage various engines available within Cortex. - **Options**: | Option | Description | Required | Default value | Example | @@ -32,18 +31,18 @@ This command allows you to manage various engines available within Cortex. | `-h`, `--help` | Display help information for the command. | No | - | `-h` | {/* | `-vk`, `--vulkan` | Install Vulkan engine. | No | `false` | `-vk` | */} ---- -# Subcommands: + ## `cortex engines list` + :::info This CLI command calls the following API endpoint: - [List Engines](/api-reference#tag/engines/get/v1/engines) ::: -This command lists all the Cortex's engines. - +This command lists all the Cortex's engines. **Usage**: + ```sh @@ -58,6 +57,7 @@ This command lists all the Cortex's engines. For example, it returns the following: + ``` +---+--------------+-------------------+---------+----------------------------+---------------+ | # | Name | Supported Formats | Version | Variant | Status | @@ -66,18 +66,19 @@ For example, it returns the following: +---+--------------+-------------------+---------+----------------------------+---------------+ | 2 | llama-cpp | GGUF | 0.1.34 | linux-amd64-avx2-cuda-12-0 | Ready | +---+--------------+-------------------+---------+----------------------------+---------------+ -| 3 | tensorrt-llm | TensorRT Engines | | | Not Installed | -+---+--------------+-------------------+---------+----------------------------+---------------+ ``` ## `cortex engines get` + :::info This CLI command calls the following API endpoint: - [Get Engine](/api-reference#tag/engines/get/v1/engines/{name}) ::: + This command returns an engine detail defined by an engine `engine_name`. **Usage**: + ```sh @@ -92,6 +93,7 @@ This command returns an engine detail defined by an engine `engine_name`. For example, it returns the following: + ``` +-----------+-------------------+---------+-----------+--------+ | Name | Supported Formats | Version | Variant | Status | @@ -99,11 +101,11 @@ For example, it returns the following: | llama-cpp | GGUF | 0.1.37 | mac-arm64 | Ready | +-----------+-------------------+---------+-----------+--------+ ``` + :::info To get an engine name, run the [`engines list`](/docs/cli/engines/list) command. ::: - **Options**: | Option | Description | Required | Default value | Example | @@ -114,16 +116,18 @@ To get an engine name, run the [`engines list`](/docs/cli/engines/list) command. ## `cortex engines install` + :::info This CLI command calls the following API endpoint: - [Init Engine](/api-reference#tag/engines/post/v1/engines/{name}/init) ::: + This command downloads the required dependencies and installs the engine within Cortex. Currently, Cortex supports three engines: - `llama-cpp` - `onnxruntime` -- `tensorrt-llm` **Usage**: + ```sh @@ -133,7 +137,6 @@ This command downloads the required dependencies and installs the engine within ```sh cortex.exe engines install [options] - ``` @@ -150,6 +153,7 @@ This command downloads the required dependencies and installs the engine within This command uninstalls the engine within Cortex. **Usage**: + ```sh @@ -164,6 +168,7 @@ This command uninstalls the engine within Cortex. For Example: + ```bash ## Llama.cpp engine cortex engines uninstall llama-cpp diff --git a/docs/docs/cli/models/index.mdx b/docs/docs/cli/models/index.mdx index dff452788..6c40ee55e 100644 --- a/docs/docs/cli/models/index.mdx +++ b/docs/docs/cli/models/index.mdx @@ -14,6 +14,7 @@ This command allows you to start, stop, and manage various local or remote model :::info You can use the `--verbose` flag to display more detailed output of the internal processes. To apply this flag, use the following format: `cortex --verbose [subcommand]`. ::: + ```sh @@ -23,7 +24,6 @@ You can use the `--verbose` flag to display more detailed output of the internal ```sh cortex.exe models [options] - ``` @@ -38,15 +38,16 @@ You can use the `--verbose` flag to display more detailed output of the internal # Subcommands: ## `cortex models get` + :::info This CLI command calls the following API endpoint: - [Get Model](/api-reference#tag/models/get/v1/models/{id}) ::: -This command returns a model detail defined by a `model_id`. - +This command returns a model detail defined by a `model_id`. **Usage**: + ```sh @@ -56,16 +57,67 @@ This command returns a model detail defined by a `model_id`. ```sh cortex.exe models get - ``` For example, it returns the following: -```yaml +```json { - "ai_template":"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n","created":9223372036854775888,"ctx_len":4096,"dynatemp_exponent":1.0,"dynatemp_range":0.0,"engine":"llama-cpp","files":["models/cortex.so/llama3.2/3b-gguf-q4-km/model.gguf"],"frequency_penalty":0.0,"gpu_arch":"","id":"Llama-3.2-3B-Instruct","ignore_eos":false,"max_tokens":4096,"min_keep":0,"min_p":0.05000000074505806,"mirostat":false,"mirostat_eta":0.10000000149011612,"mirostat_tau":5.0,"model":"Llama-3.2-3B-Instruct","n_parallel":1,"n_probs":0,"name":"llama3.2:3b-gguf-q4-km","ngl":29,"object":"model","os":"","owned_by":"","penalize_nl":false,"precision":"","presence_penalty":0.0,"prompt_template":"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n","quantization_method":"","repeat_last_n":64,"repeat_penalty":1.0,"result":"OK","seed":-1,"stop":["<|eot_id|>"],"stream":true,"system_template":"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n","temperature":0.69999998807907104,"text_model":false,"tfs_z":1.0,"top_k":40,"top_p":0.89999997615814209,"typ_p":1.0,"user_template":"<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n","version":"2" + "ai_template" : "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + "created" : 127638593791813, + "ctx_len" : 8192, + "dynatemp_exponent" : 1.0, + "dynatemp_range" : 0.0, + "engine" : "llama-cpp", + "files" : + [ + "models/cortex.so/llama3.1/8b-gguf-q4-km/model.gguf" + ], + "frequency_penalty" : 0.0, + "gpu_arch" : "", + "id" : "llama3.1:8b-gguf-q4-km", + "ignore_eos" : false, + "max_tokens" : 8192, + "min_keep" : 0, + "min_p" : 0.050000000000000003, + "mirostat" : false, + "mirostat_eta" : 0.10000000000000001, + "mirostat_tau" : 5.0, + "model" : "llama3.1:8b-gguf-q4-km", + "n_parallel" : 1, + "n_probs" : 0, + "name" : "llama3.1:8b-gguf-q4-km", + "ngl" : 33, + "object" : "", + "os" : "", + "owned_by" : "", + "penalize_nl" : false, + "precision" : "", + "presence_penalty" : 0.0, + "prompt_template" : "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + "quantization_method" : "", + "repeat_last_n" : 64, + "repeat_penalty" : 1.0, + "seed" : -1, + "size" : 4920739981, + "stop" : + [ + "<|end_of_text|>", + "<|eot_id|>", + "<|eom_id|>" + ], + "stream" : true, + "system_template" : "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n", + "temperature" : 0.59999999999999998, + "text_model" : false, + "tfs_z" : 1.0, + "top_k" : 40, + "top_p" : 0.90000000000000002, + "typ_p" : 1.0, + "user_template" : "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n", + "version" : "1" } ``` :::info @@ -89,6 +141,7 @@ This command lists all the downloaded local and remote models. **Usage**: + ```sh @@ -102,8 +155,9 @@ This command lists all the downloaded local and remote models. -For example, it returns the following:w -```bash +For example, it returns the following: + +``` +---------+---------------------------------------------------------------------------+ | (Index) | ID | +---------+---------------------------------------------------------------------------+ @@ -113,7 +167,6 @@ For example, it returns the following:w +---------+---------------------------------------------------------------------------+ | 3 | TheBloke:Mistral-7B-Instruct-v0.1-GGUF:mistral-7b-instruct-v0.1.Q2_K.gguf | +---------+---------------------------------------------------------------------------+ - ``` **Options**: @@ -126,16 +179,18 @@ For example, it returns the following:w | `--cpu_mode` | Display CPU mode. | No | - | `--cpu_mode` | | `--gpu_mode` | Display GPU mode. | No | - | `--gpu_mode` | + ## `cortex models start` + :::info This CLI command calls the following API endpoint: - [Start Model](/api-reference#tag/models/post/v1/models/{modelId}/start) ::: -This command starts a model defined by a `model_id`. - +This command starts a model defined by a `model_id`. **Usage**: + ```sh @@ -145,12 +200,10 @@ This command starts a model defined by a `model_id`. ```sh cortex.exe models start [options] - ``` - :::info This command uses a `model_id` from the model that you have downloaded or available in your file system. ::: @@ -165,15 +218,16 @@ This command uses a `model_id` from the model that you have downloaded or availa | `-h`, `--help` | Display help information for the command. | No | - | `-h` | ## `cortex models stop` + :::info This CLI command calls the following API endpoint: - [Stop Model](/api-reference#tag/models/post/v1/models/{modelId}/stop) ::: -This command stops a model defined by a `model_id`. - +This command stops a model defined by a `model_id`. **Usage**: + ```sh @@ -183,7 +237,6 @@ This command stops a model defined by a `model_id`. ```sh cortex.exe models stop - ``` @@ -191,6 +244,7 @@ This command stops a model defined by a `model_id`. :::info This command uses a `model_id` from the model that you have started before. ::: + **Options**: | Option | Description | Required | Default value | Example | @@ -199,15 +253,16 @@ This command uses a `model_id` from the model that you have started before. | `-h`, `--help` | Display help information for the command. | No | - | `-h` | ## `cortex models delete` + :::info This CLI command calls the following API endpoint: - [Delete Model](/api-reference#tag/models/delete/v1/models/{id}) ::: -This command deletes a local model defined by a `model_id`. - +This command deletes a local model defined by a `model_id`. **Usage**: + ```sh @@ -217,7 +272,6 @@ This command deletes a local model defined by a `model_id`. ```sh cortex.exe models delete - ``` @@ -227,20 +281,23 @@ This command uses a `model_id` from the model that you have downloaded or availa ::: **Options**: + | Option | Description | Required | Default value | Example | |---------------------------|-----------------------------------------------------------------------------|----------|----------------------|------------------------| | `model_id` | The identifier of the model you want to delete. | Yes | - | `mistral` | | `-h`, `--help` | Display help for command. | No | - | `-h` | ## `cortex models update` + :::info This CLI command calls the following API endpoint: - [Update Model](/api-reference#tag/models/patch/v1/models/{modelId) ::: -This command updates the `model.yaml` file of a local model. +This command updates the `model.yaml` file of a local model. **Usage**: + ```sh @@ -250,13 +307,13 @@ This command updates the `model.yaml` file of a local model. ```sh cortex.exe models update [options] - ``` **Options**: + | Option | Description | Required | Default value | Example | |---------------------------|-----------------------------------------------------------------------------|----------|----------------------|------------------------| | `-h`, `--help` | Display help for command. | No | - | `-h` | @@ -306,14 +363,16 @@ This command updates the `model.yaml` file of a local model. | `--n_probs` | Number of probability outputs to return. | No | - | `--n_probs 5` | ## `cortex models import` -This command imports the local model using the model's `gguf` file. +This command imports the local model using the model's `gguf` file. **Usage**: + :::info This CLI command calls the following API endpoint: - [Import Model](/api-reference#tag/models/post/v1/models/import) ::: + ```sh @@ -323,15 +382,14 @@ This CLI command calls the following API endpoint: ```sh cortex.exe models import --model_id --model_path - ``` - **Options**: + | Option | Description | Required | Default value | Example | |---------------------------|-----------------------------------------------------------------------------|----------|----------------------|------------------------| | `-h`, `--help` | Display help for command. | No | - | `-h` | | `--model_id` | The identifier of the model. | Yes | - | `mistral` | -| `--model_path` | The path of the model source file. | Yes | - | `/path/to/your/model.gguf` | \ No newline at end of file +| `--model_path` | The path of the model source file. | Yes | - | `/path/to/your/model.gguf` | diff --git a/docs/docs/cli/ps.mdx b/docs/docs/cli/ps.mdx index a70a9501c..5b531165b 100644 --- a/docs/docs/cli/ps.mdx +++ b/docs/docs/cli/ps.mdx @@ -12,6 +12,7 @@ import TabItem from "@theme/TabItem"; This command shows the running model and its status (Engine, RAM, VRAM, and Uptime). ## Usage + ```sh @@ -27,8 +28,7 @@ This command shows the running model and its status (Engine, RAM, VRAM, and Upti For example, it returns the following table: -```bash -> cortex ps +``` +------------------------+-----------+-----------+-----------+-------------------------------+ | Model | Engine | RAM | VRAM | Uptime | +------------------------+-----------+-----------+-----------+-------------------------------+ @@ -45,4 +45,18 @@ For example, it returns the following table: :::info You can use the `--verbose` flag to display more detailed output of the internal processes. To apply this flag, use the following format: `cortex --verbose [subcommand]`. -::: \ No newline at end of file +::: + +```sh +cortex --verbose ps +``` +``` +20250131 12:03:52.995079 UTC 472664 INFO Gpu Driver Version: 565.77 - system_info_utils.cc:20 +20250131 12:03:52.995393 UTC 472664 INFO CUDA Version: 12.7 - system_info_utils.cc:31 ++------------------------+-----------+--------+---------+---------------------------------+ +| Model | Engine | RAM | VRAM | Up time | ++------------------------+-----------+--------+---------+---------------------------------+ +| llama3.1:8b-gguf-q4-km | llama-cpp | 0.00 B | 4.58 GB | 9 hours, 40 minutes, 34 seconds | ++------------------------+-----------+--------+---------+---------------------------------+ +20250131 12:03:53.012323 UTC 472670 INFO Will not check for new update, return the cache latest: v1.0.8 - cortex_upd_cmd.cc:149 +``` diff --git a/docs/docs/cli/pull.mdx b/docs/docs/cli/pull.mdx index 028962896..62103aa7d 100644 --- a/docs/docs/cli/pull.mdx +++ b/docs/docs/cli/pull.mdx @@ -12,7 +12,7 @@ import TabItem from "@theme/TabItem"; This CLI command calls the following API endpoint: - [Download Model](/api-reference#tag/pulling-models/post/v1/models/pull) ::: -This command displays downloaded models, or displays models available for downloading. +This command displays downloaded models, or displays models available for downloading. There are 3 ways to download models: - From Cortex's [Built-in models](/models): `cortex pull ` @@ -33,19 +33,21 @@ You can use the `--verbose` flag to display more detailed output of the internal ```sh - cortex pull [options] + cortex pull [options] ``` ```sh - cortex.exe pull [options] + cortex.exe pull [options] ``` For example, this returns the following: ```bash -> cortex pull llama3.2 +cortex pull llama3.2 +``` +``` Downloaded models: llama3.2:3b-gguf-q4-km @@ -68,4 +70,4 @@ Select a model (1-9): | Option | Description | Required | Default value | Example | | -------------- | ------------------------------------------------- | -------- | ------------- | ----------- | | `model_id` | The identifier of the model you want to download. | Yes | - | `mistral` | -| `-h`, `--help` | Display help information for the command. | No | - | `-h` | \ No newline at end of file +| `-h`, `--help` | Display help information for the command. | No | - | `-h` | diff --git a/docs/docs/cli/run.mdx b/docs/docs/cli/run.mdx index 57c8358a2..73144de36 100644 --- a/docs/docs/cli/run.mdx +++ b/docs/docs/cli/run.mdx @@ -9,25 +9,25 @@ import TabItem from "@theme/TabItem"; # `cortex run` -This CLI command is a shortcut to run models easily. It executes this sequence of commands: -1. [`cortex pull`](/docs/cli/models/): This command pulls the specified model if the model is not yet downloaded, or finds a local model. -2. [`cortex engines install`](/docs/cli/engines/): This command installs the specified engines if not yet downloaded. -3. [`cortex models start`](/docs/cli/models/): This command starts the specified model, making it active and ready for interactions. +The lazy dev's way to run models. Does three things: +1. [`cortex pull`](/docs/cli/models/): Grabs the model if you don't have it +2. [`cortex engines install`](/docs/cli/engines/): Sets up engines if missing +3. [`cortex models start`](/docs/cli/models/): Fires up the model ## Usage :::info -You can use the `--verbose` flag to display more detailed output of the internal processes. To apply this flag, use the following format: `cortex --verbose [subcommand]`. +Need the gory details? Use `--verbose` flag like this: `cortex --verbose [subcommand]` ::: ```sh - cortex [options] + cortex run [options] ``` ```sh - cortex.exe [options] + cortex.exe run [options] ``` @@ -40,4 +40,4 @@ You can use the `--verbose` flag to display more detailed output of the internal | `--gpus` | List of GPUs to use. | No | - | `[0,1]` | | `--ctx_len` | Maximum context length for inference. | No | `min(8192, max_model_context_length)` | `1024` | | `-h`, `--help` | Display help information for the command. | No | - | `-h` | - +| `-d`, `--detached` | Load the model without starting an interactive chat | No | - | `-d` | diff --git a/docs/docs/cli/start.mdx b/docs/docs/cli/start.mdx index 703e5f535..08bb4a23c 100644 --- a/docs/docs/cli/start.mdx +++ b/docs/docs/cli/start.mdx @@ -9,7 +9,7 @@ import TabItem from "@theme/TabItem"; # `cortex start` This command starts the Cortex API server processes. -If the server is not yet running, the server will automatically be started when running other Cortex commands. +If the server is not yet running, the server will automatically start when running other Cortex commands. ## Usage :::info @@ -36,6 +36,3 @@ You can use the `--verbose` flag to display more detailed output of the internal | `-h`, `--help` | Display help information for the command. | No | - | `-h` | | `-p`, `--port ` | Port to serve the application. | No | - | `-p 39281` | | `--loglevel ` | Setup loglevel for cortex server, in the priority of `ERROR`, `WARN`, `INFO`, `DEBUG`, `TRACE` | No | - | `--loglevel INFO` will display ERROR, WARN and INFO logs| - - - diff --git a/docs/docs/cli/stop.mdx b/docs/docs/cli/stop.mdx index 0b8625f9e..7422037d1 100644 --- a/docs/docs/cli/stop.mdx +++ b/docs/docs/cli/stop.mdx @@ -8,16 +8,20 @@ import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; # `cortex stop` + :::info This CLI command calls the following API endpoint: - [Stop Cortex](/api-reference#tag/system/delete/v1/system) ::: + This command stops the API server. ## Usage + :::info You can use the `--verbose` flag to display more detailed output of the internal processes. To apply this flag, use the following format: `cortex --verbose [subcommand]`. ::: + ```sh @@ -36,4 +40,4 @@ You can use the `--verbose` flag to display more detailed output of the internal | Option | Description | Required | Default value | Example | |-------------------|-------------------------------------------------------|----------|---------------|-------------| -| `-h`, `--help` | Display help information for the command. | No | - | `-h` | \ No newline at end of file +| `-h`, `--help` | Display help information for the command. | No | - | `-h` | diff --git a/docs/docs/cli/update.mdx b/docs/docs/cli/update.mdx index 0f06f8476..3cc40ba20 100644 --- a/docs/docs/cli/update.mdx +++ b/docs/docs/cli/update.mdx @@ -16,10 +16,11 @@ This command updates Cortex.cpp to the provided version or the latest version. :::info You can use the `--verbose` flag to display more detailed output of the internal processes. To apply this flag, use the following format: `cortex --verbose [subcommand]`. ::: + ```sh - cortex update [options] + sudo cortex update [options] ``` @@ -39,6 +40,3 @@ By default, if no version is specified, Cortex.cpp will be updated to the latest |----------------------------|-------------------------------------------|----------|---------------|------------------------| | `-h`, `--help` | Display help information for the command. | No | - | `-h` | | `-v` | Specify the version of the Cortex. | No | - | `-v1.0.1`| - - - diff --git a/docs/docs/configurations/cors.mdx b/docs/docs/configurations/cors.mdx index 2ea6907de..a2454a23f 100644 --- a/docs/docs/configurations/cors.mdx +++ b/docs/docs/configurations/cors.mdx @@ -1,5 +1,5 @@ --- -title: CORS +title: CORS Configuration Guide description: Setting up CORS slug: "cors" --- @@ -7,43 +7,35 @@ slug: "cors" import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: - -# CORS Configuration Guide - -This document describes how to configure Cross-Origin Resource Sharing (CORS) settings for the API server using both CLI commands and HTTP API endpoints. +This document describes how to configure Cross-Origin Resource Sharing (CORS) settings for the API server +using the CLI commands and the HTTP API endpoints. ## Command Line Interface (CLI) -### Basic Usage +Basic Usage ```bash cortex config [OPTIONS] [COMMAND] ``` -### Commands - -- `status`: Display all current configurations - - ```bash - cortex config status - ``` +To display all current configurations: - Example Output: +```bash +cortex config status +``` - ```bash - +-----------------+-------------------+ - | Config name | Value | - +-----------------+-------------------+ - | allowed_origins | http://localhost | - +-----------------+-------------------+ - | allowed_origins | https://cortex.so | - +-----------------+-------------------+ - | cors | true | - +-----------------+-------------------+ - ``` +Example Output: +```md ++-----------------+-------------------+ +| Config name | Value | ++-----------------+-------------------+ +| allowed_origins | http://localhost | ++-----------------+-------------------+ +| allowed_origins | https://cortex.so | ++-----------------+-------------------+ +| cors | true | ++-----------------+-------------------+ +``` ### Options @@ -84,39 +76,47 @@ This document describes the REST API endpoints available for managing CORS confi ### Endpoints -#### Get Current Configuration +To get the current configuration via http. ```http GET /v1/configs ``` - -Retrieves the current CORS configuration settings. - -##### Response - +```sh +curl GET http://127.0.0.1:39281/v1/configs +``` ```json { - "allowed_origins": ["http://localhost:39281"], - "cors": true + "allowed_origins": [ + "http://localhost:39281", + "http://127.0.0.1:39281", + "http://0.0.0.0:39281" + ], + "cors": true, + "huggingface_token": "", + "no_proxy": "example.com,::1,localhost,127.0.0.1", + "proxy_password": "", + "proxy_url": "", + "proxy_username": "", + "verify_host_ssl": true, + "verify_peer_ssl": true, + "verify_proxy_host_ssl": true, + "verify_proxy_ssl": true } ``` -#### Update Configuration +To update the current configuration. ```http PATCH /v1/configs ``` -Updates CORS configuration settings. - -##### Request Headers +Request Headers ``` Content-Type: application/json ``` -##### Request Body - +Request Body ```json { "cors": true, @@ -124,15 +124,7 @@ Content-Type: application/json } ``` -##### Parameters - -| Field | Type | Description | -| ----------------- | -------- | ---------------------------- | -| `cors` | boolean | Enable or disable CORS | -| `allowed_origins` | string[] | Array of allowed origin URLs | - -##### Response - +Response ```json { "config": { @@ -143,15 +135,7 @@ Content-Type: application/json } ``` -### Example cURL Commands - -#### Get Configuration - -```bash -curl --location 'http://127.0.0.1:39281/v1/configs' -``` - -#### Update Configuration +Equivalent cURL command: ```bash curl --location --request PATCH 'http://127.0.0.1:39281/v1/configs' \ @@ -163,6 +147,26 @@ curl --location --request PATCH 'http://127.0.0.1:39281/v1/configs' \ ] }' ``` +```json +{ + "config": { + "allowed_origins": [ + "http://localhost:39281" + ], + "cors": false, + "huggingface_token": "", + "no_proxy": "example.com,::1,localhost,127.0.0.1", + "proxy_password": "", + "proxy_url": "", + "proxy_username": "", + "verify_host_ssl": true, + "verify_peer_ssl": true, + "verify_proxy_host_ssl": true, + "verify_proxy_ssl": true + }, + "message": "Configuration updated successfully" +} +``` ## Notes diff --git a/docs/docs/configurations/index.mdx b/docs/docs/configurations/index.mdx index fe3ab2f0e..8d9c77847 100644 --- a/docs/docs/configurations/index.mdx +++ b/docs/docs/configurations/index.mdx @@ -1,13 +1,13 @@ --- slug: /configurations -title: Cortex configurations +title: Configuration --- # Cortex Configurations -Welcome to the Cortex configurations documentation. Here you will find detailed guides and references for configuring various aspects of Cortex, including: - +This section covers the different configurations you can set up for Cortex, including: - **CORS**: Learn how to set up Cross-Origin Resource Sharing. - **Proxy**: Configure the proxy for Cortex. +- **token**: Set up the token for Cortex. -Use the sidebar to navigate through the different configuration topics. +Use the sidebar to navigate through the different topics. diff --git a/docs/docs/configurations/proxy.mdx b/docs/docs/configurations/proxy.mdx index ccd3aa9dc..19b277d80 100644 --- a/docs/docs/configurations/proxy.mdx +++ b/docs/docs/configurations/proxy.mdx @@ -7,9 +7,6 @@ slug: "proxy" import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: # Proxy Configuration Guide @@ -17,15 +14,13 @@ This document describes how to configure proxy settings for Cortex to be able to ## Command Line Interface (CLI) -### Basic Usage +Basic Usage ```bash cortex config [OPTIONS] [COMMAND] ``` -### Commands - -- `status`: Display all current configurations +To display all current configurations ```bash cortex config status @@ -33,7 +28,7 @@ cortex config status Example Output: -```bash +```md +-----------------------+------------------------+ | Config name | Value | +-----------------------+------------------------+ @@ -73,16 +68,17 @@ Example Output: ### Endpoints -#### Get Current Configuration +To get the current configuration ```http GET /v1/configs ``` +or +```sh +curl GET http://127.0.0.1:39281/v1/configs +``` -Retrieves the current configuration settings. - -##### Response - +Response ```json { "allowed_origins": [ @@ -102,22 +98,18 @@ Retrieves the current configuration settings. } ``` -#### Update Configuration +##### To update the current configuration ```http PATCH /v1/configs ``` -Updates proxy configuration settings. - ##### Request Headers - -``` +```json Content-Type: application/json ``` ##### Request Body - ```json { "no_proxy": "localhost", @@ -170,7 +162,8 @@ Content-Type: application/json ## Testing proxy configuration -You can test your proxy configuration using [mitmproxy](https://docs.mitmproxy.org/stable). This guide is written on macOS, but you can use it on any other platform. +You can test your proxy configuration using [mitmproxy](https://docs.mitmproxy.org/stable). This guide is written +on macOS, but you can use it on any other platform. ### Install mitmproxy @@ -184,19 +177,22 @@ brew install mitmproxy mitmproxy --set stream_large_bodies=1m ``` -mitmproxy will start on port `8080`. After mitmproxy started, you can adding options by pressing `O`. mitmproxy will display an option screen. You can check their document to learn more about mitmproxy. But let's take a simple option for now by setting the `proxyauth` for our local proxy. Inside the option screen, search for `proxyauth` and hit enter. Then, type `username:password` and hit enter again. You will see your newly added option is red-colored. +`mitmproxy` will start on port `8080`. After `mitmproxy` has started, you can add different options by pressing +the capital letter `O` and this will display an optional screen. As an example, try searching for `proxyauth` +and hit enter when you find it. Then, type `username:password` and hit enter again. You will see your newly +added option is red-colored. -### Configuring Cortex to use that proxy +Now we can use the Cortex CLI to tweak our config and use that proxy. In `mitmproxy`, press the capital letter `E` +to see the event log. You will see the request and response logs in this screen. -Let's using CLI to configure Cortex to use that proxy. +Let's try changing our config in a different terminal window. ```bash cortex config --proxy_url http://localhost:8080 --proxy_username username --proxy_password password ``` - -### Testing the proxy - -Now, let's test the proxy. If you are setting the username and password correctly (same with `proxyauth` in mitmproxy), you will see the request in mitmproxy. For example, command `cortex pull tinyllama` should be successfully and returns a list of selectable models. Also, you will see your request in mitmproxy CLI screen. +``` +Configuration updated successfully! +``` Let's try to use a wrong authentication for your proxy. @@ -204,4 +200,4 @@ Let's try to use a wrong authentication for your proxy. cortex config --proxy_password wrong_pw ``` -Now, let's test the proxy again. You will see the request is failed and returns an error. +![mproxy](/img/mitproxy_ex.png) diff --git a/docs/docs/configurations/token.mdx b/docs/docs/configurations/token.mdx index 687879cb4..df494f76a 100644 --- a/docs/docs/configurations/token.mdx +++ b/docs/docs/configurations/token.mdx @@ -1,5 +1,5 @@ --- -title: Token +title: Token Configuration Guide description: Setting up token slug: "token" --- @@ -7,33 +7,24 @@ slug: "token" import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: -# Token Configuration Guide - -This document describes how to configure HuggingFace token settings for Cortex. +A lot of the models available today can be found on HuggingFace. This page describes how to configure +HuggingFace token settings for Cortex. ## Command Line Interface (CLI) -### Basic Usage +Basic Usage ```bash cortex config [OPTIONS] [COMMAND] ``` -### Commands - -- `status`: Display all current configurations +To display all current configurations ```bash cortex config status ``` - -Example Output: - -```bash +``` +-----------------------+------------------------+ | Config name | Value | +-----------------------+------------------------+ @@ -57,11 +48,9 @@ Example Output: ```http GET /v1/configs ``` - -Retrieves the current configuration settings. - -##### Response - +```sh +curl GET http://127.0.0.1:39281/v1/configs +``` ```json { "allowed_origins": [ @@ -70,7 +59,15 @@ Retrieves the current configuration settings. "http://0.0.0.0:39281" ], "cors": true, - "huggingface_token": "" + "huggingface_token": "", + "no_proxy": "example.com,::1,localhost,127.0.0.1", + "proxy_password": "", + "proxy_url": "", + "proxy_username": "", + "verify_host_ssl": true, + "verify_peer_ssl": true, + "verify_proxy_host_ssl": true, + "verify_proxy_ssl": true } ``` @@ -117,4 +114,4 @@ Content-Type: application/json }, "message": "Configuration updated successfully" } -``` \ No newline at end of file +``` diff --git a/docs/docs/cortex-cpp.md b/docs/docs/cortex-cpp.md index 9612164f1..00fffa313 100644 --- a/docs/docs/cortex-cpp.md +++ b/docs/docs/cortex-cpp.md @@ -4,10 +4,6 @@ description: Cortex.cpp Architecture slug: "cortex-cpp" --- -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: - Cortex.cpp is a Local AI engine that is used to run and customize LLMs. Cortex can be deployed as a standalone server, or integrated into apps like [Jan.ai](https://jan.ai/) Cortex's roadmap is to eventually support full OpenAI API-equivalence. diff --git a/docs/docs/cortex-llamacpp.mdx b/docs/docs/cortex-llamacpp.mdx index 82e51a7a8..db2085eb0 100644 --- a/docs/docs/cortex-llamacpp.mdx +++ b/docs/docs/cortex-llamacpp.mdx @@ -7,9 +7,6 @@ slug: "cortex-llamacpp" import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: :::info `llamacpp` is formerly called "Nitro". @@ -67,9 +64,9 @@ The command will check, download, and install these dependencies: - Cuda 12.2: - libcublas.so.12 - libcublasLt.so.12 - - libcudart.so.12 + - libcudart.so.12 - Cuda 12.4: - - libcublasLt.so.12 + - libcublasLt.so.12 - libcublas.so.12 ``` diff --git a/docs/docs/embeddings.mdx b/docs/docs/embeddings.mdx index 5cc675ee2..fae8c8a2c 100644 --- a/docs/docs/embeddings.mdx +++ b/docs/docs/embeddings.mdx @@ -7,18 +7,15 @@ slug: "embeddings" import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: - An embedding is a vector that represents a piece of text, with the distance between vectors indicating similarity, which means closer distances mean more similar texts, while farther distances mean less similar texts. + :::note The Cortex Embeddings feature is fully compatible with OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) endpoints. ::: ## Usage ### CLI -```bash +```bash # Without Flag cortex embeddings "Hello World" # With model_id Flag @@ -84,4 +81,4 @@ For a complete list of models, please visit the [Cortex Hub](https://huggingface Learn more about Embeddings capabilities: - [Embeddings API Reference](/api-reference#tag/embeddings/post/embeddings) - [Embeddings CLI command](/docs/cli/embeddings) -::: \ No newline at end of file +::: diff --git a/docs/docs/engines/index.mdx b/docs/docs/engines/index.mdx index 4043de20d..7eb8fcab3 100644 --- a/docs/docs/engines/index.mdx +++ b/docs/docs/engines/index.mdx @@ -5,63 +5,50 @@ title: Engines import DocCardList from "@theme/DocCardList"; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: - -# Engines - -Engines in Cortex serve as execution drivers for machine learning models, providing the runtime environment necessary for model operations. Each engine is specifically designed to optimize the performance and ensure compatibility with its corresponding model types. +Engines in Cortex serve as execution drivers for machine learning models, providing the runtime +and environment necessary for model operations. Each engine is optimized for hardware +performance and ensures compatibility with its corresponding model types. ## Supported Engines -Cortex currently supports three industry-standard engines: +Cortex currently supports two engines: -| Engine | Source | Description | -| -------------------------------------------------------- | --------- | -------------------------------------------------------------------------------------- | -| [llama.cpp](https://github.com/ggerganov/llama.cpp) | ggerganov | Inference of Meta's LLaMA model (and others) in pure C/C++ | -| [ONNX Runtime](https://github.com/microsoft/onnxruntime) | Microsoft | ONNX Runtime: cross-platform, high performance ML inferencing and training accelerator | -| [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) | NVIDIA | GPU-optimized inference engine for large language models | +| Engine | Source | Description | +| -------------------------------------------------------- | --------- | -----------------------------------------------------------------------| +| [llama.cpp](https://github.com/ggerganov/llama.cpp) | ggerganov | Inference of models in GGUF format, written in pure C/C++ | +| [ONNX Runtime](https://github.com/microsoft/onnxruntime) | Microsoft | Cross-platform, high performance ML inference and training accelerator | -> **Note:** Cortex also supports users in building their own engines. +> **Note:** Cortex also supports building and adding your own custom engines. ## Features -- **Engine Retrieval**: Install engines with a single click. +- **Engine Retrieval**: Install the engines above or your own custom one with a single command. - **Engine Management**: Easily manage engines by type, variant, and version. -- **User-Friendly Interface**: Access models via Command Line Interface (CLI) or HTTP API. -- **Engine Selection**: Select the appropriate engines to run your models. - -## Usage - -Cortex offers comprehensive support for multiple engine types, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [ONNX Runtime](https://github.com/microsoft/onnxruntime), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). These engines are utilized to load their corresponding model types. The platform provides a flexible management system for different engine variants and versions, enabling developers and users to easily rollback changes or compare performance metrics across different engine versions. +- **User-Friendly Interface**: Manage your server, engines, and models via Cortex's CLI or via HTTP API. +- **Engine Selection**: Depending on the model and its format, you can use different engine for the same models. ### Installing an engine -Cortex makes it extremely easy to install an engine. For example, to run a `GGUF` model, you will need the `llama-cpp` engine. To install it, simply enter `cortex engines install llama-cpp` into your terminal and wait for the process to complete. Cortex will automatically pull the latest stable version suitable for your PC's specifications. - -#### CLI To install an engine using the CLI, use the following command: ```sh cortex engines install llama-cpp +``` +``` Validating download items, please wait.. Start downloading.. llama-cpp 100%[==================================================] [00m:00s] 1.24 MB/1.24 MB Engine llama-cpp downloaded successfully! ``` -#### HTTP API - To install an engine using the HTTP API, use the following command: ```sh -curl --location --request POST 'http://127.0.0.1:39281/engines/install/llama-cpp' +curl http://127.0.0.1:39281/v1/engines/llama-cpp/install \ + --request POST \ + --header 'Content-Type: application/json' ``` - -Example response: - ```json { "message": "Engine llama-cpp starts installing!" @@ -70,7 +57,7 @@ Example response: ### Listing engines -Cortex allowing clients to easily list current engines and their statuses. Each engine type can have different variants and versions, which are crucial for debugging and performance optimization. Different variants cater to specific hardware configurations, such as CUDA for NVIDIA GPUs and Vulkan for AMD GPUs on Windows, or AVX512 support for CPUs. +Cortex allows you to list current engines and their statuses. Each engine type can have different variants and versions, which are crucial for debugging and performance optimization. Different variants cater to specific hardware configurations, such as CUDA for NVIDIA GPUs and Vulkan for AMD GPUs on Windows, or AVX512 support for CPUs. #### CLI @@ -78,6 +65,8 @@ You can list the available engines using the following command: ```sh cortex engines list +``` +``` +---+--------------+-------------------+---------+-----------+--------------+ | # | Name | Supported Formats | Version | Variant | Status | +---+--------------+-------------------+---------+-----------+--------------+ @@ -85,8 +74,6 @@ cortex engines list +---+--------------+-------------------+---------+-----------+--------------+ | 2 | llama-cpp | GGUF | 0.1.37 | mac-arm64 | Ready | +---+--------------+-------------------+---------+-----------+--------------+ -| 3 | tensorrt-llm | TensorRT Engines | | | Incompatible | -+---+--------------+-------------------+---------+-----------+--------------+ ``` #### HTTP API @@ -94,11 +81,8 @@ cortex engines list You can also retrieve the list of engines via the HTTP API: ```sh -curl --location 'http://127.0.0.1:39281/v1/engines' +curl http://127.0.0.1:39281/v1/engines ``` - -Example response: - ```json { "data": [ @@ -119,15 +103,6 @@ Example response: "status": "Ready", "variant": "mac-arm64", "version": "0.1.37" - }, - { - "description": "This extension enables chat completion API calls using the TensorrtLLM engine", - "format": "TensorRT Engines", - "name": "tensorrt-llm", - "productName": "tensorrt-llm", - "status": "Incompatible", - "variant": "", - "version": "" } ], "object": "list", @@ -137,7 +112,7 @@ Example response: ### Getting detail information of an engine -Cortex allows users to retrieve detailed information about a specific engine. This includes supported formats, versions, variants, and status. This feature helps users understand the capabilities and compatibility of the engine they are working with. +Cortex allows users to retrieve detailed information about a specific engine. This includes supported formats, versions, variants, and status. This information helps users understand the capabilities and compatibility of their engines. #### CLI @@ -145,14 +120,15 @@ To retrieve detailed information about an engine using the CLI, use the followin ```sh cortex engines get llama-cpp -+-----------+-------------------+---------+-----------+--------+ -| Name | Supported Formats | Version | Variant | Status | -+-----------+-------------------+---------+-----------+--------+ -| llama-cpp | GGUF | 0.1.37 | mac-arm64 | Ready | -+-----------+-------------------+---------+-----------+--------+ +``` +``` ++---+-----------+---------+----------------------------+-----------+ +| # | Name | Version | Variant | Status | ++---+-----------+---------+----------------------------+-----------+ +| 1 | llama-cpp | v0.1.49 | linux-amd64-avx2-cuda-12-0 | Installed | ++---+-----------+---------+----------------------------+-----------+ ``` -This command will display information such as the engine's name, supported formats, version, variant, and status. #### HTTP API @@ -161,40 +137,33 @@ To retrieve detailed information about an engine using the HTTP API, send a GET ```sh curl --location 'http://127.0.0.1:39281/engines/llama-cpp' ``` - -This request will return a JSON response containing detailed information about the engine, including its description, format, name, product name, status, variant, and version. -Example response: - ```json -{ - "description": "This extension enables chat completion API calls using the LlamaCPP engine", - "format": "GGUF", - "name": "llama-cpp", - "productName": "llama-cpp", - "status": "Not Installed", - "variant": "", - "version": "" -} +[ + { + "engine": "llama-cpp", + "name": "linux-amd64-avx2-cuda-12-0", + "version": "v0.1.49" + } +] ``` ### Uninstalling an engine -Cortex provides an easy way to uninstall an engine. This is useful when users want to uninstall the current version and then install the latest stable version of a particular engine. +Cortex provides an easy way to uninstall an engine, which can be useful if you want to have the latest version only +instead of different ones. #### CLI -To uninstall an engine, use the following CLI command: - ```sh cortex engines uninstall llama-cpp ``` #### HTTP API -To uninstall an engine using the HTTP API, send a DELETE request to the appropriate endpoint. - ```sh -curl --location --request DELETE 'http://127.0.0.1:39281/engines/llama-cpp' +curl http://127.0.0.1:39281/v1/engines/llama-cpp/install \ + --request DELETE \ + --header 'Content-Type: application/json' ``` Example response: diff --git a/docs/docs/engines/llamacpp.mdx b/docs/docs/engines/llamacpp.mdx index 2ace67944..dc4948c2f 100644 --- a/docs/docs/engines/llamacpp.mdx +++ b/docs/docs/engines/llamacpp.mdx @@ -3,52 +3,49 @@ title: Llama.cpp description: GGUF Model Format. --- -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: +Cortex leverages `llama.cpp` as its default engine for GGUF models. The example model configuration shown +below illustrates how to configure a GGUF model (in this case DeepSeek's 8B model) with both required and +optional parameters. The configuration includes metadata, inference parameters, and model loading settings +that control everything from basic model identification to advanced generation behavior. Cortex can automatically +generate GGUF models from HuggingFace repositories when a model.yaml file isn't available. -Cortex uses `llama.cpp` as the default engine by default the `GGUF` format is supported by Cortex. - -:::info -Cortex automatically generates any `GGUF` model from the HuggingFace repo that does not have the `model.yaml` file. -::: - -## [`model.yaml`](/docs/capabilities/models/model-yaml) Sample ```yaml -## BEGIN GENERAL GGUF METADATA -id: Mistral-Nemo-Instruct-2407 # Model ID unique between models (author / quantization) -model: mistral-nemo # Model ID which is used for request construct - should be unique between models (author / quantization) -name: Mistral-Nemo-Instruct-2407 # metadata.general.name -version: 2 # metadata.version -files: # can be universal protocol (models://) OR absolute local file path (file://) OR https remote URL (https://) - - /home/thuan/cortex/models/mistral-nemo-q8/Mistral-Nemo-Instruct-2407.Q6_K.gguf +# BEGIN GENERAL GGUF METADATA +id: deepseek-r1-distill-llama-8b # Model ID unique between models (author / quantization) +model: deepseek-r1-distill-llama-8b:8b-gguf-q2-k # Model ID which is used for request construct - should be unique between models (author / quantization) +name: deepseek-r1-distill-llama-8b # metadata.general.name +version: 1 +files: # Can be relative OR absolute local file path + - models/cortex.so/deepseek-r1-distill-llama-8b/8b-gguf-q2-k/model.gguf # END GENERAL GGUF METADATA # BEGIN INFERENCE PARAMETERS # BEGIN REQUIRED stop: # tokenizer.ggml.eos_token_id - - + - <|im_end|> + - <|end▁of▁sentence|> # END REQUIRED # BEGIN OPTIONAL +size: 3179134413 stream: true # Default true? -top_p: 0.949999988 # Ranges: 0 to 1 -temperature: 0.699999988 # Ranges: 0 to 1 +top_p: 0.9 # Ranges: 0 to 1 +temperature: 0.7 # Ranges: 0 to 1 frequency_penalty: 0 # Ranges: 0 to 1 presence_penalty: 0 # Ranges: 0 to 1 -max_tokens: 1024000 # Should be default to context length +max_tokens: 4096 # Should be default to context length seed: -1 dynatemp_range: 0 dynatemp_exponent: 1 top_k: 40 -min_p: 0.0500000007 +min_p: 0.05 tfs_z: 1 typ_p: 1 repeat_last_n: 64 repeat_penalty: 1 mirostat: false mirostat_tau: 5 -mirostat_eta: 0.100000001 +mirostat_eta: 0.1 penalize_nl: false ignore_eos: false n_probs: 0 @@ -58,49 +55,59 @@ min_keep: 0 # BEGIN MODEL LOAD PARAMETERS # BEGIN REQUIRED -engine: cortex.llamacpp # engine to run model -prompt_template: "[INST] <>\n{system_message}\n<>\n{prompt}[/INST]" +engine: llama-cpp # engine to run model +prompt_template: <|start_of_text|>{system_message}<|User|>{prompt}<|Assistant|> # END REQUIRED # BEGIN OPTIONAL -ctx_len: 1024000 # llama.context_length | 0 or undefined = loaded from model -ngl: 41 # Undefined = loaded from model +ctx_len: 4096 # llama.context_length | 0 or undefined = loaded from model +n_parallel: 1 +ngl: 34 # Undefined = loaded from model # END OPTIONAL # END MODEL LOAD PARAMETERS - ``` + ## Model Parameters + | **Parameter** | **Description** | **Required** | |------------------------|--------------------------------------------------------------------------------------|--------------| -| `top_p` | The cumulative probability threshold for token sampling. | No | -| `temperature` | Controls the randomness of predictions by scaling logits before applying softmax. | No | -| `frequency_penalty` | Penalizes new tokens based on their existing frequency in the sequence so far. | No | -| `presence_penalty` | Penalizes new tokens based on whether they appear in the sequence so far. | No | -| `max_tokens` | Maximum number of tokens in the output. | No | -| `stream` | Enables or disables streaming mode for the output (true or false). | No | -| `ngl` | Number of attention heads. | No | -| `ctx_len` | Context length (maximum number of tokens). | No | -| `prompt_template` | Template for formatting the prompt, including system messages and instructions. | Yes | -| `stop` | Specifies the stopping condition for the model, which can be a word, a letter, or a specific text. | Yes | -| `seed` | Random seed value used to initialize the generation process. | No | -| `dynatemp_range` | Dynamic temperature range used to adjust randomness during generation. | No | -| `dynatemp_exponent` | Exponent used to adjust the effect of dynamic temperature. | No | -| `top_k` | Limits the number of highest probability tokens to consider during sampling. | No | -| `min_p` | Minimum cumulative probability for nucleus sampling. | No | -| `tfs_z` | Top-p frequency selection parameter. | No | -| `typ_p` | Typical sampling probability threshold. | No | -| `repeat_last_n` | Number of tokens to consider for the repetition penalty. | No | -| `repeat_penalty` | Penalty applied to repeated tokens to reduce their likelihood of being selected again. | No | -| `mirostat` | Enables or disables the use of Mirostat algorithm for dynamic temperature adjustment. | No | -| `mirostat_tau` | Target surprise value for Mirostat algorithm. | No | -| `mirostat_eta` | Learning rate for Mirostat algorithm. | No | -| `penalize_nl` | Whether newline characters should be penalized during sampling. | No | -| `ignore_eos` | If true, ignores the end of sequence token, allowing generation to continue indefinitely. | No | -| `n_probs` | Number of top token probabilities to return in the output. | No | -| `min_keep` | Minimum number of tokens to keep during top-k sampling. | No | +| `id` | Unique model identifier including author and quantization | Yes | +| `model` | Model ID used for request construction | Yes | +| `name` | General name metadata for the model | Yes | +| `version` | Model version number | Yes | +| `files` | Path to model GGUF file (relative or absolute) | Yes | +| `stop` | Array of stopping sequences for generation | Yes | +| `engine` | Model execution engine (llama-cpp) | Yes | +| `prompt_template` | Template for formatting the prompt with system message and user input | Yes | +| `size` | Model file size in bytes | No | +| `stream` | Enable streaming output (default: true) | No | +| `top_p` | Nucleus sampling probability threshold (0-1) | No | +| `temperature` | Output randomness control (0-1) | No | +| `frequency_penalty` | Penalty for frequent token usage (0-1) | No | +| `presence_penalty` | Penalty for token presence (0-1) | No | +| `max_tokens` | Maximum output length | No | +| `seed` | Random seed for reproducibility | No | +| `dynatemp_range` | Dynamic temperature range | No | +| `dynatemp_exponent` | Dynamic temperature exponent | No | +| `top_k` | Top-k sampling parameter | No | +| `min_p` | Minimum probability threshold | No | +| `tfs_z` | Tail-free sampling parameter | No | +| `typ_p` | Typical sampling parameter | No | +| `repeat_last_n` | Repetition penalty window | No | +| `repeat_penalty` | Penalty for repeated tokens | No | +| `mirostat` | Enable Mirostat sampling | No | +| `mirostat_tau` | Mirostat target entropy | No | +| `mirostat_eta` | Mirostat learning rate | No | +| `penalize_nl` | Apply penalty to newlines | No | +| `ignore_eos` | Ignore end-of-sequence token | No | +| `n_probs` | Number of probability outputs | No | +| `min_keep` | Minimum tokens to retain | No | +| `ctx_len` | Context window size | No | +| `n_parallel` | Number of parallel instances | No | +| `ngl` | Number of GPU layers | No | \ No newline at end of file +::: --> diff --git a/docs/docs/engines/onnx.mdx b/docs/docs/engines/onnx.mdx index 370aa1e53..9414ef537 100644 --- a/docs/docs/engines/onnx.mdx +++ b/docs/docs/engines/onnx.mdx @@ -5,7 +5,7 @@ unlisted: true --- :::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. +🚧 Cortex is currently under active development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. ::: Cortex uses `onnxruntime-genai` with DirectML to provide GPU acceleration for AMD, Intel, NVIDIA, and Qualcomm GPUs. @@ -14,11 +14,14 @@ Cortex uses `onnxruntime-genai` with DirectML to provide GPU acceleration for AM ```bash ## Initialize the ONNX engine cortex engines onnx init +``` ## Run an ONNX model +```sh cortex run openhermes-2.5:7b-onnx ``` -## [`model.yaml`](/docs/capabilities/models/model-yaml) Sample + +## `model.yaml` Sample ```yaml name: openhermes-2.5 model: openhermes @@ -33,7 +36,7 @@ top_p: 1.0 temperature: 1.0 frequency_penalty: 0 presence_penalty: 0 -max_tokens: 2048 +max_tokens: 2048 stream: true # true | false ``` @@ -58,4 +61,4 @@ stream: true # true | false You can download a `ONNX` model from the following: - [Cortex Model Repos](/docs/capabilities/models/sources/cortex-hub) - [HuggingFace Model Repos](/docs/capabilities/models/sources/hugging-face) -::: --> \ No newline at end of file +::: --> diff --git a/docs/docs/engines/python-engine.mdx b/docs/docs/engines/python-engine.mdx new file mode 100644 index 000000000..64996406d --- /dev/null +++ b/docs/docs/engines/python-engine.mdx @@ -0,0 +1,246 @@ +--- +title: Python Engine +description: Interface for running Python processes through Cortex +--- + +:::warning +🚧 Cortex.cpp is currently under active development. Our documentation outlines the intended +behavior of Cortex, which may not yet be fully implemented in the codebase. +::: + +The Python Engine manages Python processes that run models via Cortex. Each Python program is treated as +a model with its own `model.yml` configuration template. All requests are routed through Cortex using HTTP. + +## Python Engine Implementation + +The Python Engine is implemented as a C++ package called [EngineI](/docs/engines/engine-extension). It exposes these core methods: + +- `LoadModel`: Starts Python process and loads model +- `UnloadModel`: Stops process and unloads model +- `GetModelStatus`: Health check for running processes +- `GetModels`: Lists active Python models + +Additional methods: +- `HandleInference`: Routes inference requests to Python process +- `HandleRouteRequest`: Routes arbitrary requests to Python process + +The Python Engine is built into Cortex.cpp and loads automatically when needed. + +## Model Configuration + +Each Python model requires a `model.yml` configuration file: + +```yaml +id: ichigo-0.5:fp16-linux-amd64 +model: ichigo-0.5:fp16-linux-amd64 +name: Ichigo Wrapper +version: 1 + +port: 22310 +script: src/app.py +log_path: ichigo-wrapper.log +log_level: INFO +command: + - python +files: + - /home/thuan/cortexcpp/models/cortex.so/ichigo-0.5/fp16-linux-amd64 +depends: + - ichigo-0.4:8b-gguf-q4-km + - whispervq:fp16-linux-amd64 + - fish-speech:fp16-linux-amd64 +engine: python-engine +extra_params: + device_id: 0 + fish_speech_port: 22312 + ichigo_model: ichigo-0.4:8b-gguf-q4-km + ichigo_port: 39281 + whisper_port: 3348 +``` + +| **Parameter** | **Description** | **Required** | +|-----------------|-----------------------------------------------------------------------------------------------------------|--------------| +| `id` | Unique identifier for the model, typically includes version and platform information. | Yes | +| `model` | Specifies the variant of the model, often denoting size or quantization details. | Yes | +| `name` | The human-readable name for the model, used as the `model_id`. | Yes | +| `version` | The specific version number of the model. | Yes | +| `port` | The network port on which the Python program will listen for requests. | Yes | +| `script` | Path to the main Python script to be executed by the engine. This is relative path to the model folder | Yes | +| `log_path` | File location where logs will be stored for the Python program's execution. log_path is relative path of cortex data folder | No | +| `log_level` | The level of logging detail (e.g., INFO, DEBUG). | No | +| `command` | The command used to launch the Python program, typically starting with 'python'. | Yes | +| `files` | For python models, the files is the path to folder contains all python scripts, model binary and environment to run the program | No | +| `depends` | Dependencies required by the model, specified by their identifiers. The dependencies are other models | No | +| `engine` | Specifies the engine to use, which in this context is 'python-engine'. | Yes | +| `extra_params` | Additional parameters passed to the Python script at runtime | No | + +## Example: Ichigo Python Model + +[Ichigo python](https://github.com/janhq/ichigo) is a built-in Cortex model for chat with audio support. + +### Required Models + +Ichigo requires these models: + +- ichigo-0.5 +- whispervq +- ichigo-0.4 +- fish-speech (optional, for text-to-speech) + +Download models for your platform (example for Linux AMD64): + +```sh +curl --location '127.0.0.1:39281/v1/models/pull' \ + --header 'Content-Type: application/json' \ + --data '{"model":"ichigo-0.5:fp16-linux-amd64"}' + +curl --location '127.0.0.1:39281/v1/models/pull' \ + --header 'Content-Type: application/json' \ + --data '{"model":"ichigo-0.4:8b-gguf-q4-km"}' + +curl --location '127.0.0.1:39281/v1/models/pull' \ + --header 'Content-Type: application/json' \ + --data '{"model":"whispervq:fp16-linux-amd64"}' + +curl --location '127.0.0.1:39281/v1/models/pull' \ + --header 'Content-Type: application/json' \ + --data '{"model":"fish-speech:fp16-linux-amd64"}' +``` + +### Model Management + +Start model: +```sh +curl --location '127.0.0.1:39281/v1/models/start' \ +--header 'Content-Type: application/json' \ +--data '{"model":"ichigo-0.5:fp16-linux-amd64"}' +``` + +Check status: +```sh +curl --location '127.0.0.1:39281/v1/models/status/fish-speech:fp16-linux-amd64' +``` + +Stop model: +```sh +curl --location '127.0.0.1:39281/v1/models/stop' \ +--header 'Content-Type: application/json' \ +--data '{"model":"ichigo-0.5:fp16-linux-amd64"}' +``` + +### Inference + +Example inference request: +```sh +curl --location '127.0.0.1:39281/v1/inference' \ +--header 'Content-Type: application/json' \ +--data '{ + "model":"ichigo-0.5:fp16-linux-amd64", + "engine":"python-engine", + "body":{ + "messages": [{ + "role":"system", + "content":"you are helpful assistant, you must answer questions short and concil!" + }], + "input_audio": { + "data": "base64_encoded_audio_data", + "format": "wav" + }, + "model": "ichigo-0.4:8b-gguf-q4km", + "stream": true, + "temperature": 0.7, + "top_p": 0.9, + "max_tokens": 2048, + "presence_penalty": 0, + "frequency_penalty": 0, + "stop": ["<|eot_id|>"], + "output_audio": true + } +}' +``` + +### Route Requests + +Generic request routing example: +```sh +curl --location '127.0.0.1:39281/v1/route/request' \ +--header 'Content-Type: application/json' \ +--data '{ + "model":"whispervq:fp16", + "path":"/inference", + "engine":"python-engine", + "method":"post", + "transform_response":"{ {%- set first = true -%} {%- for key, value in input_request -%} {%- if key == \"tokens\" -%} {%- if not first -%},{%- endif -%} \"{{ key }}\": {{ tojson(value) }} {%- set first = false -%} {%- endif -%} {%- endfor -%} }", + "body": { + "data": "base64 data", + "format": "wav" + } +}' +``` + +## Adding New Python Models + +### Implementation Requirements + +Python models must expose at least two endpoints: +- `/health`: Server status check +- `/inference`: Model inference + +Example server implementation: + +```python +import argparse +import os +import sys +from pathlib import Path +from contextlib import asynccontextmanager +from typing import AsyncGenerator, List +import uvicorn +from dotenv import load_dotenv +from fastapi import APIRouter, FastAPI +from common.utility.logger_utility import LoggerUtility +from services.audio.audio_controller import AudioController +from services.audio.implementation.audio_service import AudioService +from services.health.health_controller import HealthController + +def create_app() -> FastAPI: + routes: List[APIRouter] = [ + HealthController(), + AudioController() + ] + app = FastAPI() + for route in routes: + app.include_router(route) + return app + +def parse_argument(): + parser = argparse.ArgumentParser(description="Ichigo-wrapper Application") + parser.add_argument('--log_path', type=str, default='Ichigo-wrapper.log', help='The log file path') + parser.add_argument('--log_level', type=str, default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'TRACE']) + parser.add_argument('--port', type=int, default=22310) + parser.add_argument('--device_id', type=str, default="0") + parser.add_argument('--package_dir', type=str, default="") + parser.add_argument('--whisper_port', type=int, default=3348) + parser.add_argument('--ichigo_port', type=int, default=39281) + parser.add_argument('--fish_speech_port', type=int, default=22312) + parser.add_argument('--ichigo_model', type=str, default="ichigo:8b-gguf-q4-km") + return parser.parse_args() + +if __name__ == "__main__": + args = parse_argument() + LoggerUtility.init_logger(__name__, args.log_level, args.log_path) + env_path = Path(os.path.dirname(os.path.realpath(__file__))) / "variables" / ".env" + AudioService.initialize(args.whisper_port, args.ichigo_port, args.fish_speech_port, args.ichigo_model) + load_dotenv(dotenv_path=env_path) + app = create_app() + print("Server is running at: 0.0.0.0:", args.port) + uvicorn.run(app=app, host="0.0.0.0", port=args.port) +``` + +### Deployment + +1. Create model files following the example above +2. Add required `requirements.txt` and `requirements.cuda.txt` files +3. Trigger the [Python Script Package CI](https://github.com/janhq/cortex.cpp/actions/workflows/python-script-package.yml) +4. Trigger the [Python Venv Package CI](https://github.com/janhq/cortex.cpp/actions/workflows/python-venv-package.yml) + +The CIs will build and publish your model to Hugging Face where it can then be downloaded and used. diff --git a/docs/docs/engines/tensorrt-llm.mdx b/docs/docs/engines/tensorrt-llm.mdx index 94a3d3875..f3dfd6aff 100644 --- a/docs/docs/engines/tensorrt-llm.mdx +++ b/docs/docs/engines/tensorrt-llm.mdx @@ -5,19 +5,42 @@ unlisted: true --- :::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. +🚧 Cortex is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. ::: Cortex uses the `tensorrt-llm` inference library for NVIDIA GPUs acceleration. -## Run Model +Download the Engine + ```bash -## Initialize the TensorRT-LLM engine -cortex engines tensorrt-llm init +cortex engines install tensorrt-llm +``` +``` +tensorrt-llm 100%[========================] 00m:00s 1.09 GB/1.09 GB +cuda 100%[========================] 00m:00s 346.61 MB/346.61 MB +Engine tensorrt-llm downloaded successfully! +``` -## Run a TensorRT-LLM model -cortex run openhermes-2.5:7b-tensorrt-llm +Load TensorRT-LLM Engine + +```bash +cortex engines load tensorrt-llm ``` + +To run a model using the `tensorrt-llm` engine, you will have to specify the parti +```bash +cortex run mistral:7b-tensorrt-llm-linux-ada +``` +``` +Start downloading.. +config.json 100%[========================] 00m:00s 5.92 KB/5.92 KB +model.yml 100%[========================] 00m:00s 445.00 B/445.00 B +rank0.engine 89%[=====================> ] 01m:13s 3.49 GB/3.88 GB +tokenizer.model 100%[========================] 00m:00s 573.64 KB/573.64 KB +Model mistral:7b-tensorrt-llm-linux-ada downloaded successfully! +``` + + ## [`model.yaml`](/docs/capabilities/models/model-yaml) Sample ```yaml name: Openhermes-2.5 7b Linux Ada @@ -69,4 +92,4 @@ You can download a `TensorRT-LLM` model from the following: - [Cortex Model Repos](/docs/capabilities/models/sources/cortex-hub) - [HuggingFace Model Repos](/docs/capabilities/models/sources/hugging-face) - Nvidia Catalog (Coming Soon!) -::: --> \ No newline at end of file +::: --> diff --git a/docs/docs/guides/function-calling.md b/docs/docs/guides/function-calling.md index d37911935..387cf9b89 100644 --- a/docs/docs/guides/function-calling.md +++ b/docs/docs/guides/function-calling.md @@ -1,44 +1,50 @@ --- -title: Function Calling +title: OpenAI-Compatible Function Calling --- -# Function calling with OpenAI compatible -This tutorial, I use the `mistral-nemo:12b-gguf-q4-km` for testing function calling with cortex.cpp. All steps are reproduced from original openai instruction https://platform.openai.com/docs/guides/function-calling +# Function Calling with Cortex.cpp -## Step by step with function calling +This guide demonstrates how to use function calling capabilities with Cortex.cpp that are compatible with the OpenAI API specification. We'll use the `mistral-nemo:12b-gguf-q4-km` model for these examples, following similar patterns to the [OpenAI function calling documentation](https://platform.openai.com/docs/guides/function-calling). -### 1. Start server and run model. +## Implementation Guide -``` -cortex run mistral-nemo:12b-gguf-q4-km -``` +### 1. Start the Server -### 2. Create a python script `function_calling.py` with this content: +First, launch the Cortex server with your chosen model: +```sh +cortex run -d llama3.1:8b-gguf-q4-km ``` + +### 2. Initialize the Python Client + +Create a new Python script named `function_calling.py` and set up the OpenAI client: + +```py from datetime import datetime from openai import OpenAI from pydantic import BaseModel -ENDPOINT = "http://localhost:39281/v1" -MODEL = "mistral-nemo:12b-gguf-q4-km" +import json + +MODEL = "llama3.1:8b-gguf-q4-km" + client = OpenAI( - base_url=ENDPOINT, - api_key="not-needed" + base_url="http://localhost:39281/v1", + api_key="not-needed" # Authentication is not required for local deployment ) ``` -This step creates OpenAI client in python +### 3. Implement Function Calling -### 3. Start create a chat completion with tool calling +Define your function schema and create a chat completion: -``` +```py tools = [ { "type": "function", "function": { "name": "get_delivery_date", - - "strict": True, + "strict": True, "description": "Get the delivery date for a customer's order. Call this whenever you need to know the delivery date, for example when a customer asks 'Where is my package'", "parameters": { "type": "object", @@ -54,237 +60,178 @@ tools = [ } } ] + completion_payload = { "messages": [ {"role": "system", "content": "You are a helpful customer support assistant. Use the supplied tools to assist the user."}, {"role": "user", "content": "Hi, can you tell me the delivery date for my order?"}, ] } + response = client.chat.completions.create( top_p=0.9, temperature=0.6, - model=MODEL, + model="llama3.1:8b-gguf-q4-km", messages=completion_payload["messages"], tools=tools, ) -print(response) ``` -Because you didn't provide the `order_id`, the model will ask again +Since no `order_id` was provided, the model will request it: -``` +```sh +# Example Response ChatCompletion( - id='1lblzWtLw9h5HG0GjYYi', - choices=[ - Choice( - finish_reason=None, - index=0, - logprobs=None, - message=ChatCompletionMessage( - content='Of course! Please provide your order ID so I can look it up.', - refusal=None, - role='assistant', - audio=None, - function_call=None, - tool_calls=None - ) - ) - ], - created=1730204306, - model='_', - object='chat.completion', - service_tier=None, - system_fingerprint='_', - usage=CompletionUsage( - completion_tokens=15, - prompt_tokens=449, - total_tokens=464, - completion_tokens_details=None, - prompt_tokens_details=None - ) + id='54yeEjbaFbldGfSPyl2i', + choices=[ + Choice( + finish_reason='tool_calls', + index=0, + logprobs=None, + message=ChatCompletionMessage( + content='', + refusal=None, + role='assistant', + audio=None, + function_call=None, + tool_calls=[ + ChatCompletionMessageToolCall( + id=None, + function=Function(arguments='{"order_id": "12345"}', name='get_delivery_date'), + type='function' + ) + ] + ) + ) + ], + created=1738543890, + model='_', + object='chat.completion', + service_tier=None, + system_fingerprint='_', + usage=CompletionUsage( + completion_tokens=16, + prompt_tokens=443, + total_tokens=459, + completion_tokens_details=None, + prompt_tokens_details=None + ) ) ``` -### 4. Add new message user provide order id +### 4. Handle User Input -``` +Once the user provides their order ID: + +```python completion_payload = { "messages": [ {"role": "system", "content": "You are a helpful customer support assistant. Use the supplied tools to assist the user."}, {"role": "user", "content": "Hi, can you tell me the delivery date for my order?"}, {"role": "assistant", "content": "Of course! Please provide your order ID so I can look it up."}, - {"role": "user", "content": "i think it is order_12345"}, + {"role": "user", "content": "i think it is order_70705"}, ] } response = client.chat.completions.create( - top_p=0.9, - temperature=0.6, - model=MODEL, + model="llama3.1:8b-gguf-q4-km", messages=completion_payload["messages"], - tools=tools -) -``` - -The response of the model will be - -``` -ChatCompletion( - id='zUnHwEPCambJtrvWOAQy', - choices=[ - Choice( - finish_reason='tool_calls', - index=0, - logprobs=None, - message=ChatCompletionMessage( - content='', - refusal=None, - role='assistant', - audio=None, - function_call=None, - tool_calls=[ - ChatCompletionMessageToolCall( - id=None, - function=Function( - arguments='{"order_id": "order_12345"}', - name='get_delivery_date' - ), - type='function' - ) - ] - ) - ) - ], - created=1730204559, - model='_', - object='chat.completion', - service_tier=None, - system_fingerprint='_', - usage=CompletionUsage( - completion_tokens=23, - prompt_tokens=483, - total_tokens=506, - completion_tokens_details=None, - prompt_tokens_details=None - ) + tools=tools, + temperature=0.6, + top_p=0.9 ) ``` -It can return correct function with arguments +### 5. Process Function Results -### 5. Push the response to the conversation and ask model to answer user +Handle the function call response and generate the final answer: -``` +```python +# Simulate function execution order_id = "order_12345" delivery_date = datetime.now() -# Simulate the tool call response -response = { - "choices": [ - { - "message": { - "role": "assistant", - "tool_calls": [ - { - "id": "call_62136354", - "type": "function", - "function": { - "arguments": "{'order_id': 'order_12345'}", - "name": "get_delivery_date" - } - } - ] - } - } - ] -} - -# Create a message containing the result of the function call function_call_result_message = { "role": "tool", "content": json.dumps({ "order_id": order_id, "delivery_date": delivery_date.strftime('%Y-%m-%d %H:%M:%S') }), - "tool_call_id": response['choices'][0]['message']['tool_calls'][0]['id'] -} - -# Prepare the chat completion call payload -completion_payload = { - "messages": [ - {"role": "system", "content": "You are a helpful customer support assistant. Use the supplied tools to assist the user."}, - {"role": "user", "content": "Hi, can you tell me the delivery date for my order?"}, - {"role": "assistant", "content": "Sure! Could you please provide your order ID so I can look up the delivery date for you?"}, - {"role": "user", "content": "i think it is order_12345"}, - response["choices"][0]["message"], - function_call_result_message - ] + "tool_call_id": "call_62136354" } -client = OpenAI( - # This is the default and can be omitted - base_url=ENDPOINT, - api_key="not-needed" -) - +final_messages = completion_payload["messages"] + [ + { + "role": "assistant", + "tool_calls": [{ + "id": "call_62136354", + "type": "function", + "function": { + "arguments": "{'order_id': 'order_12345'}", + "name": "get_delivery_date" + } + }] + }, + function_call_result_message +] +``` +```py response = client.chat.completions.create( - top_p=0.9, - temperature=0.6, - model=MODEL, - messages=completion_payload["messages"], + model="llama3.1:8b-gguf-q4-km", + messages=final_messages, tools=tools, + temperature=0.6, + top_p=0.9 ) print(response) ``` - -The response will include all the content that processed by the function, where the delivery date is produced by query db, .... - -``` +```sh ChatCompletion( - id='l1xdCuKVMYBSC5tEDlAn', - choices=[ - Choice( - finish_reason=None, - index=0, - logprobs=None, - message=ChatCompletionMessage( - content="Your order with ID 'order_12345' is scheduled to be delivered on October 29, 2024. Is there anything else I can help you with?", - refusal=None, - role='assistant', - audio=None, - function_call=None, - tool_calls=None - ) - ) - ], - created=1730205470, - model='_', - object='chat.completion', - service_tier=None, - system_fingerprint='_', - usage=CompletionUsage( - completion_tokens=40, - prompt_tokens=568, - total_tokens=608, - completion_tokens_details=None, - prompt_tokens_details=None - ) + id='UMIoW4aNrqKXW2DR1ksX', + choices=[ + Choice( + finish_reason='stop', + index=0, + logprobs=None, + message=ChatCompletionMessage( + content='The delivery date for your order (order_12345) is February 3, 2025 at 11:53 AM.', + refusal=None, + role='assistant', + audio=None, + function_call=None, + tool_calls=None + ) + ) + ], + created=1738544037, + model='_', + object='chat.completion', + service_tier=None, + system_fingerprint='_', + usage=CompletionUsage( + completion_tokens=27, + prompt_tokens=535, + total_tokens=562, + completion_tokens_details=None, + prompt_tokens_details=None + ) ) ``` -## Handling parallel function calling +## Advanced Features -Cortex cpp support parallel function calling by default +### Parallel Function Calls -``` +Cortex.cpp supports calling multiple functions simultaneously: + +```python tools = [ { "type": "function", "function": { "name": "get_delivery_date", - - "strict": True, - "description": "Get the delivery date for a customer's order. Call this whenever you need to know the delivery date, for example when a customer asks 'Where is my package'", + "strict": True, + "description": "Get the delivery date for a customer's order.", "parameters": { "type": "object", "properties": { @@ -302,7 +249,7 @@ tools = [ "type": "function", "function": { "name": "get_current_conditions", - "description": "Get the current weather conditions for a specific location", + "description": "Get the current weather conditions for a location", "parameters": { "type": "object", "properties": { @@ -312,8 +259,7 @@ tools = [ }, "unit": { "type": "string", - "enum": ["Celsius", "Fahrenheit"], - "description": "The temperature unit to use. Infer this from the user's location." + "enum": ["Celsius", "Fahrenheit"] } }, "required": ["location", "unit"] @@ -321,127 +267,56 @@ tools = [ } } ] - -messages = [ - {"role": "user", "content": "Hi, can you tell me the delivery date for my order order_12345 and check the weather condition in LA?"} -] -response = client.chat.completions.create( - top_p=0.9, - temperature=0.6, - model=MODEL, - messages= messages, - tools=tools -) -print(response) ``` -It will call 2 functions in parallel +### Controlling Function Execution -``` -ChatCompletion( - id='5ot3qux399DojubnBFrG', - choices=[ - Choice( - finish_reason='tool_calls', - index=0, - logprobs=None, - message=ChatCompletionMessage( - content='', - refusal=None, - role='assistant', - audio=None, - function_call=None, - tool_calls=[ - ChatCompletionMessageToolCall( - id=None, - function=Function( - arguments='{"order_id": "order_12345"}', - name='get_delivery_date' - ), - type='function' - ), - ChatCompletionMessageToolCall( - id=None, - function=Function( - arguments='{"location": "LA", "unit": "Fahrenheit"}', - name='get_current_conditions' - ), - type='function' - ) - ] - ) - ) - ], - created=1730205975, - model='_', - object='chat.completion', - service_tier=None, - system_fingerprint='_', - usage=CompletionUsage( - completion_tokens=47, - prompt_tokens=568, - total_tokens=615, - completion_tokens_details=None, - prompt_tokens_details=None - ) -) -``` - -## Configuring function calling behavior using the tool_choice parameter - -User can set `tool_choice=none` to disable function calling even if the tools are provided +You can control function calling behavior using the `tool_choice` parameter: -``` +```python +# Disable function calling response = client.chat.completions.create( - top_p=0.9, - temperature=0.6, model=MODEL, - messages= messages, #completion_payload["messages"], + messages=messages, tools=tools, tool_choice="none" ) -``` - -User can also force model to call a tool by specify the tool name, in this example it's the `get_current_conditions` -``` +# Force specific function response = client.chat.completions.create( - top_p=0.9, - temperature=0.6, model=MODEL, - messages= [{"role": "user", "content": "Hi, can you tell me the delivery date for my order order_12345 and check the weather condition in LA?"}], + messages=messages, tools=tools, - tool_choice= {"type": "function", "function": {"name": "get_current_conditions"}}) - + tool_choice={"type": "function", "function": {"name": "get_current_conditions"}} +) ``` -User can also specify the function with enum field to the tool definition to make model generate more accurate. +### Enhanced Function Definitions -``` +Use enums to improve function accuracy: + +```json { "name": "pick_tshirt_size", - "description": "Call this if the user specifies which size t-shirt they want", + "description": "Handle t-shirt size selection", "parameters": { "type": "object", "properties": { "size": { "type": "string", "enum": ["s", "m", "l"], - "description": "The size of the t-shirt that the user would like to order" + "description": "T-shirt size selection" } }, - "required": ["size"], - "additionalProperties": false + "required": ["size"] } } ``` -(*) Note that the accuracy of function calling heavily depends on the quality of the model. For small models like 8B or 12B, we should only use function calling with simple cases. - - The function calling feature from cortex.cpp is primarily an application of prompt engineering. When tools are specified, we inject a system prompt into the conversation to facilitate this functionality. - - Compatibility: This feature works best with models like llama3.1 and its derivatives, such as mistral-nemo or qwen. - - Customization: Users have the option to manually update the system prompt to fine-tune it for specific problems or use cases. The detail implementation is in this [PR](https://github.com/janhq/cortex.cpp/pull/1472/files). +## Important Notes - The full steps to mimic the function calling feature in Python using openai lib can be found [here](https://github.com/janhq/models/issues/16#issuecomment-2381129322). +- Function calling accuracy depends on model quality. Smaller models (8B-12B) work best with simple use cases. +- Cortex.cpp implements function calling through prompt engineering, injecting system prompts when tools are specified. +- Best compatibility with llama3.1 and derivatives (mistral-nemo, qwen) +- System prompts can be customized for specific use cases (see [implementation details](https://github.com/janhq/cortex.cpp/pull/1472/files)) +- For complete implementation examples, refer to our [detailed guide](https://github.com/janhq/models/issues/16#issuecomment-2381129322) diff --git a/docs/docs/guides/structured-outputs.md b/docs/docs/guides/structured-outputs.md index 1fe3f789b..2f594bb1e 100644 --- a/docs/docs/guides/structured-outputs.md +++ b/docs/docs/guides/structured-outputs.md @@ -1,188 +1,170 @@ --- title: Structured Outputs --- -# Structured Outputs -Structured outputs, or response formats, are a feature designed to generate responses in a defined JSON schema, enabling more predictable and machine-readable outputs. This is essential for applications where data consistency and format adherence are crucial, such as automated data processing, structured data generation, and integrations with other systems. +This guide demonstrates methods for getting structured JSON output from locally-hosted language models +like Llama and Mistral. We'll cover techniques for generating predictable data structures using open source LLMs. -In recent developments, systems like OpenAI's models have excelled at producing these structured outputs. However, while open-source models like Llama 3.1 and Mistral Nemo offer powerful capabilities, they currently struggle to produce reliably structured JSON outputs required for advanced use cases. +## Start the model -This guide explores the concept of structured outputs using these models, highlights the challenges faced in achieving consistent output formatting, and provides strategies for improving output accuracy, particularly when using models that don't inherently support this feature as robustly as GPT models. - -By understanding these nuances, users can make informed decisions when choosing models for tasks requiring structured outputs, ensuring that the tools they select align with their project's formatting requirements and expected accuracy. +```sh +cortex run -d llama3.1:8b-gguf-q4-km +``` +``` +llama3.1:8b-gguf-q4-km model started successfully. Use `cortex run llama3.1:8b-gguf-q4-km` for interactive chat shell +``` -The Structured Outputs/Response Format feature in [OpenAI](https://platform.openai.com/docs/guides/structured-outputs) is fundamentally a prompt engineering challenge. While its goal is to use system prompts to generate JSON output matching a specific schema, popular open-source models like Llama 3.1 and Mistral Nemo struggle to consistently generate exact JSON output that matches the requirements. An easy way to directly guild the model to reponse in json format in system message, you just need to pass the pydantic model to `response_format`: +## Basic Example: Calendar Event -``` +```python from pydantic import BaseModel from openai import OpenAI import json -ENDPOINT = "http://localhost:39281/v1" -MODEL = "llama3.1:8b-gguf-q4-km" - +``` +```py client = OpenAI( - base_url=ENDPOINT, + base_url="http://localhost:39281/v1", api_key="not-needed" ) - class CalendarEvent(BaseModel): name: str date: str participants: list[str] - - +``` +```py completion = client.beta.chat.completions.parse( - model=MODEL, + model="llama3.1:8b-gguf-q4-km", messages=[ - {"role": "system", "content": "Extract the event information."}, - {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."}, + {"role": "system", "content": "Extract the event info as JSON"}, + {"role": "user", "content": "Alice and Bob are going to a science fair on Friday"} ], response_format=CalendarEvent, stop=["<|eot_id|>"] ) - -event = completion.choices[0].message.parsed - -print(json.dumps(event.dict(), indent=4)) -``` - -The output of the model like this - +print(json.dumps(completion.choices[0].message.parsed.dict(), indent=2)) ``` +```json { - "name": "science fair", - "date": "Friday", - "participants": [ - "Alice", - "Bob" - ] + "name": "science fair", + "date": "Friday", + "participants": ["Alice", "Bob"] } ``` -With more complex json format, llama3.1 still struggle to response correct answer: - -``` - -from openai import OpenAI -from pydantic import BaseModel -import json -ENDPOINT = "http://localhost:39281/v1" -MODEL = "llama3.1:8b-gguf-q4-km" -client = OpenAI( - base_url=ENDPOINT, - api_key="not-needed" -) - -format = { - "steps": [{ - "explanation": "string", - "output": "string" - } - ], - "final_output": "string" -} - -completion_payload = { - "messages": [ - {"role": "system", "content": f"You are a helpful math tutor. Guide the user through the solution step by step. You have to response in this json format {format}\n"}, - {"role": "user", "content": "how can I solve 8x + 7 = -23"} - ] -} +## Complex Example: Math Steps +Let's try something more complex with nested schemas. Here's structured math reasoning: +```py class Step(BaseModel): explanation: str output: str - class MathReasoning(BaseModel): steps: list[Step] final_answer: str - - -response = client.beta.chat.completions.parse( - top_p=0.9, - temperature=0.6, - model=MODEL, - messages=completion_payload["messages"], - stop=["<|eot_id|>"], - response_format=MathReasoning -) - -math_reasoning = response.choices[0].message.parsed -print(json.dumps(math_reasoning.dict(), indent=4)) ``` - -The output of model looks like this - -``` -{ - "steps": [ - { - "explanation": "To isolate the variable x, we need to get rid of the constant term on the left-hand side. We can do this by subtracting 7 from both sides of the equation.", - "output": "8x + 7 - 7 = -23 - 7" - }, - { - "explanation": "Simplifying the left-hand side, we get:", - "output": "8x = -30" - }, +```py +response = client.beta.chat.completions.parse( + model="llama3.1:8b-gguf-q4-km", + messages=[ { - "explanation": "Now, to solve for x, we need to isolate it by dividing both sides of the equation by 8.", - "output": "8x / 8 = -30 / 8" + "role": "system", + "content": "Solve this math problem step by step. Output as JSON." }, { - "explanation": "Simplifying the right-hand side, we get:", - "output": "x = -3.75" + "role": "user", + "content": "how can I solve in a lot of detail, the equation 8x + 7 = -23" } ], - "final_answer": "There is no final answer yet, let's break it down step by step." + response_format=MathReasoning, + stop=["<|eot_id|>"] +) +print(json.dumps(response.choices[0].message.parsed.model_dump(), indent=2)) +``` +```json +{ + "steps": [ + { + "explanation": "The given equation is 8x + 7 = -23. To isolate x, we need to get rid of the constant term (+7) on the left side.", + "output": "" + }, + { + "explanation": "We can subtract 7 from both sides of the equation to get: 8x = -30", + "output": "8x = -30" + }, + { + "explanation": "Now, we need to isolate x. To do this, we'll divide both sides of the equation by 8.", + "output": "" + }, + { + "explanation": "Dividing both sides by 8 gives us: x = -3.75", + "output": "x = -3.75" + }, + { + "explanation": "However, looking back at the original problem, we see that it's asking for the value of x in the equation 8x + 7 = -23.", + "output": "" + }, + { + "explanation": "We can simplify this further by converting the decimal to a fraction.", + "output": "" + }, + { + "explanation": "The decimal -3.75 is equivalent to -15/4. Therefore, x = -15/4", + "output": "x = -15/4" + } + ], + "final_answer": "x = -3" } ``` -Even if the model can generate correct format but the information doesn't 100% accurate, the `final_answer` should be `-3.75` instead of `There is no final answer yet, let's break it down step by step.`. +## Quick JSON Lists -Another usecase for structured output with json response, you can provide the `response_format={"type" : "json_object"}`, the model will be force to generate json output. +For straightforward lists, you can use the json_object response format: -``` -json_format = {"song_name":"release date"} +```py completion = client.chat.completions.create( - model=MODEL, + model="llama3.1:8b-gguf-q4-km", messages=[ - {"role": "system", "content": f"You are a helpful assistant, you must reponse with this format: '{json_format}'"}, - {"role": "user", "content": "List 10 songs for me"} + { + "role": "system", + "content": "List songs in {song_name: release_year} format" + }, + { + "role": "user", + "content": "List 10 songs" + } ], response_format={"type": "json_object"}, stop=["<|eot_id|>"] ) - -print(json.dumps(json.loads(completion.choices[0].message.content), indent=4)) +print(json.dumps(json.loads(completion.choices[0].message.content), indent=2)) ``` -The output will looks like this: - -``` +Output: +```json { - "Happy": "2013", - "Uptown Funk": "2014", - "Shut Up and Dance": "2014", - "Can't Stop the Feeling!": "2016", - "We Found Love": "2011", - "All About That Bass": "2014", - "Radioactive": "2012", - "SexyBack": "2006", - "Crazy": "2007", - "Viva la Vida": "2008" + "Hotel California": 1976, + "Stairway to Heaven": 1971, + "Bohemian Rhapsody": 1975, + "Smells Like Teen Spirit": 1991, + "Viva la Vida": 2008, + "Imagine": 1971, + "Hotel Yorba": 2001, + "Mr. Brightside": 2004, + "Sweet Child O Mine": 1987, + "Livin on a Prayer": 1986 } ``` -## Limitations of Open-Source Models for Structured Outputs +## Pro Tips -While the concept of structured outputs is compelling, particularly for applications requiring machine-readable data, it's important to understand that not all models support this capability equally. Open-source models such as Llama 3.1 and Mistral Nemo face notable challenges in generating outputs that adhere strictly to defined JSON schemas. Here are the key limitations: +Open source models have come a long way with structured outputs. A few things to keep in mind: -- Lack of Training Data: These models have not been specifically trained on tasks demanding precise JSON formatting, unlike some proprietary models which have been fine-tuned for such tasks. -- Inconsistency in Output: Due to their training scope, `Llama 3.1` and `Mistral Nemo` often produce outputs that may deviate from the intended schema. This can include additional natural language explanations or incorrectly nested JSON structures. -- Complexity in Parsing: Without consistent JSON formatting, downstream processes that rely on predictable data schemas may encounter errors, leading to challenges in automation and data integration tasks. -- Beta Features: Some features related to structured outputs may still be in beta, requiring usage of specific methods like `client.beta.chat.completions.parse`, which suggests they are not yet fully reliable in all scenarios. +- Be explicit in your prompts about JSON formatting +- Use Pydantic models to enforce schema compliance +- Consider using the stop token to prevent extra output +- Some advanced features are still in beta -Given these constraints, users should consider these limitations when choosing a model for tasks involving structured outputs. Where strict compliance with a JSON schema is critical, alternative models designed for such precision might be a more suitable choice. +With proper prompting and schema validation, you can get reliable structured outputs from your local models. No cloud required! diff --git a/docs/docs/installation.mdx b/docs/docs/installation.mdx index 80409e009..acee4d5d0 100644 --- a/docs/docs/installation.mdx +++ b/docs/docs/installation.mdx @@ -8,24 +8,20 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import Admonition from '@theme/Admonition'; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: - -## Cortex.cpp Installation ### Cortex.cpp offers four installer types -- Network Installers download a minimal system and require an internet connection to fetch packages during installation. -- Local Installers include all necessary packages, enabling offline installation without internet access. -- Dockerfile Installers are used to build a Docker image for Cortex. -- Binary files without package management. + +- **Network Installers** download a minimal script and require an internet connection to fetch packages during installation. +- **Local Installers** include all necessary packages, enabling offline installation without internet access. +- **Dockerfile** Installers are used to build a Docker image with Cortex ready to go. +- **Binary files** without package management. ### Cortex.cpp supports three channels -- Stable: The latest stable release on github. -- Beta: The release candidate for the next stable release, available on github release with the tag `vx.y.z-rc1` -- Nightly: The nightly build of the latest code on dev branch, available on [discord](https://discord.com/channels/1107178041848909847/1283654073488379904). +- **Stable**: The latest stable release on github. +- **Beta**: The release candidate for the next stable release, available on github release with the tag `vx.y.z-rc1` +- **Nightly**: The nightly build of the latest commit on dev branch, available on [discord](https://discord.com/channels/1107178041848909847/1283654073488379904). -For more information, please check out [different channels](#different-channels). +For more information, please check out the [different channels](#different-channels). ### Download URLs @@ -45,7 +41,8 @@ For other versions, please look at [cortex.cpp repo](https://github.com/janhq/co ### OS - MacOS 12 or later - Windows 10 or later -- Linux: Ubuntu 20.04 or later, Debian 11 or later (For other distributions, please use the Dockerfile installer or binary files, we have not tested on other distributions yet.) +- Linux: Ubuntu 20.04 or later, Debian 11 or later, and any of the latest versions of Arch (for other distributions, +please use the Dockerfile installer or binary files, we have not tested on other distributions yet.) ### Hardware #### CPU @@ -81,11 +78,11 @@ Having at least 6GB VRAM when using NVIDIA, AMD, or Intel Arc GPUs is recommende - [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) version 12.0 or higher. *Note: Cortex.cpp will automatically detect and install the required version of cudart to the user's machine.* ::: #### Disk -- At least 10GB for app storage and model download. +- At least 10GB of free space for downloading models. ## Different channels -Different channels have different features, stability levels, binary file name, app folder and data folder. +Different channels have different features, stability levels, binary file name, app and data folders. ### Stable - App name: `cortexcpp` diff --git a/docs/docs/installation/docker.mdx b/docs/docs/installation/docker.mdx index 154281be5..ffc485962 100644 --- a/docs/docs/installation/docker.mdx +++ b/docs/docs/installation/docker.mdx @@ -1,142 +1,205 @@ --- -title: Docker -description: Install Cortex using Docker. +title: Docker Installation Guide +description: Comprehensive guide for installing and running Cortex using Docker --- import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import Admonition from '@theme/Admonition'; -:::warning -🚧 **Cortex.cpp is currently in development.** The documentation describes the intended functionality, which may not yet be fully implemented. -::: -## Setting Up Cortex with Docker +## Getting Started with Cortex on Docker -This guide walks you through the setup and running of Cortex using Docker. +This guide provides comprehensive instructions for installing and running Cortex in a Docker environment, +including sensible defaults for security and performance. ### Prerequisites -- Docker or Docker Desktop -- `nvidia-container-toolkit` (for GPU support) +Before beginning, ensure you have: +- [Docker](https://docs.docker.com/engine/install/) (version 20.10.0 or higher) or [Docker Desktop](https://docs.docker.com/desktop/) +- At least 8GB of RAM and 10GB of free disk space +- For GPU support, make sure you install `nvidia-container-toolkit`. Here is an example on how to do so for Ubuntu: + ```bash + # Install NVIDIA Container Toolkit + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + ``` + ```bash + # Add repository + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + ``` + ```bash + # Install + sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit + sudo nvidia-ctk runtime configure --runtime=docker + sudo systemctl restart docker + ``` -### Setup Instructions +### Installation Methods -#### Build Cortex Docker Image from source or Pull from Docker Hub +#### Method 1: Using Pre-built Image (Recommended) -##### Pull Cortex Docker Image from Docker Hub +```bash +# Pull the latest stable release +docker pull menloltd/cortex:latest +``` +```bash +# Or pull a specific version (recommended for production) +docker pull menloltd/cortex:nightly-1.0.1-224 +``` - ```bash - # Pull the latest image - docker pull menloltd/cortex:latest +:::info Version Tags +- `latest`: Most recent stable release +- `nightly`: Latest development build +- `x.y.z` (e.g., `1.0.1`): Specific version release +::: - # Pull a specific version - docker pull menloltd/cortex:nightly-1.0.1-224 - ``` +#### Method 2: Building from Source -##### Build and Run Cortex Docker Container from Dockerfile +1. **Clone the repo:** +```bash +git clone https://github.com/janhq/cortex.cpp.git +cd cortex.cpp +git submodule update --init +``` -1. **Clone the Cortex Repository** - ```bash - git clone https://github.com/janhq/cortex.cpp.git - cd cortex.cpp - git submodule update --init - ``` +2. **Build the Docker image:** + + + ```bash + docker build -t menloltd/cortex:local \ + --build-arg CORTEX_CPP_VERSION=$(git rev-parse HEAD) \ + -f docker/Dockerfile . + ``` + + + ```bash + docker build \ + --build-arg CORTEX_LLAMACPP_VERSION=0.1.34 \ + --build-arg CORTEX_CPP_VERSION=$(git rev-parse HEAD) \ + -t menloltd/cortex:local \ + -f docker/Dockerfile . + ``` + + -2. **Build the Docker Image** +### Running Cortex (Securely) - - - ```sh - docker build -t cortex --build-arg CORTEX_CPP_VERSION=$(git rev-parse HEAD) -f docker/Dockerfile . - ``` - - - ```sh - docker build --build-arg CORTEX_LLAMACPP_VERSION=0.1.34 --build-arg CORTEX_CPP_VERSION=$(git rev-parse HEAD) -t cortex -f docker/Dockerfile . - ``` - - +1. **[Optional] Create a dedicated user and data directory:** +```bash +# Create a dedicated user +sudo useradd -r -s /bin/false cortex +export CORTEX_UID=$(id -u cortex) +``` +```bash +# Create data directory with proper permissions +sudo mkdir -p /opt/cortex/data +sudo chown -R ${CORTEX_UID}:${CORTEX_UID} /opt/cortex +``` -#### Run Cortex Docker Container +2. **Set up persistent storage:** +```bash +docker volume create cortex_data +``` -1. **Run the Docker Container** - - Create a Docker volume to store models and data: - ```bash - docker volume create cortex_data - ``` +3. **Launch the container:** + + + ```bash + docker run --gpus all -d \ + --name cortex \ + --user ${CORTEX_UID}:${CORTEX_UID} \ + --memory=4g \ + --memory-swap=4g \ + --security-opt=no-new-privileges \ + -v cortex_data:/root/cortexcpp:rw \ + -v /opt/cortex/data:/data:rw \ + -p 127.0.0.1:39281:39281 \ + menloltd/cortex:latest + ``` + + + ```bash + docker run -d \ + --name cortex \ + --user ${CORTEX_UID}:${CORTEX_UID} \ + --memory=4g \ + --memory-swap=4g \ + --security-opt=no-new-privileges \ + -v cortex_data:/root/cortexcpp:rw \ + -v /opt/cortex/data:/data:rw \ + -p 127.0.0.1:39281:39281 \ + menloltd/cortex:latest + ``` + + - - - ```sh - # requires nvidia-container-toolkit - docker run --gpus all -it -d --name cortex -v cortex_data:/root/cortexcpp -p 39281:39281 cortex - ``` - - - ```sh - docker run -it -d --name cortex -v cortex_data:/root/cortexcpp -p 39281:39281 cortex - ``` - - +### Verification and Testing -2. **Check Logs (Optional)** - ```bash - docker logs cortex - ``` +1. **Check container status:** +```bash +docker ps | grep cortex +docker logs cortex +``` -3. **Access the Cortex Documentation API** - - Open [http://localhost:39281](http://localhost:39281) in your browser. +Expected output should show: +``` +Cortex server starting... +Initialization complete +Server listening on port 39281 +``` -4. **Access the Container and Try Cortex CLI** - ```bash - docker exec -it cortex bash - cortex --help - ``` +2. **Test the API:** +```bash +curl http://127.0.0.1:39281/healthz +``` -### Usage +### Working with Cortex -With Docker running, you can use the following commands to interact with Cortex. Ensure the container is running and `curl` is installed on your machine. +Once your container is running, here's how to interact with Cortex. Make sure you have `curl` installed on your system. -#### 1. List Available Engines +#### 1. Check Available Engines ```bash curl --request GET --url http://localhost:39281/v1/engines --header "Content-Type: application/json" ``` -- **Example Response** - ```json - { - "data": [ - { - "description": "This extension enables chat completion API calls using the Onnx engine", - "format": "ONNX", - "name": "onnxruntime", - "status": "Incompatible" - }, - { - "description": "This extension enables chat completion API calls using the LlamaCPP engine", - "format": "GGUF", - "name": "llama-cpp", - "status": "Ready", - "variant": "linux-amd64-avx2", - "version": "0.1.37" - } - ], - "object": "list", - "result": "OK" - } - ``` +You'll see something like: +```json +{ + "data": [ + { + "description": "This extension enables chat completion API calls using the Onnx engine", + "format": "ONNX", + "name": "onnxruntime", + "status": "Incompatible" + }, + { + "description": "This extension enables chat completion API calls using the LlamaCPP engine", + "format": "GGUF", + "name": "llama-cpp", + "status": "Ready", + "variant": "linux-amd64-avx2", + "version": "0.1.37" + } + ], + "object": "list", + "result": "OK" +} +``` + +#### 2. Download Models -#### 2. Pull Models from Hugging Face +First, set up event monitoring: +- Install `websocat` following [these instructions](https://github.com/vi/websocat?tab=readme-ov-file#installation) +- Open a terminal and run: `websocat ws://localhost:39281/events` -- Open a terminal and run `websocat ws://localhost:39281/events` to capture download events, follow [this instruction](https://github.com/vi/websocat?tab=readme-ov-file#installation) to install `websocat`. -- In another terminal, pull models using the commands below. +Then, in a new terminal, download your desired model: ```sh - # requires nvidia-container-toolkit curl --request POST --url http://localhost:39281/v1/models/pull --header 'Content-Type: application/json' --data '{"model": "tinyllama:gguf"}' ``` @@ -147,36 +210,93 @@ curl --request GET --url http://localhost:39281/v1/engines --header "Content-Typ -- After pull models successfully, run command below to list models. - ```bash - curl --request GET --url http://localhost:39281/v1/models - ``` +To see your downloaded models: +```bash +curl --request GET --url http://localhost:39281/v1/models +``` -#### 3. Start a Model and Send an Inference Request +#### 3. Using the Model -- **Start the model:** - ```bash - curl --request POST --url http://localhost:39281/v1/models/start --header 'Content-Type: application/json' --data '{"model": "tinyllama:gguf"}' - ``` +First, start your model: +```bash +curl --request POST --url http://localhost:39281/v1/models/start --header 'Content-Type: application/json' --data '{"model": "tinyllama:gguf"}' +``` -- **Send an inference request:** - ```bash - curl --request POST --url http://localhost:39281/v1/chat/completions --header 'Content-Type: application/json' --data '{ - "frequency_penalty": 0.2, - "max_tokens": 4096, - "messages": [{"content": "Tell me a joke", "role": "user"}], - "model": "tinyllama:gguf", - "presence_penalty": 0.6, - "stop": ["End"], - "stream": true, - "temperature": 0.8, - "top_p": 0.95 - }' - ``` +Then, send it a query: +```bash +curl --request POST --url http://localhost:39281/v1/chat/completions --header 'Content-Type: application/json' --data '{ + "frequency_penalty": 0.2, + "max_tokens": 4096, + "messages": [{"content": "Tell me a joke", "role": "user"}], + "model": "tinyllama:gguf", + "presence_penalty": 0.6, + "stop": ["End"], + "stream": true, + "temperature": 0.8, + "top_p": 0.95 + }' +``` -#### 4. Stop a Model +#### 4. Shutting Down -- To stop a running model, use: - ```bash - curl --request POST --url http://localhost:39281/v1/models/stop --header 'Content-Type: application/json' --data '{"model": "tinyllama:gguf"}' - ``` +When you're done, stop the model: +```bash +curl --request POST --url http://localhost:39281/v1/models/stop --header 'Content-Type: application/json' --data '{"model": "tinyllama:gguf"}' +``` + +### Maintenance and Troubleshooting + +#### Common Issues + +1. **Permission Denied Errors:** +```bash +sudo chown -R ${CORTEX_UID}:${CORTEX_UID} /opt/cortex/data +docker restart cortex +``` + +2. **Container Won't Start:** +```bash +docker logs cortex +docker system info # Check available resources +``` + +#### Cleanup + +```bash +# Stop and remove container +docker stop cortex +docker rm cortex +``` + +```bash +# Remove data (optional) +docker volume rm cortex_data +sudo rm -rf /opt/cortex/data +``` + +```bash +# Remove image +docker rmi cortexai/cortex:latest +``` + +### Updating Cortex + +```bash +# Pull latest version +docker pull cortexai/cortex:latest +``` + +```bash +# Stop and remove old container +docker stop cortex +docker rm cortex + +# Start new container (use run command from above) +``` + +:::tip Best Practices +- Always use specific version tags in production +- Regularly backup your data volume +- Monitor container resources using `docker stats cortex` +- Keep your Docker installation updated +::: diff --git a/docs/docs/installation/linux.mdx b/docs/docs/installation/linux.mdx index a14450f47..a45c9cefe 100644 --- a/docs/docs/installation/linux.mdx +++ b/docs/docs/installation/linux.mdx @@ -8,17 +8,14 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import Admonition from '@theme/Admonition'; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: -## Cortex.cpp Installation :::info Before installation, make sure that you have met the [minimum requirements](/docs/installation#minimum-requirements) to run Cortex. This instruction is for stable releases. For beta and nightly releases, please replace `cortex` with `cortex-beta` and `cortex-nightly`, respectively. ::: ### Prerequisites + - OpenMPI - curl - jq @@ -28,18 +25,14 @@ This instruction is for stable releases. For beta and nightly releases, please r 1. Install cortex with one command -- Linux debian base distros +- Network installer for all linux distros ```bash - # Network installer - curl -s https://raw.githubusercontent.com/janhq/cortex/main/engine/templates/linux/install.sh | sudo bash -s - - # Local installer - curl -s https://raw.githubusercontent.com/janhq/cortex/main/engine/templates/linux/install.sh | sudo bash -s -- --deb_local + curl -s https://raw.githubusercontent.com/janhq/cortex/main/engine/templates/linux/install.sh | sudo bash -s ``` -- Other linux distros +- Local installer for Debian-based distros ```bash - curl -s https://raw.githubusercontent.com/janhq/cortex/main/engine/templates/linux/install.sh | sudo bash -s + curl -s https://raw.githubusercontent.com/janhq/cortex/main/engine/templates/linux/install.sh | sudo bash -s -- --deb_local ``` - Parameters @@ -55,7 +48,9 @@ This instruction is for stable releases. For beta and nightly releases, please r ``` ### Data Folder + By default, Cortex.cpp is installed in the following directory: + ``` # Binary Location /usr/bin/cortex @@ -69,6 +64,7 @@ By default, Cortex.cpp is installed in the following directory: ``` ## Uninstall Cortex.cpp + ```bash # Stable version sudo /usr/bin/cortex-uninstall.sh @@ -113,4 +109,4 @@ sudo /usr/bin/cortex-uninstall.sh ```bash sudo cortex update -``` \ No newline at end of file +``` diff --git a/docs/docs/installation/mac.mdx b/docs/docs/installation/mac.mdx index 51c4760a4..b1e8b5e2b 100644 --- a/docs/docs/installation/mac.mdx +++ b/docs/docs/installation/mac.mdx @@ -7,31 +7,26 @@ slug: 'mac' import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: - -## Cortex.cpp Installation :::info Before installation, make sure that you have met the [minimum requirements](/docs/installation#minimum-requirements) to run Cortex. -This instruction is for stable releases. For beta and nightly releases, please replace `cortex` with `cortex-beta` and `cortex-nightly`, respectively. +The instructions below are for stable releases only. For beta and nightly releases, please replace `cortex` with `cortex-beta` and `cortex-nightly`, respectively. ::: 1. Download the Linux installer: - - From release: https://github.com/janhq/cortex.cpp/releases - - From quick download links: - - Local installer `.deb`: - - Stable: https://app.cortexcpp.com/download/latest/mac-universal-local - - Beta: https://app.cortexcpp.com/download/beta/mac-universal-local - - Nightly: https://app.cortexcpp.com/download/nightly/mac-universal-local - - Network installer `.deb`: - - Stable: https://app.cortexcpp.com/download/latest/mac-universal-network - - Beta: https://app.cortexcpp.com/download/beta/mac-universal-network - - Nightly: https://app.cortexcpp.com/download/nightly/mac-universal-network - - Binary: - - Stable: https://app.cortexcpp.com/download/latest/mac-universal-binary - - Beta: https://app.cortexcpp.com/download/beta/mac-universal-binary - - Nightly: https://app.cortexcpp.com/download/nightly/mac-universal-binary +- From release: https://github.com/janhq/cortex.cpp/releases +- From quick download links: + - Local installer `.deb`: + - Stable: https://app.cortexcpp.com/download/latest/mac-universal-local + - Beta: https://app.cortexcpp.com/download/beta/mac-universal-local + - Nightly: https://app.cortexcpp.com/download/nightly/mac-universal-local + - Network installer `.deb`: + - Stable: https://app.cortexcpp.com/download/latest/mac-universal-network + - Beta: https://app.cortexcpp.com/download/beta/mac-universal-network + - Nightly: https://app.cortexcpp.com/download/nightly/mac-universal-network + - Binary: + - Stable: https://app.cortexcpp.com/download/latest/mac-universal-binary + - Beta: https://app.cortexcpp.com/download/beta/mac-universal-binary + - Nightly: https://app.cortexcpp.com/download/nightly/mac-universal-binary 2. Install Cortex.cpp by double-clicking the pkg downloaded file. @@ -42,21 +37,29 @@ This instruction is for stable releases. For beta and nightly releases, please r ``` ### Data Folder -By default, Cortex.cpp is installed in the following directory: -``` + +By default, Cortex.cpp is installed in the `bin` directory: + +```sh # Binary Location /usr/local/bin/cortex /usr/local/bin/cortex-server /usr/local/bin/cortex-uninstall.sh +``` -# Application Data (Engines, Models and Logs folders) +The application data which includes Engines, Models and Logs will be installed in your home directory. +```sh /Users//cortexcpp +``` -# Configuration File +The configuration file, `.cortexrc`, will also be in your home directory. +```sh /Users//.cortexrc ``` ## Uninstall Cortex.cpp + Run the uninstaller script: + ```bash sudo sh cortex-uninstall.sh ``` @@ -100,17 +103,22 @@ The script requires sudo permission. -3. Verify that Cortex.cpp is builded correctly by getting help information. +3. Verify that Cortex.cpp was built correctly by using `-h` flag to call the help info. ```sh # Get the help information ./build/cortex -h ``` -## Update cortex to latest version +## Update Cortex + +Cortex can be updated in-place without any additional scripts. In addition, cortex will let you know if there is a new version of itself the next +time you start a server. + :::info The script requires sudo permission. ::: + ```bash sudo cortex update -``` \ No newline at end of file +``` diff --git a/docs/docs/installation/windows.mdx b/docs/docs/installation/windows.mdx index 39855d44e..f49fe2c78 100644 --- a/docs/docs/installation/windows.mdx +++ b/docs/docs/installation/windows.mdx @@ -8,37 +8,33 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import Admonition from '@theme/Admonition'; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: -## Overview -For Windows, Cortex.cpp can be installed in two ways: -- [Windows](#windows) -- [Windows Subsystem for Linux (WSL)](#windows-subsystem-linux) +For Windows, Cortex.cpp can be installed in two ways, by downloading the [windows](#windows) installer or +via the [Windows Subsystem for Linux (WSL)](#windows-subsystem-linux). ## Windows ### Install Cortex.cpp :::info Before installation, make sure that you have met the [minimum requirements](/docs/installation#minimum-requirements) to run Cortex. -This instruction is for stable releases. For beta and nightly releases, please replace `cortex` with `cortex-beta` and `cortex-nightly`, respectively. +The instructions below are for stable releases only. For beta and nightly releases, please replace `cortex` with `cortex-beta` +and `cortex-nightly`, respectively. ::: Download the windows installer: - - From release: https://github.com/janhq/cortex.cpp/releases - - From quick download links: - - Local installer `.deb`: - - Stable: https://app.cortexcpp.com/download/latest/windows-amd64-local - - Beta: https://app.cortexcpp.com/download/beta/windows-amd64-local - - Nightly: https://app.cortexcpp.com/download/nightly/windows-amd64-local - - Network installer `.deb`: - - Stable: https://app.cortexcpp.com/download/latest/windows-amd64-network - - Beta: https://app.cortexcpp.com/download/beta/windows-amd64-network - - Nightly: https://app.cortexcpp.com/download/nightly/windows-amd64-network - - Binary: - - Stable: https://app.cortexcpp.com/download/latest/windows-amd64-binary - - Beta: https://app.cortexcpp.com/download/beta/windows-amd64-binary - - Nightly: https://app.cortexcpp.com/download/nightly/windows-amd64-binary +- From release: https://github.com/janhq/cortex.cpp/releases +- From quick download links: + - Local installer `.deb`: + - Stable: https://app.cortexcpp.com/download/latest/windows-amd64-local + - Beta: https://app.cortexcpp.com/download/beta/windows-amd64-local + - Nightly: https://app.cortexcpp.com/download/nightly/windows-amd64-local + - Network installer `.deb`: + - Stable: https://app.cortexcpp.com/download/latest/windows-amd64-network + - Beta: https://app.cortexcpp.com/download/beta/windows-amd64-network + - Nightly: https://app.cortexcpp.com/download/nightly/windows-amd64-network + - Binary: + - Stable: https://app.cortexcpp.com/download/latest/windows-amd64-binary + - Beta: https://app.cortexcpp.com/download/beta/windows-amd64-binary + - Nightly: https://app.cortexcpp.com/download/nightly/windows-amd64-binary #### Data Folder @@ -58,14 +54,15 @@ C:\Users\\.cortexrc To uninstall Cortex.cpp: 1. Open the **Control Panel**. 1. Navigate to **Add or Remove program**. -2. Search for cortexcpp and click **Uninstall**. +2. Search for `cortexcpp` and click **Uninstall**. ## Windows Subsystem Linux :::info -Windows Subsystem Linux allows running Linux tools and workflows seamlessly alongside Windows applications. For more information, please see this [article](https://learn.microsoft.com/en-us/windows/wsl/faq). +Windows Subsystem Linux allows running Linux tools and workflows seamlessly alongside Windows applications. For more +information, please see this [article](https://learn.microsoft.com/en-us/windows/wsl/faq). ::: -Follow [linux installation steps](linux) to install Cortex.cpp on Windows Subsystem Linux. +Follow the [linux installation steps](linux) to install Cortex.cpp on the WSL. ## Build from Source @@ -84,7 +81,7 @@ Follow [linux installation steps](linux) to install Cortex.cpp on Windows Subsys cd cortex.cpp git submodule update --init ``` -2. Build the Cortex.cpp : +2. Build Cortex.cpp from source: ```cmd cd engine @@ -102,4 +99,4 @@ Follow [linux installation steps](linux) to install Cortex.cpp on Windows Subsys ## Update cortex to latest version ```bash cortex.exe update -``` \ No newline at end of file +``` diff --git a/docs/docs/overview.mdx b/docs/docs/overview.mdx index 95fec519b..4a00b55ba 100644 --- a/docs/docs/overview.mdx +++ b/docs/docs/overview.mdx @@ -8,125 +8,71 @@ import OAICoverage from "@site/src/components/OAICoverage" import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -# Cortex - -:::info -**Real-world Use**: Cortex.cpp powers [Jan](https://jan.ai), our on-device ChatGPT-alternative. - -Cortex.cpp is in active development. If you have any questions, please reach out to us on [GitHub](https://github.com/janhq/cortex.cpp/issues/new/choose) -or [Discord](https://discord.com/invite/FTk2MvZwJH) -::: - ![Cortex Cover Image](/img/social-card.jpg) -Cortex is a Local AI API Platform that is used to run and customize LLMs. +Cortex is the open-source brain for robots: vision, speech, language, tabular, and action -- the cloud is optional. Key Features: - Straightforward CLI (inspired by Ollama) - Full C++ implementation, packageable into Desktop and Mobile apps - Pull from Huggingface, or Cortex Built-in Model Library - Models stored in universal file formats (vs blobs) -- Swappable Inference Backends (default: [`llamacpp`](https://github.com/janhq/cortex.llamacpp), future: [`ONNXRuntime`](https://github.com/janhq/cortex.onnx), [`TensorRT-LLM`](https://github.com/janhq/cortex.tensorrt-llm)) +- Swappable Inference Backends (default: [`llamacpp`](https://github.com/janhq/cortex.llamacpp) and [`ONNXRuntime`](https://github.com/janhq/cortex.onnx)) - Cortex can be deployed as a standalone API server, or integrated into apps like [Jan.ai](https://jan.ai/) +- Automatic API docs for your server -Cortex's roadmap is to implement the full OpenAI API including Tools, Runs, Multi-modal and Realtime APIs. +Cortex's roadmap includes implementing full compatibility with OpenAI API's and that includes Tools, Runs, Multi-modal and Realtime APIs. ## Inference Backends - Default: [llama.cpp](https://github.com/ggerganov/llama.cpp): cross-platform, supports most laptops, desktops and OSes -- Future: [ONNX Runtime](https://github.com/microsoft/onnxruntime): supports Windows Copilot+ PCs & NPUs -- Future: [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM): supports Nvidia GPUs +- Future: [ONNX Runtime](https://github.com/microsoft/onnxruntime): supports Windows Copilot+ PCs & NPUs and traditional machine learning models If GPU hardware is available, Cortex is GPU accelerated by default. ## Models -Cortex.cpp allows users to pull models from multiple Model Hubs, offering flexibility and extensive model access. +Cortex.cpp allows users to pull models from multiple Model Hubs, offering flexibility and extensive model access. - [Hugging Face](https://huggingface.co) - [Cortex Built-in Models](https://cortex.so/models) > **Note**: -> As a very general guide: You should have >8 GB of RAM available to run the 7B models, 16 GB to run the 14B models, and 32 GB to run the 32B models. +> As a very general guide: You should have >8 GB of RAM available to run the 3B models, 16 GB to run the 7B models, and 32 GB to run the 14B models. ### Cortex Built-in Models & Quantizations | Model /Engine | llama.cpp | Command | | -------------- | --------------------- | ----------------------------- | -| phi-3.5 | ✅ | cortex run phi3.5 | -| llama3.2 | ✅ | cortex run llama3.2 | -| llama3.1 | ✅ | cortex run llama3.1 | -| codestral | ✅ | cortex run codestral | -| gemma2 | ✅ | cortex run gemma2 | -| mistral | ✅ | cortex run mistral | -| ministral | ✅ | cortex run ministral | -| qwen2 | ✅ | cortex run qwen2.5 | -| openhermes-2.5 | ✅ | cortex run openhermes-2.5 | -| tinyllama | ✅ | cortex run tinyllama | +| phi-4 | ✅ | `cortex run phi-4` | +| llama3.2 | ✅ | `cortex run llama3.2` | +| llama3.1 | ✅ | `cortex run llama3.1` | +| codestral | ✅ | `cortex run codestral` | +| gemma2 | ✅ | `cortex run gemma2` | +| mistral | ✅ | `cortex run mistral` | +| ministral | ✅ | `cortex run ministral` | +| qwen2 | ✅ | `cortex run qwen2.5` | +| openhermes-2.5 | ✅ | `cortex run openhermes-2.5` | +| tinyllama | ✅ | `cortex run tinyllama` | View all [Cortex Built-in Models](https://cortex.so/models). Cortex supports multiple quantizations for each model. +```sh +cortex pull phi-4 +``` ``` -❯ cortex-nightly pull llama3.2 Downloaded models: - llama3.2:3b-gguf-q2-k + bartowski:phi-4-GGUF:phi-4-Q3_K_S.gguf Available to download: - 1. llama3.2:3b-gguf-q3-kl - 2. llama3.2:3b-gguf-q3-km - 3. llama3.2:3b-gguf-q3-ks - 4. llama3.2:3b-gguf-q4-km (default) - 5. llama3.2:3b-gguf-q4-ks - 6. llama3.2:3b-gguf-q5-km - 7. llama3.2:3b-gguf-q5-ks - 8. llama3.2:3b-gguf-q6-k - 9. llama3.2:3b-gguf-q8-0 - -Select a model (1-9): + 1. phi-4:14.7b-gguf-q2-k + 2. phi-4:14.7b-gguf-q3-kl + 3. phi-4:14.7b-gguf-q3-km + 4. phi-4:14.7b-gguf-q3-ks + 5. phi-4:14.7b-gguf-q4-km (default) + 6. phi-4:14.7b-gguf-q4-ks + 7. phi-4:14.7b-gguf-q5-km + 8. phi-4:14.7b-gguf-q5-ks + 9. phi-4:14.7b-gguf-q6-k + 10. phi-4:14.7b-gguf-q8-0 + +Select a model (1-10): ``` - - -{/* - - -| Model ID | Variant (Branch) | Model size | CLI command | -|------------------|------------------|-------------------|------------------------------------| -| codestral | 22b-gguf | 22B | `cortex run codestral:22b-gguf` | -| command-r | 35b-gguf | 35B | `cortex run command-r:35b-gguf` | -| gemma | 7b-gguf | 7B | `cortex run gemma:7b-gguf` | -| llama3 | gguf | 8B | `cortex run llama3:gguf` | -| llama3.1 | gguf | 8B | `cortex run llama3.1:gguf` | -| mistral | 7b-gguf | 7B | `cortex run mistral:7b-gguf` | -| mixtral | 7x8b-gguf | 46.7B | `cortex run mixtral:7x8b-gguf` | -| openhermes-2.5 | 7b-gguf | 7B | `cortex run openhermes-2.5:7b-gguf`| -| phi3 | medium-gguf | 14B - 4k ctx len | `cortex run phi3:medium-gguf` | -| phi3 | mini-gguf | 3.82B - 4k ctx len| `cortex run phi3:mini-gguf` | -| qwen2 | 7b-gguf | 7B | `cortex run qwen2:7b-gguf` | -| tinyllama | 1b-gguf | 1.1B | `cortex run tinyllama:1b-gguf` | - - -| Model ID | Variant (Branch) | Model size | CLI command | -|------------------|------------------|-------------------|------------------------------------| -| gemma | 7b-onnx | 7B | `cortex run gemma:7b-onnx` | -| llama3 | onnx | 8B | `cortex run llama3:onnx` | -| mistral | 7b-onnx | 7B | `cortex run mistral:7b-onnx` | -| openhermes-2.5 | 7b-onnx | 7B | `cortex run openhermes-2.5:7b-onnx`| -| phi3 | mini-onnx | 3.82B - 4k ctx len| `cortex run phi3:mini-onnx` | -| phi3 | medium-onnx | 14B - 4k ctx len | `cortex run phi3:medium-onnx` | - - - -| Model ID | Variant (Branch) | Model size | CLI command | -|------------------|-------------------------------|-------------------|------------------------------------| -| llama3 | 8b-tensorrt-llm-windows-ampere | 8B | `cortex run llama3:8b-tensorrt-llm-windows-ampere` | -| llama3 | 8b-tensorrt-llm-linux-ampere | 8B | `cortex run llama3:8b-tensorrt-llm-linux-ampere` | -| llama3 | 8b-tensorrt-llm-linux-ada | 8B | `cortex run llama3:8b-tensorrt-llm-linux-ada`| -| llama3 | 8b-tensorrt-llm-windows-ada | 8B | `cortex run llama3:8b-tensorrt-llm-windows-ada` | -| mistral | 7b-tensorrt-llm-linux-ampere | 7B | `cortex run mistral:7b-tensorrt-llm-linux-ampere`| -| mistral | 7b-tensorrt-llm-windows-ampere | 7B | `cortex run mistral:7b-tensorrt-llm-windows-ampere` | -| mistral | 7b-tensorrt-llm-linux-ada | 7B | `cortex run mistral:7b-tensorrt-llm-linux-ada`| -| mistral | 7b-tensorrt-llm-windows-ada | 7B | `cortex run mistral:7b-tensorrt-llm-windows-ada` | -| openhermes-2.5 | 7b-tensorrt-llm-windows-ampere | 7B | `cortex run openhermes-2.5:7b-tensorrt-llm-windows-ampere`| -| openhermes-2.5 | 7b-tensorrt-llm-windows-ada | 7B | `cortex run openhermes-2.5:7b-tensorrt-llm-windows-ada`| -| openhermes-2.5 | 7b-tensorrt-llm-linux-ada | 7B | `cortex run openhermes-2.5:7b-tensorrt-llm-linux-ada`| - - - */} \ No newline at end of file diff --git a/docs/docs/quickstart.mdx b/docs/docs/quickstart.mdx index 874309ad4..c965a342a 100644 --- a/docs/docs/quickstart.mdx +++ b/docs/docs/quickstart.mdx @@ -8,20 +8,16 @@ import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -:::info -Cortex.cpp is in active development. If you have any questions, please reach out to us: -- [GitHub](https://github.com/janhq/cortex.cpp/issues/new/choose) -- [Discord](https://discord.com/invite/FTk2MvZwJH) -::: - ## Local Installation -Cortex has an Local Installer that packages all required dependencies, so that no internet connection is required during the installation process. - - [Windows](https://app.cortexcpp.com/download/latest/windows-amd64-local) - - [Mac (Universal)](https://app.cortexcpp.com/download/latest/mac-universal-local) - - [Linux](https://app.cortexcpp.com/download/latest/linux-amd64-local) -## Start Cortex.cpp API Server -This command starts the Cortex.cpp API server at `localhost:39281`. +Cortex has a **Local Installer** with all of the required dependencies, so that once you've downloaded it, no internet connection is required during the installation process. +- [Windows](https://app.cortexcpp.com/download/latest/windows-amd64-local) +- [Mac (Universal)](https://app.cortexcpp.com/download/latest/mac-universal-local) +- [Linux](https://app.cortexcpp.com/download/latest/linux-amd64-local) + +## Start a Cortex Server + +This command starts the Cortex's' API server at `localhost:39281`. ```sh @@ -35,48 +31,63 @@ This command starts the Cortex.cpp API server at `localhost:39281`. -## Pull a Model & Select Quantization +## Pull Models + This command allows users to download a model from these Model Hubs: - [Cortex Built-in Models](https://cortex.so/models) - [Hugging Face](https://huggingface.co) (GGUF): `cortex pull ` -It displays available quantizations, recommends a default and downloads the desired quantization. +It displays available quantizations, recommends a default and downloads the desired quantization. + + The following two options will show you all of the available models under those names. Cortex will first search + on its own hub for models like `llama3.3`, and in huggingface for hyper specific ones like `bartowski/Meta-Llama-3.1-8B-Instruct-GGU`. ```sh - $ cortex pull llama3.2 - $ cortex pull bartowski/Meta-Llama-3.1-8B-Instruct-GGUF + cortex pull llama3.3 + ``` + or, + + ```sh + cortex pull bartowski/Meta-Llama-3.1-8B-Instruct-GGUF ``` ```sh - $ cortex pull llama3.2 - $ cortex.exe pull bartowski/Meta-Llama-3.1-8B-Instruct-GGUF + cortex pull llama3.3 + ``` + ```sh + cortex.exe pull bartowski/Meta-Llama-3.1-8B-Instruct-GGUF ``` ## Run a Model -This command downloads the default `gguf` model format from the [Cortex Hub](https://huggingface.co/cortexso), starts the model, and chat with the model. + +This command downloads the default `gguf` model (if not available in your file system) from the +[Cortex Hub](https://huggingface.co/cortexso), starts the model, and chat with the model. + ```sh - cortex run llama3.2 + cortex run llama3.3 ``` ```sh - cortex.exe run llama3.2 + cortex.exe run llama3.3 ``` + :::info All model files are stored in the `~/cortex/models` folder. ::: ## Using the Model + ### API -```curl +```sh curl http://localhost:39281/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ @@ -101,7 +112,9 @@ curl http://localhost:39281/v1/chat/completions \ Refer to our [API documentation](https://cortex.so/api-reference) for more details. ## Show the System State -This command displays the running model and the hardware system status (RAM, Engine, VRAM, Uptime) + +This command displays the running model and the hardware system status (RAM, Engine, VRAM, Uptime). + ```sh @@ -116,22 +129,26 @@ This command displays the running model and the hardware system status (RAM, Eng ## Stop a Model + This command stops the running model. + ```sh - cortex models stop llama3.2 + cortex models stop llama3.3 ``` ```sh - cortex.exe models stop llama3.2 + cortex.exe models stop llama3.3 ``` -## Stop Cortex.cpp API Server -This command starts the Cortex.cpp API server at `localhost:39281`. +## Stop a Cortex Server + +This command stops the Cortex.cpp API server at `localhost:39281` or whichever other port you used to start cortex. + ```sh @@ -145,31 +162,9 @@ This command starts the Cortex.cpp API server at `localhost:39281`. - - ## What's Next? -Now that Cortex.cpp is set up, here are the next steps to explore: +Now that Cortex is set up, you can continue on to any of the following sections: -1. Adjust the folder path and configuration using the [`.cortexrc`](/docs/architecture/cortexrc) file. -2. Explore the Cortex.cpp [data folder](/docs/architecture/data-folder) to understand how it stores data. -3. Learn about the structure of the [`model.yaml`](/docs/capabilities/models/model-yaml) file in Cortex.cpp. +- Adjust the folder path and configuration using the [`.cortexrc`](/docs/architecture/cortexrc) file. +- Explore the Cortex's [data folder](/docs/architecture/data-folder) to understand how data gets stored. +- Learn about the structure of the [`model.yaml`](/docs/capabilities/models/model-yaml) file in Cortex. diff --git a/docs/docs/requirements.mdx b/docs/docs/requirements.mdx index 7c13ab772..fef3915ff 100644 --- a/docs/docs/requirements.mdx +++ b/docs/docs/requirements.mdx @@ -7,10 +7,6 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import Admonition from '@theme/Admonition'; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: - To run LLMs on-device or on-premise, Cortex has the following requirements: ## Hardware Requirements @@ -42,7 +38,7 @@ To run LLMs on-device or on-premise, Cortex has the following requirements: - 8GB for running up to 3B models (int4). - 16GB for running up to 7B models (int4). - 32GB for running up to 13B models (int4). - + We support DDR2 RAM as the minimum requirement but recommend using newer generations of RAM for improved performance. @@ -50,13 +46,13 @@ To run LLMs on-device or on-premise, Cortex has the following requirements: - 6GB can load the 3B model (int4) with `ngl` at 120 ~ full speed on CPU/ GPU. - 8GB can load the 7B model (int4) with `ngl` at 120 ~ full speed on CPU/ GPU. - 12GB can load the 13B model (int4) with `ngl` at 120 ~ full speed on CPU/ GPU. - + Having at least 6GB VRAM when using NVIDIA, AMD, or Intel Arc GPUs is recommended. - Having at least 10GB is recommended. - + The app is 1.02 MB, but models are usually 4GB+. @@ -116,7 +112,7 @@ To run LLMs on-device or on-premise, Cortex has the following requirements: - [NVIDIA driver](https://www.nvidia.com/Download/index.aspx) version 470.63.01 or higher. - [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) version 12.3 or higher. - + CUDA Toolkit dependencies will be installed when you install Cortex. @@ -137,4 +133,4 @@ To run LLMs on-device or on-premise, Cortex has the following requirements: - \ No newline at end of file + diff --git a/docs/docs/telemetry.mdx b/docs/docs/telemetry.mdx index 602449978..33c8abef8 100644 --- a/docs/docs/telemetry.mdx +++ b/docs/docs/telemetry.mdx @@ -7,10 +7,6 @@ slug: "telemetry" import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -:::warning -🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase. -::: - Cortex collects telemetry data to enhance our product. This data provides detailed insights into your usage, including crash reports for your Cortex or Jan applications. By analyzing this information, we can identify and fix bugs, optimize performance, and improve overall stability and user experience. :::info We do not collect any sensitive or personal information. @@ -22,7 +18,7 @@ cortex telemetry crash ## Dataflow To understand how our telemetry system operates and how data flows from your hardware into our system, please refer to the [Telemetry architecture](/docs/telemetry-architecture). ## Telemetry Metrics -The collected telemetry metrics for Cortex are divided into two main categories: +The collected telemetry metrics for Cortex are divided into two main categories: - `CrashReportResource` - `CrashReportPayload` @@ -115,4 +111,4 @@ This category focuses on metrics related to specific operations within Cortex. I :::info Learn more about Telemetry: - [Telemetry CLI command](/docs/cli/telemetry). -::: \ No newline at end of file +::: diff --git a/docs/sidebars.ts b/docs/sidebars.ts index 15b29bada..dde3da69d 100644 --- a/docs/sidebars.ts +++ b/docs/sidebars.ts @@ -148,6 +148,7 @@ const sidebars: SidebarsConfig = { collapsed: true, items: [ { type: "doc", id: "engines/llamacpp", label: "llama.cpp" }, + { type: "doc", id: "engines/python-engine", label: "python engine" }, // { type: "doc", id: "engines/tensorrt-llm", label: "TensorRT-LLM" }, // { type: "doc", id: "engines/onnx", label: "ONNX" }, { diff --git a/docs/static/img/cortex_hub.png b/docs/static/img/cortex_hub.png new file mode 100644 index 000000000..1c478ff38 Binary files /dev/null and b/docs/static/img/cortex_hub.png differ diff --git a/docs/static/img/mitproxy_ex.png b/docs/static/img/mitproxy_ex.png new file mode 100644 index 000000000..57f86c616 Binary files /dev/null and b/docs/static/img/mitproxy_ex.png differ diff --git a/docs/static/img/social-card-old.jpg b/docs/static/img/social-card-old.jpg new file mode 100644 index 000000000..cad56cc6f Binary files /dev/null and b/docs/static/img/social-card-old.jpg differ diff --git a/docs/static/img/social-card.jpg b/docs/static/img/social-card.jpg index cad56cc6f..d2114111d 100644 Binary files a/docs/static/img/social-card.jpg and b/docs/static/img/social-card.jpg differ diff --git a/docs/static/img/sqlite_view_harlequin.png b/docs/static/img/sqlite_view_harlequin.png new file mode 100644 index 000000000..3e3293ebf Binary files /dev/null and b/docs/static/img/sqlite_view_harlequin.png differ diff --git a/engine/CMakeLists.txt b/engine/CMakeLists.txt index ae354500b..be8fe200d 100644 --- a/engine/CMakeLists.txt +++ b/engine/CMakeLists.txt @@ -62,13 +62,16 @@ add_compile_definitions(CORTEX_CPP_VERSION="${CORTEX_CPP_VERSION}") add_compile_definitions(CORTEX_CONFIG_FILE_PATH="${CORTEX_CONFIG_FILE_PATH}") option(CMAKE_BUILD_TEST "Enable testing" OFF) +option(CMAKE_BUILD_INJA_TEST "Enable inja example" OFF) if(CMAKE_BUILD_TEST) add_subdirectory(test) endif() add_subdirectory(cli) - +if(CMAKE_BUILD_INJA_TEST) + add_subdirectory(examples/inja) +endif() find_package(jsoncpp CONFIG REQUIRED) find_package(Drogon CONFIG REQUIRED) @@ -149,6 +152,7 @@ add_executable(${TARGET_NAME} main.cc ${CMAKE_CURRENT_SOURCE_DIR}/extensions/python-engine/python_engine.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/dylib_path_manager.cc + ${CMAKE_CURRENT_SOURCE_DIR}/utils/process/utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/extensions/remote-engine/remote_engine.cc diff --git a/engine/cli/CMakeLists.txt b/engine/cli/CMakeLists.txt index f9e8b673f..0f0b2b48d 100644 --- a/engine/cli/CMakeLists.txt +++ b/engine/cli/CMakeLists.txt @@ -95,6 +95,7 @@ add_executable(${TARGET_NAME} main.cc ${CMAKE_CURRENT_SOURCE_DIR}/../utils/file_manager_utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/../utils/curl_utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/../utils/system_info_utils.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../utils/process/utils.cc ) target_link_libraries(${TARGET_NAME} PRIVATE CLI11::CLI11) diff --git a/engine/cli/commands/server_start_cmd.cc b/engine/cli/commands/server_start_cmd.cc index 37ca87fb6..c2ef779f1 100644 --- a/engine/cli/commands/server_start_cmd.cc +++ b/engine/cli/commands/server_start_cmd.cc @@ -3,6 +3,7 @@ #include "services/engine_service.h" #include "utils/cortex_utils.h" #include "utils/file_manager_utils.h" +#include "utils/process/utils.h" #if defined(_WIN32) || defined(_WIN64) #include "utils/widechar_conv.h" @@ -79,17 +80,17 @@ bool ServerStartCmd::Exec(const std::string& host, int port, mutable_cmds.push_back(L'\0'); // Create child process if (!CreateProcess( - NULL, // No module name (use command line) + NULL, // No module name (use command line) mutable_cmds - .data(), // Command line (replace with your actual executable) - NULL, // Process handle not inheritable - NULL, // Thread handle not inheritable - FALSE, // Set handle inheritance - 0, // No creation flags - NULL, // Use parent's environment block - NULL, // Use parent's starting directory - &si, // Pointer to STARTUPINFO structure - &pi)) // Pointer to PROCESS_INFORMATION structure + .data(), // Command line (replace with your actual executable) + NULL, // Process handle not inheritable + NULL, // Thread handle not inheritable + FALSE, // Set handle inheritance + CREATE_NO_WINDOW, // No new console + NULL, // Use parent's environment block + NULL, // Use parent's starting directory + &si, // Pointer to STARTUPINFO structure + &pi)) // Pointer to PROCESS_INFORMATION structure { std::cout << "Could not start server: " << GetLastError() << std::endl; return false; @@ -103,25 +104,26 @@ bool ServerStartCmd::Exec(const std::string& host, int port, } #else - // Unix-like system-specific code to fork a child process - pid_t pid = fork(); + std::vector commands; + // Some engines requires to add lib search path before process being created + auto download_srv = std::make_shared(); + auto dylib_path_mng = std::make_shared(); + auto db_srv = std::make_shared(); + EngineService(download_srv, dylib_path_mng, db_srv).RegisterEngineLibPath(); + std::string p = cortex_utils::GetCurrentPath() + "/" + exe; + commands.push_back(p); + commands.push_back("--config_file_path"); + commands.push_back(get_config_file_path()); + commands.push_back("--data_folder_path"); + commands.push_back(get_data_folder_path()); + commands.push_back("--loglevel"); + commands.push_back(log_level_); + auto pid = cortex::process::SpawnProcess(commands); if (pid < 0) { // Fork failed std::cerr << "Could not start server: " << std::endl; return false; - } else if (pid == 0) { - // Some engines requires to add lib search path before process being created - auto download_srv = std::make_shared(); - auto dylib_path_mng = std::make_shared(); - auto db_srv = std::make_shared(); - EngineService(download_srv, dylib_path_mng, db_srv).RegisterEngineLibPath(); - - std::string p = cortex_utils::GetCurrentPath() + "/" + exe; - execl(p.c_str(), exe.c_str(), "--start-server", "--config_file_path", - get_config_file_path().c_str(), "--data_folder_path", - get_data_folder_path().c_str(), "--loglevel", log_level_.c_str(), - (char*)0); } else { // Parent process if (!TryConnectToServer(host, port)) { diff --git a/engine/cli/utils/download_progress.cc b/engine/cli/utils/download_progress.cc index 37920e516..07f91adb4 100644 --- a/engine/cli/utils/download_progress.cc +++ b/engine/cli/utils/download_progress.cc @@ -17,10 +17,6 @@ namespace { std::string Repo2Engine(const std::string& r) { if (r == kLlamaRepo) { return kLlamaEngine; - } else if (r == kOnnxRepo) { - return kOnnxEngine; - } else if (r == kTrtLlmRepo) { - return kTrtLlmEngine; } return r; }; diff --git a/engine/config/gguf_parser.cc b/engine/config/gguf_parser.cc index acd2b9c76..c97c79c30 100644 --- a/engine/config/gguf_parser.cc +++ b/engine/config/gguf_parser.cc @@ -104,6 +104,10 @@ std::pair GGUFHandler::ReadString( uint64_t length; std::memcpy(&length, data_ + offset, sizeof(uint64_t)); + if (offset + 8 + length > file_size_) { + throw std::runtime_error("GGUF metadata string length exceeds file size.\n"); + } + std::string value(reinterpret_cast(data_ + offset + 8), length); return {8 + static_cast(length), value}; } @@ -274,6 +278,9 @@ size_t GGUFHandler::ReadArray(std::size_t offset, const std::string& key) { } array_offset += length; + if (offset + array_offset > file_size_) { + throw std::runtime_error("GGUF Parser Array exceeded file size.\n"); + } } if (array_values_string.size() > 0) metadata_array_string_[key] = array_values_string; diff --git a/engine/controllers/engines.cc b/engine/controllers/engines.cc index b8f8991ce..9c5836d3e 100644 --- a/engine/controllers/engines.cc +++ b/engine/controllers/engines.cc @@ -13,10 +13,6 @@ namespace { std::string NormalizeEngine(const std::string& engine) { if (engine == kLlamaEngine) { return kLlamaRepo; - } else if (engine == kOnnxEngine) { - return kOnnxRepo; - } else if (engine == kTrtLlmEngine) { - return kTrtLlmRepo; } return engine; }; diff --git a/engine/controllers/models.cc b/engine/controllers/models.cc index 3cfa9c8c3..7cca9624d 100644 --- a/engine/controllers/models.cc +++ b/engine/controllers/models.cc @@ -58,6 +58,20 @@ void Models::PullModel(const HttpRequestPtr& req, model_handle, desired_model_id, desired_model_name); } else if (model_handle.find(":") != std::string::npos) { auto model_and_branch = string_utils::SplitBy(model_handle, ":"); + if (model_and_branch.size() == 3) { + auto mh = url_parser::Url{ + .protocol = "https", + .host = kHuggingFaceHost, + .pathParams = { + model_and_branch[0], + model_and_branch[1], + "resolve", + "main", + model_and_branch[2], + }}.ToFullPath(); + return model_service_->HandleDownloadUrlAsync(mh, desired_model_id, + desired_model_name); + } return model_service_->DownloadModelFromCortexsoAsync( model_and_branch[0], model_and_branch[1], desired_model_id); } @@ -284,9 +298,7 @@ void Models::GetModel(const HttpRequestPtr& req, fs::path(model_entry.value().path_to_model_yaml)) .string()); auto model_config = yaml_handler.GetModelConfig(); - if (model_config.engine == kOnnxEngine || - model_config.engine == kLlamaEngine || - model_config.engine == kTrtLlmEngine) { + if (model_config.engine == kLlamaEngine) { auto ret = model_config.ToJsonString(); auto resp = cortex_utils::CreateCortexHttpTextAsJsonResponse(ret); resp->setStatusCode(drogon::k200OK); @@ -366,9 +378,7 @@ void Models::UpdateModel(const HttpRequestPtr& req, yaml_handler.ModelConfigFromFile(yaml_fp.string()); config::ModelConfig model_config = yaml_handler.GetModelConfig(); std::string message; - if (model_config.engine == kOnnxEngine || - model_config.engine == kLlamaEngine || - model_config.engine == kTrtLlmEngine) { + if (model_config.engine == kLlamaEngine) { model_config.FromJson(json_body); yaml_handler.UpdateModelConfig(model_config); yaml_handler.WriteYamlFile(yaml_fp.string()); @@ -814,15 +824,34 @@ void Models::GetModelSources( resp->setStatusCode(k400BadRequest); callback(resp); } else { - auto const& info = res.value(); + auto& info = res.value(); Json::Value ret; Json::Value data(Json::arrayValue); - for (auto const& i : info) { - data.append(i); + for (auto& i : info) { + data.append(i.second.ToJson()); } ret["data"] = data; auto resp = cortex_utils::CreateCortexHttpJsonResponse(ret); resp->setStatusCode(k200OK); callback(resp); } +} + +void Models::GetModelSource( + const HttpRequestPtr& req, + std::function&& callback, + const std::string& src) { + auto res = model_src_svc_->GetModelSource(src); + if (res.has_error()) { + Json::Value ret; + ret["message"] = res.error(); + auto resp = cortex_utils::CreateCortexHttpJsonResponse(ret); + resp->setStatusCode(k400BadRequest); + callback(resp); + } else { + auto& info = res.value(); + auto resp = cortex_utils::CreateCortexHttpJsonResponse(info.ToJson()); + resp->setStatusCode(k200OK); + callback(resp); + } } \ No newline at end of file diff --git a/engine/controllers/models.h b/engine/controllers/models.h index 60053acdb..8f6af4280 100644 --- a/engine/controllers/models.h +++ b/engine/controllers/models.h @@ -43,6 +43,7 @@ class Models : public drogon::HttpController { ADD_METHOD_TO(Models::AddModelSource, "/v1/models/sources", Post); ADD_METHOD_TO(Models::DeleteModelSource, "/v1/models/sources", Delete); ADD_METHOD_TO(Models::GetModelSources, "/v1/models/sources", Get); + ADD_METHOD_TO(Models::GetModelSource, "/v1/models/sources/{src}", Get); METHOD_LIST_END explicit Models(std::shared_ptr db_service, @@ -106,6 +107,10 @@ class Models : public drogon::HttpController { void GetModelSources(const HttpRequestPtr& req, std::function&& callback); + void GetModelSource(const HttpRequestPtr& req, + std::function&& callback, + const std::string& src); + private: std::shared_ptr db_service_; std::shared_ptr model_service_; diff --git a/engine/cortex-cpp-deps/.gitignore b/engine/cortex-cpp-deps/.gitignore deleted file mode 100644 index d76b74e58..000000000 --- a/engine/cortex-cpp-deps/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -build -.DS_Store diff --git a/engine/cortex-cpp-deps/CMakeLists.txt b/engine/cortex-cpp-deps/CMakeLists.txt deleted file mode 100644 index b610d1c3d..000000000 --- a/engine/cortex-cpp-deps/CMakeLists.txt +++ /dev/null @@ -1,129 +0,0 @@ -cmake_minimum_required(VERSION 3.22) # Required for FetchContent - -project(MyProject) - -include(ExternalProject) - -# Define variables -if(UNIX AND NOT APPLE) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") -endif() -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -set(THIRD_PARTY_INSTALL_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../build-deps/_install) -#if(NOT THIRD_PARTY_INSTALL_PATH ) -# message(FATAL_ERROR "TRITON_THIRD_PARTY_INSTALL_PREFIX must be set") -#endif() # TRITON_THIRD_PARTY_INSTALL_PREFIX -# To force the find_package to look for .a inside self installed version -#set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -#set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) -#set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) -# -# Add the external project -if(NOT APPLE) -ExternalProject_Add( - zlib - GIT_REPOSITORY https://github.com/madler/zlib.git - GIT_TAG v1.2.11 - CMAKE_ARGS - -DBUILD_SHARED_LIBS=OFF - -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} - -DCMAKE_BUILD_TYPE=RELEASE -) - -if(WIN32) - SET(ZLIB_DYN_PATH "${THIRD_PARTY_INSTALL_PATH}/lib/zlib.lib") - ExternalProject_Add_Step( - zlib remove_dynamic_lib - COMMAND ${CMAKE_COMMAND} -E rm -f ${ZLIB_DYN_PATH} - DEPENDEES install - ) -endif() -endif() - -ExternalProject_Add( - brotli - GIT_REPOSITORY https://github.com/google/brotli - GIT_TAG v1.1.0 - CMAKE_ARGS - -DBUILD_SHARED_LIBS=OFF - -DSHARE_INSTALL_PREFIX=${CMAKE_CURRENT_SOURCE_DIR}/share - -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} - -DCMAKE_BUILD_TYPE=RELEASE -) - -ExternalProject_Add( - jsoncpp - GIT_REPOSITORY https://github.com/open-source-parsers/jsoncpp - GIT_TAG 1.9.5 - CMAKE_ARGS - -DBUILD_SHARED_LIBS=OFF - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} - -DCMAKE_BUILD_TYPE=RELEASE -) - -ExternalProject_Add( - c-ares - GIT_REPOSITORY https://github.com/c-ares/c-ares - GIT_TAG cares-1_26_0 - CMAKE_ARGS - -DCARES_SHARED=OFF - -DCARES_STATIC=ON - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} - -DCMAKE_BUILD_TYPE=RELEASE - -DCARES_STATIC_PIC=ON -) - -ExternalProject_Add( - drogon - GIT_REPOSITORY https://github.com/drogonframework/drogon - GIT_TAG v1.9.4 - CMAKE_ARGS - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DOPENSSL_USE_STATIC_LIBS=TRUE - -DBUILD_ORM=OFF - -DBUILD_YAML_CONFIG=OFF - -DBUILD_EXAMPLES=OFF - -DBUILD_CTL=OFF - -DCMAKE_EXPORT_COMPILE_COMMANDS=ON - -DBUILD_BROTLI=ON - -DCMAKE_PREFIX_PATH=${THIRD_PARTY_INSTALL_PATH} - # -DCMAKE_FIND_ROOT_PATH=${THIRD_PARTY_INSTALL_PATH} # To set the dir (that will be used to force the look for .a) - -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} - -DCMAKE_BUILD_TYPE=RELEASE -) - -# Download and install GoogleTest -# ExternalProject_Add( -# gtest -# GIT_REPOSITORY https://github.com/google/googletest -# GIT_TAG v1.14.0 -# CMAKE_ARGS -# -Dgtest_force_shared_crt=ON -# -DCMAKE_PREFIX_PATH=${THIRD_PARTY_INSTALL_PATH} -# -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} -# -DCMAKE_BUILD_TYPE=RELEASE -# ) - - -if(WIN32) - # Fix trantor cmakelists to link c-ares on Windows - set(TRANTOR_CMAKE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/../build-deps/cortex-cpp-deps/drogon-prefix/src/drogon/trantor/CMakeLists.txt) - ExternalProject_Add_Step(drogon trantor_custom_target - COMMAND ${CMAKE_COMMAND} -E echo add_definitions(-DCARES_STATICLIB) >> ${TRANTOR_CMAKE_FILE} - DEPENDEES download - ) -endif() - -include_directories(${THIRD_PARTY_INSTALL_PATH}/include) -link_directories(${THIRD_PARTY_INSTALL_PATH}/lib) -# Optionally link or add dependencies to your targets -add_dependencies(drogon c-ares jsoncpp brotli) - -if(NOT APPLE) - add_dependencies(drogon zlib) -endif() - -# target_link_libraries( ...) - diff --git a/engine/cortex-cpp-deps/README.md b/engine/cortex-cpp-deps/README.md deleted file mode 100644 index 9ba6aeabb..000000000 --- a/engine/cortex-cpp-deps/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# WIP - -# Apple -- ZLib in apple command line tool is a .tbd that is reusable in other system (no need static) -- OpenSSL has its own value of include - -Static drogon usage -You need to add back these library when linking -``` -target_link_libraries(${PROJECT_NAME} PRIVATE drogon brotlicommon brotlidec brotlienc cares resolv jsoncpp trantor OpenSSL::SSL OpenSSL::Crypto ZLIB::ZLIB) -``` diff --git a/engine/database/models.cc b/engine/database/models.cc index 67ff1a8c9..6bf891040 100644 --- a/engine/database/models.cc +++ b/engine/database/models.cc @@ -270,35 +270,63 @@ bool Models::HasModel(const std::string& identifier) const { } } -cpp::result, std::string> Models::GetModelSources() - const { +cpp::result, std::string> Models::GetModels( + const std::string& model_src) const { try { - std::vector sources; + std::vector res; SQLite::Statement query(db_, - "SELECT DISTINCT model_source FROM models WHERE " - "status = \"downloadable\""); - + "SELECT model_id, author_repo_id, branch_name, " + "path_to_model_yaml, model_alias, model_format, " + "model_source, status, engine, metadata FROM " + "models WHERE model_source = " + "? AND status = \"downloadable\""); + query.bind(1, model_src); while (query.executeStep()) { - sources.push_back(query.getColumn(0).getString()); + ModelEntry entry; + entry.model = query.getColumn(0).getString(); + entry.author_repo_id = query.getColumn(1).getString(); + entry.branch_name = query.getColumn(2).getString(); + entry.path_to_model_yaml = query.getColumn(3).getString(); + entry.model_alias = query.getColumn(4).getString(); + entry.model_format = query.getColumn(5).getString(); + entry.model_source = query.getColumn(6).getString(); + entry.status = StringToStatus(query.getColumn(7).getString()); + entry.engine = query.getColumn(8).getString(); + entry.metadata = query.getColumn(9).getString(); + res.push_back(entry); } - return sources; + return res; } catch (const std::exception& e) { return cpp::fail(e.what()); } } -cpp::result, std::string> Models::GetModels( - const std::string& model_src) const { +cpp::result, std::string> Models::GetModelSources() + const { try { - std::vector ids; - SQLite::Statement query(db_, - "SELECT model_id FROM models WHERE model_source = " - "? AND status = \"downloadable\""); - query.bind(1, model_src); + std::vector res; + SQLite::Statement query( + db_, + "SELECT model_id, author_repo_id, branch_name, " + "path_to_model_yaml, model_alias, model_format, " + "model_source, status, engine, metadata FROM models " + "WHERE model_source != \"\" AND (status = \"downloaded\" OR status = " + "\"downloadable\")"); while (query.executeStep()) { - ids.push_back(query.getColumn(0).getString()); + ModelEntry entry; + entry.model = query.getColumn(0).getString(); + entry.author_repo_id = query.getColumn(1).getString(); + entry.branch_name = query.getColumn(2).getString(); + entry.path_to_model_yaml = query.getColumn(3).getString(); + entry.model_alias = query.getColumn(4).getString(); + entry.model_format = query.getColumn(5).getString(); + entry.model_source = query.getColumn(6).getString(); + entry.status = StringToStatus(query.getColumn(7).getString()); + entry.engine = query.getColumn(8).getString(); + entry.metadata = query.getColumn(9).getString(); + res.push_back(entry); } - return ids; + return res; } catch (const std::exception& e) { return cpp::fail(e.what()); } diff --git a/engine/database/models.h b/engine/database/models.h index b0c4bc258..b0059dbea 100644 --- a/engine/database/models.h +++ b/engine/database/models.h @@ -10,7 +10,6 @@ namespace cortex::db { enum class ModelStatus { Remote, Downloaded, Downloadable }; - struct ModelEntry { std::string model; std::string author_repo_id; @@ -57,9 +56,9 @@ class Models { cpp::result, std::string> FindRelatedModel( const std::string& identifier) const; bool HasModel(const std::string& identifier) const; - cpp::result, std::string> GetModelSources() const; - cpp::result, std::string> GetModels( + cpp::result, std::string> GetModels( const std::string& model_src) const; + cpp::result, std::string> GetModelSources() const; }; } // namespace cortex::db diff --git a/engine/e2e-test/local_test.py b/engine/e2e-test/local_test.py new file mode 100644 index 000000000..be0905e66 --- /dev/null +++ b/engine/e2e-test/local_test.py @@ -0,0 +1,284 @@ +import requests +import threading +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +import json +import platform +import asyncio +import argparse +import websockets + +# Define a list of request configurations +model_id = "tinyllama:gguf" + + +def make_request(config): + try: + method = config["method"].lower() + url = config["url"] + headers = config.get("headers", {}) + data = json.dumps(config.get("data")) if "data" in config else None + + response = requests.request(method, url, headers=headers, data=data) + return response.status_code, response.text + except requests.RequestException as e: + return None, str(e) + + +def get_setup_configs(host_port): + return [ + { + "method": "PATCH", + "url": "http://" + host_port + "/v1/configs", + "headers": {"Content-Type": "application/json"}, + "data": { + "cors": True, + "allowed_origins": [ + "http://localhost:39281", + "http://127.0.0.1:39281", + "http://0.0.0.0:39281", + ], + "proxy_username": "", + "proxy_password": "", + "proxy_url": "", + "verify_proxy_ssl": False, + "verify_proxy_host_ssl": False, + "verify_peer_ssl": False, + "verify_host_ssl": False, + "no_proxy": "localhost", + "huggingface_token": "", + }, + }, + { + "method": "POST", + "url": "http://" + host_port + "/v1/engines/llama-cpp/install", + "headers": {"Accept": "application/json"}, + }, + { + "method": "GET", + "url": "http://" + host_port + "/v1/engines/llama-cpp", + "headers": {"Accept": "application/json"}, + }, + { + "method": "GET", + "url": "http://" + host_port + "/v1/engines/llama-cpp/releases", + "headers": {"Accept": "application/json"}, + }, + { + "method": "GET", + "url": "http://" + host_port + "/v1/engines/llama-cpp/releases/latest", + "headers": {"Accept": "application/json"}, + }, + { + "method": "GET", + "url": "http://" + host_port + "/v1/engines/llama-cpp/default", + "headers": {"Accept": "application/json"}, + }, + { + "method": "POST", + "url": "http://" + host_port + "/v1/models/pull", + "headers": {"Content-Type": "application/json"}, + "data": {"model": "tinyllama:gguf"}, + }, + { + "method": "POST", + "url": "http://" + host_port + "/v1/engines/llama-cpp/load", + "headers": {"Content-Type": "application/json"}, + }, + { + "method": "POST", + "url": "http://" + host_port + "/v1/models/start", + "headers": {"Content-Type": "application/json"}, + "data": {"model": "tinyllama:gguf"}, + }, + { + "method": "POST", + "url": "http://" + host_port + "/v1/chat/completions", + "headers": {"Content-Type": "application/json"}, + "data": { + "model": "tinyllama:gguf", + "stream": True, + "messages": [{"content": "How are you today?", "role": "user"}], + "max_tokens": 256, + }, + }, + ] + + +def get_teardown_configs(host_port): + return [ + { + "method": "POST", + "url": "http://" + host_port + "/v1/models/stop", + "headers": {"Content-Type": "application/json"}, + "data": {"model": "tinyllama:gguf"}, + }, + { + "method": "DELETE", + "url": "http://" + host_port + "/v1/engines/llama-cpp/load", + "headers": {"Content-Type": "application/json"}, + }, + { + "method": "DELETE", + "url": "http://" + host_port + "/v1/engines/llama-cpp/install", + "headers": {"Accept": "application/json"}, + }, + { + "method": "DELETE", + "url": "http://" + host_port + "/v1/models/" + model_id, + "headers": {"Accept": "application/json"}, + }, + ] + + +async def setup_env(host_port): + for config in get_setup_configs(host_port): + status_code, response_text = make_request(config) + if config["method"] == "POST" and ( + "/v1/engines/install" in config["url"] or "/v1/models/pull" in config["url"] + ): + await wait_for_websocket_download_success_event(timeout=None) + + if status_code: + print( + f"setup_env: {config['url']} Status Code {status_code} - Response {response_text}" + ) + else: + print(f"setup_env: {config['url']} Error - {response_text}") + + +def teardown(host_port): + for config in get_teardown_configs(host_port): + status_code, response_text = make_request(config) + + if status_code: + print(f"teardown: {config['url']} Status Code {status_code}") + else: + print(f"teardown: {config['url']} Error - {response_text}") + + +def get_request_configs(host_port): + return [ + { + "method": "GET", + "url": "http://" + host_port + "/v1/configs", + "headers": {"Accept": "application/json"}, + }, + { + "method": "GET", + "url": "http://" + host_port + "/v1/models", + "headers": {"Accept": "application/json"}, + }, + { + "method": "GET", + "url": "http://" + host_port + "/v1/hardware", + "headers": {"Accept": "application/json"}, + }, + { + "method": "GET", + "url": "http://" + host_port + "/healthz", + "headers": {"Accept": "application/json"}, + }, + { + "method": "GET", + "url": "http://" + host_port + "/v1/threads", + "headers": {"Accept": "application/json"}, + }, + { + "method": "GET", + "url": "http://" + host_port + "/v1/threads", + "headers": {"Accept": "application/json"}, + }, + ] + + +def worker(host_port, thread_id, num_requests): + request_configs = get_request_configs(host_port) + for i in range(num_requests): + config = request_configs[i % len(request_configs)] + status_code, response_text = make_request(config) + if status_code: + print( + f"Thread {thread_id}, Request {i+1}: {config['method']} {config['url']} - Status Code {status_code}" + ) + else: + print( + f"Thread {thread_id}, Request {i+1}: {config['method']} {config['url']} - Error - {response_text}" + ) + + +async def wait_for_websocket_download_success_event(timeout: float = 30): + if platform.system() == "Windows": + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) + async with websockets.connect("ws://" + host_port + "/events") as websocket: + try: + # Using wait_for instead of timeout context manager + async def receive_until_success(): + while True: + message = await websocket.recv() + try: + event = json.loads(message) + if event.get("type") == "DownloadSuccess": + return event + except json.JSONDecodeError: + continue + + return await asyncio.wait_for(receive_until_success(), timeout) + + except asyncio.TimeoutError: + raise TimeoutError("Timeout waiting for DownloadSuccess event") + + +def run_test(host_port, num_threads, requests_per_thread): + start_time = time.time() + + with ThreadPoolExecutor(max_workers=num_threads) as executor: + futures = [ + executor.submit(worker, host_port, i, requests_per_thread) + for i in range(num_threads) + ] + + for future in as_completed(futures): + future.result() + + end_time = time.time() + total_requests = num_threads * requests_per_thread + total_time = end_time - start_time + + print(f"\nTest completed:") + print(f"Total threads: {num_threads}") + print(f"Requests per thread: {requests_per_thread}") + print(f"Total requests: {total_requests}") + print(f"Total time: {total_time:.2f} seconds") + print(f"Requests per second: {total_requests / total_time:.2f}") + + +def parse_argument(): + parser = argparse.ArgumentParser(description="Local test") + parser.add_argument("--host", type=str, default="127.0.0.1", help="Server host") + parser.add_argument("--port", type=int, help="Server port", required=True) + parser.add_argument( + "--num_threads", type=str, default=5, help="Number of threads to send requests" + ) + parser.add_argument( + "--requests_per_thread", + type=str, + default=10, + help="Number of requests per thread", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + args = parse_argument() + host_port = args.host + ":" + str(args.port) + print("Server start: " + host_port) + teardown(host_port) + + loop = asyncio.get_event_loop() + loop.run_until_complete(setup_env(host_port)) + + run_test(host_port, args.num_threads, args.requests_per_thread) + + teardown(host_port) diff --git a/engine/e2e-test/test_cli_engine_get.py b/engine/e2e-test/test_cli_engine_get.py index c26bedfae..1eb32388c 100644 --- a/engine/e2e-test/test_cli_engine_get.py +++ b/engine/e2e-test/test_cli_engine_get.py @@ -18,7 +18,7 @@ def setup_and_teardown(self): # Teardown stop_server() - @pytest.mark.skipif(platform.system() != "Windows", reason="Windows-specific test") + @pytest.mark.skipif(reason="Ignore tensorrt-llm test") def test_engines_get_tensorrt_llm_should_not_be_incompatible(self): exit_code, output, error = run("Get engine", ["engines", "get", "tensorrt-llm"]) assert exit_code == 0, f"Get engine failed with error: {error}" @@ -26,7 +26,7 @@ def test_engines_get_tensorrt_llm_should_not_be_incompatible(self): "Incompatible" not in output ), "tensorrt-llm should be Ready or Not Installed on Windows" - @pytest.mark.skipif(platform.system() != "Windows", reason="Windows-specific test") + @pytest.mark.skipif(reason="Ignore onnx-runtime test") def test_engines_get_onnx_should_not_be_incompatible(self): exit_code, output, error = run("Get engine", ["engines", "get", "onnxruntime"]) assert exit_code == 0, f"Get engine failed with error: {error}" @@ -41,7 +41,7 @@ def test_engines_get_llamacpp_should_not_be_incompatible(self): "Incompatible" not in output ), "llama-cpp should be compatible for Windows, MacOs and Linux" - @pytest.mark.skipif(platform.system() != "Darwin", reason="macOS-specific test") + @pytest.mark.skipif(reason="Ignore tensorrt-llm test") def test_engines_get_tensorrt_llm_should_be_incompatible_on_macos(self): exit_code, output, error = run("Get engine", ["engines", "get", "tensorrt-llm"]) assert exit_code == 0, f"Get engine failed with error: {error}" @@ -49,7 +49,7 @@ def test_engines_get_tensorrt_llm_should_be_incompatible_on_macos(self): "is not supported on" in output ), "tensorrt-llm should be Incompatible on MacOS" - @pytest.mark.skipif(platform.system() != "Darwin", reason="macOS-specific test") + @pytest.mark.skipif(reason="Ignore onnx-runtime test") def test_engines_get_onnx_should_be_incompatible_on_macos(self): exit_code, output, error = run("Get engine", ["engines", "get", "onnxruntime"]) assert exit_code == 0, f"Get engine failed with error: {error}" @@ -57,7 +57,7 @@ def test_engines_get_onnx_should_be_incompatible_on_macos(self): "is not supported on" in output ), "onnxruntime should be Incompatible on MacOS" - @pytest.mark.skipif(platform.system() != "Linux", reason="Linux-specific test") + @pytest.mark.skipif(reason="Ignore onnx-runtime test") def test_engines_get_onnx_should_be_incompatible_on_linux(self): exit_code, output, error = run("Get engine", ["engines", "get", "onnxruntime"]) print(output) diff --git a/engine/e2e-test/test_cli_engine_install.py b/engine/e2e-test/test_cli_engine_install.py index dbbc16e8a..aeeabd64d 100644 --- a/engine/e2e-test/test_cli_engine_install.py +++ b/engine/e2e-test/test_cli_engine_install.py @@ -31,7 +31,7 @@ def test_engines_install_llamacpp_should_be_successfully(self): assert len(response.json()) > 0 assert exit_code == 0, f"Install engine failed with error: {error}" - @pytest.mark.skipif(platform.system() != "Darwin", reason="macOS-specific test") + @pytest.mark.skipif(reason="Ignore onnx-runtime test") def test_engines_install_onnx_on_macos_should_be_failed(self): exit_code, output, error = run( "Install Engine", ["engines", "install", "onnxruntime"] @@ -39,7 +39,7 @@ def test_engines_install_onnx_on_macos_should_be_failed(self): assert "is not supported on" in output, "Should display error message" assert exit_code == 0, f"Install engine failed with error: {error}" - @pytest.mark.skipif(platform.system() != "Darwin", reason="macOS-specific test") + @pytest.mark.skipif(reason="Ignore tensorrt-llm test") def test_engines_install_onnx_on_tensorrt_should_be_failed(self): exit_code, output, error = run( "Install Engine", ["engines", "install", "tensorrt-llm"] diff --git a/engine/e2e-test/test_cli_engine_install_nightly.py b/engine/e2e-test/test_cli_engine_install_nightly.py index bbb56ac9b..80490ab55 100644 --- a/engine/e2e-test/test_cli_engine_install_nightly.py +++ b/engine/e2e-test/test_cli_engine_install_nightly.py @@ -31,7 +31,7 @@ def test_engines_install_llamacpp_should_be_successfully(self): assert len(response.json()) > 0 assert exit_code == 0, f"Install engine failed with error: {error}" - @pytest.mark.skipif(platform.system() != "Darwin", reason="macOS-specific test") + @pytest.mark.skipif(reason="Ignore onnx-runtime test") def test_engines_install_onnx_on_macos_should_be_failed(self): exit_code, output, error = run( "Install Engine", ["engines", "install", "onnxruntime"] @@ -39,7 +39,7 @@ def test_engines_install_onnx_on_macos_should_be_failed(self): assert "is not supported on" in output, "Should display error message" assert exit_code == 0, f"Install engine failed with error: {error}" - @pytest.mark.skipif(platform.system() != "Darwin", reason="macOS-specific test") + @pytest.mark.skipif(reason="Ignore tensorrt-llm test") def test_engines_install_onnx_on_tensorrt_should_be_failed(self): exit_code, output, error = run( "Install Engine", ["engines", "install", "tensorrt-llm"] diff --git a/engine/examples/inja/CMakeLists.txt b/engine/examples/inja/CMakeLists.txt new file mode 100644 index 000000000..e89853cb7 --- /dev/null +++ b/engine/examples/inja/CMakeLists.txt @@ -0,0 +1,66 @@ +project(inja-test C CXX) + +include(CheckIncludeFileCXX) + +check_include_file_cxx(any HAS_ANY) +check_include_file_cxx(string_view HAS_STRING_VIEW) +check_include_file_cxx(coroutine HAS_COROUTINE) +if(HAS_ANY + AND HAS_STRING_VIEW + AND HAS_COROUTINE) + set(CMAKE_CXX_STANDARD 20) +elseif(HAS_ANY AND HAS_STRING_VIEW) + set(CMAKE_CXX_STANDARD 17) +else() + set(CMAKE_CXX_STANDARD 14) +endif() + +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) +set(OPENSSL_USE_STATIC_LIBS TRUE) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +if(MSVC) + add_compile_options( + $<$:/MT> #---------| + $<$:/MTd> #---|-- Statically link the runtime libraries + $<$:/MT> #--| + ) + + add_compile_options(/utf-8) + add_definitions(-DUNICODE -D_UNICODE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DUNICODE /D_UNICODE") +endif() + +find_package(jsoncpp CONFIG REQUIRED) +find_package(Trantor CONFIG REQUIRED) +set(TARGET_NAME ${PROJECT_NAME}) + +add_executable(${TARGET_NAME} main.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../../extensions/template_renderer.cc + ) + +target_link_libraries(${TARGET_NAME} PRIVATE JsonCpp::JsonCpp) +target_link_libraries(${TARGET_NAME} PRIVATE Trantor::Trantor) + +# ############################################################################## + +if(CMAKE_CXX_STANDARD LESS 17) + # With C++14, use boost to support any and std::string_view + message(STATUS "use c++14") + find_package(Boost 1.61.0 REQUIRED) + target_include_directories(${TARGET_NAME} PRIVATE ${Boost_INCLUDE_DIRS}) +elseif(CMAKE_CXX_STANDARD LESS 20) + message(STATUS "use c++17") +else() + message(STATUS "use c++20") +endif() + +target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../..) +target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + +set_target_properties(${TARGET_NAME} PROPERTIES + RUNTIME_OUTPUT_DIRECTORY_DEBUG ${CMAKE_BINARY_DIR} + RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR} + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} +) diff --git a/engine/examples/inja/README.md b/engine/examples/inja/README.md new file mode 100644 index 000000000..bd0c33370 --- /dev/null +++ b/engine/examples/inja/README.md @@ -0,0 +1,5 @@ +Build cortex with `CMAKE_BUILD_INJA_TEST=ON` + +```bash title="Run inja-test" +./inja-test --template path_to_template_file --data path_to_data_file +``` \ No newline at end of file diff --git a/engine/examples/inja/data.txt b/engine/examples/inja/data.txt new file mode 100644 index 000000000..58ae7f78a --- /dev/null +++ b/engine/examples/inja/data.txt @@ -0,0 +1,17 @@ +{ + "choices": [ + { + "delta": { + "content": " questions" + }, + "finish_reason": null, + "index": 0 + } + ], + "created": 1735372587, + "id": "", + "model": "o1-preview", + "object": "chat.completion.chunk", + "stream": true, + "system_fingerprint": "fp_1ddf0263de" +} \ No newline at end of file diff --git a/engine/examples/inja/main.cc b/engine/examples/inja/main.cc new file mode 100644 index 000000000..4e0e9b1fe --- /dev/null +++ b/engine/examples/inja/main.cc @@ -0,0 +1,45 @@ +#include "extensions/template_renderer.h" +#include "utils/json_helper.h" + +void print_help() { + std::cout << "Usage: \ninja-test [options]\n\n"; + std::cout << "Options:\n"; + std::cout << " --template Path to template file\n"; + std::cout << " --data Path to data file\n"; + + exit(0); +} + +int main(int argc, char* argv[]) { + std::filesystem::path template_path; + std::filesystem::path data_path; + for (int i = 0; i < argc; i++) { + if (strcmp(argv[i], "--template") == 0) { + template_path = argv[i + 1]; + } else if (strcmp(argv[i], "--data") == 0) { + data_path = argv[i + 1]; + } else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { + print_help(); + } + } + + auto get_str = [](const std::filesystem::path& p) { + std::ifstream ifs(p); + std::stringstream buffer; + buffer << ifs.rdbuf(); + return buffer.str(); + }; + + std::string tpl = get_str(template_path); + std::string message = get_str(data_path); + auto data = json_helper::ParseJsonString(message); + + extensions::TemplateRenderer rdr; + auto res = rdr.Render(tpl, data); + std::cout << std::endl; + std::cout << "Result: " << std::endl; + std::cout << res << std::endl; + std::cout << std::endl; + + return 0; +} \ No newline at end of file diff --git a/engine/examples/inja/template.txt b/engine/examples/inja/template.txt new file mode 100644 index 000000000..421f03633 --- /dev/null +++ b/engine/examples/inja/template.txt @@ -0,0 +1,10 @@ +{ + {% set first = true %} + {% for key, value in input_request %} + {% if key == "choices" or key == "created" or key == "model" or key == "service_tier" or key == "system_fingerprint" or key == "stream" or key == "object" or key == "usage" %} + {% if not first %},{% endif %} + "{{ key }}": {{ tojson(value) }} + {% set first = false %} + {% endif %} + {% endfor %} +} diff --git a/engine/extensions/python-engine/python_engine.cc b/engine/extensions/python-engine/python_engine.cc index f9557d70b..d34f75c08 100644 --- a/engine/extensions/python-engine/python_engine.cc +++ b/engine/extensions/python-engine/python_engine.cc @@ -3,19 +3,67 @@ #include #include #include + namespace python_engine { +namespace { constexpr const int k200OK = 200; constexpr const int k400BadRequest = 400; constexpr const int k409Conflict = 409; constexpr const int k500InternalServerError = 500; constexpr const int kFileLoggerOption = 0; +size_t StreamWriteCallback(char* ptr, size_t size, size_t nmemb, + void* userdata) { + auto* context = static_cast(userdata); + std::string chunk(ptr, size * nmemb); + + context->buffer += chunk; + + // Process complete lines + size_t pos; + while ((pos = context->buffer.find('\n')) != std::string::npos) { + std::string line = context->buffer.substr(0, pos); + context->buffer = context->buffer.substr(pos + 1); + LOG_DEBUG << "line: " << line; + + // Skip empty lines + if (line.empty() || line == "\r") + continue; + + if (line == "data: [DONE]") { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + (*context->callback)(std::move(status), Json::Value()); + break; + } + + // Parse the JSON + Json::Value chunk_json; + chunk_json["data"] = line + "\n\n"; + Json::Reader reader; + + Json::Value status; + status["is_done"] = false; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + (*context->callback)(std::move(status), std::move(chunk_json)); + } + + return size * nmemb; +} + static size_t WriteCallback(char* ptr, size_t size, size_t nmemb, std::string* data) { data->append(ptr, size * nmemb); return size * nmemb; } +} // namespace + PythonEngine::PythonEngine() : q_(4 /*n_parallel*/, "python_engine") {} PythonEngine::~PythonEngine() { @@ -31,118 +79,17 @@ config::PythonModelConfig* PythonEngine::GetModelConfig( } return nullptr; } -std::string constructWindowsCommandLine(const std::vector& args) { - std::string cmdLine; - for (const auto& arg : args) { - // Simple escaping for Windows command line - std::string escapedArg = arg; - if (escapedArg.find(' ') != std::string::npos) { - // Wrap in quotes and escape existing quotes - for (char& c : escapedArg) { - if (c == '"') - c = '\\'; - } - escapedArg = "\"" + escapedArg + "\""; - } - cmdLine += escapedArg + " "; - } - return cmdLine; -} -std::vector convertToArgv(const std::vector& args) { - std::vector argv; - for (const auto& arg : args) { - argv.push_back(const_cast(arg.c_str())); - } - argv.push_back(nullptr); - return argv; -} - -pid_t PythonEngine::SpawnProcess(const std::string& model, - const std::vector& command) { - try { -#ifdef _WIN32 - // Windows process creation - STARTUPINFOA si = {0}; - PROCESS_INFORMATION pi = {0}; - si.cb = sizeof(si); - - // Construct command line - std::string cmdLine = constructWindowsCommandLine(command); - - // Convert string to char* for Windows API - char commandBuffer[4096]; - strncpy_s(commandBuffer, cmdLine.c_str(), sizeof(commandBuffer)); - - if (!CreateProcessA(NULL, // lpApplicationName - commandBuffer, // lpCommandLine - NULL, // lpProcessAttributes - NULL, // lpThreadAttributes - FALSE, // bInheritHandles - 0, // dwCreationFlags - NULL, // lpEnvironment - NULL, // lpCurrentDirectory - &si, // lpStartupInfo - &pi // lpProcessInformation - )) { - throw std::runtime_error("Failed to create process on Windows"); - } - - // Store the process ID - pid_t pid = pi.dwProcessId; - processMap[model] = pid; - - // Close handles to avoid resource leaks - CloseHandle(pi.hProcess); - CloseHandle(pi.hThread); - - return pid; - -#elif __APPLE__ || __linux__ - // POSIX process creation - pid_t pid; - - // Convert command vector to char*[] - std::vector argv = convertToArgv(command); - // for (auto c : command) { - // std::cout << c << " " << std::endl; - // } - - // Use posix_spawn for cross-platform compatibility - int spawn_result = posix_spawn(&pid, // pid output - command[0].c_str(), // executable path - NULL, // file actions - NULL, // spawn attributes - argv.data(), // argument vector - NULL // environment (inherit) - ); - - if (spawn_result != 0) { - throw std::runtime_error("Failed to spawn process"); - } - - // Store the process ID - processMap[model] = pid; - return pid; - -#else -#error Unsupported platform -#endif - } catch (const std::exception& e) { - LOG_ERROR << "Process spawning error: " << e.what(); - return -1; - } -} bool PythonEngine::TerminateModelProcess(const std::string& model) { - auto it = processMap.find(model); - if (it == processMap.end()) { + auto it = process_map_.find(model); + if (it == process_map_.end()) { LOG_ERROR << "No process found for model: " << model << ", removing from list running models."; models_.erase(model); return false; } -#ifdef _WIN32 +#if defined(_WIN32) HANDLE hProcess = OpenProcess(PROCESS_TERMINATE, FALSE, it->second); if (hProcess == NULL) { LOG_ERROR << "Failed to open process"; @@ -153,20 +100,21 @@ bool PythonEngine::TerminateModelProcess(const std::string& model) { CloseHandle(hProcess); if (terminated) { - processMap.erase(it); + process_map_.erase(it); return true; } -#elif __APPLE__ || __linux__ +#elif defined(__APPLE__) || defined(__linux__) int result = kill(it->second, SIGTERM); if (result == 0) { - processMap.erase(it); + process_map_.erase(it); return true; } #endif return false; } + CurlResponse PythonEngine::MakeGetRequest(const std::string& model, const std::string& path) { auto const& config = models_[model]; @@ -182,6 +130,7 @@ CurlResponse PythonEngine::MakeGetRequest(const std::string& model, } return response; } + CurlResponse PythonEngine::MakeDeleteRequest(const std::string& model, const std::string& path) { auto const& config = models_[model]; @@ -304,7 +253,7 @@ void PythonEngine::LoadModel( auto data_folder_path = std::filesystem::path(model_folder_path) / std::filesystem::path("venv"); try { -#ifdef _WIN32 +#if defined(_WIN32) auto executable = std::filesystem::path(data_folder_path) / std::filesystem::path("Scripts"); #else @@ -356,7 +305,8 @@ void PythonEngine::LoadModel( // Add the parsed arguments to the command command.insert(command.end(), args.begin(), args.end()); - pid = SpawnProcess(model, command); + pid = cortex::process::SpawnProcess(command); + process_map_[model] = pid; if (pid == -1) { std::unique_lock lock(models_mutex_); if (models_.find(model) != models_.end()) { @@ -416,16 +366,16 @@ void PythonEngine::UnloadModel( return; } - const std::string& model = (*json_body)["model"].asString(); + auto model = (*json_body)["model"].asString(); { - std::unique_lock lock(models_mutex_); if (TerminateModelProcess(model)) { + std::unique_lock lock(models_mutex_); models_.erase(model); } else { Json::Value error; error["error"] = "Fail to terminate process with id: " + - std::to_string(processMap[model]); + std::to_string(process_map_[model]); Json::Value status; status["is_done"] = true; status["has_error"] = true; @@ -448,7 +398,9 @@ void PythonEngine::UnloadModel( void PythonEngine::HandleChatCompletion( std::shared_ptr json_body, - std::function&& callback) {} + std::function&& callback) { + LOG_WARN << "Does not support yet!"; +} CurlResponse PythonEngine::MakeStreamPostRequest( const std::string& model, const std::string& path, const std::string& body, @@ -509,7 +461,7 @@ CurlResponse PythonEngine::MakeStreamPostRequest( void PythonEngine::HandleInference( std::shared_ptr json_body, std::function&& callback) { - if (!json_body->isMember("model")) { + if (json_body && !json_body->isMember("model")) { Json::Value error; error["error"] = "Missing required field: model is required!"; Json::Value status; @@ -520,14 +472,14 @@ void PythonEngine::HandleInference( callback(std::move(status), std::move(error)); return; } + std::string method = "post"; std::string path = "/inference"; - std::string transform_request = - (*json_body).get("transform_request", "").asString(); - std::string transform_response = + auto transform_request = (*json_body).get("transform_request", "").asString(); + auto transform_response = (*json_body).get("transform_response", "").asString(); - std::string model = (*json_body)["model"].asString(); - Json::Value body = (*json_body)["body"]; + auto model = (*json_body)["model"].asString(); + auto& body = (*json_body)["body"]; if (models_.find(model) == models_.end()) { Json::Value error; @@ -680,10 +632,13 @@ void PythonEngine::HandleInference( callback(std::move(status), std::move(response_json)); } } + Json::Value PythonEngine::GetRemoteModels() { return Json::Value(); } + void PythonEngine::StopInferencing(const std::string& model_id) {} + void PythonEngine::HandleRouteRequest( std::shared_ptr json_body, std::function&& callback) { @@ -700,14 +655,13 @@ void PythonEngine::HandleRouteRequest( callback(std::move(status), std::move(error)); return; } - std::string method = (*json_body)["method"].asString(); - std::string path = (*json_body)["path"].asString(); - std::string transform_request = - (*json_body).get("transform_request", "").asString(); - std::string transform_response = + auto method = (*json_body)["method"].asString(); + auto path = (*json_body)["path"].asString(); + auto transform_request = (*json_body).get("transform_request", "").asString(); + auto transform_response = (*json_body).get("transform_response", "").asString(); - std::string model = (*json_body)["model"].asString(); - Json::Value body = (*json_body)["body"]; + auto model = (*json_body)["model"].asString(); + auto& body = (*json_body)["body"]; if (models_.find(model) == models_.end()) { Json::Value error; @@ -864,20 +818,34 @@ void PythonEngine::GetModelStatus( callback(std::move(status), std::move(error)); return; } + auto model = json_body->get("model", "").asString(); auto model_config = models_[model]; auto health_endpoint = model_config.heath_check; + auto pid = process_map_[model]; + auto is_process_live = process_status_utils::IsProcessRunning(pid); auto response_health = MakeGetRequest(model, health_endpoint.path); - if (response_health.error) { + if (response_health.error && is_process_live) { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = k200OK; + Json::Value message; + message["message"] = "model '"+model+"' is loading"; + callback(std::move(status), std::move(message)); + return; + } + else if(response_health.error && !is_process_live){ Json::Value status; status["is_done"] = true; status["has_error"] = true; status["is_stream"] = false; status["status_code"] = k400BadRequest; - Json::Value error; - error["error"] = response_health.error_message; - callback(std::move(status), std::move(error)); + Json::Value message; + message["message"] = response_health.error_message; + callback(std::move(status), std::move(message)); return; } @@ -947,9 +915,4 @@ void PythonEngine::Unload(EngineUnloadOption opts) { } }; -// extern "C" { -// EngineI* get_engine() { -// return new PythonEngine(); -// } -// } } // namespace python_engine \ No newline at end of file diff --git a/engine/extensions/python-engine/python_engine.h b/engine/extensions/python-engine/python_engine.h index 979ba1fd8..70a9b9829 100644 --- a/engine/extensions/python-engine/python_engine.h +++ b/engine/extensions/python-engine/python_engine.h @@ -14,19 +14,10 @@ #include "extensions/template_renderer.h" #include "utils/file_logger.h" #include "utils/file_manager_utils.h" - +#include "utils/process_status_utils.h" #include "utils/curl_utils.h" -#ifdef _WIN32 -#include -#include -using pid_t = DWORD; -#elif __APPLE__ || __linux__ -#include -#include -#include -#include -#include -#endif +#include "utils/process/utils.h" + // Helper for CURL response namespace python_engine { struct StreamContext { @@ -34,50 +25,6 @@ struct StreamContext { std::string buffer; }; -static size_t StreamWriteCallback(char* ptr, size_t size, size_t nmemb, - void* userdata) { - auto* context = static_cast(userdata); - std::string chunk(ptr, size * nmemb); - - context->buffer += chunk; - - // Process complete lines - size_t pos; - while ((pos = context->buffer.find('\n')) != std::string::npos) { - std::string line = context->buffer.substr(0, pos); - context->buffer = context->buffer.substr(pos + 1); - LOG_DEBUG << "line: "<callback)(std::move(status), Json::Value()); - break; - } - - // Parse the JSON - Json::Value chunk_json; - chunk_json["data"] = line + "\n\n"; - Json::Reader reader; - - Json::Value status; - status["is_done"] = false; - status["has_error"] = false; - status["is_stream"] = true; - status["status_code"] = 200; - (*context->callback)(std::move(status), std::move(chunk_json)); - } - - return size * nmemb; -} - struct CurlResponse { std::string body; bool error{false}; @@ -93,10 +40,9 @@ class PythonEngine : public EngineI { std::unordered_map models_; extensions::TemplateRenderer renderer_; std::unique_ptr async_file_logger_; - std::unordered_map processMap; + std::unordered_map process_map_; trantor::ConcurrentTaskQueue q_; - // Helper functions CurlResponse MakePostRequest(const std::string& model, const std::string& path, @@ -111,8 +57,6 @@ class PythonEngine : public EngineI { const std::function& callback); // Process manager functions - pid_t SpawnProcess(const std::string& model, - const std::vector& command); bool TerminateModelProcess(const std::string& model); // Internal model management diff --git a/engine/main.cc b/engine/main.cc index 7c3075736..122ea094a 100644 --- a/engine/main.cc +++ b/engine/main.cc @@ -1,6 +1,8 @@ #include #include +#include #include +#include #include "controllers/assistants.h" #include "controllers/configs.h" #include "controllers/engines.h" @@ -53,16 +55,28 @@ #error "Unsupported platform!" #endif +// Global var to signal drogon to shutdown +volatile bool shutdown_signal; + void RunServer(std::optional host, std::optional port, bool ignore_cout) { #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) - signal(SIGINT, SIG_IGN); + auto signal_handler = +[](int sig) -> void { + std::cout << "\rCaught interrupt signal, shutting down\n"; + shutdown_signal = true; + }; + signal(SIGINT, signal_handler); #elif defined(_WIN32) auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { - return (ctrl_type == CTRL_C_EVENT) ? true : false; + if (ctrl_type == CTRL_C_EVENT) { + std::cout << "\rCaught interrupt signal, shutting down\n"; + shutdown_signal = true; + return TRUE; + } + return FALSE; }; SetConsoleCtrlHandler( - reinterpret_cast(console_ctrl_handler), true); + reinterpret_cast(console_ctrl_handler), TRUE); #endif auto config = file_manager_utils::GetCortexConfig(); if (host.has_value() || port.has_value()) { @@ -204,16 +218,23 @@ void RunServer(std::optional host, std::optional port, auto upload_path = std::filesystem::temp_directory_path() / "cortex-uploads"; drogon::app().setUploadPath(upload_path.string()); - LOG_INFO << "Server started, listening at: " << config.apiServerHost << ":" - << config.apiServerPort; - LOG_INFO << "Please load your model"; #ifndef _WIN32 drogon::app().enableReusePort(); #else drogon::app().enableDateHeader(false); #endif - drogon::app().addListener(config.apiServerHost, - std::stoi(config.apiServerPort)); + try { + drogon::app().addListener(config.apiServerHost, + std::stoi(config.apiServerPort)); + } catch (const std::exception& e) { + LOG_ERROR << "Failed to start server: " << e.what(); + return; + } + + LOG_INFO << "Server started, listening at: " << config.apiServerHost << ":" + << config.apiServerPort; + LOG_INFO << "Please load your model"; + drogon::app().setThreadNum(drogon_thread_num); LOG_INFO << "Number of thread is:" << drogon::app().getThreadNum(); drogon::app().disableSigtermHandling(); @@ -276,11 +297,37 @@ void RunServer(std::optional host, std::optional port, drogon::app().addListener(config.apiServerHost, 443, true); } - drogon::app().run(); + // Fires up the server in another thread and set the shutdown signal if it somehow dies + std::thread([] { + drogon::app().run(); + shutdown_signal = true; + }).detach(); + + // Now this thread can monitor the shutdown signal + while (!shutdown_signal) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + if (hw_service->ShouldRestart()) { CTL_INF("Restart to update hardware configuration"); hw_service->Restart(config.apiServerHost, std::stoi(config.apiServerPort)); } + drogon::app().quit(); +} + +void print_help() { + std::cout << "Usage: \ncortex-server [options]\n\n"; + std::cout << "Options:\n"; + std::cout << " --config_file_path Path to the config file (default: " + "~/.cortexrc)\n"; + std::cout << " --data_folder_path Path to the data folder (default: " + "~/cortexcpp)\n"; + std::cout << " --host Host name (default: 127.0.0.1)\n"; + std::cout << " --port Port number (default: 39281)\n"; + std::cout << " --ignore_cout Ignore cout output\n"; + std::cout << " --loglevel Set log level\n"; + + exit(0); } #if defined(_WIN32) @@ -326,6 +373,8 @@ int main(int argc, char* argv[]) { std::wstring v = argv[i + 1]; std::string log_level = cortex::wc::WstringToUtf8(v); logging_utils_helper::SetLogLevel(log_level, ignore_cout_log); + } else if (command == L"--help" || command == L"-h") { + print_help(); } } #else @@ -343,6 +392,8 @@ int main(int argc, char* argv[]) { } else if (strcmp(argv[i], "--loglevel") == 0) { std::string log_level = argv[i + 1]; logging_utils_helper::SetLogLevel(log_level, ignore_cout_log); + } else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { + print_help(); } } #endif diff --git a/engine/services/database_service.cc b/engine/services/database_service.cc index d4cd977a9..695e36f72 100644 --- a/engine/services/database_service.cc +++ b/engine/services/database_service.cc @@ -118,13 +118,13 @@ bool DatabaseService::HasModel(const std::string& identifier) const { return cortex::db::Models().HasModel(identifier); } -cpp::result, std::string> -DatabaseService::GetModelSources() const { - return cortex::db::Models().GetModelSources(); -} - -cpp::result, std::string> DatabaseService::GetModels( +cpp::result, std::string> DatabaseService::GetModels( const std::string& model_src) const { return cortex::db::Models().GetModels(model_src); } + +cpp::result, std::string> +DatabaseService::GetModelSources() const { + return cortex::db::Models().GetModelSources(); +} // end models \ No newline at end of file diff --git a/engine/services/database_service.h b/engine/services/database_service.h index 4fb4f7be0..e45fc57b2 100644 --- a/engine/services/database_service.h +++ b/engine/services/database_service.h @@ -60,9 +60,10 @@ class DatabaseService { cpp::result, std::string> FindRelatedModel( const std::string& identifier) const; bool HasModel(const std::string& identifier) const; - cpp::result, std::string> GetModelSources() const; - cpp::result, std::string> GetModels( + cpp::result, std::string> GetModels( const std::string& model_src) const; + cpp::result, std::string> GetModelSources() + const; private: }; \ No newline at end of file diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 65e37ac4d..bdc647905 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -27,19 +27,16 @@ namespace { std::string GetSuitableCudaVersion(const std::string& engine, const std::string& cuda_driver_version) { auto suitable_toolkit_version = ""; - if (engine == kTrtLlmRepo || engine == kTrtLlmEngine) { - // for tensorrt-llm, we need to download cuda toolkit v12.4 - suitable_toolkit_version = "12.4"; - } else { - // llamacpp - auto cuda_driver_semver = - semantic_version_utils::SplitVersion(cuda_driver_version); - if (cuda_driver_semver.major == 11) { - suitable_toolkit_version = "11.7"; - } else if (cuda_driver_semver.major == 12) { - suitable_toolkit_version = "12.0"; - } + + // llamacpp + auto cuda_driver_semver = + semantic_version_utils::SplitVersion(cuda_driver_version); + if (cuda_driver_semver.major == 11) { + suitable_toolkit_version = "11.7"; + } else if (cuda_driver_semver.major == 12) { + suitable_toolkit_version = "12.0"; } + return suitable_toolkit_version; } @@ -47,10 +44,6 @@ std::string GetSuitableCudaVersion(const std::string& engine, std::string NormalizeEngine(const std::string& engine) { if (engine == kLlamaEngine) { return kLlamaRepo; - } else if (engine == kOnnxEngine) { - return kOnnxRepo; - } else if (engine == kTrtLlmEngine) { - return kTrtLlmRepo; } return engine; }; @@ -58,10 +51,6 @@ std::string NormalizeEngine(const std::string& engine) { std::string Repo2Engine(const std::string& r) { if (r == kLlamaRepo) { return kLlamaEngine; - } else if (r == kOnnxRepo) { - return kOnnxEngine; - } else if (r == kTrtLlmRepo) { - return kTrtLlmEngine; } return r; }; @@ -69,10 +58,6 @@ std::string Repo2Engine(const std::string& r) { std::string GetEnginePath(std::string_view e) { if (e == kLlamaRepo) { return kLlamaLibPath; - } else if (e == kOnnxRepo) { - return kOnnxLibPath; - } else if (e == kTrtLlmRepo) { - return kTensorrtLlmPath; } return kLlamaLibPath; }; @@ -85,13 +70,6 @@ cpp::result EngineService::InstallEngineAsync( CTL_INF("InstallEngineAsync: " << ne << ", " << version << ", " << variant_name.value_or("")); auto os = hw_inf_.sys_inf->os; - if (os == kMacOs && (ne == kOnnxRepo || ne == kTrtLlmRepo)) { - return cpp::fail("Engine " + ne + " is not supported on macOS"); - } - - if (os == kLinuxOs && ne == kOnnxRepo) { - return cpp::fail("Engine " + ne + " is not supported on Linux"); - } auto result = DownloadEngine(ne, version, variant_name); if (result.has_error()) { @@ -342,7 +320,7 @@ cpp::result EngineService::DownloadEngine( engine, // engine_name kLocal, "", "", normalize_version, variant.value(), "Default", ""); - if (create_res.has_value()) { + if (create_res.has_error()) { CTL_ERR("Failed to create engine entry: " << create_res->engine_name); } else { CTL_INF("Engine entry created successfully"); @@ -386,9 +364,8 @@ cpp::result EngineService::DownloadEngine( cpp::result EngineService::DownloadCuda( const std::string& engine, bool async) { - if (hw_inf_.sys_inf->os == "mac" || engine == kOnnxRepo || - engine == kOnnxEngine) { - // mac and onnx engine does not require cuda toolkit + if (hw_inf_.sys_inf->os == "mac") { + // mac does not require cuda toolkit return true; } @@ -453,13 +430,7 @@ cpp::result EngineService::DownloadCuda( std::string EngineService::GetMatchedVariant( const std::string& engine, const std::vector& variants) { std::string matched_variant; - if (engine == kTrtLlmRepo || engine == kTrtLlmEngine) { - matched_variant = engine_matcher_utils::ValidateTensorrtLlm( - variants, hw_inf_.sys_inf->os, hw_inf_.cuda_driver_version); - } else if (engine == kOnnxRepo || engine == kOnnxEngine) { - matched_variant = engine_matcher_utils::ValidateOnnx( - variants, hw_inf_.sys_inf->os, hw_inf_.sys_inf->arch); - } else if (engine == kLlamaRepo || engine == kLlamaEngine) { + if (engine == kLlamaRepo || engine == kLlamaEngine) { auto suitable_avx = engine_matcher_utils::GetSuitableAvxVariant(hw_inf_.cpu_inf); matched_variant = engine_matcher_utils::Validate( @@ -638,13 +609,6 @@ cpp::result, std::string> EngineService::GetInstalledEngineVariants(const std::string& engine) const { auto ne = NormalizeEngine(engine); auto os = hw_inf_.sys_inf->os; - if (os == kMacOs && (ne == kOnnxRepo || ne == kTrtLlmRepo)) { - return cpp::fail("Engine " + engine + " is not supported on macOS"); - } - - if (os == kLinuxOs && ne == kOnnxRepo) { - return cpp::fail("Engine " + engine + " is not supported on Linux"); - } auto engines_variants_dir = file_manager_utils::GetEnginesContainerPath() / ne; @@ -954,13 +918,7 @@ cpp::result EngineService::IsEngineReady( } auto os = hw_inf_.sys_inf->os; - if (os == kMacOs && (ne == kOnnxRepo || ne == kTrtLlmRepo)) { - return cpp::fail("Engine " + engine + " is not supported on macOS"); - } - if (os == kLinuxOs && ne == kOnnxRepo) { - return cpp::fail("Engine " + engine + " is not supported on Linux"); - } auto installed_variants = GetInstalledEngineVariants(engine); if (installed_variants.has_error()) { return cpp::fail(installed_variants.error()); diff --git a/engine/services/hardware_service.cc b/engine/services/hardware_service.cc index 7db302a7d..6b19a0119 100644 --- a/engine/services/hardware_service.cc +++ b/engine/services/hardware_service.cc @@ -9,7 +9,10 @@ #endif #include "cli/commands/cortex_upd_cmd.h" #include "database/hardware.h" +#include "services/engine_service.h" #include "utils/cortex_utils.h" +#include "utils/dylib_path_manager.h" +#include "utils/process/utils.h" #if defined(__linux__) #include "services/download_service.h" #endif @@ -152,7 +155,7 @@ bool HardwareService::Restart(const std::string& host, int port) { std::wstring exe_w = exe.wstring(); std::wstring current_path_w = file_manager_utils::GetExecutableFolderContainerPath().wstring(); - std::wstring wcmds = current_path_w + L"\\" + exe_w + L" " + params; + std::wstring wcmds = current_path_w + L"\\" + exe_w + L" " + params; CTL_DBG("wcmds: " << wcmds); std::vector mutable_cmds(wcmds.begin(), wcmds.end()); mutable_cmds.push_back(L'\0'); @@ -179,42 +182,36 @@ bool HardwareService::Restart(const std::string& host, int port) { } #else - // Unix-like system-specific code to fork a child process - pid_t pid = fork(); - + std::vector commands; + // Some engines requires to add lib search path before process being created + auto download_srv = std::make_shared(); + auto dylib_path_mng = std::make_shared(); + auto db_srv = std::make_shared(); + EngineService(download_srv, dylib_path_mng, db_srv).RegisterEngineLibPath(); + std::string p = cortex_utils::GetCurrentPath() / exe; + commands.push_back(p); + commands.push_back("--ignore_cout"); + commands.push_back("--config_file_path"); + commands.push_back(get_config_file_path()); + commands.push_back("--data_folder_path"); + commands.push_back(get_data_folder_path()); + commands.push_back("--loglevel"); + commands.push_back(luh::LogLevelStr(luh::global_log_level)); + auto pid = cortex::process::SpawnProcess(commands); if (pid < 0) { // Fork failed std::cerr << "Could not start server: " << std::endl; return false; - } else if (pid == 0) { - // No need to configure LD_LIBRARY_PATH for macOS -#if !defined(__APPLE__) || !defined(__MACH__) - const char* name = "LD_LIBRARY_PATH"; - auto data = getenv(name); - std::string v; - if (auto g = getenv(name); g) { - v += g; - } - CTL_INF("LD_LIBRARY_PATH: " << v); - auto llamacpp_path = file_manager_utils::GetCudaToolkitPath(kLlamaRepo); - auto trt_path = file_manager_utils::GetCudaToolkitPath(kTrtLlmRepo); - - auto new_v = trt_path.string() + ":" + llamacpp_path.string() + ":" + v; - setenv(name, new_v.c_str(), true); - CTL_INF("LD_LIBRARY_PATH: " << getenv(name)); -#endif - std::string p = cortex_utils::GetCurrentPath() + "/" + exe.string(); - CTL_INF("server file path: " << p); - execl(p.c_str(), exe.c_str(), "--ignore_cout", "--config_file_path", - get_config_file_path().c_str(), "--data_folder_path", - get_data_folder_path().c_str(), "--loglevel", - luh::LogLevelStr(luh::global_log_level).c_str(), (char*)0); } else { // Parent process if (!TryConnectToServer(host, port)) { return false; } + std::cout << "Server started" << std::endl; + std::cout << "API Documentation available at: http://" << host << ":" + << port << std::endl; } + #endif return true; } diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 0fcf957eb..a9131c886 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -67,7 +67,8 @@ void ParseGguf(DatabaseService& db_service, CTL_INF("Adding model to modellist with branch: " << branch); auto rel = file_manager_utils::ToRelativeCortexDataPath(yaml_name); - CTL_INF("path_to_model_yaml: " << rel.string()); + CTL_INF("path_to_model_yaml: " << rel.string() + << ", model: " << ggufDownloadItem.id); auto author_id = author.has_value() ? author.value() : "cortexso"; if (!db_service.HasModel(ggufDownloadItem.id)) { @@ -86,6 +87,7 @@ void ParseGguf(DatabaseService& db_service, } else { if (auto m = db_service.GetModelInfo(ggufDownloadItem.id); m.has_value()) { auto upd_m = m.value(); + upd_m.path_to_model_yaml = rel.string(); upd_m.status = cortex::db::ModelStatus::Downloaded; if (auto r = db_service.UpdateModelEntry(ggufDownloadItem.id, upd_m); r.has_error()) { @@ -161,6 +163,9 @@ void ModelService::ForceIndexingModelList() { continue; } try { + CTL_DBG(fmu::ToAbsoluteCortexDataPath( + fs::path(model_entry.path_to_model_yaml)) + .string()); yaml_handler.ModelConfigFromFile( fmu::ToAbsoluteCortexDataPath( fs::path(model_entry.path_to_model_yaml)) @@ -171,48 +176,12 @@ void ModelService::ForceIndexingModelList() { } catch (const std::exception& e) { // remove in db auto remove_result = db_service_->DeleteModelEntry(model_entry.model); + CTL_DBG(e.what()); // silently ignore result } } } -cpp::result ModelService::DownloadModel( - const std::string& input) { - if (input.empty()) { - return cpp::fail( - "Input must be Cortex Model Hub handle or HuggingFace url!"); - } - - if (string_utils::StartsWith(input, "https://")) { - return HandleUrl(input); - } - - if (input.find(":") != std::string::npos) { - auto parsed = string_utils::SplitBy(input, ":"); - if (parsed.size() != 2) { - return cpp::fail("Invalid model handle: " + input); - } - return DownloadModelFromCortexso(parsed[0], parsed[1]); - } - - if (input.find("/") != std::string::npos) { - auto parsed = string_utils::SplitBy(input, "/"); - if (parsed.size() != 2) { - return cpp::fail("Invalid model handle: " + input); - } - - auto author = parsed[0]; - auto model_name = parsed[1]; - if (author == "cortexso") { - return HandleCortexsoModel(model_name); - } - - return DownloadHuggingFaceGgufModel(author, model_name, std::nullopt); - } - - return HandleCortexsoModel(input); -} - cpp::result ModelService::HandleCortexsoModel( const std::string& modelName) { auto branches = @@ -612,7 +581,8 @@ ModelService::DownloadModelFromCortexsoAsync( .branch_name = branch, .path_to_model_yaml = rel.string(), .model_alias = unique_model_id, - .status = cortex::db::ModelStatus::Downloaded}; + .status = cortex::db::ModelStatus::Downloaded, + .engine = mc.engine}; auto result = db_service_->AddModelEntry(model_entry); if (result.has_error()) { @@ -621,6 +591,7 @@ ModelService::DownloadModelFromCortexsoAsync( } else { if (auto m = db_service_->GetModelInfo(unique_model_id); m.has_value()) { auto upd_m = m.value(); + upd_m.path_to_model_yaml = rel.string(); upd_m.status = cortex::db::ModelStatus::Downloaded; if (auto r = db_service_->UpdateModelEntry(unique_model_id, upd_m); r.has_error()) { @@ -765,8 +736,15 @@ cpp::result ModelService::DeleteModel( fs::path(model_entry.value().path_to_model_yaml)); yaml_handler.ModelConfigFromFile(yaml_fp.string()); auto mc = yaml_handler.GetModelConfig(); - // Remove yaml file - std::filesystem::remove(yaml_fp); + // Remove yaml files + for (const auto& entry : + std::filesystem::directory_iterator(yaml_fp.parent_path())) { + if (entry.is_regular_file() && (entry.path().extension() == ".yml")) { + std::filesystem::remove(entry); + CTL_INF("Removed: " << entry.path().string()); + } + } + // Remove model files if they are not imported locally if (model_entry.value().branch_name != "imported" && !engine_svc_->IsRemoteEngine(mc.engine)) { @@ -1157,7 +1135,7 @@ cpp::result ModelService::GetModelPullInfo( if (input.find(":") != std::string::npos) { auto parsed = string_utils::SplitBy(input, ":"); - if (parsed.size() != 2) { + if (parsed.size() != 2 && parsed.size() != 3) { return cpp::fail("Invalid model handle: " + input); } return ModelPullInfo{.id = input, diff --git a/engine/services/model_service.h b/engine/services/model_service.h index a668b27ba..17f2c0ddb 100644 --- a/engine/services/model_service.h +++ b/engine/services/model_service.h @@ -42,11 +42,6 @@ class ModelService { inference_svc_(inference_service), engine_svc_(engine_svc) {}; - /** - * Return model id if download successfully - */ - cpp::result DownloadModel(const std::string& input); - cpp::result AbortDownloadModel( const std::string& task_id); diff --git a/engine/services/model_source_service.cc b/engine/services/model_source_service.cc index c148ecfc7..f0e29b8bc 100644 --- a/engine/services/model_source_service.cc +++ b/engine/services/model_source_service.cc @@ -4,6 +4,7 @@ #include "database/models.h" #include "json/json.h" #include "utils/curl_utils.h" +#include "utils/file_manager_utils.h" #include "utils/huggingface_utils.h" #include "utils/logging_utils.h" #include "utils/string_utils.h" @@ -63,7 +64,8 @@ std::vector ParseJsonString(const std::string& json_str) { ModelSourceService::ModelSourceService( std::shared_ptr db_service) : db_service_(db_service) { - sync_db_thread_ = std::thread(&ModelSourceService::SyncModelSource, this); + // TODO(sang) temporariy comment out because of race condition bug + // sync_db_thread_ = std::thread(&ModelSourceService::SyncModelSource, this); running_ = true; } @@ -113,7 +115,14 @@ cpp::result ModelSourceService::RemoveModelSource( return cpp::fail(srcs.error()); } else { auto& v = srcs.value(); - if (std::find(v.begin(), v.end(), model_source) == v.end()) { + auto exists = [&v, &model_source]() { + for (auto const& m : v) { + if (m.model_source == model_source) + return true; + } + return false; + }(); + if (!exists) { return cpp::fail("Model source does not exist: " + model_source); } } @@ -144,9 +153,50 @@ cpp::result ModelSourceService::RemoveModelSource( return true; } -cpp::result, std::string> +cpp::result, std::string> ModelSourceService::GetModelSources() { - return db_service_->GetModelSources(); + auto res = db_service_->GetModelSources(); + if (res.has_error()) { + return cpp::fail(res.error()); + } + auto& models = res.value(); + std::unordered_map ms; + for (auto const& m : models) { + auto meta_json = json_helper::ParseJsonString(m.metadata); + ms[m.model_source].models.push_back( + {m.model, meta_json["size"].asUInt64()}); + meta_json.removeMember("size"); + if (ms[m.model_source].metadata.isNull()) { + ms[m.model_source].metadata = meta_json; + } + ms[m.model_source].id = m.model_source; + ms[m.model_source].author = m.author_repo_id; + LOG_DEBUG << m.model; + } + return ms; +} + +cpp::result ModelSourceService::GetModelSource( + const std::string& src) { + auto res = db_service_->GetModels(src); + if (res.has_error()) { + return cpp::fail(res.error()); + } + + auto& models = res.value(); + ModelSource ms; + for (auto const& m : models) { + auto meta_json = json_helper::ParseJsonString(m.metadata); + ms.models.push_back({m.model, meta_json["size"].asUInt64()}); + meta_json.removeMember("size"); + if (ms.metadata.isNull()) { + ms.metadata = meta_json; + } + ms.id = m.model_source; + ms.author = m.author_repo_id; + LOG_INFO << m.model; + } + return ms; } cpp::result ModelSourceService::AddHfOrg( @@ -155,32 +205,17 @@ cpp::result ModelSourceService::AddHfOrg( author); if (res.has_value()) { auto models = ParseJsonString(res.value()); - // Get models from db - - auto model_list_before = db_service_->GetModels(model_source) - .value_or(std::vector{}); - std::unordered_set updated_model_list; // Add new models for (auto const& m : models) { CTL_DBG(m.id); + auto author_model = string_utils::SplitBy(m.id, "/"); if (author_model.size() == 2) { auto const& author = author_model[0]; auto const& model_name = author_model[1]; - auto add_res = AddRepoSiblings(model_source, author, model_name) - .value_or(std::unordered_set{}); - for (auto const& a : add_res) { - updated_model_list.insert(a); - } - } - } - - // Clean up - for (auto const& mid : model_list_before) { - if (updated_model_list.find(mid) == updated_model_list.end()) { - if (auto del_res = db_service_->DeleteModelEntry(mid); - del_res.has_error()) { - CTL_INF(del_res.error()); + auto r = AddHfRepo(model_source + "/" + model_name, author, model_name); + if (r.has_error()) { + CTL_WRN(r.error()); } } } @@ -195,8 +230,8 @@ cpp::result ModelSourceService::AddHfRepo( const std::string& model_name) { // Get models from db - auto model_list_before = - db_service_->GetModels(model_source).value_or(std::vector{}); + auto model_list_before = db_service_->GetModels(model_source) + .value_or(std::vector{}); std::unordered_set updated_model_list; auto add_res = AddRepoSiblings(model_source, author, model_name); if (add_res.has_error()) { @@ -205,8 +240,8 @@ cpp::result ModelSourceService::AddHfRepo( updated_model_list = add_res.value(); } for (auto const& mid : model_list_before) { - if (updated_model_list.find(mid) == updated_model_list.end()) { - if (auto del_res = db_service_->DeleteModelEntry(mid); + if (updated_model_list.find(mid.model) == updated_model_list.end()) { + if (auto del_res = db_service_->DeleteModelEntry(mid.model); del_res.has_error()) { CTL_INF(del_res.error()); } @@ -231,8 +266,38 @@ ModelSourceService::AddRepoSiblings(const std::string& model_source, "supported."); } + auto siblings_fs = hu::GetSiblingsFileSize(author, model_name); + + if (siblings_fs.has_error()) { + return cpp::fail("Could not get siblings file size: " + author + "/" + + model_name); + } + + auto readme = hu::GetReadMe(author, model_name); + std::string desc; + if (!readme.has_error()) { + desc = readme.value(); + } + + auto meta_json = json_helper::ParseJsonString(repo_info->metadata); + auto& siblings_fs_v = siblings_fs.value(); + for (auto& m : meta_json["siblings"]) { + auto r_file = m["rfilename"].asString(); + if (siblings_fs_v.file_sizes.find(r_file) != + siblings_fs_v.file_sizes.end()) { + m["size"] = siblings_fs_v.file_sizes.at(r_file).size_in_bytes; + } + } + meta_json["description"] = desc; + LOG_DEBUG << meta_json.toStyledString(); + for (const auto& sibling : repo_info->siblings) { if (string_utils::EndsWith(sibling.rfilename, ".gguf")) { + if (siblings_fs_v.file_sizes.find(sibling.rfilename) != + siblings_fs_v.file_sizes.end()) { + meta_json["size"] = + siblings_fs_v.file_sizes.at(sibling.rfilename).size_in_bytes; + } std::string model_id = author + ":" + model_name + ":" + sibling.rfilename; cortex::db::ModelEntry e = { @@ -245,7 +310,7 @@ ModelSourceService::AddRepoSiblings(const std::string& model_source, .model_source = model_source, .status = cortex::db::ModelStatus::Downloadable, .engine = "llama-cpp", - .metadata = repo_info->metadata}; + .metadata = json_helper::DumpJsonString(meta_json)}; if (!db_service_->HasModel(model_id)) { if (auto add_res = db_service_->AddModelEntry(e); add_res.has_error()) { CTL_INF(add_res.error()); @@ -273,46 +338,16 @@ cpp::result ModelSourceService::AddCortexsoOrg( "https://huggingface.co/api/models?author=cortexso"); if (res.has_value()) { auto models = ParseJsonString(res.value()); - // Get models from db - - auto model_list_before = db_service_->GetModels(model_source) - .value_or(std::vector{}); - std::unordered_set updated_model_list; for (auto const& m : models) { CTL_INF(m.id); auto author_model = string_utils::SplitBy(m.id, "/"); if (author_model.size() == 2) { auto const& author = author_model[0]; auto const& model_name = author_model[1]; - auto branches = huggingface_utils::GetModelRepositoryBranches( - "cortexso", model_name); - if (branches.has_error()) { - CTL_INF(branches.error()); - continue; - } - - auto repo_info = hu::GetHuggingFaceModelRepoInfo(author, model_name); - if (repo_info.has_error()) { - CTL_INF(repo_info.error()); - continue; - } - for (auto const& [branch, _] : branches.value()) { - CTL_INF(branch); - auto add_res = AddCortexsoRepoBranch(model_source, author, model_name, - branch, repo_info->metadata) - .value_or(std::unordered_set{}); - for (auto const& a : add_res) { - updated_model_list.insert(a); - } - } - } - } - // Clean up - for (auto const& mid : model_list_before) { - if (updated_model_list.find(mid) == updated_model_list.end()) { - if (auto del_res = db_service_->DeleteModelEntry(mid); - del_res.has_error()) { - CTL_INF(del_res.error()); + auto r = AddCortexsoRepo(model_source + "/" + model_name, author, + model_name); + if (r.has_error()) { + CTL_WRN(r.error()); } } } @@ -336,16 +371,22 @@ cpp::result ModelSourceService::AddCortexsoRepo( if (repo_info.has_error()) { return cpp::fail(repo_info.error()); } + + auto readme = hu::GetReadMe(author, model_name); + std::string desc; + if (!readme.has_error()) { + desc = readme.value(); + } // Get models from db - auto model_list_before = - db_service_->GetModels(model_source).value_or(std::vector{}); + auto model_list_before = db_service_->GetModels(model_source) + .value_or(std::vector{}); std::unordered_set updated_model_list; for (auto const& [branch, _] : branches.value()) { CTL_INF(branch); auto add_res = AddCortexsoRepoBranch(model_source, author, model_name, - branch, repo_info->metadata) + branch, repo_info->metadata, desc) .value_or(std::unordered_set{}); for (auto const& a : add_res) { updated_model_list.insert(a); @@ -354,8 +395,8 @@ cpp::result ModelSourceService::AddCortexsoRepo( // Clean up for (auto const& mid : model_list_before) { - if (updated_model_list.find(mid) == updated_model_list.end()) { - if (auto del_res = db_service_->DeleteModelEntry(mid); + if (updated_model_list.find(mid.model) == updated_model_list.end()) { + if (auto del_res = db_service_->DeleteModelEntry(mid.model); del_res.has_error()) { CTL_INF(del_res.error()); } @@ -369,7 +410,8 @@ ModelSourceService::AddCortexsoRepoBranch(const std::string& model_source, const std::string& author, const std::string& model_name, const std::string& branch, - const std::string& metadata) { + const std::string& metadata, + const std::string& desc) { std::unordered_set res; url_parser::Url url = { @@ -384,27 +426,33 @@ ModelSourceService::AddCortexsoRepoBranch(const std::string& model_source, } bool has_gguf = false; + uint64_t model_size = 0; for (const auto& value : result.value()) { auto path = value["path"].asString(); if (path.find(".gguf") != std::string::npos) { has_gguf = true; + model_size = value["size"].asUInt64(); } } if (!has_gguf) { CTL_INF("Only support gguf file format! - branch: " << branch); return {}; } else { + auto meta_json = json_helper::ParseJsonString(metadata); + meta_json["size"] = model_size; + meta_json["description"] = desc; std::string model_id = model_name + ":" + branch; - cortex::db::ModelEntry e = {.model = model_id, - .author_repo_id = author, - .branch_name = branch, - .path_to_model_yaml = "", - .model_alias = "", - .model_format = "cortexso", - .model_source = model_source, - .status = cortex::db::ModelStatus::Downloadable, - .engine = "llama-cpp", - .metadata = metadata}; + cortex::db::ModelEntry e = { + .model = model_id, + .author_repo_id = author, + .branch_name = branch, + .path_to_model_yaml = "", + .model_alias = "", + .model_format = "cortexso", + .model_source = model_source, + .status = cortex::db::ModelStatus::Downloadable, + .engine = "llama-cpp", + .metadata = json_helper::DumpJsonString(meta_json)}; if (!db_service_->HasModel(model_id)) { CTL_INF("Adding model to db: " << model_name << ":" << branch); if (auto res = db_service_->AddModelEntry(e); @@ -426,37 +474,34 @@ ModelSourceService::AddCortexsoRepoBranch(const std::string& model_source, } void ModelSourceService::SyncModelSource() { - // Do interval check for 10 minutes - constexpr const int kIntervalCheck = 10 * 60; - auto start_time = std::chrono::steady_clock::now(); while (running_) { std::this_thread::sleep_for(std::chrono::milliseconds(100)); - auto current_time = std::chrono::steady_clock::now(); - auto elapsed_time = std::chrono::duration_cast( - current_time - start_time) - .count(); - - if (elapsed_time > kIntervalCheck) { + auto now = std::chrono::system_clock::now(); + auto config = file_manager_utils::GetCortexConfig(); + auto last_check = + std::chrono::system_clock::time_point( + std::chrono::milliseconds(config.checkedForSyncHubAt)) + + std::chrono::hours(1); + if (now > last_check) { CTL_DBG("Start to sync cortex.db"); - start_time = current_time; auto res = db_service_->GetModelSources(); if (res.has_error()) { CTL_INF(res.error()); } else { for (auto const& src : res.value()) { - CTL_DBG(src); + CTL_DBG(src.model_source); } std::unordered_set orgs; std::vector repos; for (auto const& src : res.value()) { - auto url_res = url_parser::FromUrlString(src); + auto url_res = url_parser::FromUrlString(src.model_source); if (url_res.has_value()) { if (url_res->pathParams.size() == 1) { - orgs.insert(src); + orgs.insert(src.model_source); } else if (url_res->pathParams.size() == 2) { - repos.push_back(src); + repos.push_back(src.model_source); } } } @@ -481,6 +526,20 @@ void ModelSourceService::SyncModelSource() { } CTL_DBG("Done sync cortex.db"); + + auto now = std::chrono::system_clock::now(); + auto config = file_manager_utils::GetCortexConfig(); + config.checkedForSyncHubAt = + std::chrono::duration_cast( + now.time_since_epoch()) + .count(); + + auto upd_config_res = + config_yaml_utils::CortexConfigMgr::GetInstance().DumpYamlConfig( + config, file_manager_utils::GetConfigurationPath().string()); + if (upd_config_res.has_error()) { + CTL_ERR("Failed to update config file: " << upd_config_res.error()); + } } } } diff --git a/engine/services/model_source_service.h b/engine/services/model_source_service.h index 7227267d3..606d25d4f 100644 --- a/engine/services/model_source_service.h +++ b/engine/services/model_source_service.h @@ -1,10 +1,42 @@ #pragma once #include #include +#include #include #include "services/database_service.h" #include "utils/result.hpp" +struct ModelSourceInfo { + std::string id; + uint64_t size; + Json::Value ToJson() const { + Json::Value root; + root["id"] = id; + root["size"] = size; + return root; + } +}; + +struct ModelSource { + std::string id; + std::string author; + std::vector models; + Json::Value metadata; + + Json::Value ToJson() { + Json::Value root; + root["id"] = id; + root["author"] = author; + Json::Value models_json; + for (auto const& m : models) { + models_json.append(m.ToJson()); + } + root["models"] = models_json; + root["metadata"] = metadata; + return root; + }; +}; + class ModelSourceService { public: explicit ModelSourceService(std::shared_ptr db_service); @@ -16,7 +48,10 @@ class ModelSourceService { cpp::result RemoveModelSource( const std::string& model_source); - cpp::result, std::string> GetModelSources(); + cpp::result, std::string> + GetModelSources(); + + cpp::result GetModelSource(const std::string& src); private: cpp::result AddHfOrg(const std::string& model_source, @@ -41,7 +76,8 @@ class ModelSourceService { AddCortexsoRepoBranch(const std::string& model_source, const std::string& author, const std::string& model_name, - const std::string& branch, const std::string& metadata); + const std::string& branch, const std::string& metadata, + const std::string& desc); void SyncModelSource(); diff --git a/engine/utils/config_yaml_utils.cc b/engine/utils/config_yaml_utils.cc index 8fbfe1dbe..b26d690c6 100644 --- a/engine/utils/config_yaml_utils.cc +++ b/engine/utils/config_yaml_utils.cc @@ -50,6 +50,7 @@ cpp::result CortexConfigMgr::DumpYamlConfig( node["sslCertPath"] = config.sslCertPath; node["sslKeyPath"] = config.sslKeyPath; node["supportedEngines"] = config.supportedEngines; + node["checkedForSyncHubAt"] = config.checkedForSyncHubAt; out_file << node; out_file.close(); @@ -85,7 +86,8 @@ CortexConfig CortexConfigMgr::FromYaml(const std::string& path, !node["verifyPeerSsl"] || !node["verifyHostSsl"] || !node["verifyProxySsl"] || !node["verifyProxyHostSsl"] || !node["supportedEngines"] || !node["sslCertPath"] || - !node["sslKeyPath"] || !node["noProxy"]); + !node["sslKeyPath"] || !node["noProxy"] || + !node["checkedForSyncHubAt"]); CortexConfig config = { .logFolderPath = node["logFolderPath"] @@ -177,6 +179,9 @@ CortexConfig CortexConfigMgr::FromYaml(const std::string& path, node["supportedEngines"] ? node["supportedEngines"].as>() : default_cfg.supportedEngines, + .checkedForSyncHubAt = node["checkedForSyncHubAt"] + ? node["checkedForSyncHubAt"].as() + : default_cfg.checkedForSyncHubAt, }; if (should_update_config) { l.unlock(); diff --git a/engine/utils/config_yaml_utils.h b/engine/utils/config_yaml_utils.h index 73eaf3084..1749cd2d0 100644 --- a/engine/utils/config_yaml_utils.h +++ b/engine/utils/config_yaml_utils.h @@ -24,10 +24,8 @@ constexpr const auto kDefaultCorsEnabled = true; const std::vector kDefaultEnabledOrigins{ "http://localhost:39281", "http://127.0.0.1:39281", "http://0.0.0.0:39281"}; constexpr const auto kDefaultNoProxy = "example.com,::1,localhost,127.0.0.1"; -const std::vector kDefaultSupportedEngines{ - kLlamaEngine, kOnnxEngine, kTrtLlmEngine, kPythonEngine}; - - +const std::vector kDefaultSupportedEngines{kLlamaEngine, + kPythonEngine}; struct CortexConfig { std::string logFolderPath; @@ -69,6 +67,7 @@ struct CortexConfig { std::string sslCertPath; std::string sslKeyPath; std::vector supportedEngines; + uint64_t checkedForSyncHubAt; }; class CortexConfigMgr { diff --git a/engine/utils/engine_constants.h b/engine/utils/engine_constants.h index 9392ede35..35368c519 100644 --- a/engine/utils/engine_constants.h +++ b/engine/utils/engine_constants.h @@ -1,9 +1,6 @@ #pragma once -constexpr const auto kOnnxEngine = "onnxruntime"; constexpr const auto kLlamaEngine = "llama-cpp"; -constexpr const auto kTrtLlmEngine = "tensorrt-llm"; - constexpr const auto kPythonEngine = "python-engine"; constexpr const auto kOpenAiEngine = "openai"; @@ -14,15 +11,11 @@ constexpr const auto kRemote = "remote"; constexpr const auto kLocal = "local"; -constexpr const auto kOnnxRepo = "cortex.onnx"; constexpr const auto kLlamaRepo = "cortex.llamacpp"; -constexpr const auto kTrtLlmRepo = "cortex.tensorrt-llm"; constexpr const auto kPythonRuntimeRepo = "cortex.python"; constexpr const auto kLlamaLibPath = "./engines/cortex.llamacpp"; constexpr const auto kPythonRuntimeLibPath = "/engines/cortex.python"; -constexpr const auto kOnnxLibPath = "/engines/cortex.onnx"; -constexpr const auto kTensorrtLlmPath = "/engines/cortex.tensorrt-llm"; // other constants constexpr auto static kHuggingFaceHost = "huggingface.co"; diff --git a/engine/utils/file_manager_utils.cc b/engine/utils/file_manager_utils.cc index 6c4595e0b..743c6a641 100644 --- a/engine/utils/file_manager_utils.cc +++ b/engine/utils/file_manager_utils.cc @@ -194,6 +194,7 @@ config_yaml_utils::CortexConfig GetDefaultConfig() { .sslCertPath = "", .sslKeyPath = "", .supportedEngines = config_yaml_utils::kDefaultSupportedEngines, + .checkedForSyncHubAt = 0u, }; } diff --git a/engine/utils/hardware/cpu_info.h b/engine/utils/hardware/cpu_info.h index 4b933b886..20ae30bc3 100644 --- a/engine/utils/hardware/cpu_info.h +++ b/engine/utils/hardware/cpu_info.h @@ -26,6 +26,7 @@ inline CPU GetCPUInfo() { #else float usage = GetCPUUsage(); #endif + // float usage = 0; return CPU{.cores = cpu.numPhysicalCores(), .arch = std::string(GetArch()), diff --git a/engine/utils/huggingface_utils.h b/engine/utils/huggingface_utils.h index 1d1040612..e5c74a6e1 100644 --- a/engine/utils/huggingface_utils.h +++ b/engine/utils/huggingface_utils.h @@ -22,6 +22,107 @@ struct HuggingFaceFileSibling { std::string rfilename; }; +struct HuggingFaceFileSize { + uint64_t size_in_bytes; +}; + +struct HuggingFaceSiblingsFileSize { + std::unordered_map file_sizes; + static cpp::result FromJson( + const Json::Value& json) { + if (json.isNull() || json.type() == Json::ValueType::nullValue) { + return cpp::fail("gguf info is null"); + } + + try { + HuggingFaceSiblingsFileSize res; + for (auto const& j : json) { + if (j["type"].asString() == "file") { + res.file_sizes[j["path"].asString()] = + HuggingFaceFileSize{.size_in_bytes = j["size"].asUInt64()}; + } + } + return res; + } catch (const std::exception& e) { + return cpp::fail("Failed to parse gguf info: " + std::string(e.what())); + } + } + + Json::Value ToJson() { + Json::Value root; + Json::Value siblings(Json::arrayValue); + for (auto const& s : file_sizes) { + Json::Value s_json; + s_json["path"] = s.first; + s_json["size"] = s.second.size_in_bytes; + siblings.append(s_json); + } + root["siblings"] = siblings; + return root; + } +}; + +inline cpp::result +GetSiblingsFileSize(const std::string& author, const std::string& model_name, + const std::string& branch = "main") { + if (author.empty() || model_name.empty()) { + return cpp::fail("Author and model name cannot be empty"); + } + auto url_obj = url_parser::Url{ + .protocol = "https", + .host = kHuggingFaceHost, + .pathParams = {"api", "models", author, model_name, "tree", branch}}; + + auto result = curl_utils::SimpleGetJson(url_obj.ToFullPath()); + if (result.has_error()) { + return cpp::fail("Failed to get model siblings file size: " + author + "/" + + model_name + "/tree/" + branch); + } + auto r = result.value(); + for (auto const& j : result.value()) { + if (j["type"].asString() == "directory") { + auto url_obj = + url_parser::Url{.protocol = "https", + .host = kHuggingFaceHost, + .pathParams = {"api", "models", author, model_name, + "tree", branch, j["path"].asString()}}; + + auto rd = curl_utils::SimpleGetJson(url_obj.ToFullPath()); + if (rd.has_value()) { + for (auto const& rdj : rd.value()) { + r.append(rdj); + } + } + } + } + + return HuggingFaceSiblingsFileSize::FromJson(r); +} + +inline cpp::result GetReadMe( + const std::string& author, const std::string& model_name) { + if (author.empty() || model_name.empty()) { + return cpp::fail("Author and model name cannot be empty"); + } + auto url_obj = url_parser::Url{.protocol = "https", + .host = kHuggingFaceHost, + .pathParams = { + author, + model_name, + "raw", + "main", + "README.md", + }}; + + auto result = curl_utils::SimpleGet(url_obj.ToFullPath()); + if (result.has_error()) { + return cpp::fail("Failed to get model siblings file size: " + author + "/" + + model_name + "/raw/main/README.md"); + } + + return result.value(); +} + struct HuggingFaceGgufInfo { uint64_t total; std::string architecture; diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc new file mode 100644 index 000000000..fef425803 --- /dev/null +++ b/engine/utils/process/utils.cc @@ -0,0 +1,106 @@ +#include "utils/process/utils.h" +#include "utils/logging_utils.h" + +#if defined(__APPLE__) || defined(__linux__) +extern char **environ; // environment variables +#endif + +namespace cortex::process { + +std::string ConstructWindowsCommandLine(const std::vector& args) { + std::string cmd_line; + for (const auto& arg : args) { + // Simple escaping for Windows command line + std::string escaped_arg = arg; + if (escaped_arg.find(' ') != std::string::npos) { + // Wrap in quotes and escape existing quotes + for (char& c : escaped_arg) { + if (c == '"') + c = '\\'; + } + escaped_arg = "\"" + escaped_arg + "\""; + } + cmd_line += escaped_arg + " "; + } + return cmd_line; +} + +std::vector ConvertToArgv(const std::vector& args) { + std::vector argv; + for (const auto& arg : args) { + argv.push_back(const_cast(arg.c_str())); + } + argv.push_back(nullptr); + return argv; +} + +pid_t SpawnProcess(const std::vector& command) { + try { +#if defined(_WIN32) + // Windows process creation + STARTUPINFOA si = {0}; + PROCESS_INFORMATION pi = {0}; + si.cb = sizeof(si); + + // Construct command line + std::string cmd_line = ConstructWindowsCommandLine(command); + + // Convert string to char* for Windows API + char command_buffer[4096]; + strncpy_s(command_buffer, cmd_line.c_str(), sizeof(command_buffer)); + + if (!CreateProcessA(NULL, // lpApplicationName + command_buffer, // lpCommandLine + NULL, // lpProcessAttributes + NULL, // lpThreadAttributes + FALSE, // bInheritHandles + 0, // dwCreationFlags + NULL, // lpEnvironment + NULL, // lpCurrentDirectory + &si, // lpStartupInfo + &pi // lpProcessInformation + )) { + throw std::runtime_error("Failed to create process on Windows"); + } + + // Store the process ID + pid_t pid = pi.dwProcessId; + + // Close handles to avoid resource leaks + CloseHandle(pi.hProcess); + CloseHandle(pi.hThread); + + return pid; + +#elif defined(__APPLE__) || defined(__linux__) + // POSIX process creation + pid_t pid; + + // Convert command vector to char*[] + auto argv = ConvertToArgv(command); + + // Use posix_spawn for cross-platform compatibility + auto spawn_result = posix_spawn(&pid, // pid output + command[0].c_str(), // executable path + NULL, // file actions + NULL, // spawn attributes + argv.data(), // argument vector + environ // environment (inherit) + ); + + if (spawn_result != 0) { + throw std::runtime_error("Failed to spawn process"); + } + + return pid; + +#else +#error Unsupported platform +#endif + } catch (const std::exception& e) { + LOG_ERROR << "Process spawning error: " << e.what(); + return -1; + } +} + +} // namespace cortex::process \ No newline at end of file diff --git a/engine/utils/process/utils.h b/engine/utils/process/utils.h new file mode 100644 index 000000000..9332607e9 --- /dev/null +++ b/engine/utils/process/utils.h @@ -0,0 +1,25 @@ +#pragma once + +#if defined(_WIN32) +#include +#include +using pid_t = DWORD; +#elif defined(__APPLE__) || defined(__linux__) +#include +#include +#include +#include +#include +#endif + +#include +#include + +namespace cortex::process { +std::string ConstructWindowsCommandLine(const std::vector& args); + +std::vector ConvertToArgv(const std::vector& args); + +pid_t SpawnProcess(const std::vector& command); + +} \ No newline at end of file diff --git a/engine/utils/process_status_utils.h b/engine/utils/process_status_utils.h new file mode 100644 index 000000000..189f82ede --- /dev/null +++ b/engine/utils/process_status_utils.h @@ -0,0 +1,56 @@ +#include + +#ifdef _WIN32 +#include +#include +#include +using pid_t = DWORD; +#elif defined(__APPLE__) || defined(__linux__) +#include +#include +#endif +namespace process_status_utils { + +inline bool IsProcessRunning(pid_t pid) { +#ifdef _WIN32 + // Windows implementation + HANDLE snapshot = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0); + if (snapshot == INVALID_HANDLE_VALUE) { + return false; + } + + PROCESSENTRY32 processEntry = {0}; + processEntry.dwSize = sizeof(processEntry); + + if (Process32First(snapshot, &processEntry)) { + do { + if (processEntry.th32ProcessID == pid) { + CloseHandle(snapshot); + return true; + } + } while (Process32Next(snapshot, &processEntry)); + } + + CloseHandle(snapshot); + return false; + +#elif defined(__APPLE__) || defined(__linux__) + // Unix-like systems (Linux and macOS) implementation + if (pid <= 0) { + return false; + } + + // Try to send signal 0 to the process + // This doesn't actually send a signal but checks if we can send signals to the process + int result = kill(pid, 0); + + if (result == 0) { + return true; // Process exists and we have permission to send it signals + } + + return errno != ESRCH; // ESRCH means "no such process" +#else +#error "Unsupported platform" +#endif +} +} // namespace process_status_utils \ No newline at end of file