diff --git a/.github/scripts/e2e-test-whisper-linux-and-mac.sh b/.github/scripts/e2e-test-whisper-linux-and-mac.sh deleted file mode 100755 index 4c8a1e9eb..000000000 --- a/.github/scripts/e2e-test-whisper-linux-and-mac.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash - -## Example run command -# ./linux-and-mac.sh './jan/plugins/@janhq/inference-plugin/dist/nitro/nitro_mac_arm64' https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/resolve/main/tinyllama-1.1b-chat-v0.3.Q2_K.gguf - -# Check for required arguments -if [[ $# -ne 2 ]]; then - echo "Usage: $0 " - exit 1 -fi - -rm /tmp/response1.log /tmp/response2.log /tmp/nitro.log - -BINARY_PATH=$1 -DOWNLOAD_URL=$2 - -# Random port to ensure it's not used -min=10000 -max=11000 -range=$((max - min + 1)) -PORT=$((RANDOM % range + min)) - -# Start the binary file -"$BINARY_PATH" 1 127.0.0.1 $PORT >/tmp/nitro.log & - -# Get the process id of the binary file -pid=$! - -if ! ps -p $pid >/dev/null; then - echo "nitro failed to start. Logs:" - cat /tmp/nitro.log - exit 1 -fi - -# Wait for a few seconds to let the server start -sleep 5 - -# Check if /tmp/testwhisper exists, if not, download it -if [[ ! -f "/tmp/testwhisper" ]]; then - curl --connect-timeout 300 $DOWNLOAD_URL --output /tmp/testwhisper -fi - -# Run the curl commands -response1=$(curl --connect-timeout 60 -o /tmp/response1.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/audio/load_model" \ - --header 'Content-Type: application/json' \ - --data '{ - "model_path": "/tmp/testwhisper", - "model_id": "whisper.cpp" -}') - -response2=$( - curl --connect-timeout 60 -o /tmp/response2.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/audio/transcriptions" \ - --header 'Access-Control-Allow-Origin: *' \ - --form 'file=@"../whisper.cpp/samples/jfk.wav"' \ - --form 'model_id="whisper.cpp"' \ - --form 'temperature="0.0"' \ - --form 'prompt="The transcript is about OpenAI which makes technology like DALLĀ·E, GPT-3, and ChatGPT with the hope of one day building an AGI system that benefits all of humanity. The president is trying to raly people to support the cause."' \ - -) - -error_occurred=0 -if [[ "$response1" -ne 200 ]]; then - echo "The first curl command failed with status code: $response1" - cat /tmp/response1.log - error_occurred=1 -fi - -if [[ "$response2" -ne 200 ]]; then - echo "The second curl command failed with status code: $response2" - cat /tmp/response2.log - error_occurred=1 -fi - -if [[ "$error_occurred" -eq 1 ]]; then - echo "Nitro test run failed!!!!!!!!!!!!!!!!!!!!!!" - echo "Nitro Error Logs:" - cat /tmp/nitro.log - kill $pid - exit 1 -fi - -echo "----------------------" -echo "Log load model:" -cat /tmp/response1.log - -echo "----------------------" -echo "Log run test:" -cat /tmp/response2.log - -echo "Nitro test run successfully!" - -# Kill the server process -kill $pid diff --git a/.github/scripts/e2e-test-whisper-windows.bat b/.github/scripts/e2e-test-whisper-windows.bat deleted file mode 100644 index 6eb2037ea..000000000 --- a/.github/scripts/e2e-test-whisper-windows.bat +++ /dev/null @@ -1,102 +0,0 @@ -@echo off - -set "TEMP=C:\Users\%UserName%\AppData\Local\Temp" -set "MODEL_PATH=%TEMP%\testwhisper" - -rem Check for required arguments -if "%~2"=="" ( - echo Usage: %~0 ^ ^ - exit /b 1 -) - -set "BINARY_PATH=%~1" -set "DOWNLOAD_URL=%~2" - -for %%i in ("%BINARY_PATH%") do set "BINARY_NAME=%%~nxi" - -echo BINARY_NAME=%BINARY_NAME% - -del %TEMP%\response1.log 2>nul -del %TEMP%\response2.log 2>nul -del %TEMP%\nitro.log 2>nul - -set /a min=9999 -set /a max=11000 -set /a range=max-min+1 -set /a PORT=%min% + %RANDOM% %% %range% - -rem Start the binary file -start /B "" "%BINARY_PATH%" 1 "127.0.0.1" %PORT% > %TEMP%\nitro.log 2>&1 - -ping -n 6 127.0.0.1 %PORT% > nul - -rem Capture the PID of the started process with "nitro" in its name -for /f "tokens=2" %%a in ('tasklist /fi "imagename eq %BINARY_NAME%" /fo list ^| findstr /B "PID:"') do ( - set "pid=%%a" -) - -echo pid=%pid% - -if not defined pid ( - echo nitro failed to start. Logs: - type %TEMP%\nitro.log - exit /b 1 -) - -rem Wait for a few seconds to let the server start - -rem Check if %TEMP%\testwhisper exists, if not, download it -if not exist "%MODEL_PATH%" ( - curl.exe --connect-timeout 300 %DOWNLOAD_URL% --output "%MODEL_PATH%" -) - -rem Define JSON strings for curl data -call set "MODEL_PATH_STRING=%%MODEL_PATH:\=\\%%" -set "curl_data1={\"model_path\":\"%MODEL_PATH_STRING%\",\"model_id\":\"whisper\"}" - -rem Run the curl commands and capture the status code -curl.exe --connect-timeout 60 -o %TEMP%\response1.log -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/v1/audio/load_model" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1_code.log 2>&1 - -curl --connect-timeout 60 -o %TEMP%\response2.log -s -w "%%{http_code}" --location "http://localhost:%PORT%/v1/audio/transcriptions" ^ ---form "file=@../..//whisper.cpp/samples/jfk.wav" ^ ---form "model_id=whisper" > %TEMP%\response2_code.log 2>&1 - -set "error_occurred=0" - -rem Read the status codes from the log files -for /f %%a in (%TEMP%\response1_code.log) do set "response1=%%a" -for /f %%a in (%TEMP%\response2_code.log) do set "response2=%%a" - -if "%response1%" neq "200" ( - echo The first curl command failed with status code: %response1% - type %TEMP%\response1.log - set "error_occurred=1" -) - -if "%response2%" neq "200" ( - echo The second curl command failed with status code: %response2% - type %TEMP%\response2.log - set "error_occurred=1" -) - -if "%error_occurred%"=="1" ( - echo Nitro test run failed!!!!!!!!!!!!!!!!!!!!!! - echo Nitro Error Logs: - type %TEMP%\nitro.log - taskkill /f /pid %pid% - exit /b 1 -) - - -echo ---------------------- -echo Log load model: -type %TEMP%\response1.log - -echo ---------------------- -echo "Log run test:" -type %TEMP%\response2.log - -echo Nitro test run successfully! - -rem Kill the server process -taskkill /f /im nitro.exe 2>nul || exit /B 0 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 90e509d55..356a2a9ca 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,54 +1,16 @@ name: CI on: - schedule: - - cron: "0 20 * * *" # At 8 PM UTC, which is 3 AM UTC+7 push: tags: ["v[0-9]+.[0-9]+.[0-9]+"] paths: [ - ".github/scripts/**", - ".github/workflows/build.yml", - "**/CMakeLists.txt", - "**/Makefile", - "**/*.h", - "**/*.hpp", - "**/*.c", - "**/*.cpp", - "**/*.cu", - "**/*.cc", - "**/*.cxx", - "llama.cpp", - "!docs/**", - "!.gitignore", - "!README.md", - ] - pull_request: - types: [opened, synchronize, reopened] - paths: - [ - ".github/scripts/**", - ".github/workflows/build.yml", - "**/CMakeLists.txt", - "**/Makefile", - "**/*.h", - "**/*.hpp", - "**/*.c", - "**/*.cpp", - "**/*.cu", - "**/*.cc", - "**/*.cxx", - "llama.cpp", - "!docs/**", - "!.gitignore", - "!README.md", + "cortex-cpp/**", ] workflow_dispatch: env: - BRANCH_NAME: ${{ github.head_ref || github.ref_name }} LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf - WHISPER_MODEL_URL: https://delta.jan.ai/ggml-tiny-q5_1.bin EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf jobs: @@ -77,263 +39,121 @@ jobs: draft: true prerelease: false - # Get the latest version of the release - set-nitro-version: - runs-on: ubuntu-latest - outputs: - version: ${{ steps.version_update.outputs.new_version }} - steps: - - name: Get latest release - id: version_update - run: | - ldd --version - if [[ ${{ github.event_name }} == push && ${{ github.ref }} == refs/tags/* ]]; then - echo "VERSION=${GITHUB_REF#refs/tags/}" - NEW_VERSION="${VERSION#v}" - echo "::set-output name=new_version::$NEW_VERSION" - else - # Function to get the latest release tag - get_latest_tag() { - local retries=0 - local max_retries=3 - local tag - while [ $retries -lt $max_retries ]; do - tag=$(curl -s https://api.github.com/repos/janhq/nitro/releases/latest | jq -r .tag_name) - if [ -n "$tag" ] && [ "$tag" != "null" ]; then - echo $tag - return - else - let retries++ - sleep 2 - fi - done - echo "Failed to fetch latest tag after $max_retries attempts." - exit 1 - } - # Get the latest release tag from GitHub API - LATEST_TAG=$(get_latest_tag) - - # Remove the 'v' and append the build number to the version - NEW_VERSION="${LATEST_TAG#v}-${GITHUB_RUN_NUMBER}" - echo "New version: $NEW_VERSION" - echo "::set-output name=new_version::$NEW_VERSION" - fi - echo "Version: $NEW_VERSION" - - ubuntu-amd64-build: - runs-on: ubuntu-18-04-cuda-11-7 - needs: [create-draft-release, set-nitro-version] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' + build-and-test: + runs-on: ${{ matrix.runs-on }} + needs: [create-draft-release] timeout-minutes: 40 - permissions: - contents: write - strategy: + fail-fast: false matrix: include: - - build: "amd64-avx2" - defines: "-DLLAMA_NATIVE=OFF" - - build: "amd64-avx" - defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" - - build: "amd64-avx512" - defines: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" - - build: "amd64-vulkan" - defines: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF" - # - build: "arm64" - # defines: "-A ARM64 -DLLAMA_NATIVE=OFF" - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Prepare Vulkan SDK - if: ${{ matrix.build == 'amd64-vulkan' }} - uses: humbletim/setup-vulkan-sdk@v1.2.0 - with: - vulkan-query-version: 1.3.275.0 - vulkan-components: Vulkan-Headers, Vulkan-Loader - vulkan-use-cache: true - - - name: Build - id: make_build - run: | - ldd --version - ./install_deps.sh - mkdir build && cd build - cmake ${{ matrix.defines }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. - make -j $(nproc) - ls -la - - - name: Package - shell: bash - run: | - mkdir -p nitro - cp build/nitro nitro/ - tar -czvf nitro.tar.gz nitro - - - name: Upload Artifact - uses: actions/upload-artifact@v2 - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' - with: - name: nitro-linux-${{ matrix.build }} - path: ./nitro - - - name: Run e2e testing - LLama.CPP - shell: bash - if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }} - run: | - # run e2e testing - cd nitro - chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} - rm -rf uploads/ - - - name: Run e2e testing - Whisper.CPP - shell: bash - if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }} - run: | - # run e2e testing - cd nitro - chmod +x ../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} - rm -rf uploads/ - - - uses: actions/upload-release-asset@v1.0.1 - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz - asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-linux-${{ matrix.build }}.tar.gz - asset_content_type: application/gzip - - ubuntu-amd64-cuda-build: - runs-on: ubuntu-18-04-cuda-${{ matrix.cuda }} - needs: [create-draft-release, set-nitro-version] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' - timeout-minutes: 40 - permissions: - contents: write - strategy: - matrix: - cuda: ["12-0", "11-7"] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Build - id: make_build - run: | - ./install_deps.sh - mkdir build && cd build - cmake -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. - make -j $(nproc) - ls -la - - - name: Package - shell: bash - run: | - mkdir -p nitro - cp build/nitro nitro/ - tar -czvf nitro.tar.gz nitro + - os: "linux" + name: "amd64-avx2" + runs-on: "ubuntu-18-04" + cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF" + run-e2e: true + + - os: "linux" + name: "amd64-avx" + runs-on: "ubuntu-18-04" + cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" + run-e2e: false + + - os: "linux" + name: "amd64-avx512" + runs-on: "ubuntu-18-04" + cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" + run-e2e: false + + - os: "linux" + name: "amd64-vulkan" + runs-on: "ubuntu-18-04-cuda-11-7" + cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF" + run-e2e: false + + - os: "linux" + name: "amd64-cuda-11-7" + runs-on: "ubuntu-18-04-cuda-11-7" + cmake-flags: "-DCUDA_11_7=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON" + run-e2e: false + + - os: "linux" + name: "amd64-cuda-12-0" + runs-on: "ubuntu-18-04-cuda-12-0" + cmake-flags: "-DCUDA_12_0=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON" + run-e2e: false + + - os: "mac" + name: "amd64" + runs-on: "macos-13" + cmake-flags: "" + run-e2e: true + + - os: "mac" + name: "arm64" + runs-on: "mac-silicon" + cmake-flags: "-DMAC_ARM64=ON" + run-e2e: true + + - os: "windows" + name: "amd64-avx2" + runs-on: "windows-latest" + cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: true + + - os: "windows" + name: "amd64-avx" + runs-on: "windows-latest" + cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-avx512" + runs-on: "windows-latest" + cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-vulkan" + runs-on: "windows-latest" + cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-avx2-cuda-12-0" + runs-on: "windows-cuda-12-0" + cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-avx-cuda-12-0" + runs-on: "windows-cuda-12-0" + cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-avx512-cuda-12-0" + runs-on: "windows-cuda-12-0" + cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-avx2-cuda-11-7" + runs-on: "windows-cuda-11-7" + cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-avx-cuda-11-7" + runs-on: "windows-cuda-11-7" + cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + - os: "windows" + name: "amd64-avx512-cuda-11-7" + runs-on: "windows-cuda-11-7" + cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false - - name: Upload Artifact - uses: actions/upload-artifact@v2 - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' - with: - name: nitro-linux-amd64-cuda-${{ matrix.cuda }} - path: ./nitro - - - uses: actions/upload-release-asset@v1.0.1 - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz - asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-linux-amd64-cuda-${{ matrix.cuda }}.tar.gz - asset_content_type: application/gzip - - macOS-silicon-build: - runs-on: mac-silicon - needs: [create-draft-release, set-nitro-version] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' - timeout-minutes: 40 - permissions: - contents: write - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install cmake sdl2 - - - name: Build - id: cmake_build - run: | - ./install_deps.sh - mkdir build && cd build - cmake -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. - CC=gcc-8 make -j $(sysctl -n hw.ncpu) - ls -la - - - name: Package - shell: bash - run: | - mkdir -p nitro - cp llama.cpp/ggml-metal.metal nitro/ - cp build/nitro nitro/ - - - name: Upload Artifact - uses: actions/upload-artifact@v2 - with: - name: nitro-mac-arm64 - path: ./nitro - - - name: Run e2e testing - LLama.CPP - shell: bash - run: | - # run e2e testing - cd nitro/ - chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} - rm -rf uploads/ - - - name: Run e2e testing - Whisper.CPP - shell: bash - run: | - # To test with CoreML - if [[ ! -f "/tmp/testwhisper-encoder.mlmodelc" ]]; then - wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-encoder.mlmodelc.zip - unzip ggml-tiny-encoder.mlmodelc.zip - rm ggml-tiny-encoder.mlmodelc.zip - rm -rf /tmp/testwhisper-encoder.mlmodelc - mv ggml-tiny-encoder.mlmodelc /tmp/testwhisper-encoder.mlmodelc - fi - # run e2e testing - cd nitro - chmod +x ../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} - rm -rf uploads/ - - macOS-amd64-build: - runs-on: macos-latest - needs: [create-draft-release, set-nitro-version] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' - timeout-minutes: 40 - permissions: - contents: write steps: - name: Clone id: checkout @@ -341,314 +161,32 @@ jobs: with: submodules: recursive - - name: Dependencies - id: depends - continue-on-error: true + - name: Install choco on Windows + if: runner.os == 'Windows' run: | - brew update - brew install sdl2 + choco install make -y - name: Build - id: cmake_build run: | - ./install_deps.sh - mkdir build && cd build - cmake -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. - CC=gcc-8 make -j $(sysctl -n hw.ncp) - ls -la + cd cortex-cpp + make build CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}" - name: Package - shell: bash - run: | - mkdir -p nitro - cp build/nitro nitro/ - - - name: Upload Artifact - uses: actions/upload-artifact@v2 - with: - name: nitro-mac-amd64 - path: ./nitro - - - name: Run e2e testing - LLama.CPP - shell: bash - run: | - # run e2e testing - cd nitro - chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} - rm -rf uploads/ - - - name: Run e2e testing - Whisper.CPP - shell: bash - run: | - # run e2e testing - cd nitro - chmod +x ../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} - rm -rf uploads/ - - universal-nitro-artifact-macos: - runs-on: macos-latest - needs: [create-draft-release, set-nitro-version, macOS-silicon-build, macOS-amd64-build] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' - timeout-minutes: 40 - permissions: - contents: write - steps: - - name: download artifact amd64 - uses: actions/download-artifact@v2 - with: - name: nitro-mac-amd64 - path: ./nitro-mac-amd64 - - - name: download artifact arm64 - uses: actions/download-artifact@v2 - with: - name: nitro-mac-arm64 - path: ./nitro-mac-arm64 - - - name: bundle universal binary - run: | - mkdir -p nitro - ls ./nitro-mac-amd64 - lipo -create ./nitro-mac-amd64/nitro ./nitro-mac-arm64/nitro -output ./nitro/nitro - cp ./nitro-mac-arm64/ggml-metal.metal ./nitro/ggml-metal.metal - tar -czvf nitro.tar.gz nitro - - - name: Upload Artifact - uses: actions/upload-artifact@v2 - with: - name: nitro-mac-universal - path: ./nitro - - - uses: actions/upload-release-asset@v1.0.1 - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz - asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-mac-universal.tar.gz - asset_content_type: application/gzip - - windows-amd64-build: - runs-on: windows-latest - needs: [create-draft-release, set-nitro-version] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' - timeout-minutes: 40 - - strategy: - matrix: - include: - - build: "amd64-avx2" - defines: "-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - - build: "amd64-avx" - defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - - build: "amd64-avx512" - defines: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - - build: "amd64-vulkan" - defines: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - # - build: "arm64" - # defines: "-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON" - - permissions: - contents: write - - steps: - - name: Clone - - id: checkout - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Setup VSWhere.exe - uses: warrenbuckley/Setup-VSWhere@v1 - with: - version: latest - silent: true - env: - ACTIONS_ALLOW_UNSECURE_COMMANDS: true - - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1 - - - name: Fetch SDL2 and set SDL2_DIR version 2.28.5 - run: | - C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-2.28.5/SDL2-devel-2.28.5-VC.zip - 7z x sdl2.zip -aoa - echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-2.28.5/cmake" >> $env:GITHUB_ENV - - - name: actions-setup-cmake - uses: jwlawson/actions-setup-cmake@v1.14.1 - - - name: Prepare Vulkan SDK - uses: humbletim/setup-vulkan-sdk@v1.2.0 - if: ${{ matrix.build == 'amd64-vulkan' }} - with: - vulkan-query-version: 1.3.275.0 - vulkan-components: Vulkan-Headers, Vulkan-Loader - vulkan-use-cache: true - - - name: Build - id: cmake_build - shell: cmd - run: | - cmake -S ./nitro_deps -B ./build_deps/nitro_deps - cmake --build ./build_deps/nitro_deps --config Release - mkdir -p build - cd build - cmake .. ${{ matrix.defines }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} - cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%" - - - name: Pack artifacts - id: pack_artifacts - shell: cmd - run: | - robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll - robocopy build\bin\Release\ .\build\Release\ llama.dll - robocopy build\bin\Release\ .\build\Release\ whisper.dll - robocopy .github\patches\windows\ .\build\Release\ msvcp140.dll - robocopy .github\patches\windows\ .\build\Release\ vcruntime140_1.dll - robocopy .github\patches\windows\ .\build\Release\ vcruntime140.dll - robocopy "$env:SDL2_DIR\..\lib\2.28.5\" .\build\Release\ SDL2.dll - dotnet tool install --global AzureSignTool - azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe" - 7z a -ttar temp.tar .\build\Release\* - 7z a -tgzip nitro.tar.gz temp.tar - - - name: Run e2e testing - Llama.cpp - shell: cmd - if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }} - run: | - cd build\Release - ..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} - rmdir /S /Q .\build\Release\uploads - - - name: Run e2e testing - Whisper.cpp - shell: cmd - if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }} - run: | - cd build\Release - ..\..\.github\scripts\e2e-test-whisper-windows.bat nitro.exe ${{ env.WHISPER_MODEL_URL }} - rmdir /S /Q .\build\Release\uploads - - - name: Upload Artifact - uses: actions/upload-artifact@v2 - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' - with: - name: nitro-win-${{ matrix.build }} - path: ./build/Release - - - uses: actions/upload-release-asset@v1.0.1 - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz - asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-${{ matrix.build }}.tar.gz - asset_content_type: application/gzip - - windows-amd64-cuda-build: - runs-on: windows-cuda-${{ matrix.cuda }} - needs: [create-draft-release, set-nitro-version] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' - timeout-minutes: 40 - permissions: - contents: write - - strategy: - matrix: - cuda: ["12-0", "11-7"] - instructions: ["amd64-avx2", "amd64-avx", "amd64-avx512"] - - steps: - - name: Setup VSWhere.exe - uses: warrenbuckley/Setup-VSWhere@v1 - with: - version: latest - silent: true - env: - ACTIONS_ALLOW_UNSECURE_COMMANDS: true - - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1 - - - name: Fetch SDL2 and set SDL2_DIR version 2.28.5 - run: | - curl -L -o sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-2.28.5/SDL2-devel-2.28.5-VC.zip - 7z x sdl2.zip -aoa - echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-2.28.5/cmake" >> $env:GITHUB_ENV - - - name: actions-setup-cmake - uses: jwlawson/actions-setup-cmake@v1.14.1 - - - name: Clone - id: checkout - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Setup VSWhere.exe - uses: warrenbuckley/Setup-VSWhere@v1 - with: - version: latest - silent: true - env: - ACTIONS_ALLOW_UNSECURE_COMMANDS: true - - - uses: actions/setup-dotnet@v3 - with: - dotnet-version: "6.0.x" - - # Conditional instruction check and set environment variable - - name: Set INSTRUCTION Based on Instructions ${{ matrix.instructions }} - shell: cmd run: | - IF "${{ matrix.instructions }}" == "amd64-avx2" ( - echo "INSTRUCTION=-DLLAMA_NATIVE=OFF" >> $env:GITHUB_ENV - echo "INSTRUCTION=-DLLAMA_NATIVE=OFF" - ) ELSE IF "${{ matrix.instructions }}" == "amd64-avx" ( - echo "INSTRUCTION=-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" >> $env:GITHUB_ENV - echo "INSTRUCTION=-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" - ) ELSE IF "${{ matrix.instructions }}" == "amd64-avx512" ( - echo "INSTRUCTION=-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" >> $env:GITHUB_ENV - echo "INSTRUCTION=-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" - ) + cd cortex-cpp + make package - - name: Build - id: cmake_build - shell: cmd + - name: Run e2e testing + if: ${{ matrix.run-e2e }} run: | - cmake -S ./nitro_deps -B ./build_deps/nitro_deps - cmake --build ./build_deps/nitro_deps --config Release - mkdir -p build - cd build - cmake .. %INSTRUCTION% -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} - cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%" - - - name: Pack artifacts - id: pack_artifacts - shell: cmd - run: | - set PATH=%PATH%;C:\Program Files\7-Zip\ - robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll - robocopy build\bin\Release\ .\build\Release\ llama.dll - robocopy build\bin\Release\ .\build\Release\ whisper.dll - robocopy .github\patches\windows\ .\build\Release\ msvcp140.dll - robocopy .github\patches\windows\ .\build\Release\ vcruntime140_1.dll - robocopy .github\patches\windows\ .\build\Release\ vcruntime140.dll - robocopy "$env:SDL2_DIR\..\lib\2.28.5\" .\build\Release\ SDL2.dll - dotnet tool install --global AzureSignTool - %USERPROFILE%\.dotnet\tools\azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe" - 7z a -ttar temp.tar .\build\Release\* - 7z a -tgzip nitro.tar.gz temp.tar + cd cortex-cpp + make run-e2e-test RUN_TESTS=true LLM_MODEL_URL=${{ env.LLM_MODEL_URL }} EMBEDDING_MODEL_URL=${{ env.EMBEDDING_MODEL_URL }} - name: Upload Artifact uses: actions/upload-artifact@v2 - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' with: - name: nitro-win-${{ matrix.instructions }}-cuda-${{ matrix.cuda }} - path: ./build/Release + name: cortex-llamacpp-engine-${{ matrix.os }}-${{ matrix.name }} + path: ./cortex-cpp/cortex - uses: actions/upload-release-asset@v1.0.1 if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') @@ -656,103 +194,6 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz - asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-${{ matrix.instructions }}-cuda-${{ matrix.cuda }}.tar.gz - asset_content_type: application/gzip - - update_release_draft: - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - timeout-minutes: 40 - needs: - [ - ubuntu-amd64-build, - ubuntu-amd64-cuda-build, - macOS-silicon-build, - macOS-amd64-build, - windows-amd64-build, - windows-amd64-cuda-build, - ] - permissions: - contents: write - pull-requests: write - runs-on: ubuntu-latest - steps: - - uses: release-drafter/release-drafter@v5 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - noti-discord-nightly: - timeout-minutes: 40 - if: github.event_name == 'schedule' && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.ubuntu-amd64-build.result == 'success' && needs.ubuntu-amd64-cuda-build.result == 'success' && needs.macOS-silicon-build.result == 'success' && needs.macOS-amd64-build.result == 'success' && needs.windows-amd64-build.result == 'success' && needs.windows-amd64-cuda-build.result == 'success' - needs: - [ - create-draft-release, - ubuntu-amd64-build, - ubuntu-amd64-cuda-build, - macOS-silicon-build, - macOS-amd64-build, - windows-amd64-build, - windows-amd64-cuda-build, - ] - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v3 - with: - fetch-depth: "0" - token: ${{ secrets.PAT_SERVICE_ACCOUNT }} - - name: Notify Discord - uses: Ilshidur/action-discord@master - with: - args: "Nightly build artifact: https://github.com/janhq/nitro/actions/runs/{{ GITHUB_RUN_ID }}" - env: - DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }} - - name: Update README.md with artifact URL - run: | - sed -i "s|||" README.md - git config --global user.email "service@jan.ai" - git config --global user.name "Service Account" - git add README.md - git commit -m "${GITHUB_REPOSITORY}: Update README.md with nightly build artifact URL" - git -c http.extraheader="AUTHORIZATION: bearer ${{ secrets.PAT_SERVICE_ACCOUNT }}" push origin HEAD:main - env: - GITHUB_RUN_ID: ${{ github.run_id }} - - noti-discord-manual: - timeout-minutes: 40 - if: github.event_name == 'workflow_dispatch' && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.ubuntu-amd64-build.result == 'success' && needs.ubuntu-amd64-cuda-build.result == 'success' && needs.macOS-silicon-build.result == 'success' && needs.macOS-amd64-build.result == 'success' && needs.windows-amd64-build.result == 'success' && needs.windows-amd64-cuda-build.result == 'success' - needs: - [ - create-draft-release, - ubuntu-amd64-build, - ubuntu-amd64-cuda-build, - macOS-silicon-build, - macOS-amd64-build, - windows-amd64-build, - windows-amd64-cuda-build, - ] - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v3 - with: - fetch-depth: "0" - token: ${{ secrets.PAT_SERVICE_ACCOUNT }} - - name: Notify Discord - uses: Ilshidur/action-discord@master - with: - args: "Manual build artifact: https://github.com/janhq/nitro/actions/runs/{{ GITHUB_RUN_ID }}" - env: - DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }} - # Update README.md with artifact URL if manual build from main branch - - name: Update README.md with artifact URL - if: github.ref == 'refs/heads/main' - run: | - sed -i "s|||" README.md - git config --global user.email "service@jan.ai" - git config --global user.name "Service Account" - git add README.md - git commit -m "${GITHUB_REPOSITORY}: Update README.md with nightly build artifact URL" - git -c http.extraheader="AUTHORIZATION: bearer ${{ secrets.PAT_SERVICE_ACCOUNT }}" push origin HEAD:main - env: - GITHUB_RUN_ID: ${{ github.run_id }} + asset_path: ./cortex-cpp/cortex.tar.gz + asset_name: cortex-llamacpp-engine-${{ needs.create-draft-release.outputs.version }}-${{ matrix.os }}-${{ matrix.name }}.tar.gz + asset_content_type: application/gzip \ No newline at end of file diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index 75d46cb03..000000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,95 +0,0 @@ -name: Nitro Docs - -on: - push: - branches: - - main - paths: - - 'docs/**' - - '.github/workflows/docs.yml' - pull_request: - branches: - - main - paths: - - 'docs/**' - - '.github/workflows/docs.yml' - # Review gh actions docs if you want to further define triggers, paths, etc - # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on - -jobs: - deploy: - name: Deploy to GitHub Pages - env: - CLOUDFLARE_ACCOUNT_ID: 9707100ef42a1a25bd70e3ee2137bd0e - CLOUDFLARE_PROJECT_NAME: nitro - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-node@v3 - with: - node-version: 18 - - - name: Install jq - uses: dcarbone/install-jq-action@v2.0.1 - - - name: Fill env vars - run: | - env_example_file=".env.example" - touch .env - while IFS= read -r line || [[ -n "$line" ]]; do - if [[ "$line" == *"="* ]]; then - var_name=$(echo $line | cut -d '=' -f 1) - echo $var_name - var_value="$(jq -r --arg key "$var_name" '.[$key]' <<< "$SECRETS")" - echo "$var_name=$var_value" >> .env - fi - done < "$env_example_file" - working-directory: docs - env: - SECRETS: '${{ toJson(secrets) }}' - - - name: Install dependencies - run: yarn install - working-directory: docs - - name: Build website - run: sed -i '/process.env.DEBUG = namespaces;/c\// process.env.DEBUG = namespaces;' ./node_modules/debug/src/node.js && yarn build - working-directory: docs - - - name: Publish to Cloudflare Pages PR Preview and Staging - if: (github.event_name == 'push' && github.ref == 'refs/heads/main') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main') - uses: cloudflare/pages-action@v1 - with: - apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} - accountId: ${{ env.CLOUDFLARE_ACCOUNT_ID }} - projectName: ${{ env.CLOUDFLARE_PROJECT_NAME }} - directory: ./docs/build - # Optional: Enable this if you want to have GitHub Deployments triggered - gitHubToken: ${{ secrets.GITHUB_TOKEN }} - id: deployCloudflarePages - - - uses: mshick/add-pr-comment@v2 - if: github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' - with: - message: | - Preview URL: ${{ steps.deployCloudflarePages.outputs.url }} - - - name: Add Custome Domain file - if: github.event_name == 'push' && github.event.pull_request.head.repo.full_name != github.repository - run: echo "${{ vars.DOCUSAURUS_DOMAIN }}" > ./docs/build/CNAME - - # Popular action to deploy to GitHub Pages: - # Docs: https://github.com/peaceiris/actions-gh-pages#%EF%B8%8F-docusaurus - - name: Deploy to GitHub Pages - if: github.event_name == 'push' && github.event.pull_request.head.repo.full_name != github.repository - uses: peaceiris/actions-gh-pages@v3 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - # Build output to publish to the `gh-pages` branch: - publish_dir: ./docs/build - # The following lines assign commit authorship to the official - # GH-Actions bot for deploys to `gh-pages` branch: - # https://github.com/actions/checkout/issues/13#issuecomment-724415212 - # The GH actions bot is used by default if you didn't specify the two fields. - # You can swap them out with your own user credentials. - user_name: github-actions[bot] - user_email: 41898282+github-actions[bot]@users.noreply.github.com \ No newline at end of file diff --git a/.github/workflows/quality-gate.yml b/.github/workflows/quality-gate.yml new file mode 100644 index 000000000..82930e0ed --- /dev/null +++ b/.github/workflows/quality-gate.yml @@ -0,0 +1,163 @@ +name: CI Quality Gate + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + [ + "cortex-cpp/**", + ] + workflow_dispatch: + +env: + LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf + EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf + +jobs: + build-and-test: + runs-on: ${{ matrix.runs-on }} + timeout-minutes: 40 + strategy: + fail-fast: false + matrix: + include: + - os: "linux" + name: "amd64-avx2" + runs-on: "ubuntu-18-04" + cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF" + run-e2e: true + + - os: "linux" + name: "amd64-avx" + runs-on: "ubuntu-18-04" + cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" + run-e2e: false + + - os: "linux" + name: "amd64-avx512" + runs-on: "ubuntu-18-04" + cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" + run-e2e: false + + - os: "linux" + name: "amd64-vulkan" + runs-on: "ubuntu-18-04-cuda-11-7" + cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF" + run-e2e: false + + - os: "linux" + name: "amd64-cuda-11-7" + runs-on: "ubuntu-18-04-cuda-11-7" + cmake-flags: "-DCUDA_11_7=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON" + run-e2e: false + + - os: "linux" + name: "amd64-cuda-12-0" + runs-on: "ubuntu-18-04-cuda-12-0" + cmake-flags: "-DCUDA_12_0=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON" + run-e2e: false + + - os: "mac" + name: "amd64" + runs-on: "macos-13" + cmake-flags: "" + run-e2e: true + + - os: "mac" + name: "arm64" + runs-on: "mac-silicon" + cmake-flags: "-DMAC_ARM64=ON" + run-e2e: true + + - os: "windows" + name: "amd64-avx2" + runs-on: "windows-latest" + cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: true + + - os: "windows" + name: "amd64-avx" + runs-on: "windows-latest" + cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-avx512" + runs-on: "windows-latest" + cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-vulkan" + runs-on: "windows-latest" + cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-avx2-cuda-12-0" + runs-on: "windows-cuda-12-0" + cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-avx-cuda-12-0" + runs-on: "windows-cuda-12-0" + cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-avx512-cuda-12-0" + runs-on: "windows-cuda-12-0" + cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-avx2-cuda-11-7" + runs-on: "windows-cuda-11-7" + cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + - os: "windows" + name: "amd64-avx-cuda-11-7" + runs-on: "windows-cuda-11-7" + cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + - os: "windows" + name: "amd64-avx512-cuda-11-7" + runs-on: "windows-cuda-11-7" + cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" + run-e2e: false + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Install choco on Windows + if: runner.os == 'Windows' + run: | + choco install make -y + + - name: Build + run: | + cd cortex-cpp + make build CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}" + + - name: Package + run: | + cd cortex-cpp + make package + + - name: Run e2e testing + if: ${{ matrix.run-e2e }} + run: | + cd cortex-cpp + make run-e2e-test RUN_TESTS=true LLM_MODEL_URL=${{ env.LLM_MODEL_URL }} EMBEDDING_MODEL_URL=${{ env.EMBEDDING_MODEL_URL }} + + - name: Upload Artifact + uses: actions/upload-artifact@v2 + with: + name: cortex-llamacpp-engine-${{ matrix.os }}-${{ matrix.name }} + path: ./cortex-cpp/cortex \ No newline at end of file diff --git a/cortex-cpp/common/base.cc b/.gitmodules similarity index 100% rename from cortex-cpp/common/base.cc rename to .gitmodules diff --git a/cortex-cpp/.gitignore b/cortex-cpp/.gitignore index be1237faa..69c167305 100644 --- a/cortex-cpp/.gitignore +++ b/cortex-cpp/.gitignore @@ -85,7 +85,6 @@ CMakeCache.txt CMakeFiles CMakeScripts Testing -Makefile !nitro-node/Makefile cmake_install.cmake install_manifest.txt diff --git a/cortex-cpp/.gitmodules b/cortex-cpp/.gitmodules deleted file mode 100644 index e2f71d456..000000000 --- a/cortex-cpp/.gitmodules +++ /dev/null @@ -1,7 +0,0 @@ -[submodule "llama.cpp"] - path = llama.cpp - url = https://github.com/ggerganov/llama.cpp - branch = master -[submodule "whisper.cpp"] - path = whisper.cpp - url = https://github.com/ggerganov/whisper.cpp.git diff --git a/cortex-cpp/CMakeLists.txt b/cortex-cpp/CMakeLists.txt index eba4fee0c..97be0e86d 100644 --- a/cortex-cpp/CMakeLists.txt +++ b/cortex-cpp/CMakeLists.txt @@ -1,6 +1,7 @@ cmake_minimum_required(VERSION 3.5) project(nitro C CXX) +include(engines/cortex.llamacpp/engine.cmake) include(CheckIncludeFileCXX) check_include_file_cxx(any HAS_ANY) @@ -55,9 +56,6 @@ endif() add_compile_definitions(NITRO_VERSION="${NITRO_VERSION}") -add_subdirectory(llama.cpp/examples/llava) -add_subdirectory(llama.cpp) -add_subdirectory(whisper.cpp) add_subdirectory(test) add_executable(${PROJECT_NAME} main.cc) @@ -68,8 +66,9 @@ add_executable(${PROJECT_NAME} main.cc) # target_link_libraries(${PROJECT_NAME} PRIVATE nitro_deps) # # and comment out the following lines + find_package(Drogon CONFIG REQUIRED) -target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon common llama whisper llava +target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon ${CMAKE_THREAD_LIBS_INIT}) # ############################################################################## @@ -87,8 +86,8 @@ endif() aux_source_directory(controllers CTL_SRC) aux_source_directory(common COMMON_SRC) -aux_source_directory(context CONTEXT_SRC) aux_source_directory(models MODEL_SRC) +aux_source_directory(cortex-common CORTEX_COMMON) # aux_source_directory(filters FILTER_SRC) aux_source_directory(plugins # PLUGIN_SRC) @@ -97,10 +96,10 @@ aux_source_directory(models MODEL_SRC) # namespaces. drogon_create_views(${PROJECT_NAME} # ${CMAKE_CURRENT_SOURCE_DIR}/views ${CMAKE_CURRENT_BINARY_DIR} TRUE) -target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ) # ${CMAKE_CURRENT_SOURCE_DIR}/models) -target_sources(${PROJECT_NAME} PRIVATE ${CTL_SRC} ${COMMON_SRC} ${CONTEXT_SRC}) +target_sources(${PROJECT_NAME} PRIVATE ${CTL_SRC} ${COMMON_SRC}) # ${FILTER_SRC} ${PLUGIN_SRC} ${MODEL_SRC}) # ############################################################################## # uncomment the following line for dynamically loading views set_property(TARGET -# ${PROJECT_NAME} PROPERTY ENABLE_EXPORTS ON) +# ${PROJECT_NAME} PROPERTY ENABLE_EXPORTS ON) \ No newline at end of file diff --git a/cortex-cpp/Makefile b/cortex-cpp/Makefile new file mode 100644 index 000000000..960bb198a --- /dev/null +++ b/cortex-cpp/Makefile @@ -0,0 +1,68 @@ +# Makefile for Cortex llamacpp engine - Build, Lint, Test, and Clean +.PHONY: all build package run-e2e-test + + +CMAKE_EXTRA_FLAGS ?= "" +RUN_TESTS ?= false +LLM_MODEL_URL ?= "https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf" +EMBEDDING_MODEL_URL ?= "https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf" + +# Default target, does nothing +all: + @echo "Specify a target to run" + +# Build the Cortex engine +build: +ifeq ($(OS),Windows_NT) + @powershell -Command "cmake -S ./nitro_deps -B ./build_deps/nitro_deps;" + @powershell -Command "cmake --build ./build_deps/nitro_deps --config Release -j4;" + @powershell -Command "mkdir -p build; cd build; cmake .. $(CMAKE_EXTRA_FLAGS); cmake --build . --config Release -j4;" +else ifeq ($(shell uname -s),Linux) + @./install_deps.sh; + @mkdir -p build && cd build; \ + cmake .. $(CMAKE_EXTRA_FLAGS); \ + make -j4; +else + @./install_deps.sh; + @mkdir -p build && cd build; \ + cmake .. $(CMAKE_EXTRA_FLAGS); \ + make -j4; +endif + +package: +ifeq ($(OS),Windows_NT) + @powershell -Command "mkdir -p cortex\engines\cortex.llamacpp\; cp build\engines\cortex.llamacpp\engine.dll cortex\engines\cortex.llamacpp\;" + @powershell -Command "cp build\Release\nitro.exe .\cortex\;" + @powershell -Command "cp build_deps\_install\bin\zlib.dll .\cortex\;" + @powershell -Command "cp ..\.github\patches\windows\msvcp140.dll .\cortex\;" + @powershell -Command "cp ..\.github\patches\windows\vcruntime140_1.dll .\cortex\;" + @powershell -Command "cp ..\.github\patches\windows\vcruntime140.dll .\cortex\;" + @powershell -Command "7z a -ttar temp.tar cortex\\*; 7z a -tgzip cortex.tar.gz temp.tar;" +else ifeq ($(shell uname -s),Linux) + @mkdir -p cortex/engines/cortex.llamacpp; \ + cp build/engines/cortex.llamacpp/libengine.so cortex/engines/cortex.llamacpp/; \ + cp build/nitro cortex/; \ + tar -czvf cortex.tar.gz cortex; +else + @mkdir -p cortex/engines/cortex.llamacpp; \ + cp build/engines/cortex.llamacpp/libengine.dylib cortex/engines/cortex.llamacpp/; \ + cp build/nitro cortex/; \ + tar -czvf cortex.llamacpp.tar.gz cortex; +endif + +run-e2e-test: +ifeq ($(RUN_TESTS),false) + @echo "Skipping tests" + @exit 0 +endif +ifeq ($(OS),Windows_NT) + @powershell -Command "cd cortex; ..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);" +else ifeq ($(shell uname -s),Linux) + @cd cortex; \ + chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL); \ + rm -rf uploads/; +else + @cd cortex; \ + chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL); \ + rm -rf uploads/; +endif \ No newline at end of file diff --git a/cortex-cpp/common/base.h b/cortex-cpp/common/base.h index e87d07488..43d612c1b 100644 --- a/cortex-cpp/common/base.h +++ b/cortex-cpp/common/base.h @@ -1,6 +1,5 @@ #pragma once #include -#include using namespace drogon; @@ -25,7 +24,7 @@ class BaseChatCompletion { // General chat method virtual void ChatCompletion( - inferences::ChatCompletionRequest &&completion, + const HttpRequestPtr& req, std::function&& callback) = 0; }; @@ -38,21 +37,5 @@ class BaseEmbedding { const HttpRequestPtr& req, std::function&& callback) = 0; - // The derived class can also override other methods if needed -}; - -class BaseAudio { - public: - virtual ~BaseAudio() {} - // Transcribes audio into the input language. - virtual void CreateTranscription( - const HttpRequestPtr& req, - std::function&& callback) = 0; - - // Translates audio into the input language. - virtual void CreateTranslation( - const HttpRequestPtr& req, - std::function&& callback) = 0; - // The derived class can also override other methods if needed }; \ No newline at end of file diff --git a/cortex-cpp/context/llama_server_context.h b/cortex-cpp/context/llama_server_context.h deleted file mode 100644 index 21792f11b..000000000 --- a/cortex-cpp/context/llama_server_context.h +++ /dev/null @@ -1,2260 +0,0 @@ -#include -#include -#include -#include - -// External -#include "clip.h" -#include "common.h" -#include "llama.h" -#include "llava.h" -#include "stb_image.h" -#include "utils/json.hpp" - -#if defined(_WIN32) -#define NOMINMAX -#endif - -using json = nlohmann::json; - -#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" - -struct server_params { - std::string hostname = "127.0.0.1"; - std::string api_key; - std::string public_path = "examples/server/public"; - int32_t port = 8080; - int32_t read_timeout = 600; - int32_t write_timeout = 600; -}; - -static bool server_verbose = false; - -#if SERVER_VERBOSE != 1 -#define LOG_VERBOSE(MSG, ...) -#else -#define LOG_VERBOSE(MSG, ...) \ - do { \ - if (server_verbose) { \ - server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ - } \ - } while (0) -#endif - -#define LOG_ERROR_LLAMA(MSG, ...) \ - server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_WARNING_LLAMA(MSG, ...) \ - server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_INFO_LLAMA(MSG, ...) \ - server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) - -// -// base64 utils (TODO: move to common in the future) -// - -static const std::string base64_chars = - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "0123456789+/"; - -static inline bool is_base64(uint8_t c) { - return (isalnum(c) || (c == '+') || (c == '/')); -} - -static std::vector base64_decode(const std::string& encoded_string) { - int i = 0; - int j = 0; - int in_ = 0; - - int in_len = encoded_string.size(); - - uint8_t char_array_4[4]; - uint8_t char_array_3[3]; - - std::vector ret; - - while (in_len-- && (encoded_string[in_] != '=') && - is_base64(encoded_string[in_])) { - char_array_4[i++] = encoded_string[in_]; - in_++; - if (i == 4) { - for (i = 0; i < 4; i++) { - char_array_4[i] = base64_chars.find(char_array_4[i]); - } - - char_array_3[0] = - ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = - ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - - for (i = 0; (i < 3); i++) { - ret.push_back(char_array_3[i]); - } - i = 0; - } - } - - if (i) { - for (j = i; j < 4; j++) { - char_array_4[j] = 0; - } - - for (j = 0; j < 4; j++) { - char_array_4[j] = base64_chars.find(char_array_4[j]); - } - - char_array_3[0] = - ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = - ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - - for (j = 0; (j < i - 1); j++) { - ret.push_back(char_array_3[j]); - } - } - - return ret; -} - -// -// parallel -// - -enum task_type { COMPLETION_TASK, CANCEL_TASK }; - -struct task_server { - int id; - int target_id; - task_type type; - json data; - bool infill_mode = false; - bool embedding_mode = false; - int multitask_id = -1; -}; - -struct task_result { - int id; - int multitask_id = -1; - bool stop; - bool error; - json result_json; -}; - -struct task_multi { - int id; - std::set subtasks_remaining{}; - std::vector results{}; -}; - -// TODO: can become bool if we can't find use of more states -enum slot_state { - IDLE, - PROCESSING, -}; - -enum slot_command { - NONE, - LOAD_PROMPT, - RELEASE, -}; - -struct slot_params { - bool stream = true; - bool cache_prompt = - false; // remember the prompt to avoid reprocessing all prompt - - uint32_t seed = -1; // RNG seed - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_predict = -1; // new tokens to predict - - std::vector antiprompt; - - json input_prefix; - json input_suffix; -}; - -struct slot_image { - int32_t id; - - bool request_encode_image = false; - float* image_embedding = nullptr; - int32_t image_tokens = 0; - - clip_image_u8* img_data; - - std::string prefix_prompt; // before of this image -}; - -// completion token output with probabilities -struct completion_token_output { - struct token_prob { - llama_token tok; - float prob; - }; - - std::vector probs; - llama_token tok; - std::string text_to_send; -}; - -static size_t common_part(const std::vector& a, - const std::vector& b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} - return i; -} - -enum stop_type { - STOP_FULL, - STOP_PARTIAL, -}; - -enum class ModelType { LLM = 0, EMBEDDING }; - -static bool ends_with(const std::string& str, const std::string& suffix) { - return str.size() >= suffix.size() && - 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); -} - -static size_t find_partial_stop_string(const std::string& stop, - const std::string& text) { - if (!text.empty() && !stop.empty()) { - const char text_last_char = text.back(); - for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) { - if (stop[char_index] == text_last_char) { - const std::string current_partial = stop.substr(0, char_index + 1); - if (ends_with(text, current_partial)) { - return text.size() - char_index - 1; - } - } - } - } - return std::string::npos; -} - -// TODO: reuse llama_detokenize -template -static std::string tokens_to_str(llama_context* ctx, Iter begin, Iter end) { - std::string ret; - for (; begin != end; ++begin) { - ret += llama_token_to_piece(ctx, *begin); - } - return ret; -} - -static void server_log(const char* level, const char* function, int line, - const char* message, - const nlohmann::ordered_json& extra) { - nlohmann::ordered_json log{ - {"timestamp", time(nullptr)}, {"level", level}, - {"function", function}, {"line", line}, - {"message", message}, - }; - - if (!extra.empty()) { - log.merge_patch(extra); - } - - const std::string str = - log.dump(-1, ' ', false, json::error_handler_t::replace); - printf("%.*s\n", (int)str.size(), str.data()); - fflush(stdout); -} - -// format incomplete utf-8 multibyte character for output -static std::string tokens_to_output_formatted_string(const llama_context* ctx, - const llama_token token) { - std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); - // if the size is 1 and first bit is 1, meaning it's a partial character - // (size > 1 meaning it's already a known token) - if (out.size() == 1 && (out[0] & 0x80) == 0x80) { - std::stringstream ss; - ss << std::hex << (out[0] & 0xff); - std::string res(ss.str()); - out = "byte: \\x" + res; - } - return out; -} - -// convert a vector of completion_token_output to json -static json probs_vector_to_json( - const llama_context* ctx, - const std::vector& probs) { - json out = json::array(); - for (const auto& prob : probs) { - json probs_for_token = json::array(); - for (const auto& p : prob.probs) { - std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); - probs_for_token.push_back(json{ - {"tok_str", tok_str}, - {"prob", p.prob}, - }); - } - std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); - out.push_back(json{ - {"content", tok_str}, - {"probs", probs_for_token}, - }); - } - return out; -} - -template -static T json_value(const json& body, const std::string& key, - const T& default_value) { - // Fallback null to default value - return body.contains(key) && !body.at(key).is_null() - ? body.value(key, default_value) - : default_value; -} - -struct llama_client_slot { - int id; - int task_id = -1; - - struct slot_params params; - - slot_state state = IDLE; - slot_command command = NONE; - - // used to determine the slot that has been used the longest - int64_t t_last_used = -1; - - // generation props - int32_t n_ctx = 0; // context size per slot - int32_t n_past = 0; - int32_t n_decoded = 0; - int32_t n_remaining = -1; - int32_t i_batch = -1; - - int32_t num_prompt_tokens = 0; - int32_t num_prompt_tokens_processed = 0; - - json prompt; - std::string generated_text; - llama_token sampled; - std::vector cache_tokens; - std::vector generated_token_probs; - - bool infill = false; - bool embedding = false; - bool has_next_token = true; - bool truncated = false; - bool stopped_eos = false; - bool stopped_word = false; - bool stopped_limit = false; - - bool oaicompat = false; - std::string oaicompat_model; - - std::string stopping_word; - - // sampling - struct llama_sampling_params sparams; - llama_sampling_context* ctx_sampling = nullptr; - - // multimodal - std::vector images; - - // stats - size_t sent_count = 0; - size_t sent_token_probs_index = 0; - - int64_t t_start_process_prompt; - int64_t t_start_genereration; - - double t_prompt_processing; // ms - double t_token_generation; // ms - - // multitasks - int multitask_id = -1; - - void reset() { - num_prompt_tokens = 0; - generated_text = ""; - truncated = false; - stopped_eos = false; - stopped_word = false; - stopped_limit = false; - stopping_word = ""; - n_past = 0; - sent_count = 0; - sent_token_probs_index = 0; - infill = false; - - generated_token_probs.clear(); - - for (slot_image& img : images) { - free(img.image_embedding); - if (img.img_data) { - clip_image_u8_free(img.img_data); - } - img.prefix_prompt = ""; - } - - images.clear(); - } - - bool has_budget(gpt_params& global_params) { - n_remaining = -1; - if (params.n_predict != -1) { - n_remaining = params.n_predict - n_decoded; - } else if (global_params.n_predict != -1) { - n_remaining = global_params.n_predict - n_decoded; - } - return n_remaining > 0 || n_remaining == -1; // no budget || limitless - } - - bool available() const { return state == IDLE && command == NONE; } - - bool is_processing() const { - return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING; - } - - void add_token_string(const completion_token_output& token) { - if (command == RELEASE) { - return; - } - cache_tokens.push_back(token.tok); - generated_token_probs.push_back(token); - } - - void release() { - if (state == IDLE || state == PROCESSING) { - t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; - command = RELEASE; - } - } - - json get_formated_timings() { - return json{ - {"prompt_n", num_prompt_tokens_processed}, - {"prompt_ms", t_prompt_processing}, - {"prompt_per_token_ms", - t_prompt_processing / num_prompt_tokens_processed}, - {"prompt_per_second", - 1e3 / t_prompt_processing * num_prompt_tokens_processed}, - - {"predicted_n", n_decoded}, - {"predicted_ms", t_token_generation}, - {"predicted_per_token_ms", t_token_generation / n_decoded}, - {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, - }; - } - - void print_timings() const { - LOG_DEBUG << __func__ << ": prompt eval time = " << t_prompt_processing - << "ms / " << num_prompt_tokens_processed << " tokens (" - << t_prompt_processing / num_prompt_tokens_processed - << " ms per " - "token, " - << 1e3 / t_prompt_processing * num_prompt_tokens_processed - << " tokens per second)"; - LOG_DEBUG << __func__ << ": eval time = " << t_token_generation - << " ms / " << n_decoded << " runs (" - << t_token_generation / n_decoded - << " ms per " - "token, " - << 1e3 / t_token_generation * n_decoded - << " tokens per second)\n"; - LOG_DEBUG << __func__ << ": total time = " - << t_prompt_processing + t_token_generation << " ms"; - } -}; - -struct llama_server_context { - llama_model* model = nullptr; - llama_context* ctx = nullptr; - - clip_ctx* clp_ctx = nullptr; - - gpt_params params; - - llama_batch batch; - - bool multimodal = false; - bool clean_kv_cache = true; - bool all_slots_are_idle = false; - bool add_bos_token = true; - - int32_t id_gen; - int32_t n_ctx; // total context for all clients / slots - - // Internal - std::atomic model_loaded_external = false; - - // system prompt - bool system_need_update = false; - - std::string system_prompt; - std::vector system_tokens; - - std::string name_user; // this should be the antiprompt - std::string name_assistant; - - // slots / clients - std::vector slots; - - std::vector queue_tasks; - std::vector queue_results; - std::vector queue_multitasks; - std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks - std::condition_variable condition_tasks; - std::mutex mutex_results; - std::condition_variable condition_results; - ModelType model_type = ModelType::LLM; - - ~llama_server_context() { - if (ctx) { - llama_free(ctx); - ctx = nullptr; - } - if (model) { - llama_free_model(model); - model = nullptr; - } - } - - bool load_model(const gpt_params& params_) { - params = params_; - if (!params.mmproj.empty()) { - multimodal = true; - LOG_DEBUG << "Multi Modal Mode Enabled"; - clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/1); - if (clp_ctx == nullptr) { - LOG_ERROR_LLAMA("unable to load clip model", - {{"model", params.mmproj}}); - return false; - } - - if (params.n_ctx < - 2048) { // request larger context for the image embedding - params.n_ctx = 2048; - } - } - - std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (model == nullptr) { - LOG_ERROR_LLAMA("llama.cpp unable to load model", - {{"model", params.model}}); - return false; - } - - if (multimodal) { - const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); - const int n_embd_llm = llama_n_embd(model); - if (n_embd_clip != n_embd_llm) { - LOG_DEBUG << __func__ << ": embedding dim of the multimodal projector (" - << n_embd_clip - << ") is not " - "equal to that of LLaMA (" - << n_embd_llm - << "). Make sure that you use the " - "correct mmproj file."; - llama_free(ctx); - llama_free_model(model); - return false; - } - } - - if (ctx == nullptr) { - LOG_ERROR_LLAMA("Unable to get llama.cpp context", {}); - return false; - } - n_ctx = llama_n_ctx(ctx); - - add_bos_token = llama_should_add_bos_token(model); - - return true; - } - - void initialize() { - id_gen = 0; - - // create slots - all_slots_are_idle = true; - - const int32_t n_ctx_slot = n_ctx / params.n_parallel; - - LOG_DEBUG << "Available slots: "; - for (int i = 0; i < params.n_parallel; i++) { - llama_client_slot slot; - - slot.id = i; - slot.n_ctx = n_ctx_slot; - slot.reset(); - - LOG_DEBUG << " -> Slot " << slot.id << " - max context: " << n_ctx_slot; - slots.push_back(slot); - } - - try { - batch = llama_batch_init(n_ctx, 0, params.n_parallel); - } catch (const std::exception& e) { - LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata", - {{"exception", e.what()}, - {"n_tokens_alloc", n_ctx}, - {"embd", 0}, - {"n_seq_max", params.n_parallel}}); - } - - // empty system prompt - system_prompt = ""; - system_tokens.clear(); - } - - std::vector tokenize(const json& json_prompt, - bool add_bos) const { - // TODO: currently, we tokenize using special tokens by default - // this is not always correct (see - // https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) - // but it's better compared to completely ignoring ChatML and other - // chat templates - const bool TMP_FORCE_SPECIAL = true; - - // If `add_bos` is true, we only add BOS, when json_prompt is a string, - // or the first element of the json_prompt array is a string. - std::vector prompt_tokens; - - if (json_prompt.is_array()) { - bool first = true; - for (const auto& p : json_prompt) { - if (p.is_string()) { - auto s = p.template get(); - std::vector p; - if (first) { - p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); - first = false; - } else { - p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); - } - prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); - } else { - if (first) { - first = false; - } - prompt_tokens.push_back(p.template get()); - } - } - } else { - auto s = json_prompt.template get(); - prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); - } - - return prompt_tokens; - } - - llama_client_slot* get_slot(int id) { - int64_t t_last = ggml_time_us(); - llama_client_slot* last_used = nullptr; - - for (llama_client_slot& slot : slots) { - if (slot.id == id && slot.available()) { - return &slot; - } - - if (slot.available() && slot.t_last_used < t_last) { - last_used = &slot; - t_last = slot.t_last_used; - } - } - - return last_used; - } - - bool launch_slot_with_data(llama_client_slot*& slot, json data) { - slot_params default_params; - llama_sampling_params default_sparams; - - if (data.count("__oaicompat") != 0) { - slot->oaicompat = true; - slot->oaicompat_model = - json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - } else { - slot->oaicompat = false; - slot->oaicompat_model = ""; - } - - slot->params.stream = json_value(data, "stream", false); - slot->params.cache_prompt = json_value(data, "cache_prompt", false); - slot->params.n_predict = - json_value(data, "n_predict", default_params.n_predict); - slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); - slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); - slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p); - slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); - slot->sparams.typical_p = - json_value(data, "typical_p", default_sparams.typical_p); - slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); - slot->sparams.penalty_last_n = - json_value(data, "repeat_last_n", default_sparams.penalty_last_n); - slot->sparams.penalty_repeat = - json_value(data, "repeat_penalty", default_sparams.penalty_repeat); - slot->sparams.penalty_freq = - json_value(data, "frequency_penalty", default_sparams.penalty_freq); - slot->sparams.penalty_present = - json_value(data, "presence_penalty", default_sparams.penalty_present); - slot->sparams.mirostat = - json_value(data, "mirostat", default_sparams.mirostat); - slot->sparams.mirostat_tau = - json_value(data, "mirostat_tau", default_sparams.mirostat_tau); - slot->sparams.mirostat_eta = - json_value(data, "mirostat_eta", default_sparams.mirostat_eta); - slot->sparams.penalize_nl = - json_value(data, "penalize_nl", default_sparams.penalize_nl); - slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); - slot->params.seed = json_value(data, "seed", default_params.seed); - slot->sparams.grammar = - json_value(data, "grammar", default_sparams.grammar); - slot->sparams.n_probs = - json_value(data, "n_probs", default_sparams.n_probs); - - // infill - if (data.count("input_prefix") != 0) { - slot->params.input_prefix = data["input_prefix"]; - } else { - slot->params.input_prefix = ""; - } - - if (data.count("input_suffix") != 0) { - slot->params.input_suffix = data["input_suffix"]; - } else { - slot->params.input_suffix = ""; - } - - if (data.count("prompt") != 0) { - slot->prompt = data["prompt"]; - } else { - slot->prompt = ""; - } - - slot->sparams.penalty_prompt_tokens.clear(); - slot->sparams.use_penalty_prompt_tokens = false; - const auto& penalty_prompt = data.find("penalty_prompt"); - if (penalty_prompt != data.end()) { - if (penalty_prompt->is_string()) { - const auto penalty_prompt_string = penalty_prompt->get(); - auto penalty_tokens = - llama_tokenize(model, penalty_prompt_string, false); - slot->sparams.penalty_prompt_tokens.swap(penalty_tokens); - if (slot->params.n_predict > 0) { - slot->sparams.penalty_prompt_tokens.reserve( - slot->sparams.penalty_prompt_tokens.size() + - slot->params.n_predict); - } - slot->sparams.use_penalty_prompt_tokens = true; - } else if (penalty_prompt->is_array()) { - const auto n_tokens = penalty_prompt->size(); - slot->sparams.penalty_prompt_tokens.reserve( - n_tokens + std::max(0, slot->params.n_predict)); - const int n_vocab = llama_n_vocab(model); - for (const auto& penalty_token : *penalty_prompt) { - if (penalty_token.is_number_integer()) { - const auto tok = penalty_token.get(); - if (tok >= 0 && tok < n_vocab) { - slot->sparams.penalty_prompt_tokens.push_back(tok); - } - } - } - slot->sparams.use_penalty_prompt_tokens = true; - } - } - - slot->sparams.logit_bias.clear(); - - if (json_value(data, "ignore_eos", false)) { - slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY; - } - - const auto& logit_bias = data.find("logit_bias"); - if (logit_bias != data.end() && logit_bias->is_array()) { - const int n_vocab = llama_n_vocab(model); - for (const auto& el : *logit_bias) { - if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) { - if (el[1].is_number()) { - slot->sparams.logit_bias[tok] = el[1].get(); - } else if (el[1].is_boolean() && !el[1].get()) { - slot->sparams.logit_bias[tok] = -INFINITY; - } - } - } - } - } - - slot->params.antiprompt.clear(); - - const auto& stop = data.find("stop"); - if (stop != data.end() && stop->is_array()) { - for (const auto& word : *stop) { - if (!word.empty()) { - slot->params.antiprompt.push_back(word); - } - } - } - - if (multimodal) { - const auto& images_data = data.find("image_data"); - if (images_data != data.end() && images_data->is_array()) { - for (const auto& img : *images_data) { - const std::vector image_buffer = - base64_decode(img["data"].get()); - - slot_image img_sl; - img_sl.id = - img.count("id") != 0 ? img["id"].get() : slot->images.size(); - img_sl.img_data = clip_image_u8_init(); - if (!clip_image_load_from_bytes( - image_buffer.data(), image_buffer.size(), img_sl.img_data)) { - LOG_DEBUG << "slot " << slot->id - << " - failed to load image [id: " << img_sl.id << "]"; - return false; - } - LOG_DEBUG << "slot " << slot->id << " - loaded image"; - img_sl.request_encode_image = true; - slot->images.push_back(img_sl); - } - // process prompt - // example: system prompt [img-102] user [img-103] describe [img-134] -> - // [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, - // {id: 134, prefix: ' describe '}]} - if (slot->images.size() > 0 && !slot->prompt.is_array()) { - std::string prompt = slot->prompt.get(); - size_t pos = 0, begin_prefix = 0; - std::string pattern = "[img-"; - while ((pos = prompt.find(pattern, pos)) != std::string::npos) { - size_t end_prefix = pos; - pos += pattern.length(); - size_t end_pos = prompt.find("]", pos); - if (end_pos != std::string::npos) { - std::string image_id = prompt.substr(pos, end_pos - pos); - try { - int img_id = std::stoi(image_id); - bool found = false; - for (slot_image& img : slot->images) { - if (img.id == img_id) { - found = true; - img.prefix_prompt = - prompt.substr(begin_prefix, end_prefix - begin_prefix); - begin_prefix = end_pos + 1; - break; - } - } - if (!found) { - LOG_DEBUG << "ERROR: Image with id: " << img_id - << ", not found.\n"; - slot->images.clear(); - return false; - } - } catch (const std::invalid_argument& e) { - LOG_DEBUG << "Invalid image number id in prompt"; - slot->images.clear(); - return false; - } - } - } - slot->prompt = ""; - slot->params.input_suffix = prompt.substr(begin_prefix); - slot->params.cache_prompt = - false; // multimodal doesn't support cache prompt - } - } - } - - if (slot->ctx_sampling != nullptr) { - llama_sampling_free(slot->ctx_sampling); - } - slot->ctx_sampling = llama_sampling_init(slot->sparams); - llama_set_rng_seed(ctx, slot->params.seed); - slot->command = LOAD_PROMPT; - - all_slots_are_idle = false; - - LOG_DEBUG << "slot " << slot->id - << " is processing [task id: " << slot->task_id << "]"; - - return true; - } - - void kv_cache_clear() { - // clear the entire KV cache - llama_kv_cache_clear(ctx); - } - - void update_system_prompt() { - system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token); - - llama_batch_clear(batch); - - kv_cache_clear(); - - for (int i = 0; i < (int)system_tokens.size(); ++i) { - llama_batch_add(batch, system_tokens[i], i, {0}, false); - } - - if (llama_decode(ctx, batch) != 0) { - LOG_WARN << __func__ << ": llama_decode() failed"; - return; - } - - // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i < params.n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); - } - - LOG_DEBUG << "system prompt updated"; - system_need_update = false; - } - - void notify_system_prompt_changed() { - // release all slots - for (llama_client_slot& slot : slots) { - slot.release(); - } - - system_need_update = true; - } - - void process_system_prompt_data(const json& sys_props) { - system_prompt = sys_props.value("prompt", ""); - name_user = sys_props.value("anti_prompt", ""); - name_assistant = sys_props.value("assistant_name", ""); - - if (slots.size() > 0) { - notify_system_prompt_changed(); - } - } - - static size_t find_stopping_strings(const std::string& text, - const size_t last_token_size, - const stop_type type, - llama_client_slot& slot) { - size_t stop_pos = std::string::npos; - - for (const std::string& word : slot.params.antiprompt) { - size_t pos; - if (type == STOP_FULL) { - const size_t tmp = word.size() + last_token_size; - const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; - pos = text.find(word, from_pos); - } else { - pos = find_partial_stop_string(word, text); - } - if (pos != std::string::npos && - (stop_pos == std::string::npos || pos < stop_pos)) { - if (type == STOP_FULL) { - slot.stopped_word = true; - slot.stopping_word = word; - slot.has_next_token = false; - } - stop_pos = pos; - } - } - - return stop_pos; - } - - bool process_token(completion_token_output& result, llama_client_slot& slot) { - // remember which tokens were sampled - used for repetition penalties during - // sampling - const std::string token_str = llama_token_to_piece(ctx, result.tok); - slot.sampled = result.tok; - - // search stop word and delete it - slot.generated_text += token_str; - slot.has_next_token = true; - - if (slot.ctx_sampling->params.use_penalty_prompt_tokens && - result.tok != -1) { - // we can change penalty_prompt_tokens because it is always created from - // scratch each request - slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); - } - - // check if there is incomplete UTF-8 character at the end - bool incomplete = false; - for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) { - unsigned char c = slot.generated_text[slot.generated_text.size() - i]; - if ((c & 0xC0) == 0x80) { - // continuation byte: 10xxxxxx - continue; - } - if ((c & 0xE0) == 0xC0) { - // 2-byte character: 110xxxxx ... - incomplete = i < 2; - } else if ((c & 0xF0) == 0xE0) { - // 3-byte character: 1110xxxx ... - incomplete = i < 3; - } else if ((c & 0xF8) == 0xF0) { - // 4-byte character: 11110xxx ... - incomplete = i < 4; - } - // else 1-byte character or invalid byte - break; - } - - if (!incomplete) { - size_t pos = std::min(slot.sent_count, slot.generated_text.size()); - const std::string str_test = slot.generated_text.substr(pos); - bool is_stop_full = false; - size_t stop_pos = - find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); - if (stop_pos != std::string::npos) { - is_stop_full = true; - slot.generated_text.erase(slot.generated_text.begin() + pos + stop_pos, - slot.generated_text.end()); - pos = std::min(slot.sent_count, slot.generated_text.size()); - } else { - is_stop_full = false; - stop_pos = find_stopping_strings(str_test, token_str.size(), - STOP_PARTIAL, slot); - } - - // check if there is any token to predict - if (stop_pos == std::string::npos || - (!slot.has_next_token && !is_stop_full && stop_pos > 0)) { - // no send the stop word in the response - result.text_to_send = - slot.generated_text.substr(pos, std::string::npos); - slot.sent_count += result.text_to_send.size(); - // add the token to slot queue and cache - } - slot.add_token_string(result); - if (slot.params.stream) { - send_partial_response(slot, result); - } - } - - if (incomplete) { - slot.has_next_token = true; - } - - // check the limits - if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params)) { - slot.stopped_limit = true; - slot.has_next_token = false; - } - - if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model)) { - slot.stopped_eos = true; - slot.has_next_token = false; - LOG_VERBOSE("eos token found", {}); - } - - LOG_VERBOSE( - "next token", - { - {"token", result.tok}, - {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, - {"num_tokens_predicted", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - }); - - return slot.has_next_token; // continue - } - bool process_images(llama_client_slot& slot) const { - for (slot_image& img : slot.images) { - if (!img.request_encode_image) { - continue; - } - - if (!llava_image_embed_make_with_clip_img( - clp_ctx, params.n_threads, img.img_data, &img.image_embedding, - &img.image_tokens)) { - LOG_DEBUG << "Error processing the given image"; - return false; - } - - img.request_encode_image = false; - } - - return slot.images.size() > 0; - } - void send_error(task_server& task, std::string error) { - std::unique_lock lock(mutex_results); - task_result res; - res.id = task.id; - res.multitask_id = task.multitask_id; - res.stop = false; - res.error = true; - res.result_json = {{"content", error}}; - queue_results.push_back(res); - condition_results.notify_all(); - } - - void add_multi_task(int id, std::vector& sub_ids) { - std::lock_guard lock(mutex_tasks); - task_multi multi; - multi.id = id; - std::copy(sub_ids.begin(), sub_ids.end(), - std::inserter(multi.subtasks_remaining, - multi.subtasks_remaining.end())); - queue_multitasks.push_back(multi); - condition_tasks.notify_one(); - } - - void update_multi_task(int multitask_id, int subtask_id, - task_result& result) { - std::lock_guard lock(mutex_tasks); - for (auto& multitask : queue_multitasks) { - if (multitask.id == multitask_id) { - multitask.subtasks_remaining.erase(subtask_id); - multitask.results.push_back(result); - condition_tasks.notify_one(); - } - } - } - - json get_model_props() { return get_formated_generation(slots[0]); } - - json get_formated_generation(llama_client_slot& slot) { - const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); - const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && - eos_bias->second < 0.0f && - std::isinf(eos_bias->second); - return json{ - {"n_ctx", slot.n_ctx}, - {"model", params.model_alias}, - {"seed", slot.params.seed}, - {"temperature", slot.sparams.temp}, - {"top_k", slot.sparams.top_k}, - {"top_p", slot.sparams.top_p}, - {"min_p", slot.sparams.min_p}, - {"tfs_z", slot.sparams.tfs_z}, - {"typical_p", slot.sparams.typical_p}, - {"repeat_last_n", slot.sparams.penalty_last_n}, - {"repeat_penalty", slot.sparams.penalty_repeat}, - {"presence_penalty", slot.sparams.penalty_present}, - {"frequency_penalty", slot.sparams.penalty_freq}, - {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens}, - {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens}, - {"mirostat", slot.sparams.mirostat}, - {"mirostat_tau", slot.sparams.mirostat_tau}, - {"mirostat_eta", slot.sparams.mirostat_eta}, - {"penalize_nl", slot.sparams.penalize_nl}, - {"stop", slot.params.antiprompt}, - {"n_predict", slot.params.n_predict}, - {"n_keep", params.n_keep}, - {"ignore_eos", ignore_eos}, - {"stream", slot.params.stream}, - {"logit_bias", slot.sparams.logit_bias}, - {"n_probs", slot.sparams.n_probs}, - {"grammar", slot.sparams.grammar}, - }; - } - - void send_partial_response(llama_client_slot& slot, - completion_token_output tkn) { - std::unique_lock lock(mutex_results); - task_result res; - res.id = slot.task_id; - res.multitask_id = slot.multitask_id; - res.error = false; - res.stop = false; - - res.result_json = json{{"content", tkn.text_to_send}, - {"stop", false}, - {"slot_id", slot.id}, - {"multimodal", multimodal}}; - - if (slot.sparams.n_probs > 0) { - std::vector probs_output = {}; - const std::vector to_send_toks = - llama_tokenize(ctx, tkn.text_to_send, false); - size_t probs_pos = std::min(slot.sent_token_probs_index, - slot.generated_token_probs.size()); - size_t probs_stop_pos = - std::min(slot.sent_token_probs_index + to_send_toks.size(), - slot.generated_token_probs.size()); - if (probs_pos < probs_stop_pos) { - probs_output = std::vector( - slot.generated_token_probs.begin() + probs_pos, - slot.generated_token_probs.begin() + probs_stop_pos); - } - slot.sent_token_probs_index = probs_stop_pos; - res.result_json["completion_probabilities"] = - probs_vector_to_json(ctx, probs_output); - } - - if (slot.oaicompat) { - res.result_json["oaicompat_token_ctr"] = slot.n_decoded; - res.result_json["model"] = slot.oaicompat_model; - } - - queue_results.push_back(res); - condition_results.notify_all(); - } - - void send_final_response(llama_client_slot& slot) { - std::unique_lock lock(mutex_results); - task_result res; - res.id = slot.task_id; - res.multitask_id = slot.multitask_id; - res.error = false; - res.stop = true; - - res.result_json = - json{{"content", !slot.params.stream ? slot.generated_text : ""}, - {"slot_id", slot.id}, - {"stop", true}, - {"model", params.model_alias}, - {"tokens_predicted", slot.n_decoded}, - {"tokens_evaluated", slot.num_prompt_tokens}, - {"generation_settings", get_formated_generation(slot)}, - {"prompt", slot.prompt}, - {"truncated", slot.truncated}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - {"tokens_cached", slot.n_past}, - {"timings", slot.get_formated_timings()}}; - - if (slot.sparams.n_probs > 0) { - std::vector probs = {}; - if (!slot.params.stream && slot.stopped_word) { - const std::vector stop_word_toks = - llama_tokenize(ctx, slot.stopping_word, false); - probs = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.end() - stop_word_toks.size()); - } else { - probs = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.begin() + slot.sent_token_probs_index); - } - res.result_json["completion_probabilities"] = - probs_vector_to_json(ctx, probs); - } - - if (slot.oaicompat) { - res.result_json["oaicompat_token_ctr"] = slot.n_decoded; - res.result_json["model"] = slot.oaicompat_model; - } - - // parent multitask, if any, needs to be updated - if (slot.multitask_id != -1) { - update_multi_task(slot.multitask_id, slot.task_id, res); - } - - queue_results.push_back(res); - condition_results.notify_all(); - } - - void send_embedding(llama_client_slot& slot) { - std::unique_lock lock(mutex_results); - task_result res; - res.id = slot.task_id; - res.multitask_id = slot.multitask_id; - res.error = false; - res.stop = true; - - const int n_embd = llama_n_embd(model); - - std::vector embd_res(n_embd, 0.0f); - - for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { - continue; - } - - const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - if (embd == NULL) { - embd = llama_get_embeddings_ith(ctx, i); - } - - if (embd == NULL) { - LOG_ERROR << "failed to get embeddings" - << " token " << batch.token[i] << ", seq_id " - << batch.seq_id[i][0]; - - res.result_json = json{ - {"embedding", std::vector(n_embd, 0.0f)}, - }; - - continue; - } - - llama_embd_normalize(embd, embd_res.data(), n_embd); - } - res.result_json = json{ - {"embedding", embd_res}, - }; - - queue_results.push_back(res); - condition_results.notify_all(); - } - - int request_completion(json data, bool infill, bool embedding, - int multitask_id) { - std::unique_lock lock(mutex_tasks); - task_server task; - task.id = id_gen++; - task.target_id = 0; - task.data = std::move(data); - task.infill_mode = infill; - task.embedding_mode = embedding; - task.type = COMPLETION_TASK; - task.multitask_id = multitask_id; - - // when a completion task's prompt array is not a singleton, we split it - // into multiple requests - if (task.data.at("prompt").size() > 1) { - lock.unlock(); // entering new func scope - return split_multiprompt_task(task); - } - - // otherwise, it's a single-prompt task, we actually queue it - queue_tasks.push_back(task); - condition_tasks.notify_one(); - return task.id; - } - - task_result next_result(int task_id) { - while (true) { - std::unique_lock lock(mutex_results); - condition_results.wait(lock, [&] { return !queue_results.empty(); }); - - for (int i = 0; i < (int)queue_results.size(); i++) { - // for now, tasks that have associated parent multitasks just get erased - // once multitask picks up the result - if (queue_results[i].multitask_id == task_id) { - update_multi_task(task_id, queue_results[i].id, queue_results[i]); - queue_results.erase(queue_results.begin() + i); - continue; - } - - if (queue_results[i].id == task_id) { - if (queue_results[i].multitask_id != -1) { - LOG_ERROR_LLAMA("Incorrect multitask ID", {{"task_id", task_id}}); - } - task_result res = queue_results[i]; - queue_results.erase(queue_results.begin() + i); - return res; - } - } - } - - // never reached - // return task_result{-1, false, false, {}}; - } - - // for multiple images processing - bool ingest_images(llama_client_slot& slot, int n_batch) { - int image_idx = 0; - - while (image_idx < (int)slot.images.size()) { - slot_image& img = slot.images[image_idx]; - - // process prefix prompt - for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) { - const int32_t n_tokens = - std::min(n_batch, (int32_t)(batch.n_tokens - i)); - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, - 0, - 0, // unused - }; - if (llama_decode(ctx, batch_view)) { - LOG_DEBUG << __func__ << " : failed to eval\n"; - return false; - } - } - - // process image with llm - for (int i = 0; i < img.image_tokens; i += n_batch) { - int n_eval = img.image_tokens - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - - const int n_embd = llama_n_embd(model); - llama_batch batch_img = { - n_eval, nullptr, (img.image_embedding + i * n_embd), - nullptr, nullptr, nullptr, - nullptr, slot.n_past, 1, - 0, - }; - if (llama_decode(ctx, batch_img)) { - LOG_DEBUG << __func__ << " : failed to eval image"; - return false; - } - slot.n_past += n_eval; - } - image_idx++; - - llama_batch_clear(batch); - - // append prefix of next image - const auto json_prompt = - (image_idx >= (int)slot.images.size()) - ? slot.params.input_suffix - : // no more images, then process suffix prompt - (json)(slot.images[image_idx].prefix_prompt); - - std::vector append_tokens = - tokenize(json_prompt, false); // has next image - for (int i = 0; i < (int)append_tokens.size(); ++i) { - llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id}, true); - slot.n_past += 1; - } - } - - return true; - } - - void request_cancel(int task_id) { - std::unique_lock lock(mutex_tasks); - task_server task; - task.id = id_gen++; - task.type = CANCEL_TASK; - task.target_id = task_id; - queue_tasks.push_back(task); - condition_tasks.notify_one(); - } - - int split_multiprompt_task(task_server& multiprompt_task) { - int prompt_count = multiprompt_task.data.at("prompt").size(); - assert(prompt_count > 1); - - int multitask_id = id_gen++; - std::vector subtask_ids(prompt_count); - for (int i = 0; i < prompt_count; i++) { - json subtask_data = multiprompt_task.data; - subtask_data["prompt"] = subtask_data["prompt"][i]; - - // subtasks inherit everything else (infill mode, embedding mode, etc.) - subtask_ids[i] = - request_completion(subtask_data, multiprompt_task.infill_mode, - multiprompt_task.embedding_mode, multitask_id); - } - - // queue up the multitask so we can track its subtask progression - add_multi_task(multitask_id, subtask_ids); - return multitask_id; - } - - void process_tasks() { - std::unique_lock lock(mutex_tasks); - while (!queue_tasks.empty()) { - task_server task = queue_tasks.front(); - queue_tasks.erase(queue_tasks.begin()); - switch (task.type) { - case COMPLETION_TASK: { - llama_client_slot* slot = - get_slot(json_value(task.data, "slot_id", -1)); - if (slot == nullptr) { - LOG_DEBUG << "slot unavailable"; - // send error result - send_error(task, "slot unavailable"); - return; - } - - if (task.data.contains("system_prompt")) { - process_system_prompt_data(task.data["system_prompt"]); - } - - slot->reset(); - - slot->infill = task.infill_mode; - slot->embedding = task.embedding_mode; - slot->task_id = task.id; - slot->multitask_id = task.multitask_id; - - if (!launch_slot_with_data(slot, task.data)) { - // send error result - send_error(task, "internal_error"); - break; - } - } break; - case CANCEL_TASK: { // release slot linked with the task id - for (auto& slot : slots) { - if (slot.task_id == task.target_id) { - slot.release(); - break; - } - } - } break; - } - } - - // remove finished multitasks from the queue of multitasks, and add the - // corresponding result to the result queue - auto queue_iterator = queue_multitasks.begin(); - while (queue_iterator != queue_multitasks.end()) { - if (queue_iterator->subtasks_remaining.empty()) { - // all subtasks done == multitask is done - task_result aggregate_result; - aggregate_result.id = queue_iterator->id; - aggregate_result.stop = true; - aggregate_result.error = false; - - // collect json results into one json result - std::vector result_jsons; - for (auto& subres : queue_iterator->results) { - result_jsons.push_back(subres.result_json); - aggregate_result.error = aggregate_result.error && subres.error; - } - aggregate_result.result_json = json{"results", result_jsons}; - - std::lock_guard lock(mutex_results); - queue_results.push_back(aggregate_result); - condition_results.notify_all(); - - queue_iterator = queue_multitasks.erase(queue_iterator); - } else { - ++queue_iterator; - } - } - } - - bool update_slots() { - // attend tasks - process_tasks(); - - // update the system prompt wait until all slots are idle state - if (system_need_update && all_slots_are_idle) { - LOG_DEBUG << "updating system prompt"; - update_system_prompt(); - } - - llama_batch_clear(batch); - - if (all_slots_are_idle) { - if (system_prompt.empty() && clean_kv_cache) { - LOG_DEBUG - << "all slots are idle and system prompt is empty, clear the KV " - "cache"; - kv_cache_clear(); - } - // std::this_thread::sleep_for(std::chrono::milliseconds(5)); - // TODO: Need to implement queueing using CV for better performance - std::unique_lock lock(mutex_tasks); - condition_tasks.wait(lock, [&] { - return (!queue_tasks.empty() && model_loaded_external) || - (queue_tasks.empty() && !model_loaded_external); - }); - } - - for (llama_client_slot& slot : slots) { - if (slot.is_processing() && - slot.cache_tokens.size() >= (size_t)slot.n_ctx) { - // Shift context - const int n_left = slot.n_past - slot.params.n_keep - 1; - const int n_discard = n_left / 2; - - LOG_DEBUG << "slot " << slot.id - << " context shift - n_keep = " << slot.params.n_keep - << ", n_left = " << n_left << ", n_discard: " << n_discard; - llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1, - slot.params.n_keep + n_discard + 1); - llama_kv_cache_seq_add(ctx, slot.id, slot.params.n_keep + 1 + n_discard, - slot.n_past, -n_discard); - - for (size_t i = slot.params.n_keep + 1 + n_discard; - i < slot.cache_tokens.size(); i++) { - slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; - } - - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); - - slot.n_past -= n_discard; - - slot.truncated = true; - - LOG_VERBOSE("context shift", { - {"n_ctx", n_ctx}, - {"n_keep", params.n_keep}, - {"n_left", n_left}, - }); - } - } - - // decode any currently ongoing sequences - for (auto& slot : slots) { - // release the slot - if (slot.command == RELEASE) { - slot.state = IDLE; - slot.command = NONE; - slot.t_last_used = ggml_time_us(); - - LOG_DEBUG << "slot " << slot.id << " released (" - << (int)slot.cache_tokens.size() << " tokens in cache)"; - - continue; - } - - if (slot.state == IDLE) { - continue; - } - - slot.i_batch = batch.n_tokens; - - llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, - {slot.id}, true); - - slot.n_decoded += 1; - slot.n_past += 1; - } - - // process in chunks of params.n_batch - int32_t n_batch = params.n_batch; - - // assign workload to the slots - if (params.cont_batching || batch.n_tokens == 0) { - for (auto& slot : slots) { - const bool has_prompt = slot.prompt.is_array() || - (slot.prompt.is_string() && - !slot.prompt.get().empty()) || - !slot.images.empty(); - - // empty prompt passed -> release the slot and send empty response - if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) { - slot.release(); - slot.print_timings(); - send_final_response(slot); - continue; - } - - // need process the prompt - if (slot.state == IDLE && slot.command == LOAD_PROMPT) { - slot.state = PROCESSING; - slot.command = NONE; - std::vector prompt_tokens; - slot.t_start_process_prompt = ggml_time_us(); - slot.t_start_genereration = 0; - - if (slot.infill) { - bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(' ') == 0 && - params.input_suffix.size() > 1) { - params.input_suffix.erase(0, 1); - suff_rm_leading_spc = false; - } - auto prefix_tokens = tokenize(slot.params.input_prefix, false); - auto suffix_tokens = tokenize(slot.params.input_suffix, false); - - const int space_token = - 29871; // TODO: this should not be hardcoded - if (suff_rm_leading_spc && !suffix_tokens.empty() && - suffix_tokens[0] == space_token) { - suffix_tokens.erase(suffix_tokens.begin()); - } - - prefix_tokens.insert(prefix_tokens.begin(), - llama_token_prefix(model)); - prefix_tokens.insert(prefix_tokens.begin(), - llama_token_bos(model)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), - llama_token_suffix(model)); - prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), - suffix_tokens.end()); - prefix_tokens.push_back(llama_token_middle(model)); - prompt_tokens = prefix_tokens; - } else { - prompt_tokens = tokenize( - slot.prompt, - system_prompt.empty() && - add_bos_token); // add BOS if there isn't system prompt - } - - slot.num_prompt_tokens = prompt_tokens.size(); - - if (slot.params.n_keep < 0) { - slot.params.n_keep = slot.num_prompt_tokens; - } - slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - - // if input prompt is too big, truncate it - if (slot.num_prompt_tokens >= slot.n_ctx) { - const int n_left = slot.n_ctx - slot.params.n_keep; - const int n_block_size = n_left / 2; - const int erased_blocks = - (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / - n_block_size; - - std::vector new_tokens( - prompt_tokens.begin(), - prompt_tokens.begin() + slot.params.n_keep); - new_tokens.insert(new_tokens.end(), - prompt_tokens.begin() + slot.params.n_keep + - erased_blocks * n_block_size, - prompt_tokens.end()); - - LOG_VERBOSE( - "input truncated", - { - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, - {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), - new_tokens.cend())}, - }); - slot.truncated = true; - prompt_tokens = new_tokens; - - slot.num_prompt_tokens = prompt_tokens.size(); - GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); - } - - if (!slot.params.cache_prompt) { - llama_sampling_reset(slot.ctx_sampling); - - slot.n_past = 0; - slot.num_prompt_tokens_processed = slot.num_prompt_tokens; - } else { - // push the prompt into the sampling context (do not apply grammar) - for (auto& token : prompt_tokens) { - llama_sampling_accept(slot.ctx_sampling, ctx, token, false); - } - - slot.n_past = common_part(slot.cache_tokens, prompt_tokens); - slot.num_prompt_tokens_processed = - slot.num_prompt_tokens - slot.n_past; - - LOG_DEBUG << "slot " << slot.id << " : in cache: " << slot.n_past - << " tokens | to process: " - << slot.num_prompt_tokens_processed << " tokens"; - } - - LOG_DEBUG << "slot " << slot.id << " : kv cache rm - [" - << (int)system_tokens.size() + slot.n_past << ", end)"; - - llama_kv_cache_seq_rm(ctx, slot.id, - system_tokens.size() + slot.n_past, -1); - - slot.cache_tokens = prompt_tokens; - - if (slot.n_past == slot.num_prompt_tokens) { - // we have to evaluate at least 1 token to generate logits. - LOG_DEBUG << "slot " << slot.id - << " : we have to evaluate at least 1 token to " - "generate logits"; - slot.n_past--; - } - - LOG_VERBOSE( - "prompt ingested", - { - {"n_past", slot.n_past}, - {"cached", - tokens_to_str(ctx, slot.cache_tokens.cbegin(), - slot.cache_tokens.cbegin() + slot.n_past)}, - {"to_eval", - tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, - slot.cache_tokens.cend())}, - }); - - const bool has_images = process_images(slot); - - // process the prefix of first image - std::vector prefix_tokens = - has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) - : prompt_tokens; - for (; slot.n_past < (int)prefix_tokens.size(); ++slot.n_past) { - llama_batch_add(batch, prefix_tokens[slot.n_past], - system_tokens.size() + slot.n_past, {slot.id}, - false); - } - - if (has_images && !ingest_images(slot, n_batch)) { - LOG_DEBUG << "failed processing images"; - return false; - } - - // extract the logits only for the last token - if (batch.n_tokens > 0) { - batch.logits[batch.n_tokens - 1] = true; - } - - slot.n_decoded = 0; - slot.i_batch = batch.n_tokens - 1; - } - } - } - - if (batch.n_tokens == 0) { - all_slots_are_idle = true; - return true; - } - - for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, (int32_t)(batch.n_tokens - i)); - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, - 0, - 0, // unused - }; - - const int ret = llama_decode(ctx, batch_view); - if (ret != 0) { - if (n_batch == 1 || ret < 0) { - // if you get here, it means the KV cache is full - try increasing it - // via the context size - LOG_DEBUG << __func__ - << " : failed to decode the batch, n_batch = " << n_batch - << ", ret = " << ret; - return false; - } - - LOG_DEBUG - << __func__ - << " : failed to find free space in the KV cache, retrying with " - "smaller n_batch = " - << n_batch / 2; - - // retry with half the batch size to try to find a free slot in the KV - // cache - n_batch /= 2; - i -= n_batch; - continue; - } - - for (auto& slot : slots) { - if (slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens)) { - continue; - } - - // prompt evaluated for embedding - if (slot.embedding) { - send_embedding(slot); - slot.release(); - slot.i_batch = -1; - return true; - } - - completion_token_output result; - const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, - NULL, slot.i_batch - i); - - llama_sampling_accept(slot.ctx_sampling, ctx, id, true); - - if (slot.n_decoded == 1) { - slot.t_start_genereration = ggml_time_us(); - slot.t_prompt_processing = - (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3; - } - - llama_token_data_array cur_p = {slot.ctx_sampling->cur.data(), - slot.ctx_sampling->cur.size(), false}; - result.tok = id; - - const int32_t n_probs = slot.sparams.n_probs; - if (slot.sparams.temp <= 0 && n_probs > 0) { - // for llama_sample_token_greedy we need to sort candidates - llama_sample_softmax(ctx, &cur_p); - } - - for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) { - result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); - } - - if (!process_token(result, slot)) { - slot.release(); - slot.print_timings(); - send_final_response(slot); - } - - slot.i_batch = -1; - } - } - return true; - } -}; - -static void server_print_usage(const char* argv0, const gpt_params& params, - const server_params& sparams) { - printf("usage: %s [options]\n", argv0); - printf("\n"); - printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" -v, --verbose verbose output (default: %s)\n", - server_verbose ? "enabled" : "disabled"); - printf( - " -t N, --threads N number of threads to use during " - "computation (default: %d)\n", - params.n_threads); - printf( - " -tb N, --threads-batch N number of threads to use during batch " - "and prompt processing (default: same as --threads)\n"); - printf( - " -c N, --ctx-size N size of the prompt context (default: " - "%d)\n", - params.n_ctx); - printf(" --rope-scaling {none,linear,yarn}\n"); - printf( - " RoPE frequency scaling method, defaults " - "to linear unless specified by the model\n"); - printf( - " --rope-freq-base N RoPE base frequency (default: loaded " - "from model)\n"); - printf( - " --rope-freq-scale N RoPE frequency scaling factor, expands " - "context by a factor of 1/N\n"); - printf( - " --yarn-ext-factor N YaRN: extrapolation mix factor (default: " - "1.0, 0.0 = full interpolation)\n"); - printf( - " --yarn-attn-factor N YaRN: scale sqrt(t) or attention " - "magnitude (default: 1.0)\n"); - printf( - " --yarn-beta-slow N YaRN: high correction dim or alpha " - "(default: %.1f)\n", - params.yarn_beta_slow); - printf( - " --yarn-beta-fast N YaRN: low correction dim or beta " - "(default: %.1f)\n", - params.yarn_beta_fast); - printf( - " -b N, --batch-size N batch size for prompt processing " - "(default: %d)\n", - params.n_batch); - printf( - " --memory-f32 use f32 instead of f16 for memory " - "key+value (default: disabled)\n"); - printf( - " not recommended: doubles context memory " - "required and no measurable increase in quality\n"); - if (llama_supports_mlock()) { - printf( - " --mlock force system to keep model in RAM " - "rather than swapping or compressing\n"); - } - if (llama_supports_mmap()) { - printf( - " --no-mmap do not memory-map model (slower load " - "but may reduce pageouts if not using mlock)\n"); - } - printf( - " --numa attempt optimizations that help on some " - "NUMA systems\n"); - if (llama_supports_gpu_offload()) { - printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); - printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); - printf( - " how to split the model across multiple " - "GPUs, one of:\n"); - printf(" - none: use one GPU only\n"); - printf( - " - layer (default): split layers and " - "KV across GPUs\n"); - printf(" - row: split rows across GPUs\n"); - printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf( - " fraction of the model to offload to " - "each GPU, comma-separated list of proportions, e.g. 3,1\n"); - printf( - " -mg i, --main-gpu i the GPU to use for the model (with " - "split-mode = none),\n"); - printf( - " or for intermediate results and KV " - "(with split-mode = row)\n"); - } - printf(" -m FNAME, --model FNAME\n"); - printf(" model path (default: %s)\n", - params.model.c_str()); - printf(" -a ALIAS, --alias ALIAS\n"); - printf( - " set an alias for the model, will be " - "added as `model` field in completion response\n"); - printf( - " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - printf( - " --lora-base FNAME optional model to use as a base for the " - "layers modified by the LoRA adapter\n"); - printf( - " --host ip address to listen (default (default: " - "%s)\n", - sparams.hostname.c_str()); - printf(" --port PORT port to listen (default (default: %d)\n", - sparams.port); - printf( - " --path PUBLIC_PATH path from which to serve static files " - "(default %s)\n", - sparams.public_path.c_str()); - printf( - " --api-key API_KEY optional api key to enhance server " - "security. If set, requests must include this key for access.\n"); - printf( - " --api-key-file FNAME path to file containing api keys " - "delimited by new lines. If set, requests must include one of the " - "keys for access.\n"); - printf( - " -to N, --timeout N server read/write timeout in seconds " - "(default: %d)\n", - sparams.read_timeout); - printf( - " --embedding enable embedding vector output (default: " - "%s)\n", - params.embedding ? "enabled" : "disabled"); - printf( - " -np N, --parallel N number of slots for process requests " - "(default: %d)\n", - params.n_parallel); - printf( - " -cb, --cont-batching enable continuous batching (a.k.a " - "dynamic batching) (default: disabled)\n"); - printf(" -spf FNAME, --system-prompt-file FNAME\n"); - printf( - " set a file to load a system prompt " - "(initial " - "prompt of all slots), this is useful for chat applications.\n"); - printf( - " --mmproj MMPROJ_FILE path to a multimodal projector file for " - "LLaVA.\n"); - printf(" --log-disable disables logging to a file.\n"); - printf("\n"); - printf(" --override-kv KEY=TYPE:VALUE\n"); - printf( - " advanced option to override model " - "metadata by key. may be specified multiple times.\n"); - printf( - " types: int, float, bool. example: " - "--override-kv tokenizer.ggml.add_bos_token=bool:false\n"); - printf( - " -gan N, --grp-attn-n N set the group attention factor to extend " - "context size through self-extend(default: 1=disabled), used together " - "with group attention width `--grp-attn-w`"); - printf( - " -gaw N, --grp-attn-w N set the group attention width to extend " - "context size through self-extend(default: 512), used together with " - "group attention factor `--grp-attn-n`"); - printf("\n"); -} -static std::string random_string() { - static const std::string str( - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); - - std::random_device rd; - std::mt19937 generator(rd()); - - std::string result(32, ' '); - - for (int i = 0; i < 32; ++i) { - result[i] = str[generator() % str.size()]; - } - - return result; -} - -static std::string gen_chatcmplid() { - std::stringstream chatcmplid; - chatcmplid << "chatcmpl-" << random_string(); - return chatcmplid.str(); -} -static json format_final_response_oaicompat(const json& request, - const task_result& response, - bool streaming = false) { - json result = response.result_json; - - bool stopped_word = result.count("stopped_word") != 0; - bool stopped_eos = json_value(result, "stopped_eos", false); - int num_tokens_predicted = json_value(result, "tokens_predicted", 0); - int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason = "length"; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - - json choices = - streaming ? json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}) - : json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"message", json{{"content", content}, - {"role", "assistant"}}}}}); - - std::time_t t = std::time(0); - - json res = - json{{"choices", choices}, - {"created", t}, - {"model", - json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, - {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, - {"usage", - json{{"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, - {"id", gen_chatcmplid()}}; - - if (server_verbose) { - res["__verbose"] = result; - } - - if (result.contains("completion_probabilities")) { - res["completion_probabilities"] = - json_value(result, "completion_probabilities", json::array()); - } - - return res; -} - -// return value is vector as there is one case where we might need to generate -// two responses -static std::vector format_partial_response_oaicompat( - const task_result& response) { - json result = response.result_json; - - if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { - return std::vector({response.result_json}); - } - - bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; - std::string modelname = - json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - - bool stopped_word = json_value(result, "stopped_word", false); - bool stopped_eos = json_value(result, "stopped_eos", false); - bool stopped_limit = json_value(result, "stopped_limit", false); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - if (stopped_limit) { - finish_reason = "length"; - } - - std::time_t t = std::time(0); - - json choices; - - if (!finish_reason.empty()) { - choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); - } else { - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = - json{{"choices", - json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}})}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - json second_ret = - json{{"choices", - json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"content", content}}}}})}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - return std::vector({initial_ret, second_ret}); - } - } else { - // Some idiosyncrasy in task processing logic makes several trailing calls - // with empty content, we ignore these at the calee site. - if (content.empty()) { - return std::vector({json::object()}); - } - - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json{ - {"content", content}, - }}, - }}); - } - } - - json ret = json{{"choices", choices}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - return std::vector({ret}); -} - -static json format_partial_response( - llama_server_context& llama, llama_client_slot* slot, - const std::string& content, - const std::vector& probs) { - json res = json{{"content", content}, - {"stop", false}, - {"slot_id", slot->id}, - {"multimodal", llama.multimodal}}; - - if (slot->sparams.n_probs > 0) { - res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); - } - - return res; -} - -static json format_tokenizer_response(const std::vector& tokens) { - return json{{"tokens", tokens}}; -} - -static json format_detokenized_response(std::string content) { - return json{{"content", content}}; -} - -struct token_translator { - llama_context* ctx; - std::string operator()(llama_token tok) const { - return llama_token_to_piece(ctx, tok); - } - std::string operator()(const completion_token_output& cto) const { - return (*this)(cto.tok); - } -}; - -static void append_to_generated_text_from_generated_token_probs( - llama_server_context& llama, llama_client_slot* slot) { - auto& gtps = slot->generated_token_probs; - auto translator = token_translator{llama.ctx}; - auto add_strlen = [=](size_t sum, const completion_token_output& cto) { - return sum + translator(cto).size(); - }; - const size_t len = - std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen); - if (slot->generated_text.capacity() < slot->generated_text.size() + len) { - slot->generated_text.reserve(slot->generated_text.size() + len); - } - for (const completion_token_output& cto : gtps) { - slot->generated_text += translator(cto); - } -} diff --git a/cortex-cpp/context/whisper_server_context.cc b/cortex-cpp/context/whisper_server_context.cc deleted file mode 100644 index a4ccbe710..000000000 --- a/cortex-cpp/context/whisper_server_context.cc +++ /dev/null @@ -1,796 +0,0 @@ -#include "whisper_server_context.h" -#include "utils/dr_wav.h" -#include -#include -#include -#include "utils/json.hpp" - -using json = nlohmann::json; - -bool read_wav(const std::string& fname, std::vector& pcmf32, - std::vector>& pcmf32s, bool stereo) { - drwav wav; - std::vector wav_data; // used for pipe input from stdin - - if (fname == "-") { - { - uint8_t buf[1024]; - while (true) { - const size_t n = fread(buf, 1, sizeof(buf), stdin); - if (n == 0) { - break; - } - wav_data.insert(wav_data.end(), buf, buf + n); - } - } - - if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == - false) { - fprintf(stderr, "error: failed to open WAV file from stdin\n"); - return false; - } - - fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, - wav_data.size()); - } else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) { - fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str()); - return false; - } - - if (wav.channels != 1 && wav.channels != 2) { - fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, - fname.c_str()); - return false; - } - - if (stereo && wav.channels != 2) { - fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", - __func__, fname.c_str()); - return false; - } - - if (wav.sampleRate != COMMON_SAMPLE_RATE) { - fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, - fname.c_str(), COMMON_SAMPLE_RATE / 1000); - return false; - } - - if (wav.bitsPerSample != 16) { - fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, - fname.c_str()); - return false; - } - - const uint64_t n = - wav_data.empty() - ? wav.totalPCMFrameCount - : wav_data.size() / (wav.channels * wav.bitsPerSample / 8); - - std::vector pcm16; - pcm16.resize(n * wav.channels); - drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); - drwav_uninit(&wav); - - // convert to mono, float - pcmf32.resize(n); - if (wav.channels == 1) { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[i]) / 32768.0f; - } - } else { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f; - } - } - - if (stereo) { - // convert to stereo, float - pcmf32s.resize(2); - - pcmf32s[0].resize(n); - pcmf32s[1].resize(n); - for (uint64_t i = 0; i < n; i++) { - pcmf32s[0][i] = float(pcm16[2 * i]) / 32768.0f; - pcmf32s[1][i] = float(pcm16[2 * i + 1]) / 32768.0f; - } - } - - return true; -} - -std::string output_str(struct whisper_context* ctx, - const whisper_params& params, - std::vector> pcmf32s) { - std::stringstream result; - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) { - const char* text = whisper_full_get_segment_text(ctx, i); - std::string speaker = ""; - - if (params.diarize && pcmf32s.size() == 2) { - const int64_t t0 = whisper_full_get_segment_t0(ctx, i); - const int64_t t1 = whisper_full_get_segment_t1(ctx, i); - speaker = estimate_diarization_speaker(pcmf32s, t0, t1); - } - - result << speaker << text << "\n"; - } - return result.str(); -} - -std::string estimate_diarization_speaker( - std::vector> pcmf32s, int64_t t0, int64_t t1, - bool id_only) { - std::string speaker = ""; - const int64_t n_samples = pcmf32s[0].size(); - - const int64_t is0 = timestamp_to_sample(t0, n_samples); - const int64_t is1 = timestamp_to_sample(t1, n_samples); - - double energy0 = 0.0f; - double energy1 = 0.0f; - - for (int64_t j = is0; j < is1; j++) { - energy0 += fabs(pcmf32s[0][j]); - energy1 += fabs(pcmf32s[1][j]); - } - - if (energy0 > 1.1 * energy1) { - speaker = "0"; - } else if (energy1 > 1.1 * energy0) { - speaker = "1"; - } else { - speaker = "?"; - } - - // printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, speaker = - // %s\n", is0, is1, energy0, energy1, speaker.c_str()); - - if (!id_only) { - speaker.insert(0, "(speaker "); - speaker.append(")"); - } - - return speaker; -} - -// 500 -> 00:05.000 -// 6000 -> 01:00.000 -std::string to_timestamp(int64_t t, bool comma) { - int64_t msec = t * 10; - int64_t hr = msec / (1000 * 60 * 60); - msec = msec - hr * (1000 * 60 * 60); - int64_t min = msec / (1000 * 60); - msec = msec - min * (1000 * 60); - int64_t sec = msec / 1000; - msec = msec - sec * 1000; - - char buf[32]; - snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int)hr, (int)min, - (int)sec, comma ? "," : ".", (int)msec); - - return std::string(buf); -} - -int timestamp_to_sample(int64_t t, int n_samples) { - return (std::max)(0, (std::min)((int)n_samples - 1, - (int)((t * WHISPER_SAMPLE_RATE) / 100))); -} - -bool is_file_exist(const char* fileName) { - std::ifstream infile(fileName); - return infile.good(); -} - -void whisper_print_usage(int /*argc*/, char** argv, - const whisper_params& params) { - fprintf(stderr, "\n"); - fprintf(stderr, "usage: %s [options] \n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, - " -h, --help [default] show this help " - "message and exit\n"); - fprintf(stderr, - " -t N, --threads N [%-7d] number of threads to use " - "during computation\n", - params.n_threads); - fprintf(stderr, - " -p N, --processors N [%-7d] number of processors to use " - "during computation\n", - params.n_processors); - fprintf( - stderr, - " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", - params.offset_t_ms); - fprintf(stderr, - " -on N, --offset-n N [%-7d] segment index offset\n", - params.offset_n); - fprintf(stderr, - " -d N, --duration N [%-7d] duration of audio to " - "process in milliseconds\n", - params.duration_ms); - fprintf(stderr, - " -mc N, --max-context N [%-7d] maximum number of text " - "context tokens to store\n", - params.max_context); - fprintf(stderr, - " -ml N, --max-len N [%-7d] maximum segment length in " - "characters\n", - params.max_len); - fprintf(stderr, - " -sow, --split-on-word [%-7s] split on word rather than " - "on token\n", - params.split_on_word ? "true" : "false"); - fprintf(stderr, - " -bo N, --best-of N [%-7d] number of best candidates " - "to keep\n", - params.best_of); - fprintf(stderr, - " -bs N, --beam-size N [%-7d] beam size for beam search\n", - params.beam_size); - fprintf(stderr, - " -wt N, --word-thold N [%-7.2f] word timestamp " - "probability threshold\n", - params.word_thold); - fprintf(stderr, - " -et N, --entropy-thold N [%-7.2f] entropy threshold for " - "decoder fail\n", - params.entropy_thold); - fprintf(stderr, - " -lpt N, --logprob-thold N [%-7.2f] log probability threshold " - "for decoder fail\n", - params.logprob_thold); - // fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by - // x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); - fprintf(stderr, - " -debug, --debug-mode [%-7s] enable debug mode (eg. dump " - "log_mel)\n", - params.debug_mode ? "true" : "false"); - fprintf(stderr, - " -tr, --translate [%-7s] translate from source " - "language to english\n", - params.translate ? "true" : "false"); - fprintf(stderr, - " -di, --diarize [%-7s] stereo audio diarization\n", - params.diarize ? "true" : "false"); - fprintf(stderr, - " -tdrz, --tinydiarize [%-7s] enable tinydiarize " - "(requires a tdrz model)\n", - params.tinydiarize ? "true" : "false"); - fprintf(stderr, - " -nf, --no-fallback [%-7s] do not use temperature " - "fallback while decoding\n", - params.no_fallback ? "true" : "false"); - fprintf(stderr, - " -ps, --print-special [%-7s] print special tokens\n", - params.print_special ? "true" : "false"); - fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", - params.print_colors ? "true" : "false"); - fprintf(stderr, - " -pr, --print-realtime [%-7s] print output in realtime\n", - params.print_realtime ? "true" : "false"); - fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", - params.print_progress ? "true" : "false"); - fprintf(stderr, - " -nt, --no-timestamps [%-7s] do not print timestamps\n", - params.no_timestamps ? "true" : "false"); - fprintf(stderr, - " -l LANG, --language LANG [%-7s] spoken language ('auto' for " - "auto-detect)\n", - params.language.c_str()); - fprintf(stderr, - " -dl, --detect-language [%-7s] exit after automatically " - "detecting language\n", - params.detect_language ? "true" : "false"); - fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", - params.prompt.c_str()); - fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", - params.model.c_str()); - fprintf(stderr, - " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used " - "for encode inference\n", - params.openvino_encode_device.c_str()); - fprintf(stderr, - " --convert, [%-7s] Convert audio to WAV, " - "requires ffmpeg on the server", - params.ffmpeg_converter ? "true" : "false"); - fprintf(stderr, "\n"); -} - -bool whisper_params_parse(int argc, char** argv, whisper_params& params) { - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - - if (arg == "-h" || arg == "--help") { - whisper_print_usage(argc, argv, params); - exit(0); - } else if (arg == "-t" || arg == "--threads") { - params.n_threads = std::stoi(argv[++i]); - } else if (arg == "-p" || arg == "--processors") { - params.n_processors = std::stoi(argv[++i]); - } else if (arg == "-ot" || arg == "--offset-t") { - params.offset_t_ms = std::stoi(argv[++i]); - } else if (arg == "-on" || arg == "--offset-n") { - params.offset_n = std::stoi(argv[++i]); - } else if (arg == "-d" || arg == "--duration") { - params.duration_ms = std::stoi(argv[++i]); - } else if (arg == "-mc" || arg == "--max-context") { - params.max_context = std::stoi(argv[++i]); - } else if (arg == "-ml" || arg == "--max-len") { - params.max_len = std::stoi(argv[++i]); - } else if (arg == "-bo" || arg == "--best-of") { - params.best_of = std::stoi(argv[++i]); - } else if (arg == "-bs" || arg == "--beam-size") { - params.beam_size = std::stoi(argv[++i]); - } else if (arg == "-wt" || arg == "--word-thold") { - params.word_thold = std::stof(argv[++i]); - } else if (arg == "-et" || arg == "--entropy-thold") { - params.entropy_thold = std::stof(argv[++i]); - } else if (arg == "-lpt" || arg == "--logprob-thold") { - params.logprob_thold = std::stof(argv[++i]); - } - // else if (arg == "-su" || arg == "--speed-up") { params.speed_up - // = true; } - else if (arg == "-debug" || arg == "--debug-mode") { - params.debug_mode = true; - } else if (arg == "-tr" || arg == "--translate") { - params.translate = true; - } else if (arg == "-di" || arg == "--diarize") { - params.diarize = true; - } else if (arg == "-tdrz" || arg == "--tinydiarize") { - params.tinydiarize = true; - } else if (arg == "-sow" || arg == "--split-on-word") { - params.split_on_word = true; - } else if (arg == "-nf" || arg == "--no-fallback") { - params.no_fallback = true; - } else if (arg == "-fp" || arg == "--font-path") { - params.font_path = argv[++i]; - } else if (arg == "-ps" || arg == "--print-special") { - params.print_special = true; - } else if (arg == "-pc" || arg == "--print-colors") { - params.print_colors = true; - } else if (arg == "-pr" || arg == "--print-realtime") { - params.print_realtime = true; - } else if (arg == "-pp" || arg == "--print-progress") { - params.print_progress = true; - } else if (arg == "-nt" || arg == "--no-timestamps") { - params.no_timestamps = true; - } else if (arg == "-l" || arg == "--language") { - params.language = argv[++i]; - } else if (arg == "-dl" || arg == "--detect-language") { - params.detect_language = true; - } else if (arg == "--prompt") { - params.prompt = argv[++i]; - } else if (arg == "-m" || arg == "--model") { - params.model = argv[++i]; - } else if (arg == "-oved" || arg == "--ov-e-device") { - params.openvino_encode_device = argv[++i]; - } else if (arg == "-ng" || arg == "--no-gpu") { - params.use_gpu = false; - } else if (arg == "--convert") { - params.ffmpeg_converter = true; - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - whisper_print_usage(argc, argv, params); - exit(0); - } - } - - return true; -} - -void check_ffmpeg_availibility() { - int result = system("ffmpeg -version"); - - if (result == 0) { - std::cout << "ffmpeg is available." << std::endl; - } else { - // ffmpeg is not available - std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed "; - std::cout << "and that its executable is included in your system's PATH. "; - exit(0); - } -} - -bool convert_to_wav(const std::string& temp_filename, std::string& error_resp) { - std::ostringstream cmd_stream; - std::string converted_filename_temp = temp_filename + "_temp.wav"; - cmd_stream << "ffmpeg -i \"" << temp_filename - << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" - << converted_filename_temp << "\" 2>&1"; - std::string cmd = cmd_stream.str(); - - int status = std::system(cmd.c_str()); - if (status != 0) { - error_resp = "{\"error\":\"FFmpeg conversion failed.\"}"; - return false; - } - - // Remove the original file - if (remove(temp_filename.c_str()) != 0) { - error_resp = "{\"error\":\"Failed to remove the original file.\"}"; - return false; - } - - // Rename the temporary file to match the original filename - if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) { - error_resp = "{\"error\":\"Failed to rename the temporary file.\"}"; - return false; - } - return true; -} - -void whisper_print_progress_callback(struct whisper_context* /*ctx*/, - struct whisper_state* /*state*/, - int progress, void* user_data) { - int progress_step = - ((whisper_print_user_data*)user_data)->params->progress_step; - int* progress_prev = &(((whisper_print_user_data*)user_data)->progress_prev); - if (progress >= *progress_prev + progress_step) { - *progress_prev += progress_step; - fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress); - } -} - -void whisper_print_segment_callback(struct whisper_context* ctx, - struct whisper_state* /*state*/, int n_new, - void* user_data) { - const auto& params = *((whisper_print_user_data*)user_data)->params; - const auto& pcmf32s = *((whisper_print_user_data*)user_data)->pcmf32s; - - const int n_segments = whisper_full_n_segments(ctx); - - std::string speaker = ""; - - int64_t t0 = 0; - int64_t t1 = 0; - - // print the last n_new segments - const int s0 = n_segments - n_new; - - if (s0 == 0) { - printf("\n"); - } - - for (int i = s0; i < n_segments; i++) { - if (!params.no_timestamps || params.diarize) { - t0 = whisper_full_get_segment_t0(ctx, i); - t1 = whisper_full_get_segment_t1(ctx, i); - } - - if (!params.no_timestamps) { - printf("[%s --> %s] ", to_timestamp(t0).c_str(), - to_timestamp(t1).c_str()); - } - - if (params.diarize && pcmf32s.size() == 2) { - speaker = estimate_diarization_speaker(pcmf32s, t0, t1); - } - - if (params.print_colors) { - for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) { - if (params.print_special == false) { - const whisper_token id = whisper_full_get_token_id(ctx, i, j); - if (id >= whisper_token_eot(ctx)) { - continue; - } - } - - const char* text = whisper_full_get_token_text(ctx, i, j); - const float p = whisper_full_get_token_p(ctx, i, j); - - const int col = (std::max)( - 0, (std::min)((int)k_colors.size() - 1, - (int)((std::pow)(p, 3) * float(k_colors.size())))); - - printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, - "\033[0m"); - } - } else { - const char* text = whisper_full_get_segment_text(ctx, i); - - printf("%s%s", speaker.c_str(), text); - } - - if (params.tinydiarize) { - if (whisper_full_get_segment_speaker_turn_next(ctx, i)) { - printf("%s", params.tdrz_speaker_turn.c_str()); - } - } - - // with timestamps or speakers: each segment on new line - if (!params.no_timestamps || params.diarize) { - printf("\n"); - } - fflush(stdout); - } -} - -whisper_server_context::~whisper_server_context() { - if (ctx) { - whisper_print_timings(ctx); - whisper_free(ctx); - ctx = nullptr; - } -} - -bool whisper_server_context::load_model(std::string& model_path) { - whisper_mutex.lock(); - - // clean up - whisper_free(ctx); - - // whisper init - ctx = whisper_init_from_file_with_params(model_path.c_str(), cparams); - - // TODO perhaps load prior model here instead of exit - if (ctx == nullptr) { - whisper_mutex.unlock(); - return false; - } - - // initialize openvino encoder. this has no effect on whisper.cpp builds that - // don't have OpenVINO configured - whisper_ctx_init_openvino_encoder( - ctx, nullptr, params.openvino_encode_device.c_str(), nullptr); - - // check if the model is in the file system - whisper_mutex.unlock(); - return true; -} - -std::string whisper_server_context::inference( - std::string& input_file_path, std::string language, std::string prompt, - std::string response_format, float temperature, bool translate) { - // acquire whisper model mutex lock - whisper_mutex.lock(); - - // audio arrays - std::vector pcmf32; // mono-channel F32 PCM - std::vector> pcmf32s; // stereo-channel F32 PCM - - // if file is not wav, convert to wav - if (params.ffmpeg_converter) { - std::string error_resp = "Failed to execute ffmpeg command converting " + - input_file_path + " to wav"; - const bool is_converted = convert_to_wav(input_file_path, error_resp); - if (!is_converted) { - whisper_mutex.unlock(); - LOG_ERROR << error_resp; - throw std::runtime_error(error_resp); - } - } - - // read wav content into pcmf32 - if (!read_wav(input_file_path, pcmf32, pcmf32s, params.diarize)) { - std::string error_resp = "Failed to read WAV file " + input_file_path; - LOG_ERROR << error_resp; - whisper_mutex.unlock(); - throw std::runtime_error(error_resp); - } - - printf("Successfully loaded %s\n", input_file_path.c_str()); - - params.translate = translate; - params.language = language; - params.response_format = response_format; - if (!whisper_is_multilingual(ctx)) { - if (params.language != "en" || params.translate) { - params.language = "en"; - params.translate = false; - LOG_WARN - << "Model " << model_id - << " is not multilingual, ignoring language and translation options"; - } - } - if (params.detect_language) { - params.language = "auto"; - } - - // print some processing info - std::string processing_info = - "Model " + model_id + " processing " + input_file_path + " (" + - std::to_string(pcmf32.size()) + " samples, " + - std::to_string(float(pcmf32.size()) / WHISPER_SAMPLE_RATE) + " sec), " + - std::to_string(params.n_threads) + " threads, " + - std::to_string(params.n_processors) + - " processors, lang = " + params.language + - ", task = " + (params.translate ? "translate" : "transcribe") + ", " + - (params.tinydiarize ? "tdrz = 1, " : "") + - (params.no_timestamps ? "timestamps = 0" : "timestamps = 1"); - LOG_INFO << processing_info; - - // run the inference - { - std::string msg = "Running whisper.cpp inference of model " + model_id + - " on " + input_file_path; - LOG_INFO << msg; - whisper_full_params wparams = - whisper_full_default_params(WHISPER_SAMPLING_GREEDY); - - wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH - : WHISPER_SAMPLING_GREEDY; - - wparams.print_realtime = false; - wparams.print_progress = params.print_progress; - wparams.print_timestamps = !params.no_timestamps; - wparams.print_special = params.print_special; - wparams.translate = params.translate; - wparams.language = params.language.c_str(); - wparams.detect_language = params.detect_language; - wparams.n_threads = params.n_threads; - wparams.n_max_text_ctx = - params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx; - wparams.offset_ms = params.offset_t_ms; - wparams.duration_ms = params.duration_ms; - - wparams.thold_pt = params.word_thold; - wparams.max_len = params.max_len == 0 ? 60 : params.max_len; - wparams.split_on_word = params.split_on_word; - - wparams.speed_up = params.speed_up; - wparams.debug_mode = params.debug_mode; - - wparams.tdrz_enable = params.tinydiarize; // [TDRZ] - - wparams.initial_prompt = prompt.c_str(); - - wparams.greedy.best_of = params.best_of; - wparams.beam_search.beam_size = params.beam_size; - - wparams.temperature = temperature; - wparams.temperature_inc = params.temperature_inc; - wparams.entropy_thold = params.entropy_thold; - wparams.logprob_thold = params.logprob_thold; - - wparams.no_timestamps = params.no_timestamps; - - whisper_print_user_data user_data = {¶ms, &pcmf32s, 0}; - - // this callback is called on each new segment - if (params.print_realtime) { - wparams.new_segment_callback = whisper_print_segment_callback; - wparams.new_segment_callback_user_data = &user_data; - } - - if (wparams.print_progress) { - wparams.progress_callback = whisper_print_progress_callback; - wparams.progress_callback_user_data = &user_data; - } - - // examples for abort mechanism - // in examples below, we do not abort the processing, but we could if the - // flag is set to true - - // the callback is called before every encoder run - if it returns false, - // the processing is aborted - { - static bool is_aborted = - false; // NOTE: this should be atomic to avoid data race - - wparams.encoder_begin_callback = [](struct whisper_context* /*ctx*/, - struct whisper_state* /*state*/, - void* user_data) { - bool is_aborted = *(bool*)user_data; - return !is_aborted; - }; - wparams.encoder_begin_callback_user_data = &is_aborted; - } - - // the callback is called before every computation - if it returns true, the - // computation is aborted - { - static bool is_aborted = - false; // NOTE: this should be atomic to avoid data race - - wparams.abort_callback = [](void* user_data) { - bool is_aborted = *(bool*)user_data; - return is_aborted; - }; - wparams.abort_callback_user_data = &is_aborted; - } - - if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), - params.n_processors) != 0) { - std::string error_resp = "Failed to process audio"; - LOG_ERROR << error_resp; - whisper_mutex.unlock(); - throw std::runtime_error(error_resp); - } - } - - // return results to user - std::string result; - if (params.response_format == text_format) { - result = output_str(ctx, params, pcmf32s); - } else if (params.response_format == srt_format) { - std::stringstream ss; - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) { - const char* text = whisper_full_get_segment_text(ctx, i); - const int64_t t0 = whisper_full_get_segment_t0(ctx, i); - const int64_t t1 = whisper_full_get_segment_t1(ctx, i); - std::string speaker = ""; - - if (params.diarize && pcmf32s.size() == 2) { - speaker = estimate_diarization_speaker(pcmf32s, t0, t1); - } - - ss << i + 1 + params.offset_n << "\n"; - ss << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n"; - ss << speaker << text << "\n\n"; - } - result = ss.str(); - } else if (params.response_format == vtt_format) { - std::stringstream ss; - - ss << "WEBVTT\n\n"; - - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) { - const char* text = whisper_full_get_segment_text(ctx, i); - const int64_t t0 = whisper_full_get_segment_t0(ctx, i); - const int64_t t1 = whisper_full_get_segment_t1(ctx, i); - std::string speaker = ""; - - if (params.diarize && pcmf32s.size() == 2) { - speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true); - speaker.insert(0, ""); - } - - ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n"; - ss << speaker << text << "\n\n"; - } - result = ss.str(); - } else if (params.response_format == vjson_format) { - /* try to match openai/whisper's Python format */ - std::string results = output_str(ctx, params, pcmf32s); - json jres = json{{"text", results}}; - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) { - json segment = json{ - {"id", i}, - {"text", whisper_full_get_segment_text(ctx, i)}, - }; - - if (!params.no_timestamps) { - segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01; - segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01; - } - - const int n_tokens = whisper_full_n_tokens(ctx, i); - for (int j = 0; j < n_tokens; ++j) { - whisper_token_data token = whisper_full_get_token_data(ctx, i, j); - if (token.id >= whisper_token_eot(ctx)) { - continue; - } - - segment["tokens"].push_back(token.id); - json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}}; - if (!params.no_timestamps) { - word["start"] = token.t0 * 0.01; - word["end"] = token.t1 * 0.01; - } - word["probability"] = token.p; - segment["words"].push_back(word); - } - jres["segments"].push_back(segment); - } - result = jres.dump(-1, ' ', false, json::error_handler_t::replace); - } else { - std::string results = output_str(ctx, params, pcmf32s); - json jres = json{{"text", results}}; - result = jres.dump(-1, ' ', false, json::error_handler_t::replace); - } - - // reset params to thier defaults - params = default_params; - - // return whisper model mutex lock - whisper_mutex.unlock(); - LOG_INFO << "Successfully processed " << input_file_path << ": " << result; - - return result; -} diff --git a/cortex-cpp/context/whisper_server_context.h b/cortex-cpp/context/whisper_server_context.h deleted file mode 100644 index da29e4d9f..000000000 --- a/cortex-cpp/context/whisper_server_context.h +++ /dev/null @@ -1,165 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include - -#include "whisper.h" - -// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9] -// Lowest is red, middle is yellow, highest is green. -const std::vector k_colors = { - "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", - "\033[38;5;220m", "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", - "\033[38;5;118m", "\033[38;5;82m", -}; - -// output formats -const std::string json_format = "json"; -const std::string text_format = "text"; -const std::string srt_format = "srt"; -const std::string vjson_format = "verbose_json"; -const std::string vtt_format = "vtt"; - -#define COMMON_SAMPLE_RATE 16000 - -struct whisper_params { - int32_t n_threads = - (std::min)(4, (int32_t)std::thread::hardware_concurrency()); - int32_t n_processors = 1; - int32_t offset_t_ms = 0; - int32_t offset_n = 0; - int32_t duration_ms = 0; - int32_t progress_step = 5; - int32_t max_context = -1; - int32_t max_len = 0; - int32_t best_of = 2; - int32_t beam_size = -1; - - float word_thold = 0.01f; - float entropy_thold = 2.40f; - float logprob_thold = -1.00f; - float temperature = 0.00f; - float temperature_inc = 0.20f; - - bool speed_up = false; - bool debug_mode = false; - bool translate = false; - bool detect_language = false; - bool diarize = false; - bool tinydiarize = false; - bool split_on_word = false; - bool no_fallback = false; - bool print_special = false; - bool print_colors = false; - bool print_realtime = false; - bool print_progress = false; - bool no_timestamps = false; - bool use_gpu = true; - bool ffmpeg_converter = false; - - std::string language = "en"; - std::string prompt = ""; - std::string font_path = - "/System/Library/Fonts/Supplemental/Courier New Bold.ttf"; - std::string model = "models/ggml-base.en.bin"; - - std::string response_format = json_format; - - // [TDRZ] speaker turn string - std::string tdrz_speaker_turn = - " [SPEAKER_TURN]"; // TODO: set from command line - - std::string openvino_encode_device = "CPU"; -}; - -// Read WAV audio file and store the PCM data into pcmf32 -// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE -// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain -// 2 channel PCM -bool read_wav(const std::string& fname, std::vector& pcmf32, - std::vector>& pcmf32s, bool stereo); - -std::string output_str(struct whisper_context* ctx, - const whisper_params& params, - std::vector> pcmf32s); - -std::string estimate_diarization_speaker( - std::vector> pcmf32s, int64_t t0, int64_t t1, - bool id_only = false); - -// 500 -> 00:05.000 -// 6000 -> 01:00.000 -std::string to_timestamp(int64_t t, bool comma = false); - -int timestamp_to_sample(int64_t t, int n_samples); - -bool is_file_exist(const char* fileName); - -void whisper_print_usage(int /*argc*/, char** argv, - const whisper_params& params); - -bool whisper_params_parse(int argc, char** argv, whisper_params& params); - -void check_ffmpeg_availibility(); - -bool convert_to_wav(const std::string& temp_filename, std::string& error_resp); - -void whisper_print_progress_callback(struct whisper_context* /*ctx*/, - struct whisper_state* /*state*/, - int progress, void* user_data); - -void whisper_print_segment_callback(struct whisper_context* ctx, - struct whisper_state* /*state*/, int n_new, - void* user_data); - -struct whisper_print_user_data { - const whisper_params* params; - - const std::vector>* pcmf32s; - int progress_prev; -}; - -struct whisper_server_context { - whisper_params params; - whisper_params default_params; - std::mutex whisper_mutex; - std::string model_id; - - struct whisper_context_params cparams; - struct whisper_context* ctx = nullptr; - - whisper_server_context() = default; // add this line - - // Constructor - whisper_server_context(const std::string& model_id) { - this->model_id = model_id; - this->cparams = whisper_context_params(); - this->ctx = nullptr; - // store default params so we can reset after each inference request - this->default_params = whisper_params(); - this->params = whisper_params(); - } - - // Move constructor - whisper_server_context(whisper_server_context&& other) noexcept - : params(std::move(other.params)), - default_params(std::move(other.default_params)), - whisper_mutex() // std::mutex is not movable, so we initialize a new one - , - model_id(std::move(other.model_id)), - cparams(std::move(other.cparams)), - ctx(std::exchange( - other.ctx, - nullptr)) // ctx is a raw pointer, so we use std::exchange - {} - - bool load_model(std::string& model_path); - - std::string inference(std::string& input_file_path, std::string languague, - std::string prompt, std::string response_format, - float temperature, bool translate); - - ~whisper_server_context(); -}; \ No newline at end of file diff --git a/cortex-cpp/controllers/audio.cc b/cortex-cpp/controllers/audio.cc deleted file mode 100644 index 91fd76d5b..000000000 --- a/cortex-cpp/controllers/audio.cc +++ /dev/null @@ -1,300 +0,0 @@ -#include "audio.h" - -#include "utils/nitro_utils.h" -#include "whisper.h" - -using namespace v1; - -audio::audio() { - whisper_print_system_info(); -}; - -audio::~audio() {} - -std::optional audio::ParseModelId( - const std::shared_ptr& jsonBody, - const std::function& callback) { - if (!jsonBody->isMember("model_id")) { - LOG_INFO << "No model_id found in request body"; - Json::Value jsonResp; - jsonResp["message"] = "No model_id found in request body"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k400BadRequest); - callback(resp); - return std::nullopt; // Signal that an error occurred - } - - return (*jsonBody)["model_id"].asString(); -} - -void audio::LoadModel(const HttpRequestPtr& req, - std::function&& callback) { - const auto jsonBody = req->getJsonObject(); - auto optional_model_id = ParseModelId(jsonBody, callback); - if (!optional_model_id) { - return; - } - std::string model_id = *optional_model_id; - - // Check if model is already loaded - if (whispers.find(model_id) != whispers.end()) { - std::string error_msg = "Model " + model_id + " already loaded"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k409Conflict); - callback(resp); - return; - } - - // Model not loaded, load it - // Parse model path from request - std::string model_path = (*jsonBody)["model_path"].asString(); - if (!is_file_exist(model_path.c_str())) { - std::string error_msg = "Model " + model_path + " not found"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k404NotFound); - callback(resp); - return; - } - - whisper_server_context whisper = whisper_server_context(model_id); - bool model_loaded = whisper.load_model(model_path); - // If model failed to load, return a 500 error - if (!model_loaded) { - whisper.~whisper_server_context(); - std::string error_msg = "Failed to load model " + model_path; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k500InternalServerError); - callback(resp); - return; - } - - // Warm up the model - // Parse warm up audio path from request - if (jsonBody->isMember("warm_up_audio_path")) { - std::string warm_up_msg = "Warming up model " + model_id; - LOG_INFO << warm_up_msg; - std::string warm_up_audio_path = - (*jsonBody)["warm_up_audio_path"].asString(); - // Return 400 error if warm up audio path is not found - if (!is_file_exist(warm_up_audio_path.c_str())) { - std::string error_msg = - "Warm up audio " + warm_up_audio_path + - " not found, please provide a valid path or don't specify it at all"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k400BadRequest); - callback(resp); - return; - } else { - LOG_INFO << "Warming up model " << model_id << " with audio " - << warm_up_audio_path << " ..."; - std::string warm_up_result = whisper.inference(warm_up_audio_path, "en", - "", text_format, 0, false); - LOG_INFO << "Warm up model " << model_id << " completed"; - } - } else { - LOG_INFO << "No warm up audio provided, skipping warm up"; - } - - // Model loaded successfully, add it to the map of loaded models - // and return a 200 response - whispers.emplace(model_id, std::move(whisper)); - Json::Value jsonResp; - std::string success_msg = "Model " + model_id + " loaded successfully"; - jsonResp["message"] = success_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k200OK); - callback(resp); - return; -} - -void audio::UnloadModel( - const HttpRequestPtr& req, - std::function&& callback) { - const auto& jsonBody = req->getJsonObject(); - auto optional_model_id = ParseModelId(jsonBody, callback); - if (!optional_model_id) { - return; - } - std::string model_id = *optional_model_id; - - // If model is not loaded, return a 404 error - if (whispers.find(model_id) == whispers.end()) { - std::string error_msg = - "Model " + model_id + - " has not been loaded, please load that model into nitro"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k404NotFound); - callback(resp); - return; - } - - // Model loaded, unload it - whispers[model_id].~whisper_server_context(); - whispers.erase(model_id); - - // Return a 200 response - Json::Value jsonResp; - std::string success_msg = "Model " + model_id + " unloaded successfully"; - LOG_INFO << success_msg; - jsonResp["message"] = success_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k200OK); - callback(resp); - return; -} - -void audio::ListModels(const HttpRequestPtr& req, - std::function&& callback) { - // Return a list of all loaded models - Json::Value jsonResp; - Json::Value models; - for (auto const& model : whispers) { - models.append(model.first); - } - jsonResp["models"] = models; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k200OK); - callback(resp); - return; -} - -void audio::TranscriptionImpl( - const HttpRequestPtr& req, - std::function&& callback, bool translate) { - MultiPartParser partParser; - Json::Value jsonResp; - if (partParser.parse(req) != 0 || partParser.getFiles().size() != 1) { - auto resp = HttpResponse::newHttpResponse(); - resp->setBody("Must have exactly one file"); - resp->setStatusCode(k403Forbidden); - callback(resp); - return; - } - auto& file = partParser.getFiles()[0]; - const auto& formFields = partParser.getParameters(); - - // Check if model_id are present in the request. If not, return a 400 error - if (formFields.find("model_id") == formFields.end()) { - LOG_INFO << "No model_id found in request body"; - Json::Value jsonResp; - jsonResp["message"] = "No model_id found in request body"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k400BadRequest); - callback(resp); - return; - } - - std::string model_id = formFields.at("model_id"); - - // Parse all other optional parameters from the request - std::string language = formFields.find("language") != formFields.end() - ? formFields.at("language") - : "en"; - std::string prompt = formFields.find("prompt") != formFields.end() - ? formFields.at("prompt") - : ""; - std::string response_format = - formFields.find("response_format") != formFields.end() - ? formFields.at("response_format") - : json_format; - float temperature = formFields.find("temperature") != formFields.end() - ? std::stof(formFields.at("temperature")) - : 0; - - // Check if model is loaded. If not, return a 404 error - if (whispers.find(model_id) == whispers.end()) { - std::string error_msg = - "Model " + model_id + - " has not been loaded, please load that model into nitro"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k404NotFound); - callback(resp); - return; - } - - // Save input file to temp location - std::string temp_dir = - std::filesystem::temp_directory_path().string() + "/" + - std::to_string(std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count()); - // Create the directory - std::filesystem::create_directory(temp_dir); - // Save the file to the directory, with its original name - std::string temp_file_path = temp_dir + "/" + file.getFileName(); - file.saveAs(temp_file_path); - - // Run inference - std::string result; - try { - result = - whispers[model_id].inference(temp_file_path, language, prompt, - response_format, temperature, translate); - } catch (const std::exception& e) { - std::remove(temp_file_path.c_str()); - Json::Value jsonResp; - jsonResp["message"] = e.what(); - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k500InternalServerError); - callback(resp); - return; - } - // TODO: Need to remove the entire temp directory, not just the file - std::remove(temp_file_path.c_str()); - - auto resp = nitro_utils::nitroHttpResponse(); - resp->setBody(result); - resp->setStatusCode(k200OK); - // Set content type based on response format - if (response_format == json_format || response_format == vjson_format) { - resp->addHeader("Content-Type", "application/json"); - } else if (response_format == text_format) { - resp->addHeader("Content-Type", "text/html"); - } else if (response_format == srt_format) { - resp->addHeader("Content-Type", "application/x-subrip"); - } else if (response_format == vtt_format) { - resp->addHeader("Content-Type", "text/vtt"); - } - callback(resp); - return; -} - -void audio::ModelStatus( - const HttpRequestPtr& req, - std::function&& callback) { - auto resp = nitro_utils::nitroHttpResponse(); - resp->setStatusCode(k200OK); - resp->setContentTypeCode(drogon::CT_APPLICATION_JSON); - resp->setBody("Unimplemented"); - callback(resp); -} - -void audio::CreateTranscription( - const HttpRequestPtr& req, - std::function&& callback) { - return TranscriptionImpl(req, std::move(callback), false); -} - -void audio::CreateTranslation( - const HttpRequestPtr& req, - std::function&& callback) { - return TranscriptionImpl(req, std::move(callback), true); -} \ No newline at end of file diff --git a/cortex-cpp/controllers/audio.h b/cortex-cpp/controllers/audio.h deleted file mode 100644 index 19b1efb6d..000000000 --- a/cortex-cpp/controllers/audio.h +++ /dev/null @@ -1,74 +0,0 @@ -#pragma once - -#include -#include -#include -#include "common/base.h" - -#define DR_WAV_IMPLEMENTATION -#include "utils/dr_wav.h" - -#include "utils/json.hpp" - -// Whisper Context -#include "context/whisper_server_context.h" - -using json = nlohmann::ordered_json; - -using namespace drogon; - -namespace v1 { - -class audio : public drogon::HttpController