From 63c545c1fd270d9467ba9eabe6782a20e3c7a10f Mon Sep 17 00:00:00 2001
From: sangjanai <sang@janai>
Date: Thu, 13 Mar 2025 10:25:34 +0700
Subject: [PATCH 1/8] chore: add CI template

---
 .github/workflows/build.yml        | 2193 +++++++---------------------
 .github/workflows/quality-gate.yml |  472 ++++++
 Makefile                           | 1672 +--------------------
 3 files changed, 1075 insertions(+), 3262 deletions(-)
 create mode 100644 .github/workflows/quality-gate.yml

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 03cde0a48436f..c9fb5a041680b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,1727 +1,630 @@
 name: CI
 
 on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
   push:
-    branches:
-      - master
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  contents: write # for creating release
+    tags: ["b[0-9]+"]
+    paths:
+      [
+        ".github/scripts/**",
+        ".github/workflows/build.yml",
+        "**/CMakeLists.txt",
+        "**/Makefile",
+        "**/*.h",
+        "**/*.hpp",
+        "**/*.c",
+        "**/*.cpp",
+        "**/*.cu",
+        "**/*.cc",
+        "**/*.cxx",
+        "llama.cpp",
+        "!docs/**",
+        "!.gitignore",
+        "!README.md",
+      ]
+  workflow_dispatch:
 
 env:
-  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
+  EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf
+  VULKAN_VERSION: 1.3.261.1
 
 jobs:
-  macOS-latest-cmake-arm64:
-    runs-on: macos-14
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-arm64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_CURL=ON \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp LICENSE ./build/bin/
-          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
-
-  macOS-latest-cmake-x64:
-    runs-on: macos-13
-
+  create-draft-release:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+      version: ${{ steps.get_version.outputs.version }}
+    permissions:
+      contents: write
     steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-x64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_CURL=ON \
-            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp LICENSE ./build/bin/
-          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
-
-  ubuntu-cpu-cmake:
+      - name: Extract tag name
+        id: get_version
+        run: echo "VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/}"
+        env:
+          GITHUB_REF: ${{ github.ref }}
+      - name: Create Draft Release
+        id: create_release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ github.ref_name }}
+          token: ${{ secrets.GITHUB_TOKEN }}
+          name: "${{ env.VERSION }}"
+          draft: true
+          generate_release_notes: true
+          prerelease: false
+
+  build-and-test:
+    runs-on: ${{ matrix.runs-on }}
+    needs: [create-draft-release]
+    timeout-minutes: 90
     strategy:
+      fail-fast: false
       matrix:
         include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-22.04-arm
-
-    runs-on: ${{ matrix.os }}
+          - os: "linux"
+            name: "arm64"
+            runs-on: "ubuntu-2004-arm64"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx2-x64"
+            runs-on: "ubuntu-20-04"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "noavx-x64"
+            runs-on: "ubuntu-20-04"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx-x64"
+            runs-on: "ubuntu-20-04"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx512-x64"
+            runs-on: "ubuntu-20-04"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "vulkan-x64"
+          #   runs-on: "ubuntu-22-04"
+          #   cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: true
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "noavx-cuda-cu11.7-x64"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx2-cuda-cu11.7-x64"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx-cuda-cu11.7-x64"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx512-cuda-cu11.7-x64"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "noavx-cuda-cu12.0-x64"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx2-cuda-cu12.0-x64"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx-cuda-cu12.0-x64"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx512-cuda-cu12.0-x64"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          # - os: "macos"
+          #   name: "x64"
+          #   runs-on: "macos-selfhosted-12"
+          #   cmake-flags: "-DGGML_METAL=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "macos"
+          #   name: "arm64"
+          #   runs-on: "macos-selfhosted-12-arm64"
+          #   cmake-flags: "-DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "avx2-x64"
+          #   runs-on: "windows-cuda-12-0"
+          #   cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "noavx-x64"
+          #   runs-on: "windows-cuda-12-0"
+          #   cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "avx-x64"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "avx512-x64"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "vulkan-x64"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: true
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "noavx-cuda-cu12.0-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx2-cuda-cu12.0-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx-cuda-cu12.0-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx512-cuda-cu12.0-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "noavx-cuda-cu11.7-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx2-cuda-cu11.7-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx-cuda-cu11.7-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx512-cuda-cu11.7-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
 
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
         with:
-          fetch-depth: 0
+          submodules: recursive
 
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+      - name: use python for linux
+        continue-on-error: true
+        uses: actions/setup-python@v4
         with:
-          key: ubuntu-cpu-cmake
-          evict-old-files: 1d
+          python-version: '3.10'
 
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
+      - name: Install tools on Windows
+        if: runner.os == 'Windows'
         run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_CURL=ON \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(nproc)
+          choco install ccache awscli make ccache ninja -y
 
-      - name: Test
-        id: cmake_test
+      - name: Install tools on Linux
+        if: runner.os == 'Linux'
         run: |
-          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
+          sudo apt-get install -y ninja-build
+          python3 -m pip install awscli
 
-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          if [ "${{ matrix.os }}${{ matrix.name }}" == "linuxarm64" ]; then
+            sudo apt-get install -y ccache
+            exit 0
           fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp LICENSE ./build/bin/
-          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
-          name: llama-bin-ubuntu-${{ matrix.build }}.zip
-
-  ubuntu-latest-cmake-sanitizer:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-            -DGGML_OPENMP=OFF
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-latest-llguidance:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_LLGUIDANCE=ON
-          cmake --build . --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
+          cd /tmp
+          wget https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz
+          tar -xvf ccache-4.10.2-linux-x86_64.tar.xz
+          sudo cp ccache-4.10.2-linux-x86_64/ccache /usr/bin/ccache
+          ccache -V
+          rm -rf /tmp/ccache-4.10.2-linux-x86_64.tar.xz /tmp/ccache-4.10.2-linux-x86_64
+
+      - name: Download ccache from s3
+        if: runner.os == 'Windows'
+        continue-on-error: true
         run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-latest-cmake-rpc:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-latest-cmake-rpc
-          evict-old-files: 1d
+          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
+          refreshenv
+          aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }} ${{ matrix.ccache-dir }}  --endpoint ${{ secrets.MINIO_ENDPOINT }} --cli-read-timeout 0
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
 
-      - name: Dependencies
-        id: depends
+      - name: Download ccache from s3
+        if: runner.os == 'Linux'
+        continue-on-error: true
         run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
+          aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }} ${{ matrix.ccache-dir }}  --endpoint ${{ secrets.MINIO_ENDPOINT }} --cli-read-timeout 0
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
 
-      - name: Build
-        id: cmake_build
+      - name: Install coreutils macos
+        if: runner.os == 'macOS'
         run: |
-          cmake -B build \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(nproc)
+          brew install coreutils
 
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose
-
-  ubuntu-22-cmake-vulkan:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-vulkan
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
+      - name: Prepare Vulkan SDK Linux
+        if: ${{ matrix.vulkan && (matrix.os == 'linux') }}
         run: |
           wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
           sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
           sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_VULKAN=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 2700
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp LICENSE ./build/bin/
-          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
-          name: llama-bin-ubuntu-vulkan-x64.zip
-
-  ubuntu-22-cmake-hip:
-    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:6.0.2
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-hip
-          evict-old-files: 1d
-
-      - name: Build with native CMake HIP support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGGML_HIP_ROCWMMA_FATTN=ON \
-            -DGGML_HIP=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Build with legacy HIP support
-        id: cmake_build_legacy_hip
-        run: |
-          cmake -B build2 -S . \
-            -DCMAKE_C_COMPILER=hipcc \
-            -DCMAKE_CXX_COMPILER=hipcc \
-            -DGGML_HIP_ROCWMMA_FATTN=ON \
-            -DGGML_HIP=ON
-          cmake --build build2 --config Release -j $(nproc)
-
-  ubuntu-22-cmake-musa:
-    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          apt-get update
-          apt-get install -y build-essential git cmake libcurl4-openssl-dev
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-musa
-          evict-old-files: 1d
-
-      - name: Build with native CMake MUSA support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DGGML_MUSA=ON
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-22-cmake-sycl:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-sycl
-          evict-old-files: 1d
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-22-cmake-sycl-fp16:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-sycl-fp16
-          evict-old-files: 1d
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DGGML_SYCL_F16=ON
-          cmake --build build --config Release -j $(nproc)
-
-  macOS-latest-cmake-ios:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-ios
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-cmake-tvos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-tvos
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-swift:
-    runs-on: macos-latest
-
-    strategy:
-      matrix:
-        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-swift
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
+          sudo apt-get install -y build-essential vulkan-sdk
+    
+      - name: Prepare Vulkan SDK Windows
+        if: ${{ matrix.vulkan && (matrix.os == 'windows') }}
         continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build llama.cpp with CMake
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-  windows-msys2:
-    runs-on: windows-latest
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
-          - { sys: CLANG64, env: clang-x86_64, build: Release }
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-msys2
-          variant: sccache
-          evict-old-files: 1d
-
-      - name: Setup ${{ matrix.sys }}
-        uses: msys2/setup-msys2@v2
-        with:
-          update: true
-          msystem: ${{matrix.sys}}
-          install: >-
-            base-devel
-            git
-            mingw-w64-${{matrix.env}}-toolchain
-            mingw-w64-${{matrix.env}}-cmake
-            mingw-w64-${{matrix.env}}-openblas
-
-      - name: Build using CMake
-        shell: msys2 {0}
-        run: |
-            cmake -B build
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
-      - name: Clean after building using CMake
-        shell: msys2 {0}
-        run: |
-            rm -rf build
-
-      - name: Build using CMake w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
-  windows-latest-cmake:
-    runs-on: windows-latest
-
-    env:
-      OPENBLAS_VERSION: 0.3.23
-      SDE_VERSION: 9.33.0-2024-01-07
-      VULKAN_VERSION: 1.4.304.1
-
-    strategy:
-      matrix:
-        include:
-          - build: 'noavx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
-          - build: 'avx2-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
-          - build: 'avx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
-          - build: 'avx512-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
-          - build: 'openblas-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'kompute-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
-          - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
-          - build: 'llvm-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
-          - build: 'msvc-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
-          - build: 'llvm-arm64-opencl-adreno'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-${{ matrix.build }}
-          variant: sccache
-          evict-old-files: 1d
-
-      - name: Clone Kompute submodule
-        id: clone_kompute
-        if: ${{ matrix.build == 'kompute-x64' }}
-        run: |
-          git submodule update --init ggml/src/ggml-kompute/kompute
-
-      - name: Download OpenBLAS
-        id: get_openblas
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
-          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/openblas
-          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
-          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
-      - name: Install Vulkan SDK
-        id: get_vulkan
-        if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
         run: |
           curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
           & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
           Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
           Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
 
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
-        run: |
-          git clone https://github.com/KhronosGroup/OpenCL-Headers
-          cd OpenCL-Headers
-          cmake -B build `
-            -DBUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build --target install
-          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-          cd OpenCL-ICD-Loader
-          cmake -B build-arm64-release `
-            -A arm64 `
-            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build-arm64-release --target install --config release
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -S . -B build ${{ matrix.defines }}
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
-      - name: Add libopenblas.dll
-        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
-          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
-      - name: Check AVX512F support
-        id: check_avx512f
-        if: ${{ matrix.build == 'avx512-x64' }}
-        continue-on-error: true
-        run: |
-          cd build
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $cl =  $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
-          echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
-          & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
-          .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
-
-      - name: Test
-        id: cmake_test
-        # not all machines have native AVX-512
-        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
-        run: |
-          cd build
-          ctest -L main -C Release --verbose --timeout 900
-
-      - name: Test (Intel SDE)
-        id: cmake_test_sde
-        if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
-          # for some weird reason windows tar doesn't like sde tar.xz
-          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
-          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
-          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
-          cd build
-          $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
-          & $sde -future -- ctest -L main -C Release --verbose --timeout 900
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
-          name: llama-bin-win-${{ matrix.build }}.zip
-
-  ubuntu-latest-cmake-cuda:
-    runs-on: ubuntu-latest
-    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
-
-    steps:
-        - name: Clone
-          id: checkout
-          uses: actions/checkout@v4
-          with:
-            fetch-depth: 0
-
-        - name: Install dependencies
-          env:
-            DEBIAN_FRONTEND: noninteractive
-          run: |
-              apt update
-              apt install -y cmake build-essential ninja-build libgomp1 git
-
-        - name: ccache
-          uses: hendrikmuhs/ccache-action@v1.2.16
-          with:
-            key: ubuntu-latest-cmake-cuda
-            evict-old-files: 1d
-
-        - name: Build with CMake
-          run: |
-            cmake -S . -B build -G Ninja \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DCMAKE_CUDA_ARCHITECTURES=89-real \
-              -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-              -DLLAMA_FATAL_WARNINGS=ON \
-              -DGGML_NATIVE=OFF \
-              -DGGML_CUDA=ON
-            cmake --build build
-
-  windows-2019-cmake-cuda:
-    runs-on: windows-2019
-
-    strategy:
-      matrix:
-        cuda: ['12.4', '11.7']
-        build: ['cuda']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
-
-      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
-          variant: sccache
-          evict-old-files: 1d
-
-      - name: Install Cuda Toolkit 11.7
-        if: ${{ matrix.cuda == '11.7' }}
-        run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-      - name: Install Cuda Toolkit 12.4
-        if: ${{ matrix.cuda == '12.4' }}
-        run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DLLAMA_BUILD_SERVER=ON ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_CUDA=ON ^
-            -DGGML_RPC=ON
-          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
-          cmake --build build --config Release
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-
-      - name: Copy and pack Cuda runtime
-        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-        run: |
-          echo "Cuda install location: ${{ env.CUDA_PATH }}"
-          $dst='.\build\bin\cudart\'
-          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
-
-      - name: Upload Cuda runtime
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-
-  windows-latest-cmake-sycl:
-    runs-on: windows-latest
-
-    defaults:
-      run:
+      - name: Get Cer for code signing
+        if: runner.os == 'macOS'
+        run: base64 -d <<< "$CODE_SIGN_P12_BASE64" > /tmp/codesign.p12
         shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
+        env:
+          CODE_SIGN_P12_BASE64: ${{ secrets.CODE_SIGN_P12_BASE64 }}
+  
+      - uses: apple-actions/import-codesign-certs@v2
+        continue-on-error: true
+        if: runner.os == 'macOS'
         with:
-          fetch-depth: 0
+          p12-file-base64: ${{ secrets.CODE_SIGN_P12_BASE64 }}
+          p12-password: ${{ secrets.CODE_SIGN_P12_PASSWORD }}
 
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+      - uses: actions/setup-dotnet@v3
+        if: runner.os == 'Windows'
         with:
-          key: windows-latest-cmake-sycl
-          variant: sccache
-          evict-old-files: 1d
+          dotnet-version: "8.0.x"
 
-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+      - name: Add msbuild to PATH
+        if: runner.os == 'Windows'
+        uses: ilammy/msvc-dev-cmd@v1.13.0
 
       - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
+        id: build-and-test
         run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Build the release package
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
+          make build-lib CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}"
 
-          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
-
-      - name: Upload the release package
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
+      - uses: 1arp/create-a-file-action@0.4.5
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
-          name: llama-bin-win-sycl-x64.zip
-
-  windows-latest-cmake-hip:
-    if: ${{ github.event.inputs.create_release != 'true' }}
-    runs-on: windows-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
+          path: 'llama'
+          isAbsolutePath: false
+          file: 'version.txt'
+          content: |
+            name: ${{ matrix.os }}-${{ matrix.name }}
+            version: ${{needs.create-draft-release.outputs.version}}
 
-      - name: Clone rocWMMA repository
-        id: clone_rocwmma
+      - name: Code Signing macOS
+        if: runner.os == 'macOS'
         run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
+          make codesign CODE_SIGN=true DEVELOPER_ID="${{ secrets.DEVELOPER_ID }}"
 
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ${{ github.job }}
-          evict-old-files: 1d
-
-      - name: Build
-        id: cmake_build
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DGGML_HIP=ON `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_RPC=ON
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-
-  windows-latest-cmake-hip-release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-    runs-on: windows-latest
-
-    strategy:
-      matrix:
-        gpu_target: [gfx1100, gfx1101, gfx1030]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
-
-      - name: Clone rocWMMA repository
-        id: clone_rocwmma
-        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-hip-release
-          evict-old-files: 1d
-
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: Build
-        id: cmake_build
+      - name: Code Signing Windows
+        if: runner.os == 'Windows'
+        shell: cmd
         run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_HIP=ON `
-            -DGGML_RPC=ON
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-          md "build\bin\rocblas\library\"
-          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+          set PATH=%PATH%;%USERPROFILE%\.dotnet\tools
+          make codesign CODE_SIGN=true AZURE_KEY_VAULT_URI="${{ secrets.AZURE_KEY_VAULT_URI }}" AZURE_CLIENT_ID="${{ secrets.AZURE_CLIENT_ID }}" AZURE_TENANT_ID="${{ secrets.AZURE_TENANT_ID }}" AZURE_CLIENT_SECRET="${{ secrets.AZURE_CLIENT_SECRET }}" AZURE_CERT_NAME="${{ secrets.AZURE_CERT_NAME }}"
 
-      - name: Determine tag name
-        id: tag
-        shell: bash
+      - name: Package
         run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
+          cat llama/version.txt
+          make package
 
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
+      # - name: Run e2e testing
+      #   if: ${{ matrix.run-e2e }}
+      #   run: |
+      #     make run-e2e-test LLM_MODEL_URL=${{ env.LLM_MODEL_URL }} EMBEDDING_MODEL_URL=${{ env.EMBEDDING_MODEL_URL }}
 
-      - name: Upload artifacts
+      - name: Upload Artifact
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-
-  ios-xcode-build:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
+          name: llama-${{ matrix.os }}-${{ matrix.name }}
+          path: ./llama
+
+      - name: Calculate SHA512 Checksum (macOS)
+        if: runner.os == 'macOS'
+        run: |
+          sha512sum ./llama.tar.gz | awk '{ print $1 }' > sha512.txt
+          size=$(stat -f%z ./llama.tar.gz)  # Sử dụng -f%z cho macOS
+          echo "checksum=$(cat sha512.txt)" >> $GITHUB_ENV
+          echo "size=$size" >> $GITHUB_ENV
+  
+      - name: Calculate SHA512 Checksum (Windows)
+        if: runner.os == 'Windows'
+        shell: pwsh
+        run: |
+          CertUtil -hashfile ./llama.tar.gz SHA512 | Select-String -Pattern "^[0-9a-fA-F]+$" | Out-File sha512.txt
+          $size = (Get-Item ./llama.tar.gz).length
+          echo "checksum=$(Get-Content sha512.txt)" >> $env:GITHUB_ENV
+          echo "size=$size" >> $env:GITHUB_ENV
+
+      - name: Calculate SHA512 Checksum (Linux)
+        if: runner.os == 'Linux'
+        run: |
+          sha512sum ./llama.tar.gz | awk '{ print $1 }' > sha512.txt
+          size=$(stat -c%s ./llama.tar.gz)
+          echo "checksum=$(cat sha512.txt)" >> $GITHUB_ENV
+          echo "size=$size" >> $GITHUB_ENV
+  
+      ## Write for matrix outputs workaround 
+      - uses: cloudposse/github-action-matrix-outputs-write@v1
+        id: out
+        with:
+          matrix-step-name: ${{ github.job }}
+          matrix-key: ${{ matrix.os }}-${{ matrix.name }}
+          outputs: |-
+            sha512: ${{ env.checksum }}
+            size: ${{ env.size }}
+
+      - uses: actions/upload-release-asset@v1.0.1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
-          fetch-depth: 0
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: ./llama.tar.gz
+          asset_name: llama-${{ needs.create-draft-release.outputs.version }}-bin-${{ matrix.os }}-${{ matrix.name }}.tar.gz
+          asset_content_type: application/gzip
 
-      - name: xcodebuild for swift package
-        id: xcodebuild
+      - name: Upload ccache to s3
+        continue-on-error: true
+        if: always() && runner.os == 'Windows'
         run: |
-          ./build-xcframework.sh
-
-      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
+          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
+          refreshenv
+          aws s3 sync ${{ matrix.ccache-dir }} s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }}  --endpoint ${{ secrets.MINIO_ENDPOINT }}
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
 
-      - name: Determine tag name
-        id: tag
-        shell: bash
+      - name: Upload ccache to s3
+        continue-on-error: true
+        if: always() && runner.os == 'Linux'
         run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
+          aws s3 sync ${{ matrix.ccache-dir }} s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }}  --endpoint ${{ secrets.MINIO_ENDPOINT }}
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
 
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+      - name: Remove Keychain
+        continue-on-error: true
+        if: always() && runner.os == 'macOS'
         run: |
-          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+          security delete-keychain signing_temp.keychain
 
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-          name: llama-${{ steps.tag.outputs.name }}-xcframework
 
-  android-build:
+  ## Read matrix outputs 
+  read:
     runs-on: ubuntu-latest
-
+    needs: [build-and-test]
     steps:
-      - name: Clone
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: android-build
-          evict-old-files: 1d
-
-      - name: Set up JDK
-        uses: actions/setup-java@v3
-        with:
-          java-version: 17
-          distribution: zulu
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@v3
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - name: Build
-        run: |
-          cd examples/llama.android
-
-          ./gradlew build --no-daemon
-
-  release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-
-    runs-on: ubuntu-latest
-
-    needs:
-      - ubuntu-cpu-cmake
-      - ubuntu-22-cmake-vulkan
-      - windows-latest-cmake
-      - windows-2019-cmake-cuda
-      - windows-latest-cmake-sycl
-      - windows-latest-cmake-hip-release
-      - macOS-latest-cmake-arm64
-      - macOS-latest-cmake-x64
-
+      - uses: cloudposse/github-action-matrix-outputs-read@v1
+        id: read
+        with:
+          matrix-step-name: build-and-test
+    outputs:
+        result: "${{ steps.read.outputs.result }}"
+
+  create-checksum-file:
+    runs-on: ubuntu-20-04
+    permissions:
+      contents: write
+    needs: [read, create-draft-release]
     steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
+      - name: Download cuda dependencies from s3 and create checksum
+        run: |
+          wget https://minio.menlo.ai:9000/cicd/dist/cuda-dependencies/12.0/linux/cuda.tar.gz -O /tmp/cuda-12-0-linux-amd64.tar.gz
+          wget https://minio.menlo.ai:9000/cicd/dist/cuda-dependencies/11.7/linux/cuda.tar.gz -O /tmp/cuda-11-7-linux-amd64.tar.gz
+          wget https://minio.menlo.ai:9000/cicd/dist/cuda-dependencies/12.0/windows/cuda.tar.gz -O /tmp/cuda-12-0-windows-amd64.tar.gz
+          wget https://minio.menlo.ai:9000/cicd/dist/cuda-dependencies/11.7/windows/cuda.tar.gz -O /tmp/cuda-11-7-windows-amd64.tar.gz
+
+          version=${{ needs.create-draft-release.outputs.version }}
+          outputs=${{ toJson(needs.read.outputs.result) }}
+
+          echo $outputs
+
+          echo "version: $version" > checksum.yml
+          echo "files:" >> checksum.yml
+
+          echo "$outputs" | jq -r --arg version "$version" '
+            .sha512 as $sha512 |
+            .size as $size |
+            (.sha512 | keys[]) as $key |
+            "- url: llama-\($version)-\($key).tar.gz\n  sha512: >-\n    \($sha512[$key])\n  size: \($size[$key])"
+          ' >> checksum.yml
+
+          echo "- url: cuda-12-0-linux-amd64.tar.gz" >> checksum.yml
+          echo "  sha512: >-" >> checksum.yml
+          echo "    $(sha512sum /tmp/cuda-12-0-linux-amd64.tar.gz | awk '{ print $1 }')" >> checksum.yml
+          echo "  size: $(stat -c%s /tmp/cuda-12-0-linux-amd64.tar.gz)" >> checksum.yml
+
+          echo "- url: cuda-11-7-linux-amd64.tar.gz" >> checksum.yml
+          echo "  sha512: >-" >> checksum.yml
+          echo "    $(sha512sum /tmp/cuda-11-7-linux-amd64.tar.gz | awk '{ print $1 }')" >> checksum.yml
+          echo "  size: $(stat -c%s /tmp/cuda-11-7-linux-amd64.tar.gz)" >> checksum.yml
+
+          echo "- url: cuda-11-7-windows-amd64.tar.gz" >> checksum.yml
+          echo "  sha512: >-" >> checksum.yml
+          echo "    $(sha512sum /tmp/cuda-11-7-windows-amd64.tar.gz | awk '{ print $1 }')" >> checksum.yml
+          echo "  size: $(stat -c%s /tmp/cuda-11-7-windows-amd64.tar.gz)" >> checksum.yml
+
+          echo "- url: cuda-12-0-windows-amd64.tar.gz" >> checksum.yml
+          echo "  sha512: >-" >> checksum.yml
+          echo "    $(sha512sum /tmp/cuda-12-0-windows-amd64.tar.gz | awk '{ print $1 }')" >> checksum.yml
+          echo "  size: $(stat -c%s /tmp/cuda-12-0-windows-amd64.tar.gz)" >> checksum.yml
+          cat checksum.yml
+
+      - name: Upload checksum.yml to GitHub Release
+        uses: actions/upload-release-asset@v1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
-          fetch-depth: 0
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: ./checksum.yml
+          asset_name: checksum.yml
+          asset_content_type: text/yaml
 
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+      - name: upload cuda-12-0-linux-amd64.tar.gz to Github Release
+        uses: actions/upload-release-asset@v1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
-          key: release
-          evict-old-files: 1d
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Download artifacts
-        id: download-artifact
-        uses: actions/download-artifact@v4
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: /tmp/cuda-12-0-linux-amd64.tar.gz
+          asset_name: cuda-12-0-linux-amd64.tar.gz
+          asset_content_type: application/gzip
+      
+      - name: upload cuda-11-7-linux-amd64.tar.gz to Github Release
+        uses: actions/upload-release-asset@v1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
-          path: ./artifact
-
-      - name: Move artifacts
-        id: move_artifacts
-        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
-
-      - name: Create release
-        id: create_release
-        uses: ggml-org/action-create-release@v1
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: /tmp/cuda-11-7-linux-amd64.tar.gz
+          asset_name: cuda-11-7-linux-amd64.tar.gz
+          asset_content_type: application/gzip
+      
+      - name: upload cuda-12-0-windows-amd64.tar.gz to Github Release
+        uses: actions/upload-release-asset@v1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
-          tag_name: ${{ steps.tag.outputs.name }}
-
-      - name: Upload release
-        id: upload_release
-        uses: actions/github-script@v3
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: /tmp/cuda-12-0-windows-amd64.tar.gz
+          asset_name: cuda-12-0-windows-amd64.tar.gz
+          asset_content_type: application/gzip
+      
+      - name: upload cuda-11-7-windows-amd64.tar.gz to Github Release
+        uses: actions/upload-release-asset@v1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
-          github-token: ${{secrets.GITHUB_TOKEN}}
-          script: |
-            const path = require('path');
-            const fs = require('fs');
-            const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./artifact/release')) {
-              if (path.extname(file) === '.zip') {
-                console.log('uploadReleaseAsset', file);
-                await github.repos.uploadReleaseAsset({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  release_id: release_id,
-                  name: file,
-                  data: await fs.readFileSync(`./artifact/release/${file}`)
-                });
-              }
-            }
-
-#  ubuntu-latest-gcc:
-#    runs-on: ubuntu-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Debug, Release]
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: Dependencies
-#        run: |
-#          sudo apt-get update
-#          sudo apt-get install build-essential
-#          sudo apt-get install cmake
-#
-#      - name: Configure
-#        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#
-#      - name: Build
-#        run: |
-#          make
-#
-#  ubuntu-latest-clang:
-#    runs-on: ubuntu-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Debug, Release]
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: Dependencies
-#        run: |
-#          sudo apt-get update
-#          sudo apt-get install build-essential
-#          sudo apt-get install cmake
-#
-#      - name: Configure
-#        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
-#
-#      - name: Build
-#        run: |
-#          make
-#
-#  ubuntu-latest-gcc-sanitized:
-#    runs-on: ubuntu-latest
-#
-#    strategy:
-#      matrix:
-#        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: Dependencies
-#        run: |
-#          sudo apt-get update
-#          sudo apt-get install build-essential
-#          sudo apt-get install cmake
-#
-#      - name: Configure
-#        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
-#
-#      - name: Build
-#        run: |
-#          make
-#
-#  windows:
-#    runs-on: windows-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Release]
-#        arch: [Win32, x64]
-#        include:
-#          - arch: Win32
-#            s2arc: x86
-#          - arch: x64
-#            s2arc: x64
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: Add msbuild to PATH
-#        uses: microsoft/setup-msbuild@v1
-#
-#      - name: Configure
-#        run: >
-#          cmake -S . -B ./build -A ${{ matrix.arch }}
-#          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#
-#      - name: Build
-#        run: |
-#          cd ./build
-#          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
-#
-#      - name: Upload binaries
-#        uses: actions/upload-artifact@v4
-#        with:
-#          name: llama-bin-${{ matrix.arch }}
-#          path: build/bin/${{ matrix.build }}
-#
-#  windows-blas:
-#    runs-on: windows-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Release]
-#        arch: [Win32, x64]
-#        blas: [ON]
-#        include:
-#          - arch: Win32
-#            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
-#            s2arc: x86
-#          - arch: x64
-#            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
-#            s2arc: x64
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: Add msbuild to PATH
-#        uses: microsoft/setup-msbuild@v1
-#
-#      - name: Fetch OpenBLAS
-#        if: matrix.blas == 'ON'
-#        run: |
-#          C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
-#          7z x blas.zip -oblas -y
-#          copy blas/include/cblas.h .
-#          copy blas/include/openblas_config.h .
-#          echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
-#
-#      - name: Configure
-#        run: >
-#          cmake -S . -B ./build -A ${{ matrix.arch }}
-#          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#          -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
-#          -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
-#
-#      - name: Build
-#        run: |
-#          cd ./build
-#          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
-#
-#      - name: Copy libopenblas.dll
-#        if: matrix.blas == 'ON'
-#        run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
-#
-#      - name: Upload binaries
-#        if: matrix.blas == 'ON'
-#        uses: actions/upload-artifact@v4
-#        with:
-#          name: llama-blas-bin-${{ matrix.arch }}
-#          path: build/bin/${{ matrix.build }}
-#
-#  emscripten:
-#    runs-on: ubuntu-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Release]
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: Dependencies
-#        run: |
-#          wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
-#          tar -xvf master.tar.gz
-#          emsdk-master/emsdk update
-#          emsdk-master/emsdk install latest
-#          emsdk-master/emsdk activate latest
-#
-#      - name: Configure
-#        run: echo "tmp"
-#
-#      - name: Build
-#        run: |
-#          pushd emsdk-master
-#          source ./emsdk_env.sh
-#          popd
-#          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#          make
-
-  openEuler-latest-cmake-cann:
-    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
-    defaults:
-      run:
-       shell: bash -el {0}
-    runs-on: ubuntu-24.04-arm
-    strategy:
-      matrix:
-        cann:
-          - '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
-        device:
-          - 'ascend910b3'
-        build:
-          - 'Release'
-    container: ascendai/cann:${{ matrix.cann }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        run: |
-          yum update -y
-          yum install -y git gcc gcc-c++ make cmake
-
-      - name: Build
-        run: |
-          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-
-          cmake -S . -B build \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-              -DGGML_CANN=on \
-              -DSOC_TYPE=${{ matrix.device }}
-          cmake --build build -j $(nproc)
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: /tmp/cuda-11-7-windows-amd64.tar.gz
+          asset_name: cuda-11-7-windows-amd64.tar.gz
+          asset_content_type: application/gzip
\ No newline at end of file
diff --git a/.github/workflows/quality-gate.yml b/.github/workflows/quality-gate.yml
new file mode 100644
index 0000000000000..b9244f74ae0fa
--- /dev/null
+++ b/.github/workflows/quality-gate.yml
@@ -0,0 +1,472 @@
+name: CI Quality Gate
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+  workflow_call:
+    secrets:
+      MINIO_BUCKET_NAME:
+        required: false
+      MINIO_REGION:
+        required: false
+      MINIO_ENDPOINT:
+        required: false
+      MINIO_ACCESS_KEY_ID:
+        required: false
+      MINIO_SECRET_ACCESS_KEY:
+        required: false
+
+env:
+  LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
+  EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf
+  VULKAN_VERSION: 1.3.261.1
+
+jobs:
+  build-and-test:
+    runs-on: ${{ matrix.runs-on }}
+    if: ${{ ! startsWith(github.head_ref, 'update-submodule') }}
+    timeout-minutes: 90
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: "linux"
+            name: "arm64"
+            runs-on: "ubuntu-2004-arm64"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx2-x64"
+            runs-on: "ubuntu-20-04"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "noavx-x64"
+            runs-on: "ubuntu-20-04"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx-x64"
+            runs-on: "ubuntu-20-04"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx512-x64"
+            runs-on: "ubuntu-20-04"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "vulkan-x64"
+            runs-on: "ubuntu-22-04"
+            cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: true
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "noavx-cuda-cu11.7-x64"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx2-cuda-cu11.7-x64"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx-cuda-cu11.7-x64"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx512-cuda-cu11.7-x64"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "noavx-cuda-cu12.0-x64"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx2-cuda-cu12.0-x64"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx-cuda-cu12.0-x64"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx512-cuda-cu12.0-x64"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "macos"
+            name: "x64"
+            runs-on: "macos-selfhosted-12"
+            cmake-flags: "-DGGML_METAL=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
+            run-e2e: false
+            vulkan: false
+            ccache: false
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "macos"
+            name: "arm64"
+            runs-on: "macos-selfhosted-12-arm64"
+            cmake-flags: "-DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
+            run-e2e: false
+            vulkan: false
+            ccache: false
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx2-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: false
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "noavx-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: false
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: false
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx512-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: false
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "vulkan-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            run-e2e: false
+            vulkan: true
+            ccache: false
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "noavx-cuda-cu12.0-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx2-cuda-cu12.0-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx-cuda-cu12.0-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx512-cuda-cu12.0-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "noavx-cuda-cu11.7-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx2-cuda-cu11.7-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx-cuda-cu11.7-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx512-cuda-cu11.7-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      # - name: Apply patch file
+      #   run: |
+      #     cd llama.cpp
+      #     git apply ../patches/0001-Add-API-query-buffer-size.patch
+
+      - name: use python for linux
+        continue-on-error: true
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install tools on Windows
+        if: runner.os == 'Windows'
+        run: |
+          choco install ccache awscli make ccache ninja -y
+
+      - name: Install tools on Linux
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt-get install -y ninja-build
+          python3 -m pip install awscli
+          if [ "${{ matrix.os }}${{ matrix.name }}" == "linuxarm64" ]; then
+            sudo apt-get install -y ccache
+            exit 0
+          fi
+          cd /tmp
+          wget https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz
+          tar -xvf ccache-4.10.2-linux-x86_64.tar.xz
+          sudo cp ccache-4.10.2-linux-x86_64/ccache /usr/bin/ccache
+          ccache -V
+          rm -rf /tmp/ccache-4.10.2-linux-x86_64.tar.xz /tmp/ccache-4.10.2-linux-x86_64
+
+      - name: Download ccache from s3
+        if: runner.os == 'Windows'
+        continue-on-error: true
+        run: |
+          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
+          refreshenv
+          aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }} ${{ matrix.ccache-dir }}  --endpoint ${{ secrets.MINIO_ENDPOINT }} --cli-read-timeout 0
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+
+      - name: Download ccache from s3
+        if: runner.os == 'Linux'
+        continue-on-error: true
+        run: |
+          aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }} ${{ matrix.ccache-dir }}  --endpoint ${{ secrets.MINIO_ENDPOINT }} --cli-read-timeout 0
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+
+      - name: Install coreutils macos
+        if: runner.os == 'macOS'
+        run: |
+          brew install coreutils
+
+      - name: Prepare Vulkan SDK Linux
+        if: ${{ matrix.vulkan && (matrix.os == 'linux') }}
+        run: |
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential vulkan-sdk
+
+      - name: Prepare Vulkan SDK Windows
+        if: ${{ matrix.vulkan && (matrix.os == 'windows') }}
+        continue-on-error: true
+        run: |
+          if (Test-Path C:/VulkanSDK/${env:VULKAN_VERSION}/) { Remove-Item -Path C:/VulkanSDK/${env:VULKAN_VERSION}/ -Force -Recurse }
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
+      - name: Add msbuild to PATH
+        if: runner.os == 'Windows'
+        uses: ilammy/msvc-dev-cmd@v1.13.0
+
+      - name: Build
+        id: build-and-test
+        run: |
+          make build-lib CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}"
+
+      - uses: 1arp/create-a-file-action@0.4.5
+        with:
+          path: "build/bin"
+          isAbsolutePath: false
+          file: "version.txt"
+          content: |
+            name: ${{ matrix.os }}-${{ matrix.name }}
+            version: ${{github.event.pull_request.head.sha}}
+
+      - name: Package
+        run: |
+          make package
+
+      # - name: Run e2e testing
+      #   if: ${{ matrix.run-e2e }}
+      #   run: |
+      #     make run-e2e-test LLM_MODEL_URL=${{ env.LLM_MODEL_URL }} EMBEDDING_MODEL_URL=${{ env.EMBEDDING_MODEL_URL }}
+
+      - name: Upload Artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: llama-${{ matrix.os }}-${{ matrix.name }}
+          path: ./build/bin
+
+      - name: Calculate SHA512 Checksum (macOS)
+        if: runner.os == 'macOS'
+        run: |
+          sha512sum ./llama.tar.gz | awk '{ print $1 }' > sha512.txt
+          size=$(stat -f%z ./llama.tar.gz)  # Sử dụng -f%z cho macOS
+          echo "checksum=$(cat sha512.txt)" >> $GITHUB_ENV
+          echo "size=$size" >> $GITHUB_ENV
+
+      - name: Calculate SHA512 Checksum (Windows)
+        if: runner.os == 'Windows'
+        shell: pwsh
+        run: |
+          CertUtil -hashfile ./llama.tar.gz SHA512 | Select-String -Pattern "^[0-9a-fA-F]+$" | Out-File sha512.txt
+          $size = (Get-Item ./llama.tar.gz).length
+          echo "checksum=$(Get-Content sha512.txt)" >> $env:GITHUB_ENV
+          echo "size=$size" >> $env:GITHUB_ENV
+
+      - name: Calculate SHA512 Checksum (Linux)
+        if: runner.os == 'Linux'
+        run: |
+          sha512sum ./llama.tar.gz | awk '{ print $1 }' > sha512.txt
+          size=$(stat -c%s ./llama.tar.gz)
+          echo "checksum=$(cat sha512.txt)" >> $GITHUB_ENV
+          echo "size=$size" >> $GITHUB_ENV
+
+      ## Write for matrix outputs workaround
+      - uses: cloudposse/github-action-matrix-outputs-write@v1
+        id: out
+        with:
+          matrix-step-name: ${{ github.job }}
+          matrix-key: ${{ matrix.os }}-${{ matrix.name }}
+          outputs: |-
+            sha512: ${{ env.checksum }}
+            size: ${{ env.size }}
+
+      - name: Upload ccache to s3
+        continue-on-error: true
+        if: always() && runner.os == 'Windows'
+        run: |
+          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
+          refreshenv
+          aws s3 sync ${{ matrix.ccache-dir }} s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }}  --endpoint ${{ secrets.MINIO_ENDPOINT }}
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+
+      - name: Upload ccache to s3
+        continue-on-error: true
+        if: always() && runner.os == 'Linux'
+        run: |
+          aws s3 sync ${{ matrix.ccache-dir }} s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }}  --endpoint ${{ secrets.MINIO_ENDPOINT }}
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+
+  ## Read matrix outputs
+  read:
+    runs-on: ubuntu-latest
+    needs: [build-and-test]
+    steps:
+      - uses: cloudposse/github-action-matrix-outputs-read@v1
+        id: read
+        with:
+          matrix-step-name: build-and-test
+    outputs:
+      result: "${{ steps.read.outputs.result }}"
+
+  create-checksum-file:
+    runs-on: ubuntu-latest
+    needs: [read]
+    steps:
+      - name: Create checksum.yml
+        run: |
+          version="${{github.event.pull_request.head.sha}}"
+          outputs=${{ toJson(needs.read.outputs.result) }}
+
+          echo $outputs
+
+          echo "version: $version" > checksum.yml
+          echo "files:" >> checksum.yml
+
+          echo "$outputs" | jq -r --arg version "$version" '
+            .sha512 as $sha512 |
+            .size as $size |
+            (.sha512 | keys[]) as $key |
+            "- url: llama-\($version)-\($key).tar.gz\n  sha512: >-\n    \($sha512[$key])\n  size: \($size[$key])"
+          ' >> checksum.yml
+
+          cat checksum.yml 
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 1f9455eff0aec..cd69979f31001 100644
--- a/Makefile
+++ b/Makefile
@@ -1,1617 +1,55 @@
-ifndef LLAMA_MAKEFILE
-$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
-endif
-
-# Define the default target now so that it is always the first target
-BUILD_TARGETS = \
-	libllava.a \
-	llama-batched \
-	llama-batched-bench \
-	llama-bench \
-	llama-cli \
-	llama-convert-llama2c-to-ggml \
-	llama-embedding \
-	llama-eval-callback \
-	llama-export-lora \
-	llama-gbnf-validator \
-	llama-gguf \
-	llama-gguf-hash \
-	llama-gguf-split \
-	llama-gritlm \
-	llama-imatrix \
-	llama-infill \
-	llama-llava-cli \
-	llama-minicpmv-cli\
-	llama-qwen2vl-cli\
-	llama-lookahead \
-	llama-lookup \
-	llama-lookup-create \
-	llama-lookup-merge \
-	llama-lookup-stats \
-	llama-parallel \
-	llama-passkey \
-	llama-perplexity \
-	llama-q8dot \
-	llama-quantize \
-	llama-quantize-stats \
-	llama-retrieval \
-	llama-save-load-state \
-	llama-server \
-	llama-simple \
-	llama-simple-chat \
-	llama-run \
-	llama-speculative \
-	llama-tokenize \
-	llama-vdot \
-	llama-cvector-generator \
-	llama-gen-docs \
-	tests/test-c.o
-
-# Binaries only useful for tests
-TEST_TARGETS = \
-	tests/test-arg-parser \
-	tests/test-autorelease \
-	tests/test-backend-ops \
-	tests/test-chat \
-	tests/test-chat-template \
-	tests/test-double-float \
-	tests/test-grammar-integration \
-	tests/test-grammar-parser \
-	tests/test-json-schema-to-grammar \
-	tests/test-llama-grammar \
-	tests/test-log \
-	tests/test-model-load-cancel \
-	tests/test-quantize-fns \
-	tests/test-quantize-perf \
-	tests/test-rope \
-	tests/test-sampling \
-	tests/test-tokenizer-0 \
-	tests/test-tokenizer-1-bpe \
-	tests/test-tokenizer-1-spm
-#	tests/test-opt \
-
-# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
-LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
-	retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm
-
-# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
-#  We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
-LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
-
-# Deprecation aliases
-ifdef LLAMA_CUBLAS
-$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
-endif
-
-ifdef LLAMA_CUDA
-GGML_CUDA := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifdef LLAMA_KOMPUTE
-GGML_KOMPUTE := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifdef LLAMA_METAL
-GGML_METAL := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifdef LLAMA_RPC
-GGML_RPC := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifdef LLAMA_SYCL
-GGML_SYCL := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifdef LLAMA_SYCL_F16
-GGML_SYCL_F16 := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifdef LLAMA_OPENBLAS
-GGML_OPENBLAS := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifdef LLAMA_OPENBLAS64
-GGML_OPENBLAS64 := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifdef LLAMA_BLIS
-GGML_BLIS := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifdef LLAMA_NO_LLAMAFILE
-GGML_NO_LLAMAFILE := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifdef LLAMA_NO_ACCELERATE
-GGML_NO_ACCELERATE := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifdef LLAMA_NO_OPENMP
-GGML_NO_OPENMP := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifdef LLAMA_NO_METAL
-GGML_NO_METAL := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifdef LLAMA_DISABLE_LOGS
-REMOVE_WARNING := 1
-endif
-
-ifdef LLAMA_SERVER_VERBOSE
-REMOVE_WARNING := 1
-endif
-
-ifndef UNAME_S
-UNAME_S := $(shell uname -s)
-endif
-
-ifndef UNAME_P
-UNAME_P := $(shell uname -p)
-endif
-
-ifndef UNAME_M
-UNAME_M := $(shell uname -m)
-endif
-
-# In GNU make default CXX is g++ instead of c++.  Let's fix that so that users
-# of non-gcc compilers don't have to provide g++ alias or wrapper.
-DEFCC  := cc
-DEFCXX := c++
-ifeq ($(origin CC),default)
-CC  := $(DEFCC)
-endif
-ifeq ($(origin CXX),default)
-CXX := $(DEFCXX)
-endif
-
-# Mac OS + Arm can report x86_64
-# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
-ifeq ($(UNAME_S),Darwin)
-	ifndef GGML_NO_METAL
-		GGML_METAL := 1
-	endif
-
-	GGML_NO_OPENMP := 1
-
-	ifneq ($(UNAME_P),arm)
-		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
-		ifeq ($(SYSCTL_M),1)
-			# UNAME_P := arm
-			# UNAME_M := arm64
-			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
-		endif
-	endif
-endif
-
-ifdef GGML_METAL
-	GGML_METAL_EMBED_LIBRARY := 1
-endif
-
-ifdef GGML_RPC
-	BUILD_TARGETS += rpc-server
-endif
-
-ifdef GGML_VULKAN
-	BUILD_TARGETS += vulkan-shaders-gen
-endif
-
-default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)
-
-test: $(TEST_TARGETS)
-	@failures=0; \
-	for test_target in $(TEST_TARGETS); do \
-		if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
-			continue; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
-			continue; \
-		else \
-			echo "Running test $$test_target..."; \
-			./$$test_target; \
-		fi; \
-		if [ $$? -ne 0 ]; then \
-			printf 'Test %s FAILED!\n\n' $$test_target; \
-			failures=$$(( failures + 1 )); \
-		else \
-			printf 'Test %s passed.\n\n' $$test_target; \
-		fi; \
-	done; \
-	if [ $$failures -gt 0 ]; then \
-		printf '\n%s tests failed.\n' $$failures; \
-		exit 1; \
-	fi
-	@echo 'All tests passed.'
-
-all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)
-
-ifdef RISCV_CROSS_COMPILE
-CC	:= riscv64-unknown-linux-gnu-gcc
-CXX	:= riscv64-unknown-linux-gnu-g++
-endif
-
-#
-# Compile flags
-#
-
-# keep standard at C11 and C++17
-MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
-MK_CFLAGS    = -std=c11   -fPIC
-MK_CXXFLAGS  = -std=c++17 -fPIC
-MK_NVCCFLAGS = -std=c++17
-
-ifdef LLAMA_NO_CCACHE
-GGML_NO_CCACHE := 1
-DEPRECATE_WARNING := 1
-endif
-
-ifndef GGML_NO_CCACHE
-CCACHE := $(shell which ccache)
-ifdef CCACHE
-export CCACHE_SLOPPINESS = time_macros
-$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
-CC    := $(CCACHE) $(CC)
-CXX   := $(CCACHE) $(CXX)
-else
-$(info I ccache not found. Consider installing it for faster compilation.)
-endif # CCACHE
-endif # GGML_NO_CCACHE
-
-# clock_gettime came in POSIX.1b (1993)
-# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
-# posix_memalign came in POSIX.1-2001 / SUSv3
-# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
-MK_CPPFLAGS += -D_XOPEN_SOURCE=600
-
-# Somehow in OpenBSD whenever POSIX conformance is specified
-# some string functions rely on locale_t availability,
-# which was introduced in POSIX.1-2008, forcing us to go higher
-ifeq ($(UNAME_S),OpenBSD)
-	MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
-endif
-
-# Data types, macros and functions related to controlling CPU affinity and
-# some memory allocation are available on Linux through GNU extensions in libc
-ifeq ($(UNAME_S),Linux)
-	MK_CPPFLAGS += -D_GNU_SOURCE
-	MK_LDFLAGS  += -ldl
-endif
-
-# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
-# and on macOS its availability depends on enabling Darwin extensions
-# similarly on DragonFly, enabling BSD extensions is necessary
-ifeq ($(UNAME_S),Darwin)
-	MK_CPPFLAGS += -D_DARWIN_C_SOURCE
-endif
-ifeq ($(UNAME_S),DragonFly)
-	MK_CPPFLAGS += -D__BSD_VISIBLE
-endif
-
-# alloca is a non-standard interface that is not visible on BSDs when
-# POSIX conformance is specified, but not all of them provide a clean way
-# to enable it in such cases
-ifeq ($(UNAME_S),FreeBSD)
-	MK_CPPFLAGS += -D__BSD_VISIBLE
-endif
-ifeq ($(UNAME_S),NetBSD)
-	MK_CPPFLAGS += -D_NETBSD_SOURCE
-endif
-ifeq ($(UNAME_S),OpenBSD)
-	MK_CPPFLAGS += -D_BSD_SOURCE
-endif
-
-ifdef GGML_SCHED_MAX_COPIES
-	MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES)
-endif
-
-ifdef LLAMA_DEBUG
-	MK_CFLAGS    += -O0 -g
-	MK_CXXFLAGS  += -O0 -g
-	MK_LDFLAGS   += -g
-	MK_NVCCFLAGS += -O0 -g
-
-	ifeq ($(UNAME_S),Linux)
-		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
-	endif
-else
-	MK_CPPFLAGS   += -DNDEBUG
-	MK_CFLAGS     += -O3 -g
-	MK_CXXFLAGS   += -O3 -g
-	MK_NVCCFLAGS  += -O3 -g
-endif
-
-ifdef LLAMA_SANITIZE_THREAD
-	MK_CFLAGS   += -fsanitize=thread -g
-	MK_CXXFLAGS += -fsanitize=thread -g
-	MK_LDFLAGS  += -fsanitize=thread -g
-endif
-
-ifdef LLAMA_SANITIZE_ADDRESS
-	MK_CFLAGS   += -fsanitize=address -fno-omit-frame-pointer -g
-	MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
-	MK_LDFLAGS  += -fsanitize=address -fno-omit-frame-pointer -g
-endif
-
-ifdef LLAMA_SANITIZE_UNDEFINED
-	MK_CFLAGS   += -fsanitize=undefined -g
-	MK_CXXFLAGS += -fsanitize=undefined -g
-	MK_LDFLAGS  += -fsanitize=undefined -g
-endif
-
-ifdef LLAMA_SERVER_SSL
-	MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
-	MK_LDFLAGS += -lssl -lcrypto
-endif
-
-ifndef GGML_NO_CPU_AARCH64
-	MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
-endif
-
-# warnings
-WARN_FLAGS = \
-	-Wall \
-	-Wextra \
-	-Wpedantic \
-	-Wcast-qual \
-	-Wno-unused-function
-
-MK_CFLAGS += \
-	$(WARN_FLAGS) \
-	-Wshadow \
-	-Wstrict-prototypes \
-	-Wpointer-arith \
-	-Wmissing-prototypes \
-	-Werror=implicit-int \
-	-Werror=implicit-function-declaration
-
-MK_CXXFLAGS += \
-	$(WARN_FLAGS) \
-	-Wmissing-declarations \
-	-Wmissing-noreturn
-
-ifeq ($(LLAMA_FATAL_WARNINGS),1)
-	MK_CFLAGS   += -Werror
-	MK_CXXFLAGS += -Werror
-endif
-
-# this version of Apple ld64 is buggy
-ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
-	MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
-endif
-
-# OS specific
-# TODO: support Windows
-ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
-	MK_CFLAGS   += -pthread
-	MK_CXXFLAGS += -pthread
-endif
-
-# detect Windows
-ifneq ($(findstring _NT,$(UNAME_S)),)
-	_WIN32 := 1
-endif
-
-# library name prefix
-ifneq ($(_WIN32),1)
-	LIB_PRE := lib
-endif
-
-# Dynamic Shared Object extension
-ifneq ($(_WIN32),1)
-	DSO_EXT := .so
-else
-	DSO_EXT := .dll
-endif
-
-# Windows Sockets 2 (Winsock) for network-capable apps
-ifeq ($(_WIN32),1)
-	LWINSOCK2 := -lws2_32
-endif
-
-ifdef LLAMA_GPROF
-	MK_CFLAGS   += -pg
-	MK_CXXFLAGS += -pg
-endif
-
-# Architecture specific
-# TODO: probably these flags need to be tweaked on some architectures
-#       feel free to update the Makefile for your architecture and send a pull request or issue
-
-ifndef RISCV_CROSS_COMPILE
-
-ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
-	# Use all CPU extensions that are available:
-	MK_CFLAGS     += -march=native -mtune=native
-	HOST_CXXFLAGS += -march=native -mtune=native
-
-	# Usage AMX build test
-	#MK_CFLAGS     += -march=graniterapids -mtune=graniterapids
-	#HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids
-
-	# Usage AVX-only
-	#MK_CFLAGS   += -mfma -mf16c -mavx
-	#MK_CXXFLAGS += -mfma -mf16c -mavx
-
-	# Usage SSSE3-only (Not is SSE3!)
-	#MK_CFLAGS   += -mssse3
-	#MK_CXXFLAGS += -mssse3
-endif
-
-ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
-	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
-	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
-	# https://github.com/ggml-org/llama.cpp/issues/2922
-	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
-	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
-
-	# Target Windows 8 for PrefetchVirtualMemory
-	MK_CPPFLAGS += -D_WIN32_WINNT=0x602
-endif
-
-ifneq ($(filter aarch64%,$(UNAME_M)),)
-	# Apple M1, M2, etc.
-	# Raspberry Pi 3, 4, Zero 2 (64-bit)
-	# Nvidia Jetson
-	MK_CFLAGS   += -mcpu=native
-	MK_CXXFLAGS += -mcpu=native
-	JETSON_RELEASE_INFO = $(shell jetson_release)
-	ifdef JETSON_RELEASE_INFO
-		ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),)
-			JETSON_EOL_MODULE_DETECT = 1
-			CC = aarch64-unknown-linux-gnu-gcc
-			cxx = aarch64-unknown-linux-gnu-g++
-		endif
-	endif
-endif
-
-ifneq ($(filter armv6%,$(UNAME_M)),)
-	# Raspberry Pi 1, Zero
-	MK_CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
-	MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
-endif
-
-ifneq ($(filter armv7%,$(UNAME_M)),)
-	# Raspberry Pi 2
-	MK_CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
-	MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
-endif
-
-ifneq ($(filter armv8%,$(UNAME_M)),)
-	# Raspberry Pi 3, 4, Zero 2 (32-bit)
-	MK_CFLAGS   += -mfp16-format=ieee -mno-unaligned-access
-	MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access
-endif
-
-ifneq ($(filter ppc64%,$(UNAME_M)),)
-	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
-	ifneq (,$(findstring POWER9,$(POWER9_M)))
-		MK_CFLAGS   += -mcpu=power9
-		MK_CXXFLAGS += -mcpu=power9
-	endif
-endif
-
-ifneq ($(filter ppc64le%,$(UNAME_M)),)
-	MK_CFLAGS   += -mcpu=powerpc64le
-	MK_CXXFLAGS += -mcpu=powerpc64le
-	CUDA_POWER_ARCH = 1
-endif
-
-ifneq ($(filter loongarch64%,$(UNAME_M)),)
-	MK_CFLAGS   += -mlasx
-	MK_CXXFLAGS += -mlasx
-endif
-
-ifneq ($(filter riscv64%,$(UNAME_M)),)
-	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
-	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
-endif
-
-else # RISC-V CROSS COMPILATION
-	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
-	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
-endif
-
-ifndef GGML_NO_ACCELERATE
-	# Mac OS - include Accelerate framework.
-	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
-	ifeq ($(UNAME_S),Darwin)
-		MK_CPPFLAGS  += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
-		MK_CPPFLAGS  += -DACCELERATE_NEW_LAPACK
-		MK_CPPFLAGS  += -DACCELERATE_LAPACK_ILP64
-		MK_LDFLAGS   += -framework Accelerate
-		OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
-	endif
-endif # GGML_NO_ACCELERATE
-
-ifndef GGML_NO_OPENMP
-	MK_CPPFLAGS += -DGGML_USE_OPENMP
-	MK_CFLAGS   += -fopenmp
-	MK_CXXFLAGS += -fopenmp
-endif # GGML_NO_OPENMP
-
-ifdef GGML_OPENBLAS
-	MK_CPPFLAGS  += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
-	MK_CFLAGS    += $(shell pkg-config --cflags-only-other openblas)
-	MK_LDFLAGS   += $(shell pkg-config --libs openblas)
-	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
-endif # GGML_OPENBLAS
-
-ifdef GGML_OPENBLAS64
-	MK_CPPFLAGS  += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
-	MK_CFLAGS    += $(shell pkg-config --cflags-only-other openblas64)
-	MK_LDFLAGS   += $(shell pkg-config --libs openblas64)
-	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
-endif # GGML_OPENBLAS64
-
-ifdef GGML_BLIS
-	MK_CPPFLAGS  += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
-	MK_LDFLAGS   += -lblis -L/usr/local/lib
-	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
-endif # GGML_BLIS
-
-ifdef GGML_NVPL
-	MK_CPPFLAGS  += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
-	MK_LDFLAGS   += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
-	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
-endif # GGML_NVPL
-
-ifndef GGML_NO_LLAMAFILE
-	MK_CPPFLAGS  += -DGGML_USE_LLAMAFILE
-	OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o
-endif
-
-ifndef GGML_NO_AMX
-	MK_CPPFLAGS += -DGGML_USE_AMX
-	OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o
-endif
-
-# only necessary for the CPU backend files
-MK_CPPFLAGS += -Iggml/src/ggml-cpu
-
-ifdef GGML_RPC
-	MK_CPPFLAGS  += -DGGML_USE_RPC
-	OBJ_GGML_EXT += ggml/src/ggml-rpc.o
-endif # GGML_RPC
-
-OBJ_CUDA_TMPL      = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-mma*.cu))
-OBJ_CUDA_TMPL     += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu))
-
-ifdef GGML_CUDA_FA_ALL_QUANTS
-	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*.cu))
-else
-	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
-	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
-	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
-endif # GGML_CUDA_FA_ALL_QUANTS
-
-ifdef GGML_CUDA
-	ifneq ('', '$(wildcard /opt/cuda)')
-		CUDA_PATH ?= /opt/cuda
-	else
-		CUDA_PATH ?= /usr/local/cuda
-	endif
-
-	MK_CPPFLAGS  += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
-	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
-	MK_NVCCFLAGS += -use_fast_math
-
-	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
-	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
-	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
-
-ifdef LLAMA_FATAL_WARNINGS
-	MK_NVCCFLAGS += -Werror all-warnings
-endif # LLAMA_FATAL_WARNINGS
-
-ifndef JETSON_EOL_MODULE_DETECT
-	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
-endif # JETSON_EOL_MODULE_DETECT
-
-ifdef LLAMA_DEBUG
-	MK_NVCCFLAGS += -lineinfo
-endif # LLAMA_DEBUG
-
-ifdef GGML_CUDA_DEBUG
-	MK_NVCCFLAGS += --device-debug
-endif # GGML_CUDA_DEBUG
-
-ifdef GGML_CUDA_NVCC
-	NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
-else
-	NVCC = $(CCACHE) nvcc
-endif # GGML_CUDA_NVCC
-
-ifdef CUDA_DOCKER_ARCH
-	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
-else ifndef CUDA_POWER_ARCH
-	MK_NVCCFLAGS += -arch=native
-endif # CUDA_DOCKER_ARCH
-
-ifdef GGML_CUDA_FORCE_MMQ
-	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
-endif # GGML_CUDA_FORCE_MMQ
-
-ifdef GGML_CUDA_FORCE_CUBLAS
-	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
-endif # GGML_CUDA_FORCE_CUBLAS
-
-ifdef GGML_CUDA_F16
-	MK_NVCCFLAGS += -DGGML_CUDA_F16
-endif # GGML_CUDA_F16
-
-ifdef GGML_CUDA_DMMV_F16
-	MK_NVCCFLAGS += -DGGML_CUDA_F16
-endif # GGML_CUDA_DMMV_F16
-
-ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
-	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
-else
-	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
-endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
-
-ifdef GGML_CUDA_NO_PEER_COPY
-	MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
-endif # GGML_CUDA_NO_PEER_COPY
-
-ifdef GGML_CUDA_CCBIN
-	MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN)
-endif # GGML_CUDA_CCBIN
-
-ifdef GGML_CUDA_NO_FA
-	MK_NVCCFLAGS += -DGGML_CUDA_NO_FA
-endif # GGML_CUDA_NO_FA
-
-ifdef GGML_CUDA_FA_ALL_QUANTS
-	MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
-endif # GGML_CUDA_FA_ALL_QUANTS
-
-ifdef JETSON_EOL_MODULE_DETECT
-define NVCC_COMPILE
-	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
-endef # NVCC_COMPILE
-else
-define NVCC_COMPILE
-	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
-endef # NVCC_COMPILE
-endif # JETSON_EOL_MODULE_DETECT
-
-ggml/src/ggml-cuda/%.o: \
-	ggml/src/ggml-cuda/%.cu \
-	ggml/include/ggml.h \
-	ggml/src/ggml-common.h \
-	ggml/src/ggml-cuda/common.cuh
-	$(NVCC_COMPILE)
-
-ggml/src/ggml-cuda/ggml-cuda.o: \
-	ggml/src/ggml-cuda/ggml-cuda.cu \
-	ggml/include/ggml-cuda.h \
-	ggml/include/ggml.h \
-	ggml/include/ggml-backend.h \
-	ggml/src/ggml-backend-impl.h \
-	ggml/src/ggml-common.h \
-	$(wildcard ggml/src/ggml-cuda/*.cuh)
-	$(NVCC_COMPILE)
-endif # GGML_CUDA
-
-ifdef GGML_VULKAN
-	MK_CPPFLAGS  += -DGGML_USE_VULKAN
-	MK_LDFLAGS   += $(shell pkg-config --libs vulkan)
-	OBJ_GGML_EXT += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
-
-ifdef GGML_VULKAN_CHECK_RESULTS
-	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
-endif
-
-ifdef GGML_VULKAN_DEBUG
-	MK_CPPFLAGS  += -DGGML_VULKAN_DEBUG
-endif
-
-ifdef GGML_VULKAN_MEMORY_DEBUG
-	MK_CPPFLAGS  += -DGGML_VULKAN_MEMORY_DEBUG
-endif
-
-ifdef GGML_VULKAN_PERF
-	MK_CPPFLAGS  += -DGGML_VULKAN_PERF
-endif
-
-ifdef GGML_VULKAN_VALIDATE
-	MK_CPPFLAGS  += -DGGML_VULKAN_VALIDATE
-endif
-
-ifdef GGML_VULKAN_RUN_TESTS
-	MK_CPPFLAGS  += -DGGML_VULKAN_RUN_TESTS
-endif
-
-GLSLC_CMD  = glslc
-_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
-_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
-_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
-_ggml_vk_input_dir = ggml/src/ggml-vulkan/vulkan-shaders
-_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
-
-ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
-	$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
-
-$(_ggml_vk_header): $(_ggml_vk_source)
-
-$(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
-	$(_ggml_vk_genshaders_cmd) \
-		--glslc      $(GLSLC_CMD) \
-		--input-dir  $(_ggml_vk_input_dir) \
-		--target-hpp $(_ggml_vk_header) \
-		--target-cpp $(_ggml_vk_source)
-
-vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
-	$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
-
-endif # GGML_VULKAN
-
-ifdef GGML_HIP
-	ifeq ($(wildcard /opt/rocm),)
-		ROCM_PATH      ?= /usr
-		AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
-	else
-		ROCM_PATH	?= /opt/rocm
-		AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
-	endif
-
-	MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
-
-ifdef GGML_HIP_UMA
-	MK_CPPFLAGS += -DGGML_HIP_UMA
-endif # GGML_HIP_UMA
-
-	MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
-	MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
-	MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
-
-	HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
-
-	HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
-
-ifdef GGML_CUDA_FORCE_MMQ
-	HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
-endif # GGML_CUDA_FORCE_MMQ
-
-ifdef GGML_CUDA_FORCE_CUBLAS
-	HIPFLAGS += -DGGML_CUDA_FORCE_CUBLAS
-endif # GGML_CUDA_FORCE_CUBLAS
-
-ifdef GGML_CUDA_NO_PEER_COPY
-	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
-endif # GGML_CUDA_NO_PEER_COPY
-
-ifdef GGML_CUDA_NO_FA
-	HIPFLAGS += -DGGML_CUDA_NO_FA
-endif # GGML_CUDA_NO_FA
-
-	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
-	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
-	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
-
-ggml/src/ggml-cuda/ggml-cuda.o: \
-	ggml/src/ggml-cuda/ggml-cuda.cu \
-	ggml/include/ggml-cuda.h \
-	ggml/include/ggml.h \
-	ggml/include/ggml-backend.h \
-	ggml/src/ggml-backend-impl.h \
-	ggml/src/ggml-common.h \
-	$(wildcard ggml/src/ggml-cuda/*.cuh)
-	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
-
-ggml/src/ggml-cuda/%.o: \
-	ggml/src/ggml-cuda/%.cu \
-	ggml/include/ggml.h \
-	ggml/src/ggml-common.h \
-	ggml/src/ggml-cuda/common.cuh
-	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
-endif # GGML_HIP
-
-ifdef GGML_MUSA
-	ifeq ($(wildcard /opt/musa),)
-		MUSA_PATH ?= /usr/local/musa
-	else
-		MUSA_PATH ?= /opt/musa
-	endif
-	MUSA_ARCHITECTURES ?= 21;22;31
-
-	MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
-	MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
-	MK_LDFLAGS += -lmusa -lmusart -lmublas
-
-	ifndef GGML_NO_OPENMP
-		# For Ubuntu Focal
-		MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
-		MK_LDFLAGS  += -L/usr/lib/llvm-10/lib
-		# For Ubuntu Jammy
-		MK_CPPFLAGS += -I/usr/lib/llvm-14/lib/clang/14.0.0/include
-		MK_LDFLAGS  += -L/usr/lib/llvm-14/lib
-	endif # GGML_NO_OPENMP
-
-	CC  := $(MUSA_PATH)/bin/clang
-	CXX := $(MUSA_PATH)/bin/clang++
-	MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
-
-	MUSAFLAGS  = -fsigned-char -x musa -mtgpu
-	MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
-
-ifdef GGML_CUDA_FORCE_MMQ
-	MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
-endif # GGML_CUDA_FORCE_MMQ
-
-ifdef GGML_CUDA_FORCE_CUBLAS
-	MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
-endif # GGML_CUDA_FORCE_CUBLAS
-
-ifdef GGML_CUDA_F16
-	MUSAFLAGS += -DGGML_CUDA_F16
-endif # GGML_CUDA_F16
-
-ifdef GGML_CUDA_DMMV_F16
-	MUSAFLAGS += -DGGML_CUDA_F16
-endif # GGML_CUDA_DMMV_F16
-
-ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
-	MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
-else
-	MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
-endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
-
-ifdef GGML_CUDA_NO_PEER_COPY
-	MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
-endif # GGML_CUDA_NO_PEER_COPY
-
-ifdef GGML_CUDA_NO_FA
-	MUSAFLAGS += -DGGML_CUDA_NO_FA
-endif # GGML_CUDA_NO_FA
-
-ifdef GGML_CUDA_FA_ALL_QUANTS
-	MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
-endif # GGML_CUDA_FA_ALL_QUANTS
-
-	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
-	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
-	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
-
-ggml/src/ggml-cuda/ggml-cuda.o: \
-	ggml/src/ggml-cuda/ggml-cuda.cu \
-	ggml/include/ggml-cuda.h \
-	ggml/include/ggml.h \
-	ggml/include/ggml-backend.h \
-	ggml/src/ggml-backend-impl.h \
-	ggml/src/ggml-common.h \
-	$(wildcard ggml/src/ggml-cuda/*.cuh)
-	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
-
-ggml/src/ggml-cuda/%.o: \
-	ggml/src/ggml-cuda/%.cu \
-	ggml/include/ggml.h \
-	ggml/src/ggml-common.h \
-	ggml/src/ggml-cuda/common.cuh
-	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
-endif # GGML_MUSA
-
-ifdef GGML_METAL
-	MK_CPPFLAGS  += -DGGML_USE_METAL
-	MK_LDFLAGS   += -framework Foundation -framework Metal -framework MetalKit
-	OBJ_GGML_EXT += ggml/src/ggml-metal/ggml-metal.o
-
-ifdef GGML_METAL_USE_BF16
-	MK_CPPFLAGS += -DGGML_METAL_USE_BF16
-endif # GGML_METAL_USE_BF16
-ifdef GGML_METAL_NDEBUG
-	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
-endif
-ifdef GGML_METAL_EMBED_LIBRARY
-	MK_CPPFLAGS  += -DGGML_METAL_EMBED_LIBRARY
-	OBJ_GGML_EXT += ggml/src/ggml-metal-embed.o
-endif
-endif # GGML_METAL
-
-ifdef GGML_METAL
-ggml/src/ggml-metal/ggml-metal.o: \
-	ggml/src/ggml-metal/ggml-metal.m \
-	ggml/src/ggml-metal/ggml-metal-impl.h \
-	ggml/include/ggml-metal.h \
-	ggml/include/ggml.h
-	$(CC) $(CFLAGS) -c $< -o $@
-
-ifdef GGML_METAL_EMBED_LIBRARY
-ggml/src/ggml-metal-embed.o: \
-	ggml/src/ggml-metal/ggml-metal.metal \
-	ggml/src/ggml-metal/ggml-metal-impl.h \
-	ggml/src/ggml-common.h
-	@echo "Embedding Metal library"
-	@sed -e '/__embed_ggml-common.h__/r      ggml/src/ggml-common.h'                -e '/__embed_ggml-common.h__/d'      < ggml/src/ggml-metal/ggml-metal.metal           > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
-	@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
-	$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
-	@echo ".section __DATA, __ggml_metallib"                       >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".globl _ggml_metallib_start"                            >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo "_ggml_metallib_start:"                                  >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".globl _ggml_metallib_end"                              >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo "_ggml_metallib_end:"                                    >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
-	@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
-	@rmdir ${TEMP_ASSEMBLY}
-endif
-endif # GGML_METAL
-
-DIR_GGML = ggml
-DIR_LLAMA = src
-DIR_COMMON = common
-
-OBJ_GGML = \
-	$(DIR_GGML)/src/ggml.o \
-	$(DIR_GGML)/src/ggml-alloc.o \
-	$(DIR_GGML)/src/ggml-backend.o \
-	$(DIR_GGML)/src/ggml-backend-reg.o \
-	$(DIR_GGML)/src/ggml-opt.o \
-	$(DIR_GGML)/src/ggml-quants.o \
-	$(DIR_GGML)/src/ggml-threading.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
-	$(OBJ_GGML_EXT)
-
-OBJ_LLAMA = \
-	$(DIR_LLAMA)/llama.o \
-	$(DIR_LLAMA)/llama-vocab.o \
-	$(DIR_LLAMA)/llama-grammar.o \
-	$(DIR_LLAMA)/llama-sampling.o \
-	$(DIR_LLAMA)/unicode.o \
-	$(DIR_LLAMA)/unicode-data.o
-
-OBJ_COMMON = \
-	$(DIR_COMMON)/common.o \
-	$(DIR_COMMON)/arg.o \
-	$(DIR_COMMON)/log.o \
-	$(DIR_COMMON)/console.o \
-	$(DIR_COMMON)/ngram-cache.o \
-	$(DIR_COMMON)/sampling.o \
-	$(DIR_COMMON)/speculative.o \
-	$(DIR_COMMON)/chat.o \
-	$(DIR_COMMON)/build-info.o \
-	$(DIR_COMMON)/json-schema-to-grammar.o
-
-OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
-
-LIB_GGML   = $(LIB_PRE)ggml$(DSO_EXT)
-LIB_GGML_S = $(LIB_PRE)ggml.a
-
-LIB_LLAMA   = $(LIB_PRE)llama$(DSO_EXT)
-LIB_LLAMA_S = $(LIB_PRE)llama.a
-
-LIB_COMMON   = $(LIB_PRE)common$(DSO_EXT)
-LIB_COMMON_S = $(LIB_PRE)common.a
-
-LIB_ALL   = $(LIB_GGML)   $(LIB_LLAMA)   $(LIB_COMMON)
-LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S)
-
-GF_CC := $(CC)
-include scripts/get-flags.mk
-
-# combine build flags with cmdline overrides
-override CPPFLAGS  := $(MK_CPPFLAGS) $(CPPFLAGS)
-override CFLAGS    := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
-BASE_CXXFLAGS      := $(MK_CXXFLAGS) $(CXXFLAGS)
-override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
-override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
-override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
-
-# identify CUDA host compiler
-ifdef GGML_CUDA
-GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
-include scripts/get-flags.mk
-CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
-endif
-
-ifdef LLAMA_CURL
-override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
-override LDFLAGS  := $(LDFLAGS) -lcurl
-endif
-
-#
-# Print build information
-#
-
-$(info I llama.cpp build info: )
-$(info I UNAME_S:   $(UNAME_S))
-$(info I UNAME_P:   $(UNAME_P))
-$(info I UNAME_M:   $(UNAME_M))
-$(info I CFLAGS:    $(CFLAGS))
-$(info I CXXFLAGS:  $(CXXFLAGS))
-$(info I NVCCFLAGS: $(NVCCFLAGS))
-$(info I LDFLAGS:   $(LDFLAGS))
-$(info I CC:        $(shell $(CC)   --version | head -n 1))
-$(info I CXX:       $(shell $(CXX)  --version | head -n 1))
-ifdef GGML_CUDA
-$(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
-CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
-ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
-
-ifndef CUDA_DOCKER_ARCH
-ifndef CUDA_POWER_ARCH
-$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
-endif # CUDA_POWER_ARCH
-endif # CUDA_DOCKER_ARCH
-
-endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
-endif # GGML_CUDA
-$(info )
-
-ifdef DEPRECATE_WARNING
-$(info !!! DEPRECATION WARNING !!!)
-$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
-$(info   - LLAMA_CUDA)
-$(info   - LLAMA_METAL)
-$(info   - LLAMA_METAL_EMBED_LIBRARY)
-$(info   - LLAMA_OPENMP)
-$(info   - LLAMA_RPC)
-$(info   - LLAMA_SYCL)
-$(info   - LLAMA_SYCL_F16)
-$(info   - LLAMA_OPENBLAS)
-$(info   - LLAMA_OPENBLAS64)
-$(info   - LLAMA_BLIS)
-$(info   - LLAMA_NO_LLAMAFILE)
-$(info   - LLAMA_NO_ACCELERATE)
-$(info   - LLAMA_NO_OPENMP)
-$(info   - LLAMA_NO_METAL)
-$(info   - LLAMA_NO_CCACHE)
-$(info )
-endif
-
-ifdef REMOVE_WARNING
-$(info !!! REMOVAL WARNING !!!)
-$(info The following LLAMA_ options have been removed and are no longer supported)
-$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggml-org/llama.cpp/pull/9418))
-$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggml-org/llama.cpp/pull/9418))
-$(info )
-endif
-
-#
-# Build libraries
-#
-
-# Libraries
-LIB_GGML   = libggml.so
-LIB_GGML_S = libggml.a
-
-LIB_LLAMA   = libllama.so
-LIB_LLAMA_S = libllama.a
-
-LIB_COMMON   = libcommon.so
-LIB_COMMON_S = libcommon.a
-
-# Targets
-BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S)
-
-# Dependency files
-DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
-
-# Default target
-all: $(BUILD_TARGETS)
-
-# force c++ build for source file that have same name as c file
-# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
-$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
-	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
-
-# Rules for building object files
-$(DIR_GGML)/%.o: $(DIR_GGML)/%.c
-	$(CC) $(CFLAGS) -MMD -c $< -o $@
-
-$(DIR_GGML)/%.o: $(DIR_GGML)/%.cpp
-	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
-
-$(DIR_LLAMA)/%.o: $(DIR_LLAMA)/%.cpp
-	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
-
-$(DIR_COMMON)/%.o: $(DIR_COMMON)/%.cpp
-	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
-
-# Rules for building libraries
-$(LIB_GGML): $(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
-
-$(LIB_GGML_S): $(OBJ_GGML)
-	ar rcs $(LIB_GGML_S) $^
-
-$(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML)
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
-
-$(LIB_LLAMA_S): $(OBJ_LLAMA)
-	ar rcs $(LIB_LLAMA_S) $^
-
-$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML)
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
-
-$(LIB_COMMON_S): $(OBJ_COMMON)
-	ar rcs $(LIB_COMMON_S) $^
-
-# Include dependency files
--include $(DEP_FILES)
-
-# Clean generated server assets
-clean-server-assets:
-	find examples/server -type f -name "*.js.hpp"   -delete
-	find examples/server -type f -name "*.mjs.hpp"  -delete
-	find examples/server -type f -name "*.css.hpp"  -delete
-	find examples/server -type f -name "*.html.hpp" -delete
-
-# Clean rule
-clean: clean-server-assets
-	rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
-	rm -rvf *.a *.dll *.so *.dot
-	find ggml src common tests examples pocs -type f -name "*.o" -delete
-	find ggml src common tests examples pocs -type f -name "*.d" -delete
-
-#
-# Examples
-#
-
-# $< is the first prerequisite, i.e. the source file.
-# Explicitly compile this to an object file so that it can be cached with ccache.
-# The source file is then filtered out from $^ (the list of all prerequisites) and the object file is added instead.
-
-# Helper function that replaces .c, .cpp, and .cu file endings with .o:
-GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
-
-llama-cli: examples/main/main.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-	@echo
-	@echo '====  Run ./llama-cli -h for help.  ===='
-	@echo
-
-llama-infill: examples/infill/infill.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-run: examples/run/run.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-simple: examples/simple/simple.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-simple-chat: examples/simple-chat/simple-chat.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-tokenize: examples/tokenize/tokenize.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-batched: examples/batched/batched.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-batched-bench: examples/batched-bench/batched-bench.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-quantize: examples/quantize/quantize.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-perplexity: examples/perplexity/perplexity.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-imatrix: examples/imatrix/imatrix.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-embedding: examples/embedding/embedding.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-gritlm: examples/gritlm/gritlm.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-save-load-state: examples/save-load-state/save-load-state.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-gguf: examples/gguf/gguf.cpp \
-	$(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-examples/gguf-hash/deps/sha1/sha1.o: \
-	examples/gguf-hash/deps/sha1/sha1.c
-	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
-
-examples/gguf-hash/deps/xxhash/xxhash.o: \
-	examples/gguf-hash/deps/xxhash/xxhash.c
-	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
-
-examples/gguf-hash/deps/sha256/sha256.o: \
-	examples/gguf-hash/deps/sha256/sha256.c
-	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
-
-llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-gguf-split: examples/gguf-split/gguf-split.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-eval-callback: examples/eval-callback/eval-callback.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-bench: examples/llama-bench/llama-bench.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-export-lora: examples/export-lora/export-lora.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-retrieval: examples/retrieval/retrieval.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-speculative: examples/speculative/speculative.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-parallel: examples/parallel/parallel.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-lookahead: examples/lookahead/lookahead.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-lookup: examples/lookup/lookup.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-lookup-create: examples/lookup/lookup-create.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-lookup-merge: examples/lookup/lookup-merge.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-lookup-stats: examples/lookup/lookup-stats.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-passkey: examples/passkey/passkey.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-ifdef GGML_RPC
-rpc-server: examples/rpc/rpc-server.cpp \
-	$(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-endif # GGML_RPC
-
-llama-server: \
-	examples/server/server.cpp \
-	examples/server/utils.hpp \
-	examples/server/httplib.h \
-	examples/server/index.html.hpp \
-	examples/server/loading.html.hpp \
-	common/chat.cpp \
-	common/chat.h \
-	common/chat-template.hpp \
-	common/json.hpp \
-	common/minja.hpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
-
-# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
-examples/server/%.hpp: examples/server/public/% FORCE Makefile
-	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
-		echo "unsigned char $${NAME}[] = {" && \
-		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
-		echo "};" && \
-		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
-	) > $@
-
-llama-gen-docs: examples/gen-docs/gen-docs.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-libllava.a: examples/llava/llava.cpp \
-	examples/llava/llava.h \
-	examples/llava/clip.cpp \
-	examples/llava/clip.h \
-	common/stb_image.h \
-	common/base64.hpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
-
-llama-llava-cli: examples/llava/llava-cli.cpp \
-	examples/llava/llava.cpp \
-	examples/llava/llava.h \
-	examples/llava/clip.cpp \
-	examples/llava/clip.h \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
-
-llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
-	examples/llava/llava.cpp \
-	examples/llava/llava.h \
-	examples/llava/clip.cpp \
-	examples/llava/clip.h \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
-
-llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
-	examples/llava/llava.cpp \
-	examples/llava/llava.h \
-	examples/llava/clip.cpp \
-	examples/llava/clip.h \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
-
-ifeq ($(UNAME_S),Darwin)
-swift: examples/batched.swift
-	(cd examples/batched.swift; make build)
-endif
-
-common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
-	@sh scripts/build-info.sh "$(CC)" > $@.tmp
-	@if ! cmp -s $@.tmp $@; then \
-		mv $@.tmp $@; \
-	else \
-		rm $@.tmp; \
-	fi
-
-common/build-info.o: common/build-info.cpp
-	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
-
-#
-# Tests
-#
-
-tests: $(TEST_TARGETS)
-
-tests/test-arg-parser: tests/test-arg-parser.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-llama-grammar: tests/test-llama-grammar.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-log: tests/test-log.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-grammar-parser: tests/test-grammar-parser.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-grammar-integration: tests/test-grammar-integration.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-double-float: tests/test-double-float.cpp
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-chat: tests/test-chat.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-opt: tests/test-opt.cpp \
-	$(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-quantize-fns: tests/test-quantize-fns.cpp \
-	$(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-quantize-perf: tests/test-quantize-perf.cpp \
-	$(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-sampling: tests/test-sampling.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-0: tests/test-tokenizer-0.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \
-	$(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-c.o: tests/test-c.c include/llama.h
-	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
-
-tests/test-backend-ops: tests/test-backend-ops.cpp \
-	$(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-model-load-cancel: tests/test-model-load-cancel.cpp tests/get-model.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-autorelease: tests/test-autorelease.cpp tests/get-model.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-chat-template: tests/test-chat-template.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-#
-# PoCs
-#
-
-llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
-	$(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
-	$(OBJ_GGML)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-#
-# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
-#
-# Mark legacy binary targets as .PHONY so that they are always checked.
-.PHONY: FORCE main quantize perplexity embedding server
-
-# Define the object file target
-examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
-#  Eventually we will want to remove these target from building all the time.
-main: examples/deprecation-warning/deprecation-warning.o
-	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
-	@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
-
-server: examples/deprecation-warning/deprecation-warning.o
-	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
-	@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
-
-quantize: examples/deprecation-warning/deprecation-warning.o
-ifneq (,$(wildcard quantize))
-	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
-	@echo "#########"
-	@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
-	@echo "  Remove the 'quantize' binary to remove this warning."
-	@echo "#########"
-endif
-
-perplexity: examples/deprecation-warning/deprecation-warning.o
-ifneq (,$(wildcard perplexity))
-	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
-	@echo "#########"
-	@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
-	@echo "  Remove the 'perplexity' binary to remove this warning."
-	@echo "#########"
-endif
-
-embedding: examples/deprecation-warning/deprecation-warning.o
-ifneq (,$(wildcard embedding))
-	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
-	@echo "#########"
-	@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
-	@echo "  Remove the 'embedding' binary to remove this warning."
-	@echo "#########"
-endif
+# Makefile for Cortex llamacpp engine - Build, Lint, Test, and Clean
+
+CMAKE_EXTRA_FLAGS ?= ""
+RUN_TESTS ?= false
+LLM_MODEL_URL ?= "https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf"
+EMBEDDING_MODEL_URL ?= "https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf"
+CODE_SIGN ?= false
+AZURE_KEY_VAULT_URI ?= xxxx
+AZURE_CLIENT_ID ?= xxxx
+AZURE_TENANT_ID ?= xxxx
+AZURE_CLIENT_SECRET ?= xxxx
+AZURE_CERT_NAME ?= xxxx
+DEVELOPER_ID ?= xxxx
+
+# Default target, does nothing
+all:
+	@echo "Specify a target to run"
+
+# Build the Cortex engine
+build-lib:
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "cmake -B build $(CMAKE_EXTRA_FLAGS) -DLLAMA_BUILD_TESTS=OFF;"
+	@powershell -Command "cmake --build build --config Release -j4 --target llama-server;"
+else ifeq ($(shell uname -s),Linux)
+	@cmake -B build $(CMAKE_EXTRA_FLAGS) -DLLAMA_BUILD_TESTS=OFF
+	@cmake --build build --config Release -j4 --target llama-server;
+else
+	@cmake -B build $(CMAKE_EXTRA_FLAGS) -DLLAMA_BUILD_TESTS=OFF
+	@cmake --build build --config Release -j4 --target llama-server;
+endif
+
+codesign:
+ifeq ($(CODE_SIGN),false)
+	@echo "Skipping Code Sign"
+	@exit 0
+endif
+
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "dotnet tool install --global AzureSignTool;"
+	@powershell -Command 'azuresigntool.exe sign -kvu "$(AZURE_KEY_VAULT_URI)" -kvi "$(AZURE_CLIENT_ID)" -kvt "$(AZURE_TENANT_ID)" -kvs "$(AZURE_CLIENT_SECRET)" -kvc "$(AZURE_CERT_NAME)" -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\bin\llama-server.exe";'
+else ifeq ($(shell uname -s),Linux)
+	@echo "Skipping Code Sign for linux"
+	@exit 0
+else
+	find "llama" -type f -exec codesign --force -s "$(DEVELOPER_ID)" --options=runtime {} \;
+endif
+
+package:
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "7z a -ttar temp.tar build\bin\*; 7z a -tgzip llama.tar.gz temp.tar;"
+else ifeq ($(shell uname -s),Linux)
+	@tar -czvf llama.tar.gz build/bin;
+else
+	@tar -czvf llama.tar.gz build/bin;
+endif
\ No newline at end of file

From 4d24ef9204611e81623522104f00cc17d4e67467 Mon Sep 17 00:00:00 2001
From: sangjanai <sang@janai>
Date: Fri, 14 Mar 2025 13:14:01 +0700
Subject: [PATCH 2/8] chore: add upstream sync workflow

---
 .github/workflows/upstream-sync.yml | 45 +++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 .github/workflows/upstream-sync.yml

diff --git a/.github/workflows/upstream-sync.yml b/.github/workflows/upstream-sync.yml
new file mode 100644
index 0000000000000..7f31092cf2ed9
--- /dev/null
+++ b/.github/workflows/upstream-sync.yml
@@ -0,0 +1,45 @@
+name: Sync with Latest Release
+
+on:
+  schedule:
+    - cron: '0 0 * * *'  # Runs daily at midnight UTC
+  workflow_dispatch:     # Allow manual triggering
+
+jobs:
+  sync-with-release:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout repo
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0  # Get full history for commit count
+
+    - name: Add upstream remote
+      run: |
+        git remote add upstream https://github.com/ggml-org/llama.cpp.git
+        git fetch upstream --tags  # Fetch tags from upstream
+
+    - name: Sync master with latest release
+      run: |
+        git checkout master
+        LATEST_RELEASE=$(git describe --tags --abbrev=0 upstream/master)
+        git reset --hard $LATEST_RELEASE
+        git push origin master --force
+
+    - name: Rebase dev onto master
+      run: |
+        git checkout dev
+        if ! git rebase master; then
+            echo "Rebase conflict detected, aborting"
+            git rebase --abort
+            exit 1
+        fi
+        git push origin dev --force-with-lease
+
+    - name: Create version tag
+      run: |
+        git checkout master
+        COMMIT_COUNT=$(git rev-list --count HEAD)
+        git checkout dev
+        git tag "b${COMMIT_COUNT}"
+        git push origin tag "b${COMMIT_COUNT}"

From 345ad772b4b990f040151dfc5e8c575dfdc61939 Mon Sep 17 00:00:00 2001
From: sangjanai <sang@janai>
Date: Fri, 14 Mar 2025 14:20:59 +0700
Subject: [PATCH 3/8] chore: disable some variants

---
 .github/workflows/quality-gate.yml | 128 ++++++++++++++---------------
 1 file changed, 64 insertions(+), 64 deletions(-)

diff --git a/.github/workflows/quality-gate.yml b/.github/workflows/quality-gate.yml
index b9244f74ae0fa..25970187a6b6f 100644
--- a/.github/workflows/quality-gate.yml
+++ b/.github/workflows/quality-gate.yml
@@ -69,14 +69,14 @@ jobs:
             vulkan: false
             ccache: true
             ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "vulkan-x64"
-            runs-on: "ubuntu-22-04"
-            cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: true
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "vulkan-x64"
+          #   runs-on: "ubuntu-22-04"
+          #   cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: true
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
           - os: "linux"
             name: "noavx-cuda-cu11.7-x64"
             runs-on: "ubuntu-20-04-cuda-11-7"
@@ -141,62 +141,62 @@ jobs:
             vulkan: false
             ccache: true
             ccache-dir: "/home/runner/.ccache"
-          - os: "macos"
-            name: "x64"
-            runs-on: "macos-selfhosted-12"
-            cmake-flags: "-DGGML_METAL=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "macos"
-            name: "arm64"
-            runs-on: "macos-selfhosted-12-arm64"
-            cmake-flags: "-DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx2-x64"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "noavx-x64"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx512-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "vulkan-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: false
-            vulkan: true
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "macos"
+          #   name: "x64"
+          #   runs-on: "macos-selfhosted-12"
+          #   cmake-flags: "-DGGML_METAL=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "macos"
+          #   name: "arm64"
+          #   runs-on: "macos-selfhosted-12-arm64"
+          #   cmake-flags: "-DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "avx2-x64"
+          #   runs-on: "windows-cuda-12-0"
+          #   cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "noavx-x64"
+          #   runs-on: "windows-cuda-12-0"
+          #   cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "avx-x64"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "avx512-x64"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "vulkan-x64"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: true
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
           - os: "win"
             name: "noavx-cuda-cu12.0-x64"
             runs-on: "windows-cuda-12-0"

From a657ba9421953e238e3ec71974483ec261bf7997 Mon Sep 17 00:00:00 2001
From: sangjanai <sang@janai>
Date: Fri, 14 Mar 2025 14:20:59 +0700
Subject: [PATCH 4/8] chore: disable some variants

---
 .github/workflows/quality-gate.yml | 128 ++++++++++++++---------------
 1 file changed, 64 insertions(+), 64 deletions(-)

diff --git a/.github/workflows/quality-gate.yml b/.github/workflows/quality-gate.yml
index b9244f74ae0fa..25970187a6b6f 100644
--- a/.github/workflows/quality-gate.yml
+++ b/.github/workflows/quality-gate.yml
@@ -69,14 +69,14 @@ jobs:
             vulkan: false
             ccache: true
             ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "vulkan-x64"
-            runs-on: "ubuntu-22-04"
-            cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: true
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
+          # - os: "linux"
+          #   name: "vulkan-x64"
+          #   runs-on: "ubuntu-22-04"
+          #   cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+          #   run-e2e: false
+          #   vulkan: true
+          #   ccache: true
+          #   ccache-dir: "/home/runner/.ccache"
           - os: "linux"
             name: "noavx-cuda-cu11.7-x64"
             runs-on: "ubuntu-20-04-cuda-11-7"
@@ -141,62 +141,62 @@ jobs:
             vulkan: false
             ccache: true
             ccache-dir: "/home/runner/.ccache"
-          - os: "macos"
-            name: "x64"
-            runs-on: "macos-selfhosted-12"
-            cmake-flags: "-DGGML_METAL=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "macos"
-            name: "arm64"
-            runs-on: "macos-selfhosted-12-arm64"
-            cmake-flags: "-DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx2-x64"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "noavx-x64"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx512-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "vulkan-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: false
-            vulkan: true
-            ccache: false
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "macos"
+          #   name: "x64"
+          #   runs-on: "macos-selfhosted-12"
+          #   cmake-flags: "-DGGML_METAL=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "macos"
+          #   name: "arm64"
+          #   runs-on: "macos-selfhosted-12-arm64"
+          #   cmake-flags: "-DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "avx2-x64"
+          #   runs-on: "windows-cuda-12-0"
+          #   cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "noavx-x64"
+          #   runs-on: "windows-cuda-12-0"
+          #   cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "avx-x64"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "avx512-x64"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: false
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          # - os: "win"
+          #   name: "vulkan-x64"
+          #   runs-on: "windows-cuda-11-7"
+          #   cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+          #   run-e2e: false
+          #   vulkan: true
+          #   ccache: false
+          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
           - os: "win"
             name: "noavx-cuda-cu12.0-x64"
             runs-on: "windows-cuda-12-0"

From 68bf601478a5bac0c1540e562017078faf9d3411 Mon Sep 17 00:00:00 2001
From: sangjanai <sang@janai>
Date: Wed, 19 Mar 2025 12:52:07 +0700
Subject: [PATCH 5/8] chore: disable more variants

---
 .github/workflows/build.yml        |  82 +----
 .github/workflows/quality-gate.yml | 472 -----------------------------
 2 files changed, 1 insertion(+), 553 deletions(-)
 delete mode 100644 .github/workflows/quality-gate.yml

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c9fb5a041680b..29dbd62bf68a2 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -62,22 +62,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - os: "linux"
-            name: "arm64"
-            runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx2-x64"
-            runs-on: "ubuntu-20-04"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
           - os: "linux"
             name: "noavx-x64"
             runs-on: "ubuntu-20-04"
@@ -102,14 +86,6 @@ jobs:
             vulkan: false
             ccache: true
             ccache-dir: "/home/runner/.ccache"
-          # - os: "linux"
-          #   name: "vulkan-x64"
-          #   runs-on: "ubuntu-22-04"
-          #   cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-          #   run-e2e: false
-          #   vulkan: true
-          #   ccache: true
-          #   ccache-dir: "/home/runner/.ccache"
           - os: "linux"
             name: "noavx-cuda-cu11.7-x64"
             runs-on: "ubuntu-20-04-cuda-11-7"
@@ -173,63 +149,7 @@ jobs:
             run-e2e: false
             vulkan: false
             ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          # - os: "macos"
-          #   name: "x64"
-          #   runs-on: "macos-selfhosted-12"
-          #   cmake-flags: "-DGGML_METAL=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
-          #   run-e2e: false
-          #   vulkan: false
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          # - os: "macos"
-          #   name: "arm64"
-          #   runs-on: "macos-selfhosted-12-arm64"
-          #   cmake-flags: "-DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
-          #   run-e2e: false
-          #   vulkan: false
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          # - os: "win"
-          #   name: "avx2-x64"
-          #   runs-on: "windows-cuda-12-0"
-          #   cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-          #   run-e2e: false
-          #   vulkan: false
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          # - os: "win"
-          #   name: "noavx-x64"
-          #   runs-on: "windows-cuda-12-0"
-          #   cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-          #   run-e2e: false
-          #   vulkan: false
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          # - os: "win"
-          #   name: "avx-x64"
-          #   runs-on: "windows-cuda-11-7"
-          #   cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-          #   run-e2e: false
-          #   vulkan: false
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          # - os: "win"
-          #   name: "avx512-x64"
-          #   runs-on: "windows-cuda-11-7"
-          #   cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-          #   run-e2e: false
-          #   vulkan: false
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          # - os: "win"
-          #   name: "vulkan-x64"
-          #   runs-on: "windows-cuda-11-7"
-          #   cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-          #   run-e2e: false
-          #   vulkan: true
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+            ccache-dir: "/home/runner/.ccache"          
           - os: "win"
             name: "noavx-cuda-cu12.0-x64"
             runs-on: "windows-cuda-12-0"
diff --git a/.github/workflows/quality-gate.yml b/.github/workflows/quality-gate.yml
deleted file mode 100644
index 25970187a6b6f..0000000000000
--- a/.github/workflows/quality-gate.yml
+++ /dev/null
@@ -1,472 +0,0 @@
-name: CI Quality Gate
-on:
-  pull_request:
-    types: [opened, synchronize, reopened]
-  workflow_call:
-    secrets:
-      MINIO_BUCKET_NAME:
-        required: false
-      MINIO_REGION:
-        required: false
-      MINIO_ENDPOINT:
-        required: false
-      MINIO_ACCESS_KEY_ID:
-        required: false
-      MINIO_SECRET_ACCESS_KEY:
-        required: false
-
-env:
-  LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
-  EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf
-  VULKAN_VERSION: 1.3.261.1
-
-jobs:
-  build-and-test:
-    runs-on: ${{ matrix.runs-on }}
-    if: ${{ ! startsWith(github.head_ref, 'update-submodule') }}
-    timeout-minutes: 90
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - os: "linux"
-            name: "arm64"
-            runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx2-x64"
-            runs-on: "ubuntu-20-04"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "noavx-x64"
-            runs-on: "ubuntu-20-04"
-            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx-x64"
-            runs-on: "ubuntu-20-04"
-            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx512-x64"
-            runs-on: "ubuntu-20-04"
-            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          # - os: "linux"
-          #   name: "vulkan-x64"
-          #   runs-on: "ubuntu-22-04"
-          #   cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-          #   run-e2e: false
-          #   vulkan: true
-          #   ccache: true
-          #   ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "noavx-cuda-cu11.7-x64"
-            runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx2-cuda-cu11.7-x64"
-            runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx-cuda-cu11.7-x64"
-            runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx512-cuda-cu11.7-x64"
-            runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "noavx-cuda-cu12.0-x64"
-            runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx2-cuda-cu12.0-x64"
-            runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx-cuda-cu12.0-x64"
-            runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx512-cuda-cu12.0-x64"
-            runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          # - os: "macos"
-          #   name: "x64"
-          #   runs-on: "macos-selfhosted-12"
-          #   cmake-flags: "-DGGML_METAL=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
-          #   run-e2e: false
-          #   vulkan: false
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          # - os: "macos"
-          #   name: "arm64"
-          #   runs-on: "macos-selfhosted-12-arm64"
-          #   cmake-flags: "-DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
-          #   run-e2e: false
-          #   vulkan: false
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          # - os: "win"
-          #   name: "avx2-x64"
-          #   runs-on: "windows-cuda-12-0"
-          #   cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-          #   run-e2e: false
-          #   vulkan: false
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          # - os: "win"
-          #   name: "noavx-x64"
-          #   runs-on: "windows-cuda-12-0"
-          #   cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-          #   run-e2e: false
-          #   vulkan: false
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          # - os: "win"
-          #   name: "avx-x64"
-          #   runs-on: "windows-cuda-11-7"
-          #   cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-          #   run-e2e: false
-          #   vulkan: false
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          # - os: "win"
-          #   name: "avx512-x64"
-          #   runs-on: "windows-cuda-11-7"
-          #   cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-          #   run-e2e: false
-          #   vulkan: false
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          # - os: "win"
-          #   name: "vulkan-x64"
-          #   runs-on: "windows-cuda-11-7"
-          #   cmake-flags: "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-          #   run-e2e: false
-          #   vulkan: true
-          #   ccache: false
-          #   ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "noavx-cuda-cu12.0-x64"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx2-cuda-cu12.0-x64"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx-cuda-cu12.0-x64"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx512-cuda-cu12.0-x64"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "noavx-cuda-cu11.7-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx2-cuda-cu11.7-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx-cuda-cu11.7-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx512-cuda-cu11.7-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      # - name: Apply patch file
-      #   run: |
-      #     cd llama.cpp
-      #     git apply ../patches/0001-Add-API-query-buffer-size.patch
-
-      - name: use python for linux
-        continue-on-error: true
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.10"
-
-      - name: Install tools on Windows
-        if: runner.os == 'Windows'
-        run: |
-          choco install ccache awscli make ccache ninja -y
-
-      - name: Install tools on Linux
-        if: runner.os == 'Linux'
-        run: |
-          sudo apt-get install -y ninja-build
-          python3 -m pip install awscli
-          if [ "${{ matrix.os }}${{ matrix.name }}" == "linuxarm64" ]; then
-            sudo apt-get install -y ccache
-            exit 0
-          fi
-          cd /tmp
-          wget https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz
-          tar -xvf ccache-4.10.2-linux-x86_64.tar.xz
-          sudo cp ccache-4.10.2-linux-x86_64/ccache /usr/bin/ccache
-          ccache -V
-          rm -rf /tmp/ccache-4.10.2-linux-x86_64.tar.xz /tmp/ccache-4.10.2-linux-x86_64
-
-      - name: Download ccache from s3
-        if: runner.os == 'Windows'
-        continue-on-error: true
-        run: |
-          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
-          refreshenv
-          aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }} ${{ matrix.ccache-dir }}  --endpoint ${{ secrets.MINIO_ENDPOINT }} --cli-read-timeout 0
-        env:
-          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
-          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
-          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
-
-      - name: Download ccache from s3
-        if: runner.os == 'Linux'
-        continue-on-error: true
-        run: |
-          aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }} ${{ matrix.ccache-dir }}  --endpoint ${{ secrets.MINIO_ENDPOINT }} --cli-read-timeout 0
-        env:
-          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
-          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
-          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
-
-      - name: Install coreutils macos
-        if: runner.os == 'macOS'
-        run: |
-          brew install coreutils
-
-      - name: Prepare Vulkan SDK Linux
-        if: ${{ matrix.vulkan && (matrix.os == 'linux') }}
-        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential vulkan-sdk
-
-      - name: Prepare Vulkan SDK Windows
-        if: ${{ matrix.vulkan && (matrix.os == 'windows') }}
-        continue-on-error: true
-        run: |
-          if (Test-Path C:/VulkanSDK/${env:VULKAN_VERSION}/) { Remove-Item -Path C:/VulkanSDK/${env:VULKAN_VERSION}/ -Force -Recurse }
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
-          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
-          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
-          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
-
-      - name: Add msbuild to PATH
-        if: runner.os == 'Windows'
-        uses: ilammy/msvc-dev-cmd@v1.13.0
-
-      - name: Build
-        id: build-and-test
-        run: |
-          make build-lib CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}"
-
-      - uses: 1arp/create-a-file-action@0.4.5
-        with:
-          path: "build/bin"
-          isAbsolutePath: false
-          file: "version.txt"
-          content: |
-            name: ${{ matrix.os }}-${{ matrix.name }}
-            version: ${{github.event.pull_request.head.sha}}
-
-      - name: Package
-        run: |
-          make package
-
-      # - name: Run e2e testing
-      #   if: ${{ matrix.run-e2e }}
-      #   run: |
-      #     make run-e2e-test LLM_MODEL_URL=${{ env.LLM_MODEL_URL }} EMBEDDING_MODEL_URL=${{ env.EMBEDDING_MODEL_URL }}
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: llama-${{ matrix.os }}-${{ matrix.name }}
-          path: ./build/bin
-
-      - name: Calculate SHA512 Checksum (macOS)
-        if: runner.os == 'macOS'
-        run: |
-          sha512sum ./llama.tar.gz | awk '{ print $1 }' > sha512.txt
-          size=$(stat -f%z ./llama.tar.gz)  # Sử dụng -f%z cho macOS
-          echo "checksum=$(cat sha512.txt)" >> $GITHUB_ENV
-          echo "size=$size" >> $GITHUB_ENV
-
-      - name: Calculate SHA512 Checksum (Windows)
-        if: runner.os == 'Windows'
-        shell: pwsh
-        run: |
-          CertUtil -hashfile ./llama.tar.gz SHA512 | Select-String -Pattern "^[0-9a-fA-F]+$" | Out-File sha512.txt
-          $size = (Get-Item ./llama.tar.gz).length
-          echo "checksum=$(Get-Content sha512.txt)" >> $env:GITHUB_ENV
-          echo "size=$size" >> $env:GITHUB_ENV
-
-      - name: Calculate SHA512 Checksum (Linux)
-        if: runner.os == 'Linux'
-        run: |
-          sha512sum ./llama.tar.gz | awk '{ print $1 }' > sha512.txt
-          size=$(stat -c%s ./llama.tar.gz)
-          echo "checksum=$(cat sha512.txt)" >> $GITHUB_ENV
-          echo "size=$size" >> $GITHUB_ENV
-
-      ## Write for matrix outputs workaround
-      - uses: cloudposse/github-action-matrix-outputs-write@v1
-        id: out
-        with:
-          matrix-step-name: ${{ github.job }}
-          matrix-key: ${{ matrix.os }}-${{ matrix.name }}
-          outputs: |-
-            sha512: ${{ env.checksum }}
-            size: ${{ env.size }}
-
-      - name: Upload ccache to s3
-        continue-on-error: true
-        if: always() && runner.os == 'Windows'
-        run: |
-          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
-          refreshenv
-          aws s3 sync ${{ matrix.ccache-dir }} s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }}  --endpoint ${{ secrets.MINIO_ENDPOINT }}
-        env:
-          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
-          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
-          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
-
-      - name: Upload ccache to s3
-        continue-on-error: true
-        if: always() && runner.os == 'Linux'
-        run: |
-          aws s3 sync ${{ matrix.ccache-dir }} s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }}  --endpoint ${{ secrets.MINIO_ENDPOINT }}
-        env:
-          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
-          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
-          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
-
-  ## Read matrix outputs
-  read:
-    runs-on: ubuntu-latest
-    needs: [build-and-test]
-    steps:
-      - uses: cloudposse/github-action-matrix-outputs-read@v1
-        id: read
-        with:
-          matrix-step-name: build-and-test
-    outputs:
-      result: "${{ steps.read.outputs.result }}"
-
-  create-checksum-file:
-    runs-on: ubuntu-latest
-    needs: [read]
-    steps:
-      - name: Create checksum.yml
-        run: |
-          version="${{github.event.pull_request.head.sha}}"
-          outputs=${{ toJson(needs.read.outputs.result) }}
-
-          echo $outputs
-
-          echo "version: $version" > checksum.yml
-          echo "files:" >> checksum.yml
-
-          echo "$outputs" | jq -r --arg version "$version" '
-            .sha512 as $sha512 |
-            .size as $size |
-            (.sha512 | keys[]) as $key |
-            "- url: llama-\($version)-\($key).tar.gz\n  sha512: >-\n    \($sha512[$key])\n  size: \($size[$key])"
-          ' >> checksum.yml
-
-          cat checksum.yml 
\ No newline at end of file

From b3cbe22997e3258f3d60e3c1144be02a8c6a5c2b Mon Sep 17 00:00:00 2001
From: sangjanai <sang@janai>
Date: Wed, 19 Mar 2025 12:55:52 +0700
Subject: [PATCH 6/8] fix: lint

---
 .github/workflows/upstream-sync.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/upstream-sync.yml b/.github/workflows/upstream-sync.yml
index 81ab17b9f15b5..f40e9983bdebf 100644
--- a/.github/workflows/upstream-sync.yml
+++ b/.github/workflows/upstream-sync.yml
@@ -158,5 +158,4 @@ jobs:
           git tag "$TAG_NAME"
           git push origin "$TAG_NAME"
           echo "Tag $TAG_NAME created successfully"
-        fi
-
+        fi
\ No newline at end of file

From b079fabcd7d10f77d4df39736f9d703c98002a3c Mon Sep 17 00:00:00 2001
From: sangjanai <sang@janai>
Date: Wed, 19 Mar 2025 13:03:21 +0700
Subject: [PATCH 7/8] fix: build

---
 .github/workflows/build.yml | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 29dbd62bf68a2..796782321ca43 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -23,11 +23,6 @@ on:
       ]
   workflow_dispatch:
 
-env:
-  LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
-  EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf
-  VULKAN_VERSION: 1.3.261.1
-
 jobs:
   create-draft-release:
     runs-on: ubuntu-latest
@@ -277,23 +272,6 @@ jobs:
         run: |
           brew install coreutils
 
-      - name: Prepare Vulkan SDK Linux
-        if: ${{ matrix.vulkan && (matrix.os == 'linux') }}
-        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential vulkan-sdk
-    
-      - name: Prepare Vulkan SDK Windows
-        if: ${{ matrix.vulkan && (matrix.os == 'windows') }}
-        continue-on-error: true
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
-          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
-          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
-          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
-
       - name: Get Cer for code signing
         if: runner.os == 'macOS'
         run: base64 -d <<< "$CODE_SIGN_P12_BASE64" > /tmp/codesign.p12
@@ -348,11 +326,6 @@ jobs:
           cat llama/version.txt
           make package
 
-      # - name: Run e2e testing
-      #   if: ${{ matrix.run-e2e }}
-      #   run: |
-      #     make run-e2e-test LLM_MODEL_URL=${{ env.LLM_MODEL_URL }} EMBEDDING_MODEL_URL=${{ env.EMBEDDING_MODEL_URL }}
-
       - name: Upload Artifact
         uses: actions/upload-artifact@v4
         with:

From 7e996f7c595953425178cdff1563bf4b343f4b17 Mon Sep 17 00:00:00 2001
From: sangjanai <sang@janai>
Date: Wed, 19 Mar 2025 13:16:21 +0700
Subject: [PATCH 8/8] chore: build

---
 .github/workflows/build.yml       | 2106 +++++++++++++++++++++++------
 .github/workflows/menlo-build.yml |  523 +++++++
 2 files changed, 2178 insertions(+), 451 deletions(-)
 create mode 100644 .github/workflows/menlo-build.yml

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 796782321ca43..03cde0a48436f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,523 +1,1727 @@
 name: CI
 
 on:
+  workflow_dispatch: # allows manual triggering
+    inputs:
+      create_release:
+        description: 'Create new release'
+        required: true
+        type: boolean
   push:
-    tags: ["b[0-9]+"]
-    paths:
-      [
-        ".github/scripts/**",
-        ".github/workflows/build.yml",
-        "**/CMakeLists.txt",
-        "**/Makefile",
-        "**/*.h",
-        "**/*.hpp",
-        "**/*.c",
-        "**/*.cpp",
-        "**/*.cu",
-        "**/*.cc",
-        "**/*.cxx",
-        "llama.cpp",
-        "!docs/**",
-        "!.gitignore",
-        "!README.md",
-      ]
-  workflow_dispatch:
+    branches:
+      - master
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  contents: write # for creating release
+
+env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
 
 jobs:
-  create-draft-release:
-    runs-on: ubuntu-latest
-    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-    outputs:
-      upload_url: ${{ steps.create_release.outputs.upload_url }}
-      version: ${{ steps.get_version.outputs.version }}
-    permissions:
-      contents: write
+  macOS-latest-cmake-arm64:
+    runs-on: macos-14
+
     steps:
-      - name: Extract tag name
-        id: get_version
-        run: echo "VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/}"
-        env:
-          GITHUB_REF: ${{ github.ref }}
-      - name: Create Draft Release
-        id: create_release
-        uses: softprops/action-gh-release@v2
-        with:
-          tag_name: ${{ github.ref_name }}
-          token: ${{ secrets.GITHUB_TOKEN }}
-          name: "${{ env.VERSION }}"
-          draft: true
-          generate_release_notes: true
-          prerelease: false
-
-  build-and-test:
-    runs-on: ${{ matrix.runs-on }}
-    needs: [create-draft-release]
-    timeout-minutes: 90
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-arm64
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=ON \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L 'main|curl' --verbose --timeout 900
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
+          name: llama-bin-macos-arm64.zip
+
+  macOS-latest-cmake-x64:
+    runs-on: macos-13
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-x64
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=ON \
+            -DGGML_METAL=OFF \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
+          name: llama-bin-macos-x64.zip
+
+  ubuntu-cpu-cmake:
     strategy:
-      fail-fast: false
       matrix:
         include:
-          - os: "linux"
-            name: "noavx-x64"
-            runs-on: "ubuntu-20-04"
-            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx-x64"
-            runs-on: "ubuntu-20-04"
-            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx512-x64"
-            runs-on: "ubuntu-20-04"
-            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "noavx-cuda-cu11.7-x64"
-            runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx2-cuda-cu11.7-x64"
-            runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx-cuda-cu11.7-x64"
-            runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx512-cuda-cu11.7-x64"
-            runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "noavx-cuda-cu12.0-x64"
-            runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx2-cuda-cu12.0-x64"
-            runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx-cuda-cu12.0-x64"
-            runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"
-          - os: "linux"
-            name: "avx512-cuda-cu12.0-x64"
-            runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: "/home/runner/.ccache"          
-          - os: "win"
-            name: "noavx-cuda-cu12.0-x64"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx2-cuda-cu12.0-x64"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx-cuda-cu12.0-x64"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx512-cuda-cu12.0-x64"
-            runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "noavx-cuda-cu11.7-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx2-cuda-cu11.7-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx-cuda-cu11.7-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
-          - os: "win"
-            name: "avx512-cuda-cu11.7-x64"
-            runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
-            run-e2e: false
-            vulkan: false
-            ccache: true
-            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - build: 'x64'
+            os: ubuntu-22.04
+          - build: 'arm64'
+            os: ubuntu-22.04-arm
+
+    runs-on: ${{ matrix.os }}
 
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
-          submodules: recursive
+          fetch-depth: 0
 
-      - name: use python for linux
-        continue-on-error: true
-        uses: actions/setup-python@v4
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-cpu-cmake
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libcurl4-openssl-dev
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=ON \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L 'main|curl' --verbose --timeout 900
+
+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
+          name: llama-bin-ubuntu-${{ matrix.build }}.zip
+
+  ubuntu-latest-cmake-sanitizer:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        build_type: [Debug]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: cmake_build
+        if: ${{ matrix.sanitizer != 'THREAD' }}
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+      - name: Build (no OpenMP)
+        id: cmake_build_no_openmp
+        if: ${{ matrix.sanitizer == 'THREAD' }}
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+            -DGGML_OPENMP=OFF
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+  ubuntu-latest-llguidance:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_LLGUIDANCE=ON
+          cmake --build . --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+  ubuntu-latest-cmake-rpc:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-latest-cmake-rpc
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose
+
+  ubuntu-22-cmake-vulkan:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
         with:
-          python-version: '3.10'
+          key: ubuntu-22-cmake-vulkan
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
 
-      - name: Install tools on Windows
-        if: runner.os == 'Windows'
+      - name: Build
+        id: cmake_build
         run: |
-          choco install ccache awscli make ccache ninja -y
+          cmake -B build \
+            -DGGML_VULKAN=ON
+          cmake --build build --config Release -j $(nproc)
 
-      - name: Install tools on Linux
-        if: runner.os == 'Linux'
+      - name: Test
+        id: cmake_test
         run: |
-          sudo apt-get install -y ninja-build
-          python3 -m pip install awscli
+          cd build
+          # This is using llvmpipe and runs slower than other backends
+          ctest -L main --verbose --timeout 2700
 
-          if [ "${{ matrix.os }}${{ matrix.name }}" == "linuxarm64" ]; then
-            sudo apt-get install -y ccache
-            exit 0
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
           fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
+          name: llama-bin-ubuntu-vulkan-x64.zip
+
+  ubuntu-22-cmake-hip:
+    runs-on: ubuntu-22.04
+    container: rocm/dev-ubuntu-22.04:6.0.2
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-hip
+          evict-old-files: 1d
+
+      - name: Build with native CMake HIP support
+        id: cmake_build
+        run: |
+          cmake -B build -S . \
+            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
+            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DGGML_HIP=ON
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Build with legacy HIP support
+        id: cmake_build_legacy_hip
+        run: |
+          cmake -B build2 -S . \
+            -DCMAKE_C_COMPILER=hipcc \
+            -DCMAKE_CXX_COMPILER=hipcc \
+            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DGGML_HIP=ON
+          cmake --build build2 --config Release -j $(nproc)
+
+  ubuntu-22-cmake-musa:
+    runs-on: ubuntu-22.04
+    container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        id: depends
+        run: |
+          apt-get update
+          apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-musa
+          evict-old-files: 1d
+
+      - name: Build with native CMake MUSA support
+        id: cmake_build
+        run: |
+          cmake -B build -S . \
+            -DGGML_MUSA=ON
+          cmake --build build --config Release -j $(nproc)
+
+  ubuntu-22-cmake-sycl:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-sycl
+          evict-old-files: 1d
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx
+          cmake --build build --config Release -j $(nproc)
+
+  ubuntu-22-cmake-sycl-fp16:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
           cd /tmp
-          wget https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz
-          tar -xvf ccache-4.10.2-linux-x86_64.tar.xz
-          sudo cp ccache-4.10.2-linux-x86_64/ccache /usr/bin/ccache
-          ccache -V
-          rm -rf /tmp/ccache-4.10.2-linux-x86_64.tar.xz /tmp/ccache-4.10.2-linux-x86_64
-
-      - name: Download ccache from s3
-        if: runner.os == 'Windows'
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-sycl-fp16
+          evict-old-files: 1d
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DGGML_SYCL_F16=ON
+          cmake --build build --config Release -j $(nproc)
+
+  macOS-latest-cmake-ios:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-ios
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
         continue-on-error: true
         run: |
-          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
-          refreshenv
-          aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }} ${{ matrix.ccache-dir }}  --endpoint ${{ secrets.MINIO_ENDPOINT }} --cli-read-timeout 0
-        env:
-          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
-          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
-          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+          brew update
 
-      - name: Download ccache from s3
-        if: runner.os == 'Linux'
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build -G Xcode \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF \
+            -DCMAKE_SYSTEM_NAME=iOS \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+
+  macOS-latest-cmake-tvos:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-tvos
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
         continue-on-error: true
         run: |
-          aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }} ${{ matrix.ccache-dir }}  --endpoint ${{ secrets.MINIO_ENDPOINT }} --cli-read-timeout 0
-        env:
-          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
-          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
-          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+          brew update
 
-      - name: Install coreutils macos
-        if: runner.os == 'macOS'
+      - name: Build
+        id: cmake_build
         run: |
-          brew install coreutils
+          sysctl -a
+          cmake -B build -G Xcode \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF \
+            -DCMAKE_SYSTEM_NAME=tvOS \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
 
-      - name: Get Cer for code signing
-        if: runner.os == 'macOS'
-        run: base64 -d <<< "$CODE_SIGN_P12_BASE64" > /tmp/codesign.p12
-        shell: bash
-        env:
-          CODE_SIGN_P12_BASE64: ${{ secrets.CODE_SIGN_P12_BASE64 }}
-  
-      - uses: apple-actions/import-codesign-certs@v2
+  macOS-latest-swift:
+    runs-on: macos-latest
+
+    strategy:
+      matrix:
+        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-swift
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
         continue-on-error: true
-        if: runner.os == 'macOS'
+        run: |
+          brew update
+
+      - name: Build llama.cpp with CMake
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build -G Xcode \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF \
+            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: xcodebuild for swift package
+        id: xcodebuild
+        run: |
+          ./build-xcframework.sh
+
+  windows-msys2:
+    runs-on: windows-latest
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
+          - { sys: CLANG64, env: clang-x86_64, build: Release }
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
         with:
-          p12-file-base64: ${{ secrets.CODE_SIGN_P12_BASE64 }}
-          p12-password: ${{ secrets.CODE_SIGN_P12_PASSWORD }}
+          key: windows-msys2
+          variant: sccache
+          evict-old-files: 1d
 
-      - uses: actions/setup-dotnet@v3
-        if: runner.os == 'Windows'
+      - name: Setup ${{ matrix.sys }}
+        uses: msys2/setup-msys2@v2
         with:
-          dotnet-version: "8.0.x"
+          update: true
+          msystem: ${{matrix.sys}}
+          install: >-
+            base-devel
+            git
+            mingw-w64-${{matrix.env}}-toolchain
+            mingw-w64-${{matrix.env}}-cmake
+            mingw-w64-${{matrix.env}}-openblas
+
+      - name: Build using CMake
+        shell: msys2 {0}
+        run: |
+            cmake -B build
+            cmake --build build --config ${{ matrix.build }} -j $(nproc)
+
+      - name: Clean after building using CMake
+        shell: msys2 {0}
+        run: |
+            rm -rf build
+
+      - name: Build using CMake w/ OpenBLAS
+        shell: msys2 {0}
+        run: |
+            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+            cmake --build build --config ${{ matrix.build }} -j $(nproc)
+
+  windows-latest-cmake:
+    runs-on: windows-latest
 
-      - name: Add msbuild to PATH
-        if: runner.os == 'Windows'
-        uses: ilammy/msvc-dev-cmd@v1.13.0
+    env:
+      OPENBLAS_VERSION: 0.3.23
+      SDE_VERSION: 9.33.0-2024-01-07
+      VULKAN_VERSION: 1.4.304.1
+
+    strategy:
+      matrix:
+        include:
+          - build: 'noavx-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
+          - build: 'avx2-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
+          - build: 'avx-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
+          - build: 'avx512-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
+          - build: 'openblas-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+          - build: 'kompute-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
+          - build: 'vulkan-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
+          - build: 'llvm-arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
+          - build: 'msvc-arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
+          - build: 'llvm-arm64-opencl-adreno'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-${{ matrix.build }}
+          variant: sccache
+          evict-old-files: 1d
+
+      - name: Clone Kompute submodule
+        id: clone_kompute
+        if: ${{ matrix.build == 'kompute-x64' }}
+        run: |
+          git submodule update --init ggml/src/ggml-kompute/kompute
+
+      - name: Download OpenBLAS
+        id: get_openblas
+        if: ${{ matrix.build == 'openblas-x64' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
+          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
+          mkdir $env:RUNNER_TEMP/openblas
+          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
+          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
+          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
+          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
+          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
+
+      - name: Install Vulkan SDK
+        id: get_vulkan
+        if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
+        run: |
+          git clone https://github.com/KhronosGroup/OpenCL-Headers
+          cd OpenCL-Headers
+          cmake -B build `
+            -DBUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build --target install
+          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+          cd OpenCL-ICD-Loader
+          cmake -B build-arm64-release `
+            -A arm64 `
+            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build-arm64-release --target install --config release
 
       - name: Build
-        id: build-and-test
+        id: cmake_build
+        run: |
+          cmake -S . -B build ${{ matrix.defines }}
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Add libopenblas.dll
+        id: add_libopenblas_dll
+        if: ${{ matrix.build == 'openblas-x64' }}
+        run: |
+          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
+          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
+
+      - name: Check AVX512F support
+        id: check_avx512f
+        if: ${{ matrix.build == 'avx512-x64' }}
+        continue-on-error: true
+        run: |
+          cd build
+          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
+          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
+          $cl =  $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
+          echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
+          & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
+          .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
+
+      - name: Test
+        id: cmake_test
+        # not all machines have native AVX-512
+        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
+        run: |
+          cd build
+          ctest -L main -C Release --verbose --timeout 900
+
+      - name: Test (Intel SDE)
+        id: cmake_test_sde
+        if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
+          # for some weird reason windows tar doesn't like sde tar.xz
+          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
+          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
+          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
+          cd build
+          $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
+          & $sde -future -- ctest -L main -C Release --verbose --timeout 900
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
         run: |
-          make build-lib CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}"
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
+          Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
+          name: llama-bin-win-${{ matrix.build }}.zip
+
+  ubuntu-latest-cmake-cuda:
+    runs-on: ubuntu-latest
+    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
+
+    steps:
+        - name: Clone
+          id: checkout
+          uses: actions/checkout@v4
+          with:
+            fetch-depth: 0
+
+        - name: Install dependencies
+          env:
+            DEBIAN_FRONTEND: noninteractive
+          run: |
+              apt update
+              apt install -y cmake build-essential ninja-build libgomp1 git
 
-      - uses: 1arp/create-a-file-action@0.4.5
+        - name: ccache
+          uses: hendrikmuhs/ccache-action@v1.2.16
+          with:
+            key: ubuntu-latest-cmake-cuda
+            evict-old-files: 1d
+
+        - name: Build with CMake
+          run: |
+            cmake -S . -B build -G Ninja \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DCMAKE_CUDA_ARCHITECTURES=89-real \
+              -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
+              -DLLAMA_FATAL_WARNINGS=ON \
+              -DGGML_NATIVE=OFF \
+              -DGGML_CUDA=ON
+            cmake --build build
+
+  windows-2019-cmake-cuda:
+    runs-on: windows-2019
+
+    strategy:
+      matrix:
+        cuda: ['12.4', '11.7']
+        build: ['cuda']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
+
+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
         with:
-          path: 'llama'
-          isAbsolutePath: false
-          file: 'version.txt'
-          content: |
-            name: ${{ matrix.os }}-${{ matrix.name }}
-            version: ${{needs.create-draft-release.outputs.version}}
+          key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
+          variant: sccache
+          evict-old-files: 1d
+
+      - name: Install Cuda Toolkit 11.7
+        if: ${{ matrix.cuda == '11.7' }}
+        run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+      - name: Install Cuda Toolkit 12.4
+        if: ${{ matrix.cuda == '12.4' }}
+        run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
 
-      - name: Code Signing macOS
-        if: runner.os == 'macOS'
+      - name: Install Ninja
+        id: install_ninja
         run: |
-          make codesign CODE_SIGN=true DEVELOPER_ID="${{ secrets.DEVELOPER_ID }}"
+          choco install ninja
 
-      - name: Code Signing Windows
-        if: runner.os == 'Windows'
+      - name: Build
+        id: cmake_build
         shell: cmd
         run: |
-          set PATH=%PATH%;%USERPROFILE%\.dotnet\tools
-          make codesign CODE_SIGN=true AZURE_KEY_VAULT_URI="${{ secrets.AZURE_KEY_VAULT_URI }}" AZURE_CLIENT_ID="${{ secrets.AZURE_CLIENT_ID }}" AZURE_TENANT_ID="${{ secrets.AZURE_TENANT_ID }}" AZURE_CLIENT_SECRET="${{ secrets.AZURE_CLIENT_SECRET }}" AZURE_CERT_NAME="${{ secrets.AZURE_CERT_NAME }}"
+          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -DLLAMA_BUILD_SERVER=ON ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_CUDA=ON ^
+            -DGGML_RPC=ON
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
+          cmake --build build --config Release
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
 
-      - name: Package
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
-          cat llama/version.txt
-          make package
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
 
-      - name: Upload Artifact
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
-          name: llama-${{ matrix.os }}-${{ matrix.name }}
-          path: ./llama
-
-      - name: Calculate SHA512 Checksum (macOS)
-        if: runner.os == 'macOS'
-        run: |
-          sha512sum ./llama.tar.gz | awk '{ print $1 }' > sha512.txt
-          size=$(stat -f%z ./llama.tar.gz)  # Sử dụng -f%z cho macOS
-          echo "checksum=$(cat sha512.txt)" >> $GITHUB_ENV
-          echo "size=$size" >> $GITHUB_ENV
-  
-      - name: Calculate SHA512 Checksum (Windows)
-        if: runner.os == 'Windows'
-        shell: pwsh
-        run: |
-          CertUtil -hashfile ./llama.tar.gz SHA512 | Select-String -Pattern "^[0-9a-fA-F]+$" | Out-File sha512.txt
-          $size = (Get-Item ./llama.tar.gz).length
-          echo "checksum=$(Get-Content sha512.txt)" >> $env:GITHUB_ENV
-          echo "size=$size" >> $env:GITHUB_ENV
-
-      - name: Calculate SHA512 Checksum (Linux)
-        if: runner.os == 'Linux'
-        run: |
-          sha512sum ./llama.tar.gz | awk '{ print $1 }' > sha512.txt
-          size=$(stat -c%s ./llama.tar.gz)
-          echo "checksum=$(cat sha512.txt)" >> $GITHUB_ENV
-          echo "size=$size" >> $GITHUB_ENV
-  
-      ## Write for matrix outputs workaround 
-      - uses: cloudposse/github-action-matrix-outputs-write@v1
-        id: out
-        with:
-          matrix-step-name: ${{ github.job }}
-          matrix-key: ${{ matrix.os }}-${{ matrix.name }}
-          outputs: |-
-            sha512: ${{ env.checksum }}
-            size: ${{ env.size }}
-
-      - uses: actions/upload-release-asset@v1.0.1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+          name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+
+      - name: Copy and pack Cuda runtime
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+        run: |
+          echo "Cuda install location: ${{ env.CUDA_PATH }}"
+          $dst='.\build\bin\cudart\'
+          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
+
+      - name: Upload Cuda runtime
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
         with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./llama.tar.gz
-          asset_name: llama-${{ needs.create-draft-release.outputs.version }}-bin-${{ matrix.os }}-${{ matrix.name }}.tar.gz
-          asset_content_type: application/gzip
+          path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
 
-      - name: Upload ccache to s3
-        continue-on-error: true
-        if: always() && runner.os == 'Windows'
+  windows-latest-cmake-sycl:
+    runs-on: windows-latest
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-sycl
+          variant: sccache
+          evict-old-files: 1d
+
+      - name: Install
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      - name: Build
+        id: cmake_build
+        run:  examples/sycl/win-build-sycl.bat
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
         run: |
-          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
-          refreshenv
-          aws s3 sync ${{ matrix.ccache-dir }} s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }}  --endpoint ${{ secrets.MINIO_ENDPOINT }}
-        env:
-          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
-          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
-          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
 
-      - name: Upload ccache to s3
-        continue-on-error: true
-        if: always() && runner.os == 'Linux'
+      - name: Build the release package
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
-          aws s3 sync ${{ matrix.ccache-dir }} s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }}  --endpoint ${{ secrets.MINIO_ENDPOINT }}
-        env:
-          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
-          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
-          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
 
-      - name: Remove Keychain
-        continue-on-error: true
-        if: always() && runner.os == 'macOS'
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
+
+          echo "cp oneAPI running time dll files to ./build/bin done"
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
+
+      - name: Upload the release package
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+          name: llama-bin-win-sycl-x64.zip
+
+  windows-latest-cmake-hip:
+    if: ${{ github.event.inputs.create_release != 'true' }}
+    runs-on: windows-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Clone rocWMMA repository
+        id: clone_rocwmma
+        run: |
+          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
+
+      - name: Install
+        id: depends
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+
+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ${{ github.job }}
+          evict-old-files: 1d
+
+      - name: Build
+        id: cmake_build
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DGGML_HIP=ON `
+            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGGML_RPC=ON
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+
+  windows-latest-cmake-hip-release:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        gpu_target: [gfx1100, gfx1101, gfx1030]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
+
+      - name: Clone rocWMMA repository
+        id: clone_rocwmma
+        run: |
+          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-hip-release
+          evict-old-files: 1d
+
+      - name: Install
+        id: depends
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+
+      - name: Build
+        id: cmake_build
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
+            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGGML_HIP=ON `
+            -DGGML_RPC=ON
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+          md "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+
+  ios-xcode-build:
+    runs-on: macos-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build -G Xcode \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF \
+            -DCMAKE_SYSTEM_NAME=iOS \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+
+      - name: xcodebuild for swift package
+        id: xcodebuild
         run: |
-          security delete-keychain signing_temp.keychain
+          ./build-xcframework.sh
+
+      - name: Build Xcode project
+        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
 
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
 
-  ## Read matrix outputs 
-  read:
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
+          name: llama-${{ steps.tag.outputs.name }}-xcframework
+
+  android-build:
     runs-on: ubuntu-latest
-    needs: [build-and-test]
+
     steps:
-      - uses: cloudposse/github-action-matrix-outputs-read@v1
-        id: read
-        with:
-          matrix-step-name: build-and-test
-    outputs:
-        result: "${{ steps.read.outputs.result }}"
-
-  create-checksum-file:
-    runs-on: ubuntu-20-04
-    permissions:
-      contents: write
-    needs: [read, create-draft-release]
+      - name: Clone
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: android-build
+          evict-old-files: 1d
+
+      - name: Set up JDK
+        uses: actions/setup-java@v3
+        with:
+          java-version: 17
+          distribution: zulu
+
+      - name: Setup Android SDK
+        uses: android-actions/setup-android@v3
+        with:
+          log-accepted-android-sdk-licenses: false
+
+      - name: Build
+        run: |
+          cd examples/llama.android
+
+          ./gradlew build --no-daemon
+
+  release:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+
+    runs-on: ubuntu-latest
+
+    needs:
+      - ubuntu-cpu-cmake
+      - ubuntu-22-cmake-vulkan
+      - windows-latest-cmake
+      - windows-2019-cmake-cuda
+      - windows-latest-cmake-sycl
+      - windows-latest-cmake-hip-release
+      - macOS-latest-cmake-arm64
+      - macOS-latest-cmake-x64
+
     steps:
-      - name: Download cuda dependencies from s3 and create checksum
-        run: |
-          wget https://minio.menlo.ai:9000/cicd/dist/cuda-dependencies/12.0/linux/cuda.tar.gz -O /tmp/cuda-12-0-linux-amd64.tar.gz
-          wget https://minio.menlo.ai:9000/cicd/dist/cuda-dependencies/11.7/linux/cuda.tar.gz -O /tmp/cuda-11-7-linux-amd64.tar.gz
-          wget https://minio.menlo.ai:9000/cicd/dist/cuda-dependencies/12.0/windows/cuda.tar.gz -O /tmp/cuda-12-0-windows-amd64.tar.gz
-          wget https://minio.menlo.ai:9000/cicd/dist/cuda-dependencies/11.7/windows/cuda.tar.gz -O /tmp/cuda-11-7-windows-amd64.tar.gz
-
-          version=${{ needs.create-draft-release.outputs.version }}
-          outputs=${{ toJson(needs.read.outputs.result) }}
-
-          echo $outputs
-
-          echo "version: $version" > checksum.yml
-          echo "files:" >> checksum.yml
-
-          echo "$outputs" | jq -r --arg version "$version" '
-            .sha512 as $sha512 |
-            .size as $size |
-            (.sha512 | keys[]) as $key |
-            "- url: llama-\($version)-\($key).tar.gz\n  sha512: >-\n    \($sha512[$key])\n  size: \($size[$key])"
-          ' >> checksum.yml
-
-          echo "- url: cuda-12-0-linux-amd64.tar.gz" >> checksum.yml
-          echo "  sha512: >-" >> checksum.yml
-          echo "    $(sha512sum /tmp/cuda-12-0-linux-amd64.tar.gz | awk '{ print $1 }')" >> checksum.yml
-          echo "  size: $(stat -c%s /tmp/cuda-12-0-linux-amd64.tar.gz)" >> checksum.yml
-
-          echo "- url: cuda-11-7-linux-amd64.tar.gz" >> checksum.yml
-          echo "  sha512: >-" >> checksum.yml
-          echo "    $(sha512sum /tmp/cuda-11-7-linux-amd64.tar.gz | awk '{ print $1 }')" >> checksum.yml
-          echo "  size: $(stat -c%s /tmp/cuda-11-7-linux-amd64.tar.gz)" >> checksum.yml
-
-          echo "- url: cuda-11-7-windows-amd64.tar.gz" >> checksum.yml
-          echo "  sha512: >-" >> checksum.yml
-          echo "    $(sha512sum /tmp/cuda-11-7-windows-amd64.tar.gz | awk '{ print $1 }')" >> checksum.yml
-          echo "  size: $(stat -c%s /tmp/cuda-11-7-windows-amd64.tar.gz)" >> checksum.yml
-
-          echo "- url: cuda-12-0-windows-amd64.tar.gz" >> checksum.yml
-          echo "  sha512: >-" >> checksum.yml
-          echo "    $(sha512sum /tmp/cuda-12-0-windows-amd64.tar.gz | awk '{ print $1 }')" >> checksum.yml
-          echo "  size: $(stat -c%s /tmp/cuda-12-0-windows-amd64.tar.gz)" >> checksum.yml
-          cat checksum.yml
-
-      - name: Upload checksum.yml to GitHub Release
-        uses: actions/upload-release-asset@v1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
         with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./checksum.yml
-          asset_name: checksum.yml
-          asset_content_type: text/yaml
+          fetch-depth: 0
 
-      - name: upload cuda-12-0-linux-amd64.tar.gz to Github Release
-        uses: actions/upload-release-asset@v1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
         with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: /tmp/cuda-12-0-linux-amd64.tar.gz
-          asset_name: cuda-12-0-linux-amd64.tar.gz
-          asset_content_type: application/gzip
-      
-      - name: upload cuda-11-7-linux-amd64.tar.gz to Github Release
-        uses: actions/upload-release-asset@v1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          key: release
+          evict-old-files: 1d
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Download artifacts
+        id: download-artifact
+        uses: actions/download-artifact@v4
         with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: /tmp/cuda-11-7-linux-amd64.tar.gz
-          asset_name: cuda-11-7-linux-amd64.tar.gz
-          asset_content_type: application/gzip
-      
-      - name: upload cuda-12-0-windows-amd64.tar.gz to Github Release
-        uses: actions/upload-release-asset@v1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+          path: ./artifact
+
+      - name: Move artifacts
+        id: move_artifacts
+        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
+
+      - name: Create release
+        id: create_release
+        uses: ggml-org/action-create-release@v1
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: /tmp/cuda-12-0-windows-amd64.tar.gz
-          asset_name: cuda-12-0-windows-amd64.tar.gz
-          asset_content_type: application/gzip
-      
-      - name: upload cuda-11-7-windows-amd64.tar.gz to Github Release
-        uses: actions/upload-release-asset@v1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          tag_name: ${{ steps.tag.outputs.name }}
+
+      - name: Upload release
+        id: upload_release
+        uses: actions/github-script@v3
         with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: /tmp/cuda-11-7-windows-amd64.tar.gz
-          asset_name: cuda-11-7-windows-amd64.tar.gz
-          asset_content_type: application/gzip
\ No newline at end of file
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const path = require('path');
+            const fs = require('fs');
+            const release_id = '${{ steps.create_release.outputs.id }}';
+            for (let file of await fs.readdirSync('./artifact/release')) {
+              if (path.extname(file) === '.zip') {
+                console.log('uploadReleaseAsset', file);
+                await github.repos.uploadReleaseAsset({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  release_id: release_id,
+                  name: file,
+                  data: await fs.readFileSync(`./artifact/release/${file}`)
+                });
+              }
+            }
+
+#  ubuntu-latest-gcc:
+#    runs-on: ubuntu-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Debug, Release]
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v4
+#
+#      - name: Dependencies
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#          sudo apt-get install cmake
+#
+#      - name: Configure
+#        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+#
+#      - name: Build
+#        run: |
+#          make
+#
+#  ubuntu-latest-clang:
+#    runs-on: ubuntu-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Debug, Release]
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v4
+#
+#      - name: Dependencies
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#          sudo apt-get install cmake
+#
+#      - name: Configure
+#        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
+#
+#      - name: Build
+#        run: |
+#          make
+#
+#  ubuntu-latest-gcc-sanitized:
+#    runs-on: ubuntu-latest
+#
+#    strategy:
+#      matrix:
+#        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v4
+#
+#      - name: Dependencies
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#          sudo apt-get install cmake
+#
+#      - name: Configure
+#        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
+#
+#      - name: Build
+#        run: |
+#          make
+#
+#  windows:
+#    runs-on: windows-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Release]
+#        arch: [Win32, x64]
+#        include:
+#          - arch: Win32
+#            s2arc: x86
+#          - arch: x64
+#            s2arc: x64
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v4
+#
+#      - name: Add msbuild to PATH
+#        uses: microsoft/setup-msbuild@v1
+#
+#      - name: Configure
+#        run: >
+#          cmake -S . -B ./build -A ${{ matrix.arch }}
+#          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+#
+#      - name: Build
+#        run: |
+#          cd ./build
+#          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
+#
+#      - name: Upload binaries
+#        uses: actions/upload-artifact@v4
+#        with:
+#          name: llama-bin-${{ matrix.arch }}
+#          path: build/bin/${{ matrix.build }}
+#
+#  windows-blas:
+#    runs-on: windows-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Release]
+#        arch: [Win32, x64]
+#        blas: [ON]
+#        include:
+#          - arch: Win32
+#            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
+#            s2arc: x86
+#          - arch: x64
+#            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
+#            s2arc: x64
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v4
+#
+#      - name: Add msbuild to PATH
+#        uses: microsoft/setup-msbuild@v1
+#
+#      - name: Fetch OpenBLAS
+#        if: matrix.blas == 'ON'
+#        run: |
+#          C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
+#          7z x blas.zip -oblas -y
+#          copy blas/include/cblas.h .
+#          copy blas/include/openblas_config.h .
+#          echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
+#
+#      - name: Configure
+#        run: >
+#          cmake -S . -B ./build -A ${{ matrix.arch }}
+#          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+#          -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
+#          -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
+#
+#      - name: Build
+#        run: |
+#          cd ./build
+#          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
+#
+#      - name: Copy libopenblas.dll
+#        if: matrix.blas == 'ON'
+#        run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
+#
+#      - name: Upload binaries
+#        if: matrix.blas == 'ON'
+#        uses: actions/upload-artifact@v4
+#        with:
+#          name: llama-blas-bin-${{ matrix.arch }}
+#          path: build/bin/${{ matrix.build }}
+#
+#  emscripten:
+#    runs-on: ubuntu-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Release]
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v4
+#
+#      - name: Dependencies
+#        run: |
+#          wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
+#          tar -xvf master.tar.gz
+#          emsdk-master/emsdk update
+#          emsdk-master/emsdk install latest
+#          emsdk-master/emsdk activate latest
+#
+#      - name: Configure
+#        run: echo "tmp"
+#
+#      - name: Build
+#        run: |
+#          pushd emsdk-master
+#          source ./emsdk_env.sh
+#          popd
+#          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+#          make
+
+  openEuler-latest-cmake-cann:
+    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
+    defaults:
+      run:
+       shell: bash -el {0}
+    runs-on: ubuntu-24.04-arm
+    strategy:
+      matrix:
+        cann:
+          - '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
+        device:
+          - 'ascend910b3'
+        build:
+          - 'Release'
+    container: ascendai/cann:${{ matrix.cann }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        run: |
+          yum update -y
+          yum install -y git gcc gcc-c++ make cmake
+
+      - name: Build
+        run: |
+          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+
+          cmake -S . -B build \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
+              -DGGML_CANN=on \
+              -DSOC_TYPE=${{ matrix.device }}
+          cmake --build build -j $(nproc)
diff --git a/.github/workflows/menlo-build.yml b/.github/workflows/menlo-build.yml
new file mode 100644
index 0000000000000..796782321ca43
--- /dev/null
+++ b/.github/workflows/menlo-build.yml
@@ -0,0 +1,523 @@
+name: CI
+
+on:
+  push:
+    tags: ["b[0-9]+"]
+    paths:
+      [
+        ".github/scripts/**",
+        ".github/workflows/build.yml",
+        "**/CMakeLists.txt",
+        "**/Makefile",
+        "**/*.h",
+        "**/*.hpp",
+        "**/*.c",
+        "**/*.cpp",
+        "**/*.cu",
+        "**/*.cc",
+        "**/*.cxx",
+        "llama.cpp",
+        "!docs/**",
+        "!.gitignore",
+        "!README.md",
+      ]
+  workflow_dispatch:
+
+jobs:
+  create-draft-release:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+      version: ${{ steps.get_version.outputs.version }}
+    permissions:
+      contents: write
+    steps:
+      - name: Extract tag name
+        id: get_version
+        run: echo "VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/}"
+        env:
+          GITHUB_REF: ${{ github.ref }}
+      - name: Create Draft Release
+        id: create_release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ github.ref_name }}
+          token: ${{ secrets.GITHUB_TOKEN }}
+          name: "${{ env.VERSION }}"
+          draft: true
+          generate_release_notes: true
+          prerelease: false
+
+  build-and-test:
+    runs-on: ${{ matrix.runs-on }}
+    needs: [create-draft-release]
+    timeout-minutes: 90
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: "linux"
+            name: "noavx-x64"
+            runs-on: "ubuntu-20-04"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx-x64"
+            runs-on: "ubuntu-20-04"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx512-x64"
+            runs-on: "ubuntu-20-04"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "noavx-cuda-cu11.7-x64"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx2-cuda-cu11.7-x64"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx-cuda-cu11.7-x64"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx512-cuda-cu11.7-x64"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "noavx-cuda-cu12.0-x64"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx2-cuda-cu12.0-x64"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx-cuda-cu12.0-x64"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"
+          - os: "linux"
+            name: "avx512-cuda-cu12.0-x64"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: "/home/runner/.ccache"          
+          - os: "win"
+            name: "noavx-cuda-cu12.0-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx2-cuda-cu12.0-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx-cuda-cu12.0-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx512-cuda-cu12.0-x64"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "noavx-cuda-cu11.7-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx2-cuda-cu11.7-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx-cuda-cu11.7-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+          - os: "win"
+            name: "avx512-cuda-cu11.7-x64"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            ccache: true
+            ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: use python for linux
+        continue-on-error: true
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install tools on Windows
+        if: runner.os == 'Windows'
+        run: |
+          choco install ccache awscli make ccache ninja -y
+
+      - name: Install tools on Linux
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt-get install -y ninja-build
+          python3 -m pip install awscli
+
+          if [ "${{ matrix.os }}${{ matrix.name }}" == "linuxarm64" ]; then
+            sudo apt-get install -y ccache
+            exit 0
+          fi
+          cd /tmp
+          wget https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz
+          tar -xvf ccache-4.10.2-linux-x86_64.tar.xz
+          sudo cp ccache-4.10.2-linux-x86_64/ccache /usr/bin/ccache
+          ccache -V
+          rm -rf /tmp/ccache-4.10.2-linux-x86_64.tar.xz /tmp/ccache-4.10.2-linux-x86_64
+
+      - name: Download ccache from s3
+        if: runner.os == 'Windows'
+        continue-on-error: true
+        run: |
+          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
+          refreshenv
+          aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }} ${{ matrix.ccache-dir }}  --endpoint ${{ secrets.MINIO_ENDPOINT }} --cli-read-timeout 0
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+
+      - name: Download ccache from s3
+        if: runner.os == 'Linux'
+        continue-on-error: true
+        run: |
+          aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }} ${{ matrix.ccache-dir }}  --endpoint ${{ secrets.MINIO_ENDPOINT }} --cli-read-timeout 0
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+
+      - name: Install coreutils macos
+        if: runner.os == 'macOS'
+        run: |
+          brew install coreutils
+
+      - name: Get Cer for code signing
+        if: runner.os == 'macOS'
+        run: base64 -d <<< "$CODE_SIGN_P12_BASE64" > /tmp/codesign.p12
+        shell: bash
+        env:
+          CODE_SIGN_P12_BASE64: ${{ secrets.CODE_SIGN_P12_BASE64 }}
+  
+      - uses: apple-actions/import-codesign-certs@v2
+        continue-on-error: true
+        if: runner.os == 'macOS'
+        with:
+          p12-file-base64: ${{ secrets.CODE_SIGN_P12_BASE64 }}
+          p12-password: ${{ secrets.CODE_SIGN_P12_PASSWORD }}
+
+      - uses: actions/setup-dotnet@v3
+        if: runner.os == 'Windows'
+        with:
+          dotnet-version: "8.0.x"
+
+      - name: Add msbuild to PATH
+        if: runner.os == 'Windows'
+        uses: ilammy/msvc-dev-cmd@v1.13.0
+
+      - name: Build
+        id: build-and-test
+        run: |
+          make build-lib CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}"
+
+      - uses: 1arp/create-a-file-action@0.4.5
+        with:
+          path: 'llama'
+          isAbsolutePath: false
+          file: 'version.txt'
+          content: |
+            name: ${{ matrix.os }}-${{ matrix.name }}
+            version: ${{needs.create-draft-release.outputs.version}}
+
+      - name: Code Signing macOS
+        if: runner.os == 'macOS'
+        run: |
+          make codesign CODE_SIGN=true DEVELOPER_ID="${{ secrets.DEVELOPER_ID }}"
+
+      - name: Code Signing Windows
+        if: runner.os == 'Windows'
+        shell: cmd
+        run: |
+          set PATH=%PATH%;%USERPROFILE%\.dotnet\tools
+          make codesign CODE_SIGN=true AZURE_KEY_VAULT_URI="${{ secrets.AZURE_KEY_VAULT_URI }}" AZURE_CLIENT_ID="${{ secrets.AZURE_CLIENT_ID }}" AZURE_TENANT_ID="${{ secrets.AZURE_TENANT_ID }}" AZURE_CLIENT_SECRET="${{ secrets.AZURE_CLIENT_SECRET }}" AZURE_CERT_NAME="${{ secrets.AZURE_CERT_NAME }}"
+
+      - name: Package
+        run: |
+          cat llama/version.txt
+          make package
+
+      - name: Upload Artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: llama-${{ matrix.os }}-${{ matrix.name }}
+          path: ./llama
+
+      - name: Calculate SHA512 Checksum (macOS)
+        if: runner.os == 'macOS'
+        run: |
+          sha512sum ./llama.tar.gz | awk '{ print $1 }' > sha512.txt
+          size=$(stat -f%z ./llama.tar.gz)  # Sử dụng -f%z cho macOS
+          echo "checksum=$(cat sha512.txt)" >> $GITHUB_ENV
+          echo "size=$size" >> $GITHUB_ENV
+  
+      - name: Calculate SHA512 Checksum (Windows)
+        if: runner.os == 'Windows'
+        shell: pwsh
+        run: |
+          CertUtil -hashfile ./llama.tar.gz SHA512 | Select-String -Pattern "^[0-9a-fA-F]+$" | Out-File sha512.txt
+          $size = (Get-Item ./llama.tar.gz).length
+          echo "checksum=$(Get-Content sha512.txt)" >> $env:GITHUB_ENV
+          echo "size=$size" >> $env:GITHUB_ENV
+
+      - name: Calculate SHA512 Checksum (Linux)
+        if: runner.os == 'Linux'
+        run: |
+          sha512sum ./llama.tar.gz | awk '{ print $1 }' > sha512.txt
+          size=$(stat -c%s ./llama.tar.gz)
+          echo "checksum=$(cat sha512.txt)" >> $GITHUB_ENV
+          echo "size=$size" >> $GITHUB_ENV
+  
+      ## Write for matrix outputs workaround 
+      - uses: cloudposse/github-action-matrix-outputs-write@v1
+        id: out
+        with:
+          matrix-step-name: ${{ github.job }}
+          matrix-key: ${{ matrix.os }}-${{ matrix.name }}
+          outputs: |-
+            sha512: ${{ env.checksum }}
+            size: ${{ env.size }}
+
+      - uses: actions/upload-release-asset@v1.0.1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: ./llama.tar.gz
+          asset_name: llama-${{ needs.create-draft-release.outputs.version }}-bin-${{ matrix.os }}-${{ matrix.name }}.tar.gz
+          asset_content_type: application/gzip
+
+      - name: Upload ccache to s3
+        continue-on-error: true
+        if: always() && runner.os == 'Windows'
+        run: |
+          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
+          refreshenv
+          aws s3 sync ${{ matrix.ccache-dir }} s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }}  --endpoint ${{ secrets.MINIO_ENDPOINT }}
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+
+      - name: Upload ccache to s3
+        continue-on-error: true
+        if: always() && runner.os == 'Linux'
+        run: |
+          aws s3 sync ${{ matrix.ccache-dir }} s3://${{ secrets.MINIO_BUCKET_NAME }}/ccache-data-${{ matrix.os }}-${{ matrix.name }}  --endpoint ${{ secrets.MINIO_ENDPOINT }}
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+
+      - name: Remove Keychain
+        continue-on-error: true
+        if: always() && runner.os == 'macOS'
+        run: |
+          security delete-keychain signing_temp.keychain
+
+
+  ## Read matrix outputs 
+  read:
+    runs-on: ubuntu-latest
+    needs: [build-and-test]
+    steps:
+      - uses: cloudposse/github-action-matrix-outputs-read@v1
+        id: read
+        with:
+          matrix-step-name: build-and-test
+    outputs:
+        result: "${{ steps.read.outputs.result }}"
+
+  create-checksum-file:
+    runs-on: ubuntu-20-04
+    permissions:
+      contents: write
+    needs: [read, create-draft-release]
+    steps:
+      - name: Download cuda dependencies from s3 and create checksum
+        run: |
+          wget https://minio.menlo.ai:9000/cicd/dist/cuda-dependencies/12.0/linux/cuda.tar.gz -O /tmp/cuda-12-0-linux-amd64.tar.gz
+          wget https://minio.menlo.ai:9000/cicd/dist/cuda-dependencies/11.7/linux/cuda.tar.gz -O /tmp/cuda-11-7-linux-amd64.tar.gz
+          wget https://minio.menlo.ai:9000/cicd/dist/cuda-dependencies/12.0/windows/cuda.tar.gz -O /tmp/cuda-12-0-windows-amd64.tar.gz
+          wget https://minio.menlo.ai:9000/cicd/dist/cuda-dependencies/11.7/windows/cuda.tar.gz -O /tmp/cuda-11-7-windows-amd64.tar.gz
+
+          version=${{ needs.create-draft-release.outputs.version }}
+          outputs=${{ toJson(needs.read.outputs.result) }}
+
+          echo $outputs
+
+          echo "version: $version" > checksum.yml
+          echo "files:" >> checksum.yml
+
+          echo "$outputs" | jq -r --arg version "$version" '
+            .sha512 as $sha512 |
+            .size as $size |
+            (.sha512 | keys[]) as $key |
+            "- url: llama-\($version)-\($key).tar.gz\n  sha512: >-\n    \($sha512[$key])\n  size: \($size[$key])"
+          ' >> checksum.yml
+
+          echo "- url: cuda-12-0-linux-amd64.tar.gz" >> checksum.yml
+          echo "  sha512: >-" >> checksum.yml
+          echo "    $(sha512sum /tmp/cuda-12-0-linux-amd64.tar.gz | awk '{ print $1 }')" >> checksum.yml
+          echo "  size: $(stat -c%s /tmp/cuda-12-0-linux-amd64.tar.gz)" >> checksum.yml
+
+          echo "- url: cuda-11-7-linux-amd64.tar.gz" >> checksum.yml
+          echo "  sha512: >-" >> checksum.yml
+          echo "    $(sha512sum /tmp/cuda-11-7-linux-amd64.tar.gz | awk '{ print $1 }')" >> checksum.yml
+          echo "  size: $(stat -c%s /tmp/cuda-11-7-linux-amd64.tar.gz)" >> checksum.yml
+
+          echo "- url: cuda-11-7-windows-amd64.tar.gz" >> checksum.yml
+          echo "  sha512: >-" >> checksum.yml
+          echo "    $(sha512sum /tmp/cuda-11-7-windows-amd64.tar.gz | awk '{ print $1 }')" >> checksum.yml
+          echo "  size: $(stat -c%s /tmp/cuda-11-7-windows-amd64.tar.gz)" >> checksum.yml
+
+          echo "- url: cuda-12-0-windows-amd64.tar.gz" >> checksum.yml
+          echo "  sha512: >-" >> checksum.yml
+          echo "    $(sha512sum /tmp/cuda-12-0-windows-amd64.tar.gz | awk '{ print $1 }')" >> checksum.yml
+          echo "  size: $(stat -c%s /tmp/cuda-12-0-windows-amd64.tar.gz)" >> checksum.yml
+          cat checksum.yml
+
+      - name: Upload checksum.yml to GitHub Release
+        uses: actions/upload-release-asset@v1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: ./checksum.yml
+          asset_name: checksum.yml
+          asset_content_type: text/yaml
+
+      - name: upload cuda-12-0-linux-amd64.tar.gz to Github Release
+        uses: actions/upload-release-asset@v1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: /tmp/cuda-12-0-linux-amd64.tar.gz
+          asset_name: cuda-12-0-linux-amd64.tar.gz
+          asset_content_type: application/gzip
+      
+      - name: upload cuda-11-7-linux-amd64.tar.gz to Github Release
+        uses: actions/upload-release-asset@v1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: /tmp/cuda-11-7-linux-amd64.tar.gz
+          asset_name: cuda-11-7-linux-amd64.tar.gz
+          asset_content_type: application/gzip
+      
+      - name: upload cuda-12-0-windows-amd64.tar.gz to Github Release
+        uses: actions/upload-release-asset@v1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: /tmp/cuda-12-0-windows-amd64.tar.gz
+          asset_name: cuda-12-0-windows-amd64.tar.gz
+          asset_content_type: application/gzip
+      
+      - name: upload cuda-11-7-windows-amd64.tar.gz to Github Release
+        uses: actions/upload-release-asset@v1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: /tmp/cuda-11-7-windows-amd64.tar.gz
+          asset_name: cuda-11-7-windows-amd64.tar.gz
+          asset_content_type: application/gzip
\ No newline at end of file