Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
38e32eb
ggml: optimize some vec dot functions for LoongArch ASX (#11842)
MQ-mengqing Feb 14, 2025
3d68f03
llama : add completion for --chat-template-file (#11860)
danbev Feb 14, 2025
dbc2ec5
docker : drop to CUDA 12.4 (#11869)
ggerganov Feb 14, 2025
94b87f8
cuda : add ampere to the list of default architectures (#11870)
slaren Feb 14, 2025
300907b
opencl: Fix rope and softmax (#11833)
lhez Feb 14, 2025
89daa25
llguidance build fixes for Windows (#11664)
mmoskal Feb 14, 2025
fc1b0d0
vulkan: initial support for IQ1_S and IQ1_M quantizations (#11528)
remyoudompheng Feb 15, 2025
f355229
server: fix type promotion typo causing crashes w/ --jinja w/o tools …
ochafik Feb 15, 2025
68ff663
repo : update links to new url (#11886)
ggerganov Feb 15, 2025
c2cd24f
readme : add notice about new package registry (#11890)
ggerganov Feb 15, 2025
2288510
metal : optimize dequant q6_K kernel (#11892)
akretz Feb 15, 2025
fc10c38
examples: fix typo in imatrix/README.md (#11884)
708-145 Feb 15, 2025
6dde178
scripts: fix compare-llama-bench commit hash logic (#11891)
JohannesGaessler Feb 15, 2025
c2ea16f
metal : fix the crash caused by the lack of residency set support on …
halechan Feb 16, 2025
bf42a23
vulkan: support multi/vision rope, and noncontiguous rope (#11902)
jeffbolznv Feb 16, 2025
818a340
ci : fix (again) arm64 build fails (#11895)
ngxson Feb 16, 2025
fe163d5
common : Fix a typo in help (#11899)
standby24x7 Feb 16, 2025
0f2bbe6
server : bump httplib to 0.19.0 (#11908)
ngxson Feb 16, 2025
2eea03d
vulkan: implement several ops relevant for ggml_opt (#11769)
remyoudompheng Feb 17, 2025
c4d29ba
server : fix divide-by-zero in metrics reporting (#11915)
aviallon Feb 17, 2025
f7b1116
update release requirements (#11897)
netrunnereve Feb 17, 2025
73e2ed3
CUDA: use async data loading for FlashAttention (#11894)
JohannesGaessler Feb 17, 2025
09aaf4f
docs : Fix duplicated file extension in test command (#11935)
xiaobing318 Feb 18, 2025
5137da7
scripts: corrected encoding when getting chat template (#11866) (#11907)
MoonRide303 Feb 18, 2025
63ac128
server : add TEI API format for /rerank endpoint (#11942)
ngxson Feb 18, 2025
63e489c
tool-call: refactor common chat / tool-call api (+ tests / fixes) (#1…
ochafik Feb 18, 2025
b58934c
server : (webui) Enable communication with parent html (if webui is i…
igardev Feb 18, 2025
9626d93
llama : fix indentation in llama-grammar [no ci] (#11943)
danbev Feb 19, 2025
abd4d0b
speculative : update default params (#11954)
ggerganov Feb 19, 2025
d07c621
common : add llama.vim preset for Qwen2.5 Coder (#11945)
danbev Feb 19, 2025
d04e716
doc: add links to ggml examples [no ci] (#11958)
JohannesGaessler Feb 19, 2025
0d55958
run : add --chat-template-file (#11961)
engelmi Feb 20, 2025
4806498
ggml: aarch64: implement SVE kernels for q3_K_q8_K vector dot (#11917)
Vithulep Feb 20, 2025
c5d91a7
ggml-cpu: Add CPU backend support for KleidiAI library (#11390)
chaxu01 Feb 20, 2025
c392e50
server (webui): Fix Premature Submission During IME Conversion (#11971)
mmngays Feb 20, 2025
ee02ad0
clip : fix visual encoders with no CLS (#11982)
alex-jw-brooks Feb 21, 2025
0b3863f
MUSA: support ARM64 and enable dp4a .etc (#11843)
BodhiHu Feb 21, 2025
ecc8e3a
CUDA: correct the lowest Maxwell supported by CUDA 12 (#11984)
PureJourney Feb 21, 2025
586d5fe
doc: update contributing guidelines [no ci] (#11969)
JohannesGaessler Feb 21, 2025
51f311e
llama : skip loading unused tensors (#12004)
ggerganov Feb 21, 2025
de8b5a3
llama.swiftui : add "Done" dismiss button to help view (#11998)
danbev Feb 22, 2025
d709084
cuda: Add Q5_1, Q5_0, Q4_1 and Q4_0 to F32 conversion support. (#12000)
gcp Feb 22, 2025
cf756d6
server : disable Nagle's algorithm (#12020)
ggerganov Feb 22, 2025
335eb04
ci : Build on Github-hosted arm64 runners (#12009)
Rohanjames1997 Feb 22, 2025
5fa07c2
CUDA: optimize FA for GQA + large batches (#12014)
JohannesGaessler Feb 22, 2025
f3e6485
ci : fix arm upload artifacts (#12024)
ggerganov Feb 22, 2025
36c258e
llava: build clip image from pixels (#11999)
tinglou Feb 22, 2025
a28e0d5
CUDA: app option to compile without FlashAttention (#12025)
JohannesGaessler Feb 22, 2025
af7747c
ggml-cpu: Support s390x SIMD Instruction Set (#12019)
taronaeo Feb 22, 2025
f777a73
Some llama-run cleanups (#11973)
ericcurtin Feb 23, 2025
7ad0779
run: allow to customize prompt by env var LLAMA_PROMPT_PREFIX (#12041)
benoitf Feb 23, 2025
8303e8b
SYCL: Fix GGML_SYCL_DEBUG macro (#11995)
qnixsynapse Feb 24, 2025
651adf4
gguf_convert_endian.py: implement byteswapping for q4_k and q6_k (#11…
AlekseiNikiforovIBM Feb 24, 2025
08d5986
[SYCL] Optimize mul_mat for Q4_0 on Intel GPU (#12035)
NeoZhangJianyu Feb 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devops/cuda.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.6.0
ARG CUDA_VERSION=12.4.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

Expand Down
4 changes: 2 additions & 2 deletions .devops/llama-cpp-cuda.srpm.spec
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ Version: %( date "+%%Y%%m%%d" )
Release: 1%{?dist}
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
License: MIT
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
Source0: https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
BuildRequires: coreutils make gcc-c++ git cuda-toolkit
Requires: cuda-toolkit
URL: https://github.com/ggerganov/llama.cpp
URL: https://github.com/ggml-org/llama.cpp

%define debug_package %{nil}
%define source_date_epoch_from_changelog 0
Expand Down
4 changes: 2 additions & 2 deletions .devops/llama-cpp.srpm.spec
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ Version: %( date "+%%Y%%m%%d" )
Release: 1%{?dist}
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
License: MIT
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
Source0: https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
BuildRequires: coreutils make gcc-c++ git libstdc++-devel
Requires: libstdc++
URL: https://github.com/ggerganov/llama.cpp
URL: https://github.com/ggml-org/llama.cpp

%define debug_package %{nil}
%define source_date_epoch_from_changelog 0
Expand Down
6 changes: 3 additions & 3 deletions .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,12 @@ effectiveStdenv.mkDerivation (finalAttrs: {
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
'';

# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
# With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
# `default.metallib` may be compiled with Metal compiler from XCode
# and we need to escape sandbox on MacOS to access Metal compiler.
# `xcrun` is used find the path of the Metal compiler, which is varible
# and not on $PATH
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
# see https://github.com/ggml-org/llama.cpp/pull/6118 for discussion
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;

nativeBuildInputs =
Expand Down Expand Up @@ -220,7 +220,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
broken = (useMetalKit && !effectiveStdenv.isDarwin);

description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
homepage = "https://github.com/ggerganov/llama.cpp/";
homepage = "https://github.com/ggml-org/llama.cpp/";
license = lib.licenses.mit;

# Accommodates `nix run` and `lib.getExe`
Expand Down
2 changes: 1 addition & 1 deletion .devops/rocm.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
FROM ${BASE_ROCM_DEV_CONTAINER} AS build

# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
# gfx906 is deprecated
Expand Down
6 changes: 3 additions & 3 deletions .github/ISSUE_TEMPLATE/020-enhancement.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ body:
- type: markdown
attributes:
value: |
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)

- type: checkboxes
id: prerequisites
Expand All @@ -16,11 +16,11 @@ body:
options:
- label: I am running the latest code. Mention the version if possible as well.
required: true
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md).
required: true
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
required: true
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
- label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share.
required: true

- type: textarea
Expand Down
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/030-research.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ body:
- type: markdown
attributes:
value: |
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)

- type: checkboxes
id: research-stage
Expand Down
4 changes: 2 additions & 2 deletions .github/ISSUE_TEMPLATE/040-refactor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ body:
- type: markdown
attributes:
value: |
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.

- type: textarea
id: background-description
Expand Down
6 changes: 3 additions & 3 deletions .github/ISSUE_TEMPLATE/config.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
blank_issues_enabled: true
contact_links:
- name: Got an idea?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas
about: Pop it there. It may then become an enhancement ticket.
- name: Got a question?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a
about: Ask a question there!
- name: Want to contribute?
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
url: https://github.com/ggml-org/llama.cpp/wiki/contribute
about: Head to the contribution guide page of the wiki for areas you can help with
2 changes: 1 addition & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
12 changes: 1 addition & 11 deletions .github/workflows/bench.yml.disabled
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# TODO: there have been some issues with the workflow, so disabling for now
# https://github.com/ggerganov/llama.cpp/issues/7893
# https://github.com/ggml-org/llama.cpp/issues/7893
#
# Benchmark
name: Benchmark
Expand Down Expand Up @@ -57,17 +57,7 @@ jobs:

if: |
inputs.gpu-series == 'Standard_NC4as_T4_v3'
|| (
github.event_name == 'schedule'
&& github.ref_name == 'master'
&& github.repository_owner == 'ggerganov'
)
|| github.event_name == 'pull_request_target'
|| (
github.event_name == 'push'
&& github.event.ref == 'refs/heads/master'
&& github.repository_owner == 'ggerganov'
)
steps:
- name: Clone
id: checkout
Expand Down
24 changes: 18 additions & 6 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ jobs:
run: |
sysctl -a
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
Expand Down Expand Up @@ -173,7 +173,15 @@ jobs:
name: llama-bin-macos-x64.zip

ubuntu-cpu-cmake:
runs-on: ubuntu-22.04
strategy:
matrix:
include:
- build: 'x64'
os: ubuntu-22.04
- build: 'arm64'
os: ubuntu-22.04-arm

runs-on: ${{ matrix.os }}

steps:
- name: Clone
Expand Down Expand Up @@ -239,14 +247,14 @@ jobs:
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*

- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
name: llama-bin-ubuntu-x64.zip
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
name: llama-bin-ubuntu-${{ matrix.build }}.zip

ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -374,6 +382,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
Expand Down Expand Up @@ -401,7 +411,7 @@ jobs:
run: |
cd build
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 1800
ctest -L main --verbose --timeout 2700

- name: Determine tag name
id: tag
Expand Down Expand Up @@ -1373,8 +1383,10 @@ jobs:

needs:
- ubuntu-cpu-cmake
- ubuntu-22-cmake-vulkan
- windows-latest-cmake
- windows-2019-cmake-cuda
- windows-latest-cmake-sycl
- windows-latest-cmake-hip-release
- macOS-latest-cmake-arm64
- macOS-latest-cmake-x64
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ jobs:

- name: Set up QEMU
uses: docker/setup-qemu-action@v3
with:
image: tonistiigi/binfmt:qemu-v7.0.0-28

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
steps:
- uses: actions/checkout@v4
with:
repository: "ggerganov/llama.cpp"
repository: "ggml-org/llama.cpp"
- uses: actions/labeler@v5
with:
configuration-path: '.github/labeler.yml'
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ examples/server/*.css.hpp
examples/server/*.html.hpp
examples/server/*.js.hpp
examples/server/*.mjs.hpp
examples/server/*.gz.hpp
!build_64.sh
!examples/*.bat
!examples/*/*.kts
Expand Down
10 changes: 6 additions & 4 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
# Pull requests (for contributors)

- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
- Test your changes:
- Execute [the full CI locally on your machine](ci/README.md) before publishing
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments

# Pull requests (for collaborators)

- Squash-merge PRs
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)

# Coding guidelines
Expand Down Expand Up @@ -40,14 +42,14 @@
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$

![matmul](media/matmul.png)

# Naming guidelines

- Use `snake_case` for function, variable and type names
- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
- Naming usually optimizes for longest common prefix (see https://github.com/ggml-org/ggml/pull/302#discussion_r1243240963)

```cpp
// not OK
Expand Down Expand Up @@ -122,4 +124,4 @@

The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:

https://github.com/ggerganov/llama.cpp/projects
https://github.com/ggml-org/llama.cpp/projects
24 changes: 18 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ifndef LLAMA_MAKEFILE
$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
endif

# Define the default target now so that it is always the first target
Expand Down Expand Up @@ -463,7 +463,7 @@ endif
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
# https://github.com/ggerganov/llama.cpp/issues/2922
# https://github.com/ggml-org/llama.cpp/issues/2922
MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move

Expand Down Expand Up @@ -680,6 +680,10 @@ ifdef GGML_CUDA_CCBIN
MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN)
endif # GGML_CUDA_CCBIN

ifdef GGML_CUDA_NO_FA
MK_NVCCFLAGS += -DGGML_CUDA_NO_FA
endif # GGML_CUDA_NO_FA

ifdef GGML_CUDA_FA_ALL_QUANTS
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
endif # GGML_CUDA_FA_ALL_QUANTS
Expand Down Expand Up @@ -800,6 +804,10 @@ ifdef GGML_CUDA_NO_PEER_COPY
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
endif # GGML_CUDA_NO_PEER_COPY

ifdef GGML_CUDA_NO_FA
HIPFLAGS += -DGGML_CUDA_NO_FA
endif # GGML_CUDA_NO_FA

OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
Expand Down Expand Up @@ -847,7 +855,7 @@ ifdef GGML_MUSA
CXX := $(MUSA_PATH)/bin/clang++
MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc

MUSAFLAGS = -x musa -mtgpu
MUSAFLAGS = -fsigned-char -x musa -mtgpu
MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))

ifdef GGML_CUDA_FORCE_MMQ
Expand Down Expand Up @@ -876,6 +884,10 @@ ifdef GGML_CUDA_NO_PEER_COPY
MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
endif # GGML_CUDA_NO_PEER_COPY

ifdef GGML_CUDA_NO_FA
MUSAFLAGS += -DGGML_CUDA_NO_FA
endif # GGML_CUDA_NO_FA

ifdef GGML_CUDA_FA_ALL_QUANTS
MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
endif # GGML_CUDA_FA_ALL_QUANTS
Expand Down Expand Up @@ -1078,8 +1090,8 @@ endif
ifdef REMOVE_WARNING
$(info !!! REMOVAL WARNING !!!)
$(info The following LLAMA_ options have been removed and are no longer supported)
$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418))
$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
$(info - LLAMA_DISABLE_LOGS (https://github.com/ggml-org/llama.cpp/pull/9418))
$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggml-org/llama.cpp/pull/9418))
$(info )
endif

Expand Down Expand Up @@ -1364,7 +1376,7 @@ llama-server: \
examples/server/index.html.hpp \
examples/server/loading.html.hpp \
common/chat.cpp \
common/chat.hpp \
common/chat.h \
common/chat-template.hpp \
common/json.hpp \
common/minja.hpp \
Expand Down
Loading
Loading