Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
188 commits
Select commit Hold shift + click to select a range
bf10771
add geglu activation function (#14074)
huydt84 Jun 9, 2025
6c6603e
sycl: Add reorder to Q6_K mmvq implementation (#13885)
s-Nick Jun 9, 2025
adcccc1
webui: fix sidebar being covered by main content (#14082)
yeahdongcn Jun 9, 2025
562b68d
CANN: Simplify the environment variable setting(#13104)
bachelor-dou Jun 9, 2025
a603bc6
graph : fix geglu (#14077)
ggerganov Jun 9, 2025
70897cc
ggml-cpu : split arch-specific implementations (#13892)
xctan Jun 9, 2025
ce4fd50
llama : allow building all tests on windows when not using shared lib…
slaren Jun 9, 2025
87cf9bd
sync : ggml
ggerganov Jun 10, 2025
c01dc70
Vulkan: Don't default to CPU device (like llvmpipe), even if no other…
0cc4m Jun 10, 2025
0668a29
ggml : fix weak alias win32 (whisper/0)
ggerganov Jun 10, 2025
ebc78bb
sync : ggml
ggerganov Jun 10, 2025
4d24034
vulkan: force device 0 in CI (#14106)
jeffbolznv Jun 10, 2025
76651f6
llama : support GEGLU for jina-bert-v2 (#14090)
CISC Jun 10, 2025
626da8b
convert : fix duplicate key DeepSeek-R1 conversion error (#14103)
CISC Jun 10, 2025
998fe79
kv-cache : avoid modifying recurrent cells when setting inputs (#13834)
compilade Jun 10, 2025
91b760a
opencl: add `mul_mv_id_q4_0_f32_8x_flat` (#14003)
lhez Jun 10, 2025
94165e6
vulkan: Track descriptor pools/sets per-context (#14109)
jeffbolznv Jun 11, 2025
401176a
kv-cache : add LLAMA_KV_CACHE_DEBUG environment variable (#14121)
ggerganov Jun 11, 2025
534a1ae
kv-cache : relax SWA masking condition (#14119)
ggerganov Jun 11, 2025
4ed4062
webui: Wrap long numbers instead of infinite horizontal scroll (#14062)
am17an Jun 11, 2025
11f8ce0
vulkan: Better thread-safety for command pools/buffers (#14116)
jeffbolznv Jun 11, 2025
017fbb2
tests : add test-tokenizers-repo (#14017)
CISC Jun 11, 2025
41873e5
chore : clean up relative source dir paths (#14128)
CISC Jun 11, 2025
64a745d
Implement GGML_CPU_ALL_VARIANTS for ARM (#14080)
ckastner Jun 11, 2025
c022829
kv-cache : fix split_equal handling in unified implementation (#14130)
ggerganov Jun 12, 2025
99cfd5e
batch : remove logits_all flag (#14141)
ggerganov Jun 12, 2025
ee4534c
context : simplify output counting logic during decode (#14142)
ggerganov Jun 12, 2025
ae02066
cmake : Improve build-info.cpp generation (#14156)
ckastner Jun 13, 2025
2cdab0c
pooling : make cls_b and cls_out_b optional (#14165)
huydt84 Jun 13, 2025
0eda332
cmake: Add ability to pass in LLAMA_BUILD_NUMBER/COMMIT (#14167)
ckastner Jun 13, 2025
51558f5
batch : rework llama_batch_allocr (#14153)
ggerganov Jun 13, 2025
bcd94ab
batch : add LLAMA_BATCH_DEBUG environment variable (#14172)
ggerganov Jun 13, 2025
8e406f7
Merge commit from fork
GuyGoldenberg Jun 13, 2025
52d7aaa
vocab : fix build (#14175)
ggerganov Jun 13, 2025
c4df4a7
batch : auto-gen positions + verify multi-sequence input (#14177)
ggerganov Jun 15, 2025
335d1fd
cparams : rename LLAMA_MAX_PARALLEL_SEQUENCES to LLAMA_MAX_SEQ (#14188)
ggerganov Jun 15, 2025
bf6f2a2
model : add dots.llm1 architecture support (#14044) (#14118)
Noeda Jun 15, 2025
ae9e854
kv-cache : fix use-after-move of defrag info (#14189)
ggerganov Jun 15, 2025
e4ab85e
model : Add support for Arcee AI's upcoming AFM model (#14185)
bartowski1182 Jun 15, 2025
fbc7ce7
ggml-cpu : rework weak alias on apple targets (#14146)
xctan Jun 16, 2025
46cee96
vulkan: mutex around vkQueueSubmit (#14127)
jeffbolznv Jun 16, 2025
e317b59
convert : remove arcee change in convert_hf_to_gguf_update.py (#14207)
bartowski1182 Jun 16, 2025
1805319
ggml: Add Android support for GGML_CPU_ALL_VARIANTS (#14206)
chaxu01 Jun 16, 2025
e27f8d7
llama : rework embeddings logic (#14208)
ggerganov Jun 16, 2025
a282cb8
model : add NeoBERT (#14164)
huydt84 Jun 16, 2025
8cff5c5
cmake: clean up external project logic for vulkan-shaders-gen (#14179)
bandoti Jun 16, 2025
b6a9ab1
llama : add thread safety test (#14035)
slaren Jun 16, 2025
a0a3a1d
server : fix incorrect usage of llama_get_embeddings() (#14225)
ggerganov Jun 16, 2025
fe27803
ggml-cpu : remove the weak alias trick (#14221)
xctan Jun 17, 2025
dbcfa79
cmake: remove shader-gen step-targets from ggml-vulkan (#14226)
bandoti Jun 17, 2025
fcceaad
examples : include examples in msvc disable warn (ggml/1270)
danbev Jun 12, 2025
f5bd3e0
ggml : disable warnings for tests when using MSVC (ggml/1273)
danbev Jun 13, 2025
cdff507
sync : ggml
ggerganov Jun 18, 2025
8644cc6
convert : fix null head_dim AutoConfig regression (#14248)
CISC Jun 18, 2025
e9e0fe2
ggml: Add Apple support for GGML_CPU_ALL_VARIANTS (#14258)
chaxu01 Jun 18, 2025
963da5f
docs: add s390x build documentation (#14264)
taronaeo Jun 18, 2025
f9d6fac
metal : add mean kernel (#14267)
ggerganov Jun 19, 2025
c15412f
memory : Hybrid recurrent cache (#13979)
gabe-l-hart Jun 19, 2025
4370911
Vulkan: Set device max size for host memory to avoid OOM warning and …
0cc4m Jun 19, 2025
1988b98
llamafile : support s390x SIMD instruction set (#14273)
taronaeo Jun 19, 2025
81a54d1
convert : fix remote option in Windows (#14100)
pqnet Jun 19, 2025
cbe6f59
build : suppress gcc15 compile warnings (#14261)
fanyang89 Jun 19, 2025
92826fc
server : add server parameters for draft model cache type (#13782)
aa956 Jun 19, 2025
62d2996
ggml-cpu : remove unnecesary arm feature detection (#14281)
slaren Jun 19, 2025
7bad377
CUDA: add conv_2d_dw (#14265)
am17an Jun 20, 2025
3832ced
ubatch : new splitting logic (#14217)
ggerganov Jun 20, 2025
e3753db
model : more uniform output id handling (#14275)
ggerganov Jun 20, 2025
6d511cb
ggml: Update KleidiAI to v1.9.0 (#14277)
chaxu01 Jun 20, 2025
a74eaf7
ggml : fix repack work size for mul_mat_id (#14292)
ggerganov Jun 20, 2025
21d5799
cuda : synchronize graph capture and cublas handle destruction (#14288)
slaren Jun 20, 2025
9a05e70
llama : improve sep token handling (#14272)
CISC Jun 20, 2025
79a533d
Implement GGML_CPU_ALL_VARIANTS for PowerPC (#14286)
ckastner Jun 20, 2025
4aad4f9
sycl: add usage of enqueue_functions extension (#14244)
s-Nick Jun 20, 2025
6c21dcc
vocab : prevent tokenizer overflow (#14301)
retr0reg Jun 20, 2025
55184b4
lint : remove trailing whitepace (#14304)
CISC Jun 20, 2025
002433b
CUDA: add conv_2d_transpose (#14287)
am17an Jun 20, 2025
6fdb1c4
Add `ggml_roll` (ggml/1274)
Acly Jun 18, 2025
0578022
sync : ggml
ggerganov Jun 20, 2025
50516a1
convert : fix Llama 4 conversion (#14311)
danielhanchen Jun 21, 2025
2bb9b77
memory : rename interface to llama_memory_context_i (#14296)
ggerganov Jun 21, 2025
d4cd0df
metal : fix thread-safety (#14300)
ggerganov Jun 21, 2025
5d068bc
gguf-py : fix TemplateProcessing pair when bos/eos is missing (#14312)
CISC Jun 21, 2025
df3727a
Add support for VK_EXT_debug_utils to add labels to Vulkan objects. (…
mtavenrath Jun 21, 2025
64622f8
gguf-py : fix Qwen3-Embedding eos token (#14314)
CISC Jun 21, 2025
9a70c5d
CUDA: add mean operation (#14313)
am17an Jun 22, 2025
3465e6a
HIP: enable vec fattn on RDNA4 (#14323)
IMbackK Jun 22, 2025
42e5ab7
examples : fix is_first logic for tokenization (#14329)
ggerganov Jun 22, 2025
af48bf4
run : avoid double tokenization (#14327)
retr0reg Jun 22, 2025
9335b8f
gguf-py : fix SpecialVocab parsing when post_processor is null (#14330)
CISC Jun 22, 2025
402073b
quantize : handle user-defined pruning of whole layers (blocks) (#13037)
EAddario Jun 22, 2025
5b96f59
vulkan: update windows SDK in CI (#14334)
jeffbolznv Jun 23, 2025
1786c29
kv-cells : fix tracking of seq_pos (#14339)
ggerganov Jun 23, 2025
fa5b06b
CUDA: mul_mat_v support for batch sizes > 1 (#14262)
JohannesGaessler Jun 23, 2025
8c592e2
ci: add workflow for relocatable cmake package (#14346)
bandoti Jun 23, 2025
4518b29
CUDA/HIP: optimize mmv paths taken for HIP devices (#14324)
IMbackK Jun 23, 2025
889286b
cmake : use LLAMA_BUILD_NUMBER when defining LLAMA_INSTALL_VERSION (#…
mbaudier Jun 24, 2025
6bbc782
batch : fix check for empty sequences in memory (#14364)
ggerganov Jun 24, 2025
96bc216
opencl: ref count `ggml_backend_opencl_context` and refactor profilin…
lhez Jun 24, 2025
517325a
ggml-cpu: enable IBM NNPA Vector Intrinsics (#14317)
taronaeo Jun 25, 2025
306deb1
musa: enable fp16 mma (all) and cublas on qy2 (#13842)
yeahdongcn Jun 26, 2025
6443bfc
docs: update s390x documentation + add faq (#14389)
taronaeo Jun 26, 2025
e5f34a1
metal : batch rows copy in a single threadgroup (#14384)
ggerganov Jun 26, 2025
10395d3
metal : add special-case mat-vec mul for ne00 == 4 (#14385)
ggerganov Jun 26, 2025
a6f4f1a
llama : return mistral-v7-tekken as default template only (#14390)
CISC Jun 26, 2025
23b5f61
cmake: regen vulkan shaders when shaders-gen sources change (#14398)
bandoti Jun 26, 2025
84b2ce5
model : gemma3n text-only (#14400)
ngxson Jun 26, 2025
24d415b
convert : fix broken sentencepiece vocab (#14416)
CISC Jun 27, 2025
384ccba
ggml : add ggml_set_rows (#14274)
rgerganov Jun 27, 2025
0fc0039
recurrent : call balloc split_reset() in init_batch() (#14414)
ggerganov Jun 27, 2025
fbc2eea
graph : make llm_graph_context destructor virtual (#14410)
ggerganov Jun 27, 2025
51a5527
vulkan: Fix GGML_VULKAN_SHADER_DEBUG_INFO (#14427)
jeffbolznv Jun 28, 2025
4f09618
ci : fix windows build and release (#14431)
CISC Jun 28, 2025
abf8177
fix async_mode bug (#14432)
bachelor-dou Jun 28, 2025
acfcc28
model : add support for ERNIE 4.5 0.3B model (#14408)
ownia Jun 28, 2025
bf260f9
vulkan: lock accesses of pinned_memory vector (#14333)
jeffbolznv Jun 28, 2025
de1c1f5
vulkan: handle noncontig in the final case of ggml_vk_get_cpy_pipelin…
jeffbolznv Jun 28, 2025
38733a0
CUDA: add bf16 and f32 support to cublas_mul_mat_batched (#14361)
am17an Jun 28, 2025
eb219f4
vulkan: Add fusion support for RMS_NORM+MUL (#14366)
jeffbolznv Jun 29, 2025
4012285
ggml : implement REGLU/GEGLU/SWIGLU ops (#14158)
CISC Jun 29, 2025
c048a56
ggml : fix unmerged GGML_FPxx_TO_FPxx refactoring (#14443)
CISC Jun 29, 2025
b16b20f
SYCL: disable faulty fp16 exp kernel (#14395)
qnixsynapse Jun 29, 2025
1fff524
server : fix appearance of the chats list context menu for Safari (#1…
rntk Jun 29, 2025
7c7d265
server : support jinja extra template kwargs (Qwen3 enable_thinking f…
matteoserva Jun 29, 2025
91645b1
scripts : make the shell scripts cross-platform (#14341)
vedranmiletic Jun 30, 2025
34a1040
cmake : Remove redundant include path in CMakeLists.txt (#14452)
xiaobing318 Jun 30, 2025
c2eff62
test-backend-ops : disable llama test (#14461)
slaren Jun 30, 2025
4e1da8d
ggml-cpu: sycl: Re-enable exp f16 (#14462)
Rbiessy Jun 30, 2025
9d8378c
metal : disable fast-math for some cpy kernels (#14460)
ggerganov Jun 30, 2025
f98a7d6
memory : correctly handle failure in apply() (#14438)
ggerganov Jun 30, 2025
29d89c7
Add Conv2d for CPU (#14388)
am17an Jun 30, 2025
0f8e604
opencl : add GEGLU, REGLU, SWIGLU (#14456)
lhez Jul 1, 2025
e7e966e
ggml-cpu : "align corners" for bilinear upscale/downscale (ggml/1285)
Acly Jul 1, 2025
0887dad
sync : ggml
ggerganov Jul 1, 2025
41f8fea
ggml : remove trailing whitespace (#0)
ggerganov Jul 1, 2025
690ce28
add GELU_ERF (#14455)
CISC Jul 1, 2025
0d2092e
vulkan: Split large mul_mat_id to fit in shared memory (#14451)
jeffbolznv Jul 1, 2025
0994184
ci : disable fast-math for Metal GHA CI (#14478)
ggerganov Jul 1, 2025
affb1dc
ggml : Callback before abort (#14481)
ScaledLizard Jul 2, 2025
f42ffb9
github : add OpenCL backend to issue templates (#14492)
EZForever Jul 2, 2025
f2a2a58
opencl : update upscale to support align corners (#14488)
lhez Jul 2, 2025
4540d61
opencl : skip empty nodes on cgraph compute (#14491)
EZForever Jul 2, 2025
882076a
opencl : fix possible buffer overflow in dump_tensor (#14490)
jeffzhou2000 Jul 2, 2025
a522cda
ggml : support bcast ggml_soft_max_ext, ggml_flash_attn_ext (#14435)
ggerganov Jun 27, 2025
d3984d9
vulkan: support softmax/FA batch and broadcast (#14449)
jeffbolznv Jul 1, 2025
280cfd9
CUDA: add softmax broadcast (#14475)
am17an Jul 2, 2025
954a40b
ggml : add version function to get lib version (ggml/1286)
danbev Jul 2, 2025
7de925a
sync : ggml
ggerganov Jul 2, 2025
87fd08a
llama : initial Mamba-2 support (#9126)
compilade Jul 2, 2025
8274c5a
gguf-py : add support for chat template jinja files (#14508)
CISC Jul 2, 2025
6c55697
CUDA: add dynamic shared mem to softmax, refactor general usage (#14497)
am17an Jul 2, 2025
efa1066
ggml : remove kompute backend (#14501)
ggerganov Jul 3, 2025
389282a
ggml : fix FA mask dim 2 and 3 (#14505)
ggerganov Jul 3, 2025
797b32a
kv-cache : use ggml_set_rows (#14285)
ggerganov Jul 3, 2025
2396f44
ggml: backward pass for split swiglu (#14483)
JohannesGaessler Jul 3, 2025
85f709a
vulkan: support mixed/deepseekR1 FA head sizes (#14509)
jeffbolznv Jul 3, 2025
21a9e3c
opencl : broadcast for soft_max (#14510)
lhez Jul 3, 2025
f240e32
ggml : implement GEGLU_ERF and GEGLU_QUICK ops (#14445)
CISC Jul 3, 2025
ac5eb5d
batch : add n_used count (#14512)
ggerganov Jul 4, 2025
ed2e9d8
graph : prepare for 4D mask (#14515)
ggerganov Jul 4, 2025
5344176
batch : add optional for sequential equal split (#14511)
ggerganov Jul 4, 2025
fec2133
metal : disable fast math in all quantize kernels (#14528)
ggerganov Jul 4, 2025
99b8f73
test-backend-ops: add support for specifying output format (#14368)
yeahdongcn Jul 5, 2025
77ffd1f
opencl: add GELU_ERF (#14476)
CISC Jul 5, 2025
39dceac
vulkan: Handle updated FA dim2/3 definition (#14518)
jeffbolznv Jul 5, 2025
f854452
vulkan: fix rms_norm+mul fusion (#14545)
jeffbolznv Jul 6, 2025
d014fbc
vulkan: increase LOAD_VEC_A to 8 (IQ1/IQ2) or 4 (IQ3) (#14485)
netrunnereve Jul 6, 2025
8d25eb1
CUDA: add bf16 and i32 to getrows (#14529)
am17an Jul 7, 2025
0104b6a
llama : remove ggml_cont where possible (#14568)
CISC Jul 7, 2025
0fb5b47
llama : fix incorrect minicpm3 v_states shape (#14571)
CISC Jul 7, 2025
3d5a6e8
musa: fix build warnings (unused variable) (#14561)
yeahdongcn Jul 7, 2025
2a755e0
CUDA: add bilinear interpolation for upscale (#14563)
am17an Jul 8, 2025
b5940d7
cuda : fix rope with partial rotation and non-cont src (#14580)
ggerganov Jul 8, 2025
8ac5f58
vulkan: increase timeout for CI (#14574)
jeffbolznv Jul 8, 2025
61a57b8
model : add hunyuan moe (#14425)
ngxson Jul 8, 2025
839a7d9
server: Add ability to mount server at prefix (#14544)
oluwandabira Jul 8, 2025
a77eda3
vulkan : fix rope with partial rotation and non-cont src (#14582)
jeffbolznv Jul 8, 2025
93a7c18
memory : fix broken batch splits for recurrent cache (#14575)
compilade Jul 8, 2025
4c8daec
model : add SmolLM3 (#14581)
ngxson Jul 8, 2025
b1016aa
model : fix hunyuan moe chat template (#14584)
stevenkuang-tencent Jul 8, 2025
fbe57ae
vulkan: optimize flash attention split_k_reduce (#14554)
jeffbolznv Jul 8, 2025
79e2a37
convert : fix smollm3 jinja template (#14586)
ngxson Jul 9, 2025
0a67a01
model : add support for Falcon-H1 family (#14534)
ibrahimkhadraoui Jul 9, 2025
b883d1b
llama : remove unintended whitespace (#14592)
CISC Jul 9, 2025
fa016ac
model : add skt/A.X-4.0 model vocabulary (#14589)
Bing-su Jul 9, 2025
62eb62c
ggml : prevent integer overflow in gguf tensor size calculation (#14595)
Yuuoniy Jul 9, 2025
9eb6825
ggml : add ggml_scale_bias (#14417)
ngxson Jul 9, 2025
4418e12
llama : support Jamba hybrid Transformer-Mamba models (#7531)
compilade Jul 9, 2025
3762fb3
llama : remove llm_graph_input_one (#14603)
ngxson Jul 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ jobs:
cd build
export GGML_VK_VISIBLE_DEVICES=0
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 3600
ctest -L main --verbose --timeout 4200

ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04
Expand Down
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2734,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.public_path = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
add_opt(common_arg(
{"--api-prefix"}, "PREFIX",
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
[](common_params & params, const std::string & value) {
params.api_prefix = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
add_opt(common_arg(
{"--no-webui"},
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ struct common_params {

std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
std::string api_prefix = ""; // NOLINT
std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT
bool enable_chat_template = true;
Expand Down
418 changes: 413 additions & 5 deletions convert_hf_to_gguf.py

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
]

# some models are known to be broken upstream, so we will skip them as exceptions
Expand All @@ -137,6 +138,12 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
]


Expand Down
20 changes: 11 additions & 9 deletions docs/development/HOWTO-add-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,20 +83,22 @@ NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the conv

### 2. Define the model architecture in `llama.cpp`

The model params and tensors layout must be defined in `llama.cpp`:
1. Define a new `llm_arch`
2. Define the tensors layout in `LLM_TENSOR_NAMES`
3. Add any non-standard metadata in `llm_load_hparams`
4. Create the tensors for inference in `llm_load_tensors`
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
The model params and tensors layout must be defined in `llama.cpp` source files:
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
2. In `src/llama-arch.cpp`:
- Add the architecture name to the `LLM_ARCH_NAMES` map.
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.

NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.

### 3. Build the GGML graph implementation

This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.

Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.

Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.

Expand Down
15 changes: 14 additions & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ extern "C" {
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_UPSCALE,
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
GGML_OP_ROLL,
Expand Down Expand Up @@ -1297,6 +1297,19 @@ extern "C" {
struct ggml_tensor * a,
float s);

// x = s * a + b
GGML_API struct ggml_tensor * ggml_scale_bias(
struct ggml_context * ctx,
struct ggml_tensor * a,
float s,
float b);

GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
float s,
float b);

// b -> view(a,offset,nb1,nb2,3), return modified a
GGML_API struct ggml_tensor * ggml_set(
struct ggml_context * ctx,
Expand Down
5 changes: 4 additions & 1 deletion ggml/src/ggml-cann/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2188,7 +2188,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_RMS_NORM:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_CLAMP:
Expand All @@ -2210,6 +2209,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_COUNT_EQUAL:
return true;
case GGML_OP_SCALE:
float bias;
memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
return bias == 0.0f; // TODO: support bias != 0.0f
case GGML_OP_SOFT_MAX:
// TODO: support broadcast
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
Expand Down
28 changes: 20 additions & 8 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4643,9 +4643,11 @@ static void ggml_compute_forward_scale_f32(
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));

// scale factor
float v;
memcpy(&v, dst->op_params, sizeof(float));
float s; // scale factor
float b; // bias

memcpy(&s, (float *) dst->op_params + 0, sizeof(float));
memcpy(&b, (float *) dst->op_params + 1, sizeof(float));

const int ith = params->ith;
const int nth = params->nth;
Expand All @@ -4664,12 +4666,22 @@ static void ggml_compute_forward_scale_f32(

const size_t nb1 = dst->nb[1];

for (int i1 = ir0; i1 < ir1; i1++) {
if (dst->data != src0->data) {
// src0 is same shape as dst => same indices
memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
if (b == 0.0f) {
for (int i1 = ir0; i1 < ir1; i1++) {
if (dst->data != src0->data) {
// src0 is same shape as dst => same indices
// TODO: add x parameter to ggml_vec_scale_f32 and remove this memcpy
memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
}
ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), s);
}
} else {
for (int i1 = ir0; i1 < ir1; i1++) {
ggml_vec_mad1_f32(nc,
(float *) ((char *) dst->data + i1*nb1),
(float *) ((char *) src0->data + i1*nb1),
s, b);
}
ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
}
}

Expand Down
39 changes: 39 additions & 0 deletions ggml/src/ggml-cpu/vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
#endif
}

inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
#if defined(GGML_USE_ACCELERATE)
vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
#elif defined(GGML_SIMD)
#if defined(__ARM_FEATURE_SVE)
// scalar ; TODO: Write SVE code
for (int i = 0; i < n; ++i) {
y[i] = x[i]*s + b;
}
#else
const int np = (n & ~(GGML_F32_STEP - 1));

GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);

GGML_F32_VEC ay[GGML_F32_ARR];

for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);

GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
}
}

// leftovers
for (int i = np; i < n; ++i) {
y[i] = x[i]*s + b;
}
#endif
#else
// scalar
for (int i = 0; i < n; ++i) {
y[i] = x[i]*s + b;
}
#endif
}

//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
#if defined(GGML_USE_ACCELERATE)
Expand Down
23 changes: 13 additions & 10 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -176,17 +176,20 @@ static const char * cu_get_error_str(CUresult err) {
#endif

#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
#define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
do { \
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; \
const int id = ggml_cuda_get_device(); \
if (!shared_memory_limit_raised[id]) { \
CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \
shared_memory_limit_raised[id] = true; \
} \
} while (0)
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
do { \
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
const int id = ggml_cuda_get_device(); \
if (!shared_memory_limit_raised[id]) { \
CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \
shared_memory_limit_raised[id] = true; \
} \
} while (0)
#else
#define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) do {} while (0)
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
do { \
GGML_UNUSED(nbytes); \
} while (0)
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)

#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
Expand Down
16 changes: 8 additions & 8 deletions ggml/src/ggml-cuda/fattn-tile-f32.cu
Original file line number Diff line number Diff line change
Expand Up @@ -299,14 +299,14 @@ static __global__ void flash_attn_tile_ext_f32(
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
GGML_UNUSED(ne31); GGML_UNUSED(ne32);
GGML_UNUSED(nb31); GGML_UNUSED(nb32);
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}
Expand Down
16 changes: 9 additions & 7 deletions ggml/src/ggml-cuda/fattn-vec-f32.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -337,13 +337,15 @@ static __global__ void flash_attn_vec_ext_f32(
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
GGML_UNUSED(ne31); GGML_UNUSED(ne32);
GGML_UNUSED(nb31); GGML_UNUSED(nb32);
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-cuda/getrows.cu
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,10 @@ static void ggml_cuda_get_rows_switch_src0_type(
get_rows_cuda_float((const float *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_I32:
get_rows_cuda_float((const int32_t *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_BF16:
get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
Expand Down Expand Up @@ -210,6 +214,10 @@ void get_rows_cuda(
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (float *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_I32:
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (int32_t *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_F16:
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (half *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
Expand Down
3 changes: 2 additions & 1 deletion ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3200,6 +3200,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
switch (op->src[0]->type) {
case GGML_TYPE_F16:
case GGML_TYPE_F32:
case GGML_TYPE_BF16:
case GGML_TYPE_I32:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
Expand Down Expand Up @@ -3373,7 +3375,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_GROUP_NORM:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
Expand Down
Loading