Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
116 commits
Select commit Hold shift + click to select a range
a1c004e
ggml : add ggml_flash_attn_ext API
ggerganov Jan 18, 2024
fa7ebcc
ggml : fix GQA support in ggml_flash_attn_ext
ggerganov Jan 19, 2024
c3cdfff
Merge branch 'master' into gg/flash-attn
ggerganov Jan 20, 2024
a9681fe
ggml : online attention (CPU)
ggerganov Jan 20, 2024
1173f49
metal : initial implementation
ggerganov Jan 20, 2024
528da75
metal : f16 precision
ggerganov Jan 21, 2024
52ae085
metal : reduce branches
ggerganov Jan 21, 2024
b973258
metal : specialize for head size
ggerganov Jan 21, 2024
8cde449
wip : 8 rows per simd group
ggerganov Jan 21, 2024
f31955f
wip : 4 rows per simd group
ggerganov Jan 21, 2024
a4b6341
wip : template for rows per warp
ggerganov Jan 21, 2024
77d08f3
metal : parallelize across KV size
ggerganov Jan 21, 2024
17720fa
metal : parallel reduce across heads
ggerganov Jan 21, 2024
1446a12
metal : efficient flash_attn_f16 implementation
ggerganov Jan 23, 2024
d917746
metal : avoid redundant loads of the attention
ggerganov Jan 25, 2024
432ad04
metal : scale and mask in matrix form
ggerganov Jan 25, 2024
40ea8cd
metal : fix comment
ggerganov Jan 25, 2024
f9ca5dc
llama : avoid ggml_cast, use F32 query
ggerganov Jan 25, 2024
6fea843
metal : add parallel reduce version (disabled)
ggerganov Jan 25, 2024
b3dd7d9
Merge branch 'master' into gg/flash-attn
ggerganov Jan 28, 2024
77f6976
metal : move output into local memory + optimize
ggerganov Jan 28, 2024
ecc466a
metal : add tests, fix scaling, support C > 32
ggerganov Jan 28, 2024
3a428a1
metal : improve precision
ggerganov Jan 28, 2024
8612864
ggml : fix f16 mad
ggerganov Jan 28, 2024
0ad44ba
Merge branch 'master' into gg/flash-attn
ggerganov Jan 28, 2024
134c81c
metal : minor
ggerganov Jan 28, 2024
1db22d7
metal : support Q > 8
ggerganov Jan 28, 2024
4794821
tests : add ATTN tests
ggerganov Jan 29, 2024
abeaf0d
metal : disable buffer allocation logs
ggerganov Jan 29, 2024
c6c1132
tests : more
ggerganov Jan 29, 2024
5fcb9c1
metal : faster inner loop for C == 32
ggerganov Jan 29, 2024
d073e4f
metal : fix array initialization
ggerganov Jan 30, 2024
78df552
tests : ifdef
ggerganov Jan 30, 2024
3d03bcb
Merge branch 'master' into gg/flash-attn
ggerganov Jan 30, 2024
2ddc9bb
Merge branch 'master' into gg/flash-attn
ggerganov Jan 31, 2024
8ad92dc
ggml : switch to padded F16 mask for ggml_soft_max, ggml_flash_attn_ext
ggerganov Jan 31, 2024
910b15b
ggml : fix ggml_soft_max mask requirement
ggerganov Feb 1, 2024
2e46013
cuda : fix soft_max to use correct mask size
ggerganov Feb 1, 2024
5a19a9f
cuda : add flash_attn kernel (wip)
ggerganov Feb 1, 2024
41d136b
Merge branch 'master' into gg/flash-attn
ggerganov Feb 1, 2024
56e45a2
metal : optimize softmax for C > 32
ggerganov Feb 1, 2024
cda5a60
metal : optimize softmax
ggerganov Feb 1, 2024
c6769b9
tests : minor fix
ggerganov Feb 1, 2024
db1f3c4
cuda : avoid zeroing fragments
ggerganov Feb 1, 2024
12eaa22
tests : update dims
ggerganov Feb 2, 2024
b68a112
cuda : fix __hisinf() result check
ggerganov Feb 2, 2024
b150abe
cuda : avoid warp_reduce for smax
ggerganov Feb 3, 2024
7c34655
cuda : use int instead of int64_t
ggerganov Feb 3, 2024
1f8a592
cuda : make loops use the same loop values
ggerganov Feb 3, 2024
92472ea
cuda : unroll some of the loops
ggerganov Feb 3, 2024
c51f27c
cuda : avoid __hisinf branches
ggerganov Feb 3, 2024
b958151
cuda : use half2 in softmax
ggerganov Feb 3, 2024
a7b4715
cuda : switch to 1 warp for bs > 16
ggerganov Feb 3, 2024
3b1c4e7
cuda : speed-up reduce part of the kernel
ggerganov Feb 3, 2024
5b263dd
cuda : unroll Q*K^T loop
ggerganov Feb 3, 2024
e04ff39
cuda : fix -INF block check
ggerganov Feb 3, 2024
cfd9732
cuda : simplify softmax
ggerganov Feb 3, 2024
ef68fac
cuda : fix matrix names
ggerganov Feb 3, 2024
1846e92
cuda : minor
ggerganov Feb 4, 2024
6875997
Merge branch 'master' into gg/flash-attn
ggerganov Feb 12, 2024
31109ca
Merge branch 'master' into gg/flash-attn
ggerganov Feb 19, 2024
f249c99
llama : adapt to F16 KQ_pos
ggerganov Feb 19, 2024
02a645e
Merge branch 'master' into gg/flash-attn
ggerganov Mar 3, 2024
6aefd11
llama : adapt new models to F16 KQ_mask
ggerganov Mar 3, 2024
e307882
Merge branch 'master' into gg/flash-attn
ggerganov Mar 4, 2024
58c7f61
ggml : fix F16 store (ARM NEON)
ggerganov Mar 4, 2024
9495d39
Merge branch 'master' into gg/flash-attn
ggerganov Mar 22, 2024
3a468e6
llama : fix type of KQ_mask and KQ_pos
ggerganov Mar 22, 2024
0953212
ggml : fix CPU soft_max
ggerganov Mar 22, 2024
e425810
tests : add hs=256
ggerganov Mar 24, 2024
013721d
Merge branch 'master' into gg/flash-attn
ggerganov Mar 27, 2024
6be02b5
cuda : fix build
ggerganov Mar 27, 2024
57c03b7
metal : improve perf via smaller int registers
ggerganov Mar 28, 2024
3e318e7
Merge branch 'master' into gg/flash-attn
ggerganov Mar 28, 2024
08e69c5
cuda : adapt soft_max to F16 mask and pos
ggerganov Mar 28, 2024
75aa7b4
CUDA: faster FlashAttention, kernel for bs == 1
JohannesGaessler Mar 29, 2024
d59ac67
16 cols for Phi-2
JohannesGaessler Mar 30, 2024
81da919
no vec for hs, no hs==256 ncols==32 for Volta
JohannesGaessler Mar 30, 2024
269374e
adjust kernel selection logic
JohannesGaessler Mar 31, 2024
cca6d02
4 warps, 256 stride for all D
JohannesGaessler Mar 31, 2024
68d793b
no ncols == 64
JohannesGaessler Apr 1, 2024
3f777ac
Multiple parallel blocks for batch size 1
JohannesGaessler Apr 1, 2024
e1ecd3b
fix compile warnings
JohannesGaessler Apr 2, 2024
bb0d51a
fix excessive KQ_b loads
JohannesGaessler Apr 2, 2024
c63dfdf
fix cmake build
JohannesGaessler Apr 2, 2024
ee19a4a
fix KV cache padding, NaN from INFINITY (#6438)
JohannesGaessler Apr 2, 2024
89961de
Merge branch 'master' into gg/flash-attn
ggerganov Apr 5, 2024
4fbd809
gguf : add special tokens metadata for FIM/Infill (#6689)
danbev Apr 16, 2024
58227ff
perplexity : require positive --ctx-size arg (#6695)
ggerganov Apr 16, 2024
8a56075
gritlm : add --outdir option to hf.sh script (#6699)
danbev Apr 16, 2024
f4dea7d
llama : add qwen2moe (#6074)
simonJJJ Apr 16, 2024
dbceec8
llama : add StableLM2 12B (#6635)
ashishdatta Apr 16, 2024
8cc91dc
ggml : add llamafile sgemm (#6414)
jart Apr 16, 2024
666867b
ggml : fix llamafile sgemm wdata offsets (#6710)
ggerganov Apr 16, 2024
532c173
llama : make general.name optional (#6709)
ggerganov Apr 16, 2024
facb8b5
convert : fix autoawq gemma (#6704)
dengzheng-cloud Apr 16, 2024
2c41180
Merge branch 'master' into gg/flash-attn
ggerganov Apr 17, 2024
599ce84
llama : flash_attn cparam + fix defrag
ggerganov Apr 17, 2024
4053857
server: support flash_attn param
phymbert Apr 17, 2024
8dd1ec8
readme : add UI (#6724)
yaroslavyaroslav Apr 17, 2024
3b8f1ec
llamafile : tmp disable + build sgemm.o when needed (#6716)
ggerganov Apr 17, 2024
5668c79
server: bench: enable flash_attn param
phymbert Apr 17, 2024
c71bfd7
llama : fix compatibility with old 2 expert models (#6735)
slaren Apr 18, 2024
34f93bb
CUDA: refactor host code, dyn. par. blocks
JohannesGaessler Apr 9, 2024
6a3b842
fix flash_attn_vec_f16 race condition
JohannesGaessler Apr 13, 2024
ef9e159
flush softmax exp below threshold to 0
JohannesGaessler Apr 15, 2024
a5b0e2d
store temp KQ in registers
JohannesGaessler Apr 16, 2024
0bc67dd
Calculate KQ as FP32 if KQV has GGML_PREC_F32
JohannesGaessler Apr 16, 2024
2f538b9
Add __hgt2_mask implementation for CUDA 11
JohannesGaessler Apr 17, 2024
87968de
fix KQ FP32 precision fpr parallel_blocks > 1
JohannesGaessler Apr 17, 2024
260cdb2
llama-bench : add -fa,--flash-attn arg
ggerganov Apr 18, 2024
105332c
metal : add BS=1 kernel for flash attention (#6508)
ggerganov Apr 18, 2024
e11b2e6
Qwen2 : assume tied weights if lm_head/output weights is missing (#6738)
jklj077 Apr 18, 2024
fa9e8c6
Merge branch 'master' into gg/flash-attn
ggerganov Apr 18, 2024
c16a7c2
metal : use F32 attention accumulators
ggerganov Apr 18, 2024
9ca8698
batched-bench : add fattn arg
ggerganov Apr 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 31 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,18 @@ else()
set(LLAMA_METAL_DEFAULT OFF)
endif()

# TODO: fix this for Android CI
# https://github.com/ggerganov/llama.cpp/pull/6716#issuecomment-2061509191
#if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
# set(LLAMA_LLAMAFILE_DEFAULT OFF)
#else()
# set(LLAMA_LLAMAFILE_DEFAULT ON)
#endif()

# TODO: temporary disable until MoE is fixed
# https://github.com/ggerganov/llama.cpp/pull/6716
set(LLAMA_LLAMAFILE_DEFAULT OFF)

# general
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
option(LLAMA_STATIC "llama: static link libraries" OFF)
Expand Down Expand Up @@ -88,6 +100,7 @@ endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUDA "llama: use CUDA" OFF)
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
Expand Down Expand Up @@ -286,6 +299,7 @@ if (LLAMA_METAL)
${METALKIT_FRAMEWORK}
)
endif()

if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
Expand Down Expand Up @@ -368,6 +382,13 @@ if (LLAMA_BLAS)
endif()
endif()

if (LLAMA_LLAMAFILE)
add_compile_definitions(GGML_USE_LLAMAFILE)

set(GGML_HEADERS_LLAMAFILE sgemm.h)
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
endif()

if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64)
endif()
Expand Down Expand Up @@ -1151,15 +1172,16 @@ add_library(ggml OBJECT
ggml-backend.h
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
)

target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
Expand Down
16 changes: 14 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,15 @@ ifdef LLAMA_OPENBLAS
MK_LDFLAGS += $(shell pkg-config --libs openblas)
endif # LLAMA_OPENBLAS

# TODO: temporary disable until MoE is fixed
# https://github.com/ggerganov/llama.cpp/pull/6716
LLAMA_NO_LLAMAFILE := 1

ifndef LLAMA_NO_LLAMAFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
OBJS += sgemm.o
endif

ifdef LLAMA_BLIS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib
Expand Down Expand Up @@ -480,11 +489,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com

ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(NVCC_COMPILE)

endif # LLAMA_CUDA

ifdef LLAMA_CLBLAST

MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
Expand Down Expand Up @@ -603,6 +610,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI

ifndef LLAMA_NO_LLAMAFILE
sgemm.o: sgemm.cpp sgemm.h ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif

GF_CC := $(CC)
include scripts/get-flags.mk

Expand Down
1 change: 1 addition & 0 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import PackageDescription

var sources = [
"ggml.c",
"sgemm.cpp",
"llama.cpp",
"unicode.cpp",
"unicode-data.cpp",
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [MindMac](https://mindmac.app) (proprietary)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)

*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

---
Expand Down
15 changes: 8 additions & 7 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ pub fn build(b: *std.build.Builder) !void {
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;

const ggml = make.obj("ggml", "ggml.c");
const sgemm = make.obj("sgemm", "sgemm.cpp");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
Expand All @@ -128,14 +129,14 @@ pub fn build(b: *std.build.Builder) !void {
const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");

_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });

const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}
Expand Down
79 changes: 79 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,79 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}

#if defined(__x86_64__) && defined(__linux__)
#include <pthread.h>

static void cpuid(unsigned leaf, unsigned subleaf,
unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
__asm__("movq\t%%rbx,%%rsi\n\t"
"cpuid\n\t"
"xchgq\t%%rbx,%%rsi"
: "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
: "0"(leaf), "2"(subleaf));
}

static int pin_cpu(int cpu) {
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(cpu, &mask);
return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
}

static bool is_hybrid_cpu(void) {
unsigned eax, ebx, ecx, edx;
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
return !!(edx & (1u << 15));
}

static bool is_running_on_efficiency_core(void) {
unsigned eax, ebx, ecx, edx;
cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
int intel_atom = 0x20;
int core_type = (eax & 0xff000000u) >> 24;
return core_type == intel_atom;
}

static int count_math_cpus(int cpu_count) {
int result = 0;
for (int cpu = 0; cpu < cpu_count; ++cpu) {
if (pin_cpu(cpu)) {
return -1;
}
if (is_running_on_efficiency_core()) {
continue; // efficiency cores harm lockstep threading
}
++cpu; // hyperthreading isn't useful for linear algebra
++result;
}
return result;
}

#endif // __x86_64__ && __linux__

/**
* Returns number of CPUs on system that are useful for math.
*/
int get_math_cpu_count() {
#if defined(__x86_64__) && defined(__linux__)
int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
if (cpu_count < 1) {
return get_num_physical_cores();
}
if (is_hybrid_cpu()) {
cpu_set_t affinity;
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
int result = count_math_cpus(cpu_count);
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
if (result > 0) {
return result;
}
}
}
#endif
return get_num_physical_cores();
}

void process_escapes(std::string & input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;
Expand Down Expand Up @@ -827,6 +900,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.cont_batching = true;
return true;
}
if (arg == "-fa" || arg == "--flash-attn") {
params.flash_attn = true;
return true;
}
if (arg == "--color") {
params.use_color = true;
return true;
Expand Down Expand Up @@ -1763,6 +1840,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn;

cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
Expand Down Expand Up @@ -2600,6 +2678,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);

const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
Expand Down
4 changes: 3 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ extern char const *LLAMA_BUILD_TARGET;

struct llama_control_vector_load_info;

int get_math_cpu_count();
int32_t get_num_physical_cores();

//
Expand All @@ -48,7 +49,7 @@ int32_t get_num_physical_cores();
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed

int32_t n_threads = get_num_physical_cores();
int32_t n_threads = get_math_cpu_count();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
Expand Down Expand Up @@ -147,6 +148,7 @@ struct gpt_params {
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
Expand Down
Loading