From 0fd6c1f015f6cccf3b527f7dbd8386a434728126 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 10:12:29 +0200 Subject: [PATCH 01/48] embedding : print cosine similarity (#899) --- common/common.cpp | 13 +++++++++++++ common/common.h | 1 + examples/embedding/embedding.cpp | 21 ++++++++++++++++----- examples/gritlm/gritlm.cpp | 26 ++++++-------------------- 4 files changed, 36 insertions(+), 25 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 73b1b61ba1b74..58fbd05aa3516 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1877,3 +1877,16 @@ void llama_embd_normalize(const float * inp, float * out, int n) { } } +float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){ + double sum = 0.0; + double sum1 = 0.0; + double sum2 = 0.0; + + for (int i = 0; i < n; i++) { + sum += embd1[i] * embd2[i]; + sum1 += embd1[i] * embd1[i]; + sum2 += embd2[i] * embd2[i]; + } + + return sum / (sqrt(sum1) * sqrt(sum2)); +} diff --git a/common/common.h b/common/common.h index 0f178b9eb1de3..d250eef8b2b6b 100644 --- a/common/common.h +++ b/common/common.h @@ -268,3 +268,4 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40 void llama_embd_normalize(const float * inp, float * out, int n); +float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 49302a199977e..f390c40610446 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -168,14 +168,25 @@ int main(int argc, char ** argv) { batch_decode(ctx, batch, out, s, n_embd); // print first 3 embeddings + fprintf(stdout, "\n"); for (int j = 0; j < std::min(3, n_prompts); j++) { - fprintf(stderr, "embedding %d: ", j); - for (int i = 0; i < n_embd; i++) { - fprintf(stderr, "%f ", emb[j * n_embd + i]); + fprintf(stdout, "embedding %d: ", j); + for (int i = 0; i < std::min(16, n_embd); i++) { + fprintf(stdout, "%f ", emb[j * n_embd + i]); } - fprintf(stderr, "\n\n"); + fprintf(stdout, "\n"); + } + + // print cosine similarity matrix + fprintf(stdout, "\n"); + printf("cosine similarity matrix:\n\n"); + for (int i = 0; i < n_prompts; i++) { + for (int j = 0; j < n_prompts; j++) { + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f ", sim); + } + fprintf(stdout, "\n"); } - fprintf(stderr, "\n"); // clean up llama_print_timings(ctx); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 3d4b085d69b6f..52fd719b38ee5 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -6,22 +6,6 @@ // #define GRIT_DEBUG -static float dot_product(const std::vector & v1, const std::vector & v2) { - float dot = 0.0f; - for (uint64_t i = 0; i < v1.size(); ++i) { - dot += v1[i] * v2[i]; - } - return dot; -} - -static float norm(const std::vector & v) { - return std::sqrt(dot_product(v, v)); -} - -static float cosine_similarity(const std::vector & v1, const std::vector & v2) { - return dot_product(v1, v2) / (norm(v1) * norm(v2)); -} - static std::vector> encode(llama_context * ctx, const std::vector & sentences, const std::string & instruction) { std::vector> result; @@ -203,10 +187,12 @@ int main(int argc, char * argv[]) { const std::vector> d_rep = encode(ctx, documents, gritlm_instruction("")); const std::vector> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); - const float cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]); - const float cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]); - const float cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]); - const float cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]); + const int n_embd = llama_n_embd(mdl); + + const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); + const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1); From 381da2d9f0940d7009e3e918bed36338c8ff2fbb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 11:55:23 +0200 Subject: [PATCH 02/48] metal : build metallib + fix embed path (#6015) * metal : build metallib + fix embed path ggml-ci * metal : fix embed build + update library load logic ggml-ci * metal : fix embeded library build ggml-ci * ci : fix iOS builds to use embedded library --- .github/workflows/build.yml | 2 ++ .gitignore | 2 ++ CMakeLists.txt | 70 ++++++++++++++++++++----------------- Makefile | 15 ++++---- ggml-metal.m | 49 +++++++++++++++++--------- ggml-metal.metal | 3 -- 6 files changed, 83 insertions(+), 58 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d39cd6bc338e0..0da01d5ba6ead 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -333,6 +333,7 @@ jobs: mkdir build cd build cmake -G Xcode .. \ + -DLLAMA_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ @@ -361,6 +362,7 @@ jobs: mkdir build cd build cmake -G Xcode .. \ + -DLLAMA_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ diff --git a/.gitignore b/.gitignore index d28f4d1b801ea..1ad8d929b87e8 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,8 @@ .vscode/ .idea/ +ggml-metal-embed.metal + lcov-report/ gcovr-report/ diff --git a/CMakeLists.txt b/CMakeLists.txt index a8abf4088a01c..3ac2804a6881a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,9 +200,6 @@ if (LLAMA_METAL) add_compile_definitions(GGML_METAL_NDEBUG) endif() - # get full path to the file - #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") - # copy ggml-common.h and ggml-metal.metal to bin directory configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) @@ -211,53 +208,62 @@ if (LLAMA_METAL) enable_language(ASM) add_compile_definitions(GGML_METAL_EMBED_LIBRARY) + set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h") set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") + file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") - set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s") + + # merge ggml-common.h and ggml-metal.metal into a single file + set(METALLIB_EMBED_ASM "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s") + set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal") add_custom_command( - OUTPUT ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY} - DEPENDS ${METALLIB_SOURCE} + OUTPUT ${METALLIB_EMBED_ASM} + COMMAND echo "Embedding Metal library" + COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED} + COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM} + DEPENDS ggml-metal.metal ggml-common.h COMMENT "Generate assembly for embedded Metal library" ) - set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY}) - endif() - - if (LLAMA_METAL_SHADER_DEBUG) - # custom command to do the following: - # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air - # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib - # - # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works - # disabling fast math is needed in order to pass tests/test-backend-ops - # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 - # note: unfortunately, we have to call it default.metallib instead of ggml.metallib - # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 - set(XC_FLAGS -fno-fast-math -fno-inline -g) - if (LLAMA_QKK_64) - set(XC_FLAGS ${XC_FLAGS} -DQK_K=64) + set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM}) + else() + if (LLAMA_METAL_SHADER_DEBUG) + # custom command to do the following: + # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air + # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib + # + # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works + # disabling fast math is needed in order to pass tests/test-backend-ops + # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 + # note: unfortunately, we have to call it default.metallib instead of ggml.metallib + # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 + set(XC_FLAGS -fno-fast-math -fno-inline -g) + else() + set(XC_FLAGS -O3) endif() add_custom_command( OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - DEPENDS ggml-metal.metal + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal + DEPENDS ggml-metal.metal ggml-common.h COMMENT "Compiling Metal kernels" - ) + ) add_custom_target( ggml-metal ALL DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - ) - endif() + ) + endif() # LLAMA_METAL_EMBED_LIBRARY set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${FOUNDATION_LIBRARY} diff --git a/Makefile b/Makefile index db9968efbddd8..cb597b209181b 100644 --- a/Makefile +++ b/Makefile @@ -557,15 +557,16 @@ ggml-metal.o: ggml-metal.m ggml-metal.h $(CC) $(CFLAGS) -c $< -o $@ ifdef LLAMA_METAL_EMBED_LIBRARY -ggml-metal-embed.o: ggml-metal.metal +ggml-metal-embed.o: ggml-metal.metal ggml-common.h @echo "Embedding Metal library" + @sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal $(eval TEMP_ASSEMBLY=$(shell mktemp)) - @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) - @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) + @echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) @$(AS) $(TEMP_ASSEMBLY) -o $@ @rm -f ${TEMP_ASSEMBLY} endif diff --git a/ggml-metal.m b/ggml-metal.m index 3a5476c52f1a5..c3451a79b103f 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -280,6 +280,11 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ id metal_library; // load library + // + // - first check if the library is embedded + // - then check if the library is in the bundle + // - if not found, load the source and compile it + // - if that fails, return NULL { NSBundle * bundle = nil; #ifdef SWIFT_PACKAGE @@ -287,12 +292,21 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ #else bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; #endif + NSError * error = nil; - NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"]; - if (libPath != nil) { + +#if GGML_METAL_EMBED_LIBRARY + const bool try_metallib = false; +#else + const bool try_metallib = true; +#endif + + NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"]; + if (try_metallib && path_lib != nil) { // pre-compiled library found - NSURL * libURL = [NSURL fileURLWithPath:libPath]; - GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]); + NSURL * libURL = [NSURL fileURLWithPath:path_lib]; + GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]); + metal_library = [ctx->device newLibraryWithURL:libURL error:&error]; if (error) { GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); @@ -305,31 +319,34 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ extern const char ggml_metallib_start[]; extern const char ggml_metallib_end[]; - NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; + NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; #else GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); - NSString * sourcePath; - NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; + NSString * path_source; + NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; - GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil"); + GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil"); - if (ggmlMetalPathResources) { - sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"]; + if (path_resource) { + path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"]; } else { - sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; + path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; } - if (sourcePath == nil) { + + if (path_source == nil) { GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); - sourcePath = @"ggml-metal.metal"; + path_source = @"ggml-metal.metal"; } - GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]); - NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error]; + + GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]); + + NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error]; if (error) { GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } -#endif +#endif // GGML_METAL_EMBED_LIBRARY @autoreleasepool { // dictionary of preprocessor macros diff --git a/ggml-metal.metal b/ggml-metal.metal index ebf2f5b478e46..63de563251d89 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -4,9 +4,6 @@ #include -#define GGML_COMMON_IMPL_METAL -#include "ggml-common.h" - using namespace metal; #define MAX(x, y) ((x) > (y) ? (x) : (y)) From 68265ebfc6a1bed022973ea0c3145be1450b7e70 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 12:37:20 +0200 Subject: [PATCH 03/48] embedding : print all resulting embeddings (#899) --- examples/embedding/embedding.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index f390c40610446..895469a315271 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -167,9 +167,9 @@ int main(int argc, char ** argv) { float * out = emb + p * n_embd; batch_decode(ctx, batch, out, s, n_embd); - // print first 3 embeddings + // print the first part of the embeddings fprintf(stdout, "\n"); - for (int j = 0; j < std::min(3, n_prompts); j++) { + for (int j = 0; j < n_prompts; j++) { fprintf(stdout, "embedding %d: ", j); for (int i = 0; i < std::min(16, n_embd); i++) { fprintf(stdout, "%f ", emb[j * n_embd + i]); From 3fe8d7a17f84bd721cd4d8db35365da44b69f68b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 12:38:37 +0200 Subject: [PATCH 04/48] ggml : designate enum vals for integer types (#6050) --- ggml.h | 64 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/ggml.h b/ggml.h index 1171088a96526..ab26c8f5908c7 100644 --- a/ggml.h +++ b/ggml.h @@ -337,24 +337,24 @@ extern "C" { struct ggml_object; struct ggml_context; + // NOTE: always add types at the end of the enum to keep backward compatibility enum ggml_type { - GGML_TYPE_F32 = 0, - GGML_TYPE_F16 = 1, - GGML_TYPE_Q4_0 = 2, - GGML_TYPE_Q4_1 = 3, + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, // GGML_TYPE_Q4_2 = 4, support has been removed - // GGML_TYPE_Q4_3 (5) support has been removed - GGML_TYPE_Q5_0 = 6, - GGML_TYPE_Q5_1 = 7, - GGML_TYPE_Q8_0 = 8, - GGML_TYPE_Q8_1 = 9, - // k-quantizations - GGML_TYPE_Q2_K = 10, - GGML_TYPE_Q3_K = 11, - GGML_TYPE_Q4_K = 12, - GGML_TYPE_Q5_K = 13, - GGML_TYPE_Q6_K = 14, - GGML_TYPE_Q8_K = 15, + // GGML_TYPE_Q4_3 = 5, support has been removed + GGML_TYPE_Q5_0 = 6, + GGML_TYPE_Q5_1 = 7, + GGML_TYPE_Q8_0 = 8, + GGML_TYPE_Q8_1 = 9, + GGML_TYPE_Q2_K = 10, + GGML_TYPE_Q3_K = 11, + GGML_TYPE_Q4_K = 12, + GGML_TYPE_Q5_K = 13, + GGML_TYPE_Q6_K = 14, + GGML_TYPE_Q8_K = 15, GGML_TYPE_IQ2_XXS = 16, GGML_TYPE_IQ2_XS = 17, GGML_TYPE_IQ3_XXS = 18, @@ -363,9 +363,9 @@ extern "C" { GGML_TYPE_IQ3_S = 21, GGML_TYPE_IQ2_S = 22, GGML_TYPE_IQ4_XS = 23, - GGML_TYPE_I8, - GGML_TYPE_I16, - GGML_TYPE_I32, + GGML_TYPE_I8 = 24, + GGML_TYPE_I16 = 25, + GGML_TYPE_I32 = 26, GGML_TYPE_COUNT, }; @@ -383,20 +383,20 @@ extern "C" { // model file types enum ggml_ftype { - GGML_FTYPE_UNKNOWN = -1, - GGML_FTYPE_ALL_F32 = 0, - GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + GGML_FTYPE_UNKNOWN = -1, + GGML_FTYPE_ALL_F32 = 0, + GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors - GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors - GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors - GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors + GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors + GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors + GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors From 3ca23481dd309bd51cc31c73a4cc34f922cc372f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20=C4=8Cert=C3=ADk?= Date: Thu, 14 Mar 2024 04:40:14 -0600 Subject: [PATCH 05/48] gguf-py : add support for I8, I16 and I32 (#6045) * Refactor dtype handling to be extensible This code is equivalent as before, but now it is prepared to easily add more NumPy dtypes. * Add support for I8, I16 and I32 These types are allowed in the GGUF specification. * Add support for I8, I16 and I32 to gguf_writer * Add support for I8, I16, I32 to gguf_reader --- gguf-py/gguf/constants.py | 6 ++++++ gguf-py/gguf/gguf_reader.py | 9 +++++++++ gguf-py/gguf/gguf_writer.py | 16 ++++++++++++---- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b23badb1019c1..99f71f0a1aa29 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -661,6 +661,9 @@ class GGMLQuantizationType(IntEnum): IQ3_S = 21 IQ2_S = 22 IQ4_XS = 23 + I8 = 24 + I16 = 25 + I32 = 26 class GGUFEndian(IntEnum): @@ -727,6 +730,9 @@ def get_type(val: Any) -> GGUFValueType: GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), + GGMLQuantizationType.I8: (1, 1), + GGMLQuantizationType.I16: (1, 2), + GGMLQuantizationType.I32: (1, 4), } diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 5b6d4ba6bcce9..1c10f57538992 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -248,6 +248,15 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: elif ggml_type == GGMLQuantizationType.F16: item_count = n_elems item_type = np.float16 + elif ggml_type == GGMLQuantizationType.I8: + item_count = n_elems + item_type = np.int8 + elif ggml_type == GGMLQuantizationType.I16: + item_count = n_elems + item_type = np.int16 + elif ggml_type == GGMLQuantizationType.I32: + item_count = n_elems + item_type = np.int32 else: item_count = n_bytes item_type = np.uint8 diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index e49c5db6866a2..9c1eeac318c7d 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -196,9 +196,6 @@ def add_tensor_info( if self.state is not WriterState.EMPTY: raise ValueError(f'Expected output file to be empty, got {self.state}') - if raw_dtype is None and tensor_dtype not in (np.float32, np.float16): - raise ValueError("Only F32 and F16 tensors are supported for now") - encoded_name = name.encode("utf8") self.ti_data += self._pack("Q", len(encoded_name)) self.ti_data += encoded_name @@ -207,7 +204,18 @@ def add_tensor_info( for i in range(n_dims): self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) if raw_dtype is None: - dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16 + if tensor_shape == np.float32: + dtype = GGMLQuantizationType.F32 + elif tensor_dtype == np.float16: + dtype = GGMLQuantizationType.F16 + elif tensor_dtype == np.int8: + dtype = GGMLQuantizationType.I8 + elif tensor_dtype == np.int16: + dtype = GGMLQuantizationType.I16 + elif tensor_dtype == np.int32: + dtype = GGMLQuantizationType.I32 + else: + raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now") else: dtype = raw_dtype self.ti_data += self._pack("I", dtype) From 2c4fb69246834503db7b78bcbedcef506bbc60c4 Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Thu, 14 Mar 2024 11:56:48 +0100 Subject: [PATCH 06/48] llama : optimize defrag moves + fix fragmentation calculation (#6037) * attempt to reduce the impact of a worst-case scenario * fragmentation calculation fix * Update llama.cpp --------- Co-authored-by: Georgi Gerganov --- llama.cpp | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/llama.cpp b/llama.cpp index 38e7036a72720..ff467c5756a1a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9036,8 +9036,8 @@ static int llama_decode_internal( //llama_synchronize(&lctx); // decide if we need to defrag the kv cache - if (cparams.defrag_thold >= 0.0f) { - const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens_all)/float(kv_self.n) : 0.0f; + if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) { + const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f; // queue defragmentation for next llama_kv_cache_update if (fragmentation > cparams.defrag_thold) { @@ -9069,6 +9069,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // number of cells moved uint32_t n_moves = 0; + // each move requires 6*n_layer tensors (see build_defrag) + // - source view, destination view, copy operation + // - x2 for keys and values + const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer); + // determine which KV cells to move where // // cell i moves to ids[i] @@ -9095,15 +9100,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { nh++; } - // each move requires 6*n_layer tensors (see build_defrag) - // - source view, destination view, copy operation - // - x2 for keys and values - // - if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) { - // the graph is too big, we cannot move more cells - break; - } - uint32_t nf = 0; uint32_t is = n_kv - 1; @@ -9133,11 +9129,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // are we moving a continuous block of memory? bool cont = false; + // should we stop searching for the next move? + bool stop = false; + // go back and move the nf cells to the hole for (; i1 < n_kv; ++i1) { auto & cell1 = kv_self.cells[i1]; if (cell1.is_empty() || ids[i1] != n_kv) { + if (n_moves == max_moves) { + stop = true; + break; + } + cont = false; continue; } @@ -9164,6 +9168,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { } } + if (stop || n_moves == max_moves) { + break; + } + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); i0 += nh - 1; From a44bc969e4cd62ca9f4332e17fe3c51f2093e7c6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 13:13:06 +0200 Subject: [PATCH 07/48] llama : fix typo --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index ff467c5756a1a..eb48b1e90c8d1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3932,7 +3932,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); - LLAMA_LOG_INFO("%s: causal attm = %d\n", __func__, hparams.causal_attn); + LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn); LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type); LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type); From 43241adf22e8231ffaf3827d2c9310cc0ffd5ac5 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Thu, 14 Mar 2024 12:15:39 +0100 Subject: [PATCH 08/48] server: disable debug release type sanitizer, simplify trigger (#6047) - increase time out for server - do not fail fast --- .github/workflows/server.yml | 17 ++++++++--------- examples/server/tests/features/steps/steps.py | 9 ++++++++- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index e385e03f3a245..5e38b3547c659 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -25,17 +25,14 @@ jobs: strategy: matrix: sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug, Release] + build_type: [Debug] include: - build_type: Release sanitizer: "" - exclude: - - build_type: Release - sanitizer: ADDRESS - - build_type: Release + - build_type: Debug sanitizer: THREAD - - build_type: Release - sanitizer: UNDEFINED + disabled_on_pr: true + fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken container: image: ubuntu:latest @@ -81,13 +78,14 @@ jobs: - name: Tests id: server_integration_tests + if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} run: | cd examples/server/tests PORT=8888 ./tests.sh - name: Slow tests id: server_integration_tests_slow - if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }} + if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} run: | cd examples/server/tests PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow @@ -124,13 +122,14 @@ jobs: - name: Tests id: server_integration_tests + if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} run: | cd examples/server/tests behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp - name: Slow tests id: server_integration_tests_slow - if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }} + if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} run: | cd examples/server/tests behave.exe --stop --no-skipped --no-capture --tags slow diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index cfa9f96ec5306..a59a52d21748a 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -119,6 +119,10 @@ def step_server_metrics(context): def step_start_server(context): start_server_background(context) attempts = 0 + max_attempts = 20 + if 'GITHUB_ACTIONS' in os.environ: + max_attempts *= 2 + while True: with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: result = sock.connect_ex((context.server_fqdn, context.server_port)) @@ -126,7 +130,7 @@ def step_start_server(context): print("\x1b[33;46mserver started!\x1b[0m") return attempts += 1 - if attempts > 20: + if attempts > max_attempts: assert False, "server not started" print(f"waiting for server to start, connect error code = {result}...") time.sleep(0.1) @@ -943,6 +947,9 @@ async def wait_for_health_status(context, print(f"Starting checking for health for expected_health_status={expected_health_status}\n") interval = 0.5 counter = 0 + if 'GITHUB_ACTIONS' in os.environ: + timeout *= 2 + async with aiohttp.ClientSession() as session: while True: async with await session.get(f'{base_url}/health', params=params) as health_response: From 15a333260ab637a040ed0864c206a2ceaf806bb8 Mon Sep 17 00:00:00 2001 From: Jian Liao Date: Thu, 14 Mar 2024 04:18:23 -0700 Subject: [PATCH 09/48] readme : improve readme for Llava-1.6 example (#6044) Co-authored-by: Jian Liao --- examples/llava/README.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/examples/llava/README.md b/examples/llava/README.md index 35e6d9e5d00f7..67cb0f22b7d95 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -63,12 +63,20 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director ```console git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b ``` -2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: + +2) Install the required Python packages: + +```sh +pip install -r examples/llava/requirements.txt +``` + +3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: ```console python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/ ``` - you will find a llava.projector and a llava.clip file in your model directory -3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: + +4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: ```console mkdir vit cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin @@ -76,18 +84,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json ``` -4) Create the visual gguf model: +5) Create the visual gguf model: ```console python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision ``` - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP -5) Then convert the model to gguf format: +6) Then convert the model to gguf format: ```console python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown ``` -6) And finally we can run the llava-cli using the 1.6 model version: +7) And finally we can run the llava-cli using the 1.6 model version: ```console ./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 ``` From 77178eedc83d49f31bf757d8e12315d76460be78 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 13:32:14 +0200 Subject: [PATCH 10/48] gguf-py : fix dtype check (#6045) --- gguf-py/gguf/gguf_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 9c1eeac318c7d..4d389be951d72 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -204,7 +204,7 @@ def add_tensor_info( for i in range(n_dims): self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) if raw_dtype is None: - if tensor_shape == np.float32: + if tensor_dtype == np.float32: dtype = GGMLQuantizationType.F32 elif tensor_dtype == np.float16: dtype = GGMLQuantizationType.F16 From 044ec4b2a567f649459ccd20af2f387c784faa51 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 15:14:14 +0200 Subject: [PATCH 11/48] embedding : add EOS token if not present (#899) --- examples/embedding/embedding.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 895469a315271..cbf9aa2b560dd 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -112,13 +112,20 @@ int main(int argc, char ** argv) { // tokenize the prompts and trim std::vector> inputs; for (const auto & prompt : prompts) { - auto inp = ::llama_tokenize(ctx, prompt, true); + auto inp = ::llama_tokenize(ctx, prompt, true, false); if (inp.size() > n_batch) { inp.resize(n_batch); } inputs.push_back(inp); } + // add eos if not present + for (auto & inp : inputs) { + if (inp.empty() || inp.back() != llama_token_eos(model)) { + inp.push_back(llama_token_eos(model)); + } + } + // tokenization stats if (params.verbose_prompt) { for (int i = 0; i < (int) inputs.size(); i++) { @@ -172,7 +179,7 @@ int main(int argc, char ** argv) { for (int j = 0; j < n_prompts; j++) { fprintf(stdout, "embedding %d: ", j); for (int i = 0; i < std::min(16, n_embd); i++) { - fprintf(stdout, "%f ", emb[j * n_embd + i]); + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); } fprintf(stdout, "\n"); } From 69ff61397d2b7b550dcdda4a35b35128892408b0 Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Thu, 14 Mar 2024 17:21:56 +0100 Subject: [PATCH 12/48] llama : support models without vocabulary (#5798) * additional methods to read model and ctx parameters * vocab size as a part of a model metadata * models without vocabulary, convert.py part * models without vocabulary, llama.cpp part * PR clean up * converter scrypt fixes * llama_vocab_type update (renamed the new key) * pr review fixes * revert function renaming * one more NoVocab assert --- convert.py | 126 +++++++++++++++++++++--------------- gguf-py/gguf/constants.py | 2 + gguf-py/gguf/gguf_writer.py | 3 + llama.cpp | 92 +++++++++++++++++--------- llama.h | 7 +- 5 files changed, 142 insertions(+), 88 deletions(-) diff --git a/convert.py b/convert.py index c15f8c47ea4f7..161430f3e717e 100755 --- a/convert.py +++ b/convert.py @@ -332,6 +332,9 @@ def load(model_plus: ModelPlus) -> Params: # class BpeVocab: + tokenizer_model = "gpt2" + name = "bpe" + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) if isinstance(self.bpe_tokenizer.get('model'), dict): @@ -390,6 +393,9 @@ def __repr__(self) -> str: class SentencePieceVocab: + tokenizer_model = "llama" + name = "spm" + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) added_tokens: dict[str, int] @@ -453,6 +459,9 @@ def __repr__(self) -> str: class HfVocab: + tokenizer_model = "llama" + name = "hfft" + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None: try: from transformers import AutoTokenizer @@ -553,7 +562,15 @@ def __repr__(self) -> str: return f"" -Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab" +class NoVocab: + tokenizer_model = "no_vocab" + name = "no_vocab" + + def __repr__(self) -> str: + return "" + + +Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab" # @@ -935,8 +952,10 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N # Handle special case where the model's vocab size is not set if params.n_vocab == -1: raise ValueError( - f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?" + f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}" ) + if isinstance(vocab, NoVocab): + return # model has no vocab # Check for a vocab size mismatch if params.n_vocab == vocab.vocab_size: @@ -977,6 +996,7 @@ def add_meta_arch(self, params: Params) -> None: name = str(params.path_model.parent).split('/')[-1] self.gguf.add_name (name) + self.gguf.add_vocab_size (params.n_vocab) self.gguf.add_context_length (params.n_ctx) self.gguf.add_embedding_length (params.n_embd) self.gguf.add_block_count (params.n_layer) @@ -1013,21 +1033,9 @@ def add_meta_arch(self, params: Params) -> None: if params.ftype is not None: self.gguf.add_file_type(params.ftype) - def handle_tokenizer_model(self, vocab: Vocab) -> str: - # Map the vocab types to the supported tokenizer models - tokenizer_model = { - SentencePieceVocab: "llama", - HfVocab: "llama", - BpeVocab: "gpt2", - }.get(type(vocab)) - - # Block if vocab type is not predefined - if tokenizer_model is None: - raise ValueError("Unknown vocab type: Not supported") - - return tokenizer_model - def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]: + assert not isinstance(vocab, NoVocab) + tokens = [] scores = [] toktypes = [] @@ -1043,11 +1051,8 @@ def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list return tokens, scores, toktypes def add_meta_vocab(self, vocab: Vocab) -> None: - # Handle the tokenizer model - tokenizer_model = self.handle_tokenizer_model(vocab) - # Ensure that tokenizer_model is added to the GGUF model - self.gguf.add_tokenizer_model(tokenizer_model) + self.gguf.add_tokenizer_model(vocab.tokenizer_model) # Extract model vocabulary for model conversion tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab) @@ -1074,6 +1079,26 @@ def write_meta(self) -> None: def write_tensor_info(self) -> None: self.gguf.write_ti_data_to_file() + def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None: + ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency) + if ftype == GGMLFileType.MostlyQ8_0: + ndarrays = bounded_parallel_map( + OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, + use_processpool_executor=True, + ) + else: + ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) + + start = time.time() + for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): + elapsed = time.time() - start + size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) + padi = len(str(len(model))) + print( + f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" + ) + self.gguf.write_tensor_data(ndarray) + def close(self) -> None: self.gguf.close() @@ -1082,7 +1107,7 @@ def write_vocab_only( fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: - check_vocab_size(params, vocab, pad_vocab = pad_vocab) + check_vocab_size(params, vocab, pad_vocab=pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -1120,8 +1145,11 @@ def write_all( # meta data of.add_meta_arch(params) - of.add_meta_vocab(vocab) - of.add_meta_special_vocab(svocab) + if isinstance(vocab, NoVocab): + of.gguf.add_tokenizer_model(vocab.tokenizer_model) + else: + of.add_meta_vocab(vocab) + of.add_meta_special_vocab(svocab) # tensor info for name, lazy_tensor in model.items(): @@ -1131,24 +1159,7 @@ def write_all( of.write_tensor_info() # tensor data - ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) - if ftype == GGMLFileType.MostlyQ8_0: - ndarrays = bounded_parallel_map( - OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, - use_processpool_executor=True, - ) - else: - ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) - - start = time.time() - for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): - elapsed = time.time() - start - size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) - padi = len(str(len(model))) - print( - f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" - ) - of.gguf.write_tensor_data(ndarray) + of.write_tensor_data(ftype, model, concurrency) of.close() @@ -1309,8 +1320,8 @@ def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]: return vtype, path raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}") - def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab: - load_merges = vocabtype == "bpe" + def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab: + load_merges = vocab.name == "bpe" n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None return gguf.SpecialVocab( model_parent_path, @@ -1319,30 +1330,34 @@ def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: n_vocab=n_vocab, ) - def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: + def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab: vocab_type, path = self._select_file(vocab_types) print(f"Loading vocab file {path!r}, type {vocab_type!r}") added_tokens_path = path.parent / "added_tokens.json" - vocab: Vocab if vocab_type == "bpe": - vocab = BpeVocab( + return BpeVocab( path, added_tokens_path if added_tokens_path.exists() else None ) - elif vocab_type == "spm": - vocab = SentencePieceVocab( + if vocab_type == "spm": + return SentencePieceVocab( path, added_tokens_path if added_tokens_path.exists() else None ) - elif vocab_type == "hfft": - vocab = HfVocab( + if vocab_type == "hfft": + return HfVocab( path.parent, added_tokens_path if added_tokens_path.exists() else None ) + raise ValueError(vocab_type) + + def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: + vocab: Vocab + if len(vocab_types) == 1 and "no_vocab" in vocab_types: + vocab = NoVocab() else: - raise ValueError(vocab_type) + vocab = self._create_vocab_by_path(vocab_types) # FIXME: Respect --vocab-dir? special_vocab = self._create_special_vocab( vocab, - vocab_type, model_parent_path, ) return vocab, special_vocab @@ -1380,6 +1395,7 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab") parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft") @@ -1392,6 +1408,10 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") args = parser.parse_args(args_in) + if args.no_vocab: + if args.vocab_only: + raise ValueError("no need to specify --vocab-only if using --no-vocab") + args.vocab_type = "no_vocab" if args.dump_single: model_plus = lazy_load_file(args.model) @@ -1442,7 +1462,7 @@ def main(args_in: list[str] | None = None) -> None: print(f"Wrote {outfile}") return - if model_plus.vocab is not None and args.vocab_dir is None: + if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab: vocab = model_plus.vocab print(f"Vocab info: {vocab}") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 99f71f0a1aa29..2d7cf16c14ed1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -32,6 +32,7 @@ class General: FILE_TYPE = "general.file_type" class LLM: + VOCAB_SIZE = "{arch}.vocab_size" CONTEXT_LENGTH = "{arch}.context_length" EMBEDDING_LENGTH = "{arch}.embedding_length" BLOCK_COUNT = "{arch}.block_count" @@ -752,6 +753,7 @@ def get_type(val: Any) -> GGUFValueType: KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE # LLM +KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 4d389be951d72..81b2eb884d485 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -321,6 +321,9 @@ def add_custom_alignment(self, alignment: int) -> None: self.data_alignment = alignment self.add_uint32(Keys.General.ALIGNMENT, alignment) + def add_vocab_size(self, size: int) -> None: + self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size) + def add_context_length(self, length: int) -> None: self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length) diff --git a/llama.cpp b/llama.cpp index eb48b1e90c8d1..10fd53469eb6f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -258,6 +258,7 @@ enum llm_kv { LLM_KV_GENERAL_SOURCE_URL, LLM_KV_GENERAL_SOURCE_HF_REPO, + LLM_KV_VOCAB_SIZE, LLM_KV_CONTEXT_LENGTH, LLM_KV_EMBEDDING_LENGTH, LLM_KV_BLOCK_COUNT, @@ -321,6 +322,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" }, { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" }, + { LLM_KV_VOCAB_SIZE, "%s.vocab_size" }, { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, { LLM_KV_BLOCK_COUNT, "%s.block_count" }, @@ -3242,10 +3244,11 @@ static const char * llama_model_type_name(e_model type) { static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ switch (type) { - case LLAMA_VOCAB_TYPE_SPM: return "SPM"; - case LLAMA_VOCAB_TYPE_BPE: return "BPE"; - case LLAMA_VOCAB_TYPE_WPM: return "WPM"; - default: return "unknown"; + case LLAMA_VOCAB_TYPE_NONE: return "no vocab"; + case LLAMA_VOCAB_TYPE_SPM: return "SPM"; + case LLAMA_VOCAB_TYPE_BPE: return "BPE"; + case LLAMA_VOCAB_TYPE_WPM: return "WPM"; + default: return "unknown"; } } @@ -3277,14 +3280,14 @@ static void llm_load_hparams( ml.get_key(LLM_KV_GENERAL_NAME, model.name, false); // get hparams kv - ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); - ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); - ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); - ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); - ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head); - ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer); - ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false); - ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); + ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); + ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); + ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); + ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); + ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head); + ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); + ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); + ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS); GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert); @@ -3645,30 +3648,25 @@ static void llm_load_vocab( const auto kv = LLM_KV(model.arch); - const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); - if (token_idx == -1) { - throw std::runtime_error("cannot find tokenizer vocab in model file\n"); - } - - const float * scores = nullptr; - const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); - if (score_idx != -1) { - scores = (const float * ) gguf_get_arr_data(ctx, score_idx); - } - - const int * toktypes = nullptr; - const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); - if (toktype_idx != -1) { - toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); - } - // determine vocab type { std::string tokenizer_name; ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name); - if (tokenizer_name == "llama") { + if (tokenizer_name == "no_vocab") { + vocab.type = LLAMA_VOCAB_TYPE_NONE; + + // default special tokens + vocab.special_bos_id = -1; + vocab.special_eos_id = -1; + vocab.special_unk_id = -1; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + vocab.linefeed_id = -1; + + return; + } else if (tokenizer_name == "llama") { vocab.type = LLAMA_VOCAB_TYPE_SPM; // default special tokens @@ -3734,6 +3732,23 @@ static void llm_load_vocab( } } + const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); + if (token_idx == -1) { + throw std::runtime_error("cannot find tokenizer vocab in model file\n"); + } + + const float * scores = nullptr; + const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); + if (score_idx != -1) { + scores = (const float * ) gguf_get_arr_data(ctx, score_idx); + } + + const int * toktypes = nullptr; + const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); + if (toktype_idx != -1) { + toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); + } + const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); vocab.id_to_token.resize(n_vocab); @@ -5023,7 +5038,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam llm_load_print_meta(ml, model); - if (model.hparams.n_vocab != model.vocab.id_to_token.size()) { + if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE && + model.hparams.n_vocab != model.vocab.id_to_token.size()) { throw std::runtime_error("vocab size mismatch"); } @@ -9361,26 +9377,32 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) { } static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL; } static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN; } static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL; } static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; } static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED; } static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { + GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(llama_is_byte_token(vocab, id)); const auto& token_data = vocab.id_to_token.at(id); switch (llama_vocab_get_type(vocab)) { @@ -9401,6 +9423,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { } static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { + GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); static const char * hex = "0123456789ABCDEF"; switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { @@ -10232,6 +10255,8 @@ static std::vector llama_tokenize_internal(const llama_vocab & } } } break; + case LLAMA_VOCAB_TYPE_NONE: + GGML_ASSERT(false); } return output; @@ -13138,7 +13163,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { } int32_t llama_n_vocab(const struct llama_model * model) { - return model->vocab.id_to_token.size(); + return model->hparams.n_vocab; } int32_t llama_n_ctx_train(const struct llama_model * model) { @@ -13962,14 +13987,17 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id } const char * llama_token_get_text(const struct llama_model * model, llama_token token) { + GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); return model->vocab.id_to_token[token].text.c_str(); } float llama_token_get_score(const struct llama_model * model, llama_token token) { + GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); return model->vocab.id_to_token[token].score; } llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) { + GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); return model->vocab.id_to_token[token].type; } diff --git a/llama.h b/llama.h index 2d16cc9b9fa2c..90aa5372e740b 100644 --- a/llama.h +++ b/llama.h @@ -59,9 +59,10 @@ extern "C" { typedef int32_t llama_seq_id; enum llama_vocab_type { - LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece - LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding - LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece + LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab + LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece + LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding + LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece }; // note: these values should be synchronized with ggml_rope From 727107707a73b3dc8a497cf9fc9405722c16dd2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20=C4=8Cert=C3=ADk?= Date: Thu, 14 Mar 2024 11:57:31 -0600 Subject: [PATCH 13/48] gguf-py : bump version to 0.8.0 (#6060) --- gguf-py/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 9789c2c877165..96396e04e8f3f 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.7.0" +version = "0.8.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ From 6e0438da3cc95b89cdbf55f45fa4e324d9076792 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Thu, 14 Mar 2024 14:29:32 -0400 Subject: [PATCH 14/48] gguf : fix resource leaks (#6061) There several places where a gguf context is allocated. A call to gguf_free is missing in some error paths. Also on linux, llama-bench was missing a fclose. --- examples/gguf/gguf.cpp | 1 + examples/llama-bench/llama-bench.cpp | 1 + examples/llava/clip.cpp | 4 ++++ examples/train-text-from-scratch/train-text-from-scratch.cpp | 1 + 4 files changed, 7 insertions(+) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index e67be4fb2995e..5444503a5d218 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -211,6 +211,7 @@ static bool gguf_ex_read_1(const std::string & fname) { for (int j = 0; j < ggml_nelements(cur); ++j) { if (data[j] != 100 + i) { fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]); + gguf_free(ctx); return false; } } diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index bf94e7e7a60a4..d6e5e0497dc3a 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -103,6 +103,7 @@ static std::string get_cpu_info() { } } } + fclose(f); } #endif // TODO: other platforms diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 6653b815d93a1..2035554ea8741 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -995,6 +995,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { if (!new_clip->ctx_data) { fprintf(stderr, "%s: ggml_init() failed\n", __func__); clip_free(new_clip); + gguf_free(ctx); return nullptr; } @@ -1002,6 +1003,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { if (!fin) { printf("cannot open model file for loading tensors\n"); clip_free(new_clip); + gguf_free(ctx); return nullptr; } @@ -1023,6 +1025,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { if (!fin) { printf("%s: failed to seek for tensor %s\n", __func__, name); clip_free(new_clip); + gguf_free(ctx); return nullptr; } int num_bytes = ggml_nbytes(cur); @@ -1908,6 +1911,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i break; default: printf("Please use an input file in f32 or f16\n"); + gguf_free(ctx_out); return false; } diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 7eafe8515a943..7d06e401b462b 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -711,6 +711,7 @@ static bool load_checkpoint_file(const char * filename, struct my_llama_model * load_checkpoint_gguf(fctx, f_ggml_ctx, model, train); + gguf_free(fctx); return true; } From 4755afd1cbd40d93c017e5b98c39796f52345314 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 22:58:41 +0200 Subject: [PATCH 15/48] llama : fix integer overflow during quantization (#6063) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 10fd53469eb6f..2c384197492e8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11977,7 +11977,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n return new_type; } -static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector & workers, const int nthread) { +static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector & workers, const int nthread) { std::mutex mutex; int counter = 0; size_t new_size = 0; From b0bc9f4a9da7c19f4779106ea83b23feca747566 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 15 Mar 2024 09:22:24 +0100 Subject: [PATCH 16/48] llama-bench : use random tokens to improve accuracy with mixtral (#6069) --- examples/llama-bench/llama-bench.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d6e5e0497dc3a..32eea786919f6 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1123,15 +1124,19 @@ struct sql_printer : public printer { static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - //std::vector tokens(n_prompt, llama_token_bos(llama_get_model(ctx))); - //llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0)); - //GGML_UNUSED(n_batch); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); + + std::vector tokens(n_batch); - std::vector tokens(n_batch, llama_token_bos(llama_get_model(ctx))); int n_processed = 0; while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); + tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + for (int i = 1; i < n_tokens; i++) { + tokens[i] = std::rand() % n_vocab; + } llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0)); n_processed += n_tokens; } @@ -1142,11 +1147,15 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - llama_token token = llama_token_bos(llama_get_model(ctx)); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); + + llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; for (int i = 0; i < n_gen; i++) { llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0)); llama_synchronize(ctx); + token = std::rand() % n_vocab; } } From aab606a11fc0a9740a7f297521c3eef851dfb351 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 15 Mar 2024 09:44:57 +0100 Subject: [PATCH 17/48] llama : add Orion chat template (#6066) --- llama.cpp | 20 ++++++++++++++++++++ tests/test-chat-template.cpp | 4 ++++ 2 files changed, 24 insertions(+) diff --git a/llama.cpp b/llama.cpp index 2c384197492e8..b8a8d2723ae7d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -14242,6 +14242,26 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "model\n"; } + } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) { + // OrionStarAI/Orion-14B-Chat + std::string system_prompt = ""; + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + // there is no system message support, we will merge it with user prompt + system_prompt = message->content; + continue; + } else if (role == "user") { + ss << "Human: "; + if (!system_prompt.empty()) { + ss << system_prompt << "\n\n"; + system_prompt = ""; + } + ss << message->content << "\n\nAssistant: "; + } else { + ss << message->content << ""; + } + } } else { // template not supported return -1; diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index fa2eb577b6e42..6e9e4bd1ea2cc 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -31,6 +31,8 @@ int main(void) { "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}", // google/gemma-7b-it "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\\n' + message['content'] | trim + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\\n'}}{% endif %}", + // OrionStarAI/Orion-14B-Chat + "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}", }; std::vector expected_output = { // teknium/OpenHermes-2.5-Mistral-7B @@ -45,6 +47,8 @@ int main(void) { "system\nYou are a helpful assistant\nuser\nHello\nassistant\nHi there\nuser\nWho are you\nassistant\n I am an assistant \nuser\nAnother question\nassistant\n", // google/gemma-7b-it "user\nYou are a helpful assistant\n\nHello\nmodel\nHi there\nuser\nWho are you\nmodel\nI am an assistant\nuser\nAnother question\nmodel\n", + // OrionStarAI/Orion-14B-Chat + "Human: You are a helpful assistant\n\nHello\n\nAssistant: Hi thereHuman: Who are you\n\nAssistant: I am an assistant Human: Another question\n\nAssistant: ", }; std::vector formatted_chat(1024); int32_t res; From 7ce2c77f88e1ca66ec48417e56f91746bac018c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20=C4=8Cert=C3=ADk?= Date: Fri, 15 Mar 2024 02:46:51 -0600 Subject: [PATCH 18/48] gguf : add support for I64 and F64 arrays (#6062) * gguf : add support for I64 and F64 arrays GGML currently does not support I64 or F64 arrays and they are not often used in machine learning, however if in the future the need arises, it would be nice to add them now, so that the types are next to the other types I8, I16, I32 in the enums, and it also reserves their type number. Furthermore, with this addition the GGUF format becomes very usable for most computational applications of NumPy (being compatible with the most common NumPy dtypes: i8, i16, i32, i64, f32, f64), providing a faster, and more versatile alternative to the `npz` format, and a simpler alternative to the `hdf5` format. The change in this PR seems small, not significantly increasing the maintenance burden. I tested this from Python using GGUFWriter/Reader and `gguf-dump`, as well as from C, everything seems to work. * Fix compiler warnings --- ggml.c | 17 +++++++++++++++++ ggml.h | 2 ++ gguf-py/gguf/constants.py | 4 ++++ gguf-py/gguf/gguf_reader.py | 12 +++++++++--- gguf-py/gguf/gguf_writer.py | 12 ++++++++---- 5 files changed, 40 insertions(+), 7 deletions(-) diff --git a/ggml.c b/ggml.c index fbc66f65b1052..c94006e51a092 100644 --- a/ggml.c +++ b/ggml.c @@ -470,6 +470,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(int32_t), .is_quantized = false, }, + [GGML_TYPE_I64] = { + .type_name = "i64", + .blck_size = 1, + .type_size = sizeof(int64_t), + .is_quantized = false, + }, + [GGML_TYPE_F64] = { + .type_name = "f64", + .blck_size = 1, + .type_size = sizeof(double), + .is_quantized = false, + .nrows = 1, + }, [GGML_TYPE_F32] = { .type_name = "f32", .blck_size = 1, @@ -12418,6 +12431,8 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: + case GGML_TYPE_I64: + case GGML_TYPE_F64: case GGML_TYPE_COUNT: { GGML_ASSERT(false); @@ -12504,6 +12519,8 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: + case GGML_TYPE_I64: + case GGML_TYPE_F64: case GGML_TYPE_COUNT: { GGML_ASSERT(false); diff --git a/ggml.h b/ggml.h index ab26c8f5908c7..c937d4a535adb 100644 --- a/ggml.h +++ b/ggml.h @@ -366,6 +366,8 @@ extern "C" { GGML_TYPE_I8 = 24, GGML_TYPE_I16 = 25, GGML_TYPE_I32 = 26, + GGML_TYPE_I64 = 27, + GGML_TYPE_F64 = 28, GGML_TYPE_COUNT, }; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2d7cf16c14ed1..458a641dcd229 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -665,6 +665,8 @@ class GGMLQuantizationType(IntEnum): I8 = 24 I16 = 25 I32 = 26 + I64 = 27 + F64 = 28 class GGUFEndian(IntEnum): @@ -734,6 +736,8 @@ def get_type(val: Any) -> GGUFValueType: GGMLQuantizationType.I8: (1, 1), GGMLQuantizationType.I16: (1, 2), GGMLQuantizationType.I32: (1, 4), + GGMLQuantizationType.I64: (1, 8), + GGMLQuantizationType.F64: (1, 8), } diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 1c10f57538992..33afac552ca75 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -242,12 +242,15 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: n_bytes = n_elems * type_size // block_size data_offs = int(start_offs + offset_tensor[0]) item_type: npt.DTypeLike - if ggml_type == GGMLQuantizationType.F32: + if ggml_type == GGMLQuantizationType.F16: + item_count = n_elems + item_type = np.float16 + elif ggml_type == GGMLQuantizationType.F32: item_count = n_elems item_type = np.float32 - elif ggml_type == GGMLQuantizationType.F16: + elif ggml_type == GGMLQuantizationType.F64: item_count = n_elems - item_type = np.float16 + item_type = np.float64 elif ggml_type == GGMLQuantizationType.I8: item_count = n_elems item_type = np.int8 @@ -257,6 +260,9 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: elif ggml_type == GGMLQuantizationType.I32: item_count = n_elems item_type = np.int32 + elif ggml_type == GGMLQuantizationType.I64: + item_count = n_elems + item_type = np.int64 else: item_count = n_bytes item_type = np.uint8 diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 81b2eb884d485..1967b633ce261 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -204,18 +204,22 @@ def add_tensor_info( for i in range(n_dims): self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) if raw_dtype is None: - if tensor_dtype == np.float32: - dtype = GGMLQuantizationType.F32 - elif tensor_dtype == np.float16: + if tensor_dtype == np.float16: dtype = GGMLQuantizationType.F16 + elif tensor_dtype == np.float32: + dtype = GGMLQuantizationType.F32 + elif tensor_dtype == np.float64: + dtype = GGMLQuantizationType.F64 elif tensor_dtype == np.int8: dtype = GGMLQuantizationType.I8 elif tensor_dtype == np.int16: dtype = GGMLQuantizationType.I16 elif tensor_dtype == np.int32: dtype = GGMLQuantizationType.I32 + elif tensor_dtype == np.int64: + dtype = GGMLQuantizationType.I64 else: - raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now") + raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now") else: dtype = raw_dtype self.ti_data += self._pack("I", dtype) From 753e36f650fa2a5869f89188d9ee745dc74cf14b Mon Sep 17 00:00:00 2001 From: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> Date: Fri, 15 Mar 2024 09:26:20 +0000 Subject: [PATCH 19/48] [SYCL] Fix non-intel device selection (#6042) * Fix non-intel device selection * Update ggml-sycl.cpp Co-authored-by: Neo Zhang Jianyu * Update ggml-sycl.cpp Co-authored-by: Neo Zhang Jianyu --------- Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Co-authored-by: Neo Zhang Jianyu --- ggml-sycl.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 9f6506383cc0d..a1ca6aba5951d 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -3451,7 +3451,7 @@ class sycl_gpu_mgr { dpct::device_info prop; dpct::get_device_info(prop, device); if (max_compute_units == prop.get_max_compute_units() && - prop.get_major_version() == 1) { + is_ext_oneapi_device(device)) { gpus.push_back(id); devices.push_back(device); work_group_size = prop.get_max_work_group_size(); @@ -3484,6 +3484,15 @@ class sycl_gpu_mgr { assert(false); return -1; } + + bool is_ext_oneapi_device(const sycl::device &dev) { + sycl::backend dev_backend = dev.get_backend(); + if (dev_backend == sycl::backend::ext_oneapi_level_zero || + dev_backend == sycl::backend::ext_oneapi_cuda || + dev_backend == sycl::backend::ext_oneapi_hip) + return true; + return false; + } }; static sycl_gpu_mgr *g_sycl_gpu_mgr = NULL; From 131b0584096ee9df4d07cb28759dfea6efe6475f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 15 Mar 2024 11:36:50 +0200 Subject: [PATCH 20/48] make : ggml-metal.o depends on ggml.h --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index cb597b209181b..c0f1250366a64 100644 --- a/Makefile +++ b/Makefile @@ -553,7 +553,7 @@ endif endif # LLAMA_METAL ifdef LLAMA_METAL -ggml-metal.o: ggml-metal.m ggml-metal.h +ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h $(CC) $(CFLAGS) -c $< -o $@ ifdef LLAMA_METAL_EMBED_LIBRARY From 46acb3676718b983157058aecf729a2064fc7d34 Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Fri, 15 Mar 2024 18:53:53 +0800 Subject: [PATCH 21/48] fix set main gpu error (#6073) --- examples/sycl/build.sh | 5 +- examples/sycl/run-llama2.sh | 16 +- ggml-sycl.cpp | 312 +++++++++++++++++++++++++++--------- ggml-sycl.h | 5 + llama.cpp | 23 ++- 5 files changed, 272 insertions(+), 89 deletions(-) diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh index 26ad2f7da754e..f20391d7a0000 100755 --- a/examples/sycl/build.sh +++ b/examples/sycl/build.sh @@ -13,8 +13,11 @@ source /opt/intel/oneapi/setvars.sh #for FP32 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -#build example/main only +#build example/main #cmake --build . --config Release --target main +#build example/llama-bench +#cmake --build . --config Release --target llama-bench + #build all binary cmake --build . --config Release -v diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index 52f7c01a4bd90..c979a52f6f5a0 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -9,18 +9,28 @@ source /opt/intel/oneapi/setvars.sh if [ $# -gt 0 ]; then GGML_SYCL_DEVICE=$1 + GGML_SYCL_SINGLE_GPU=1 else GGML_SYCL_DEVICE=0 fi -echo "use $GGML_SYCL_DEVICE as main GPU" + #export GGML_SYCL_DEBUG=1 #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. -#use all GPUs with same max compute units -ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 +if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then + echo "use $GGML_SYCL_DEVICE as main GPU" + #use signle GPU only + ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none +else + #use multiple GPUs with same max compute units + ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 +fi #use main GPU only #ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none +#use multiple GPUs with same max compute units +#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 + diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index a1ca6aba5951d..6dc5eb20c5a63 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -24,10 +25,9 @@ #include #include #include - #include #include - +#include #include #include @@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp #define __dpct_noinline__ __attribute__((noinline)) #endif + +std::string get_device_type_name(const sycl::device &Device) { + auto DeviceType = Device.get_info(); + switch (DeviceType) { + case sycl::info::device_type::cpu: + return "cpu"; + case sycl::info::device_type::gpu: + return "gpu"; + case sycl::info::device_type::host: + return "host"; + case sycl::info::device_type::accelerator: + return "acc"; + default: + return "unknown"; + } +} + +std::string get_device_backend_and_type(const sycl::device &device) { + std::stringstream device_type; + sycl::backend backend = device.get_backend(); + device_type << backend << ":" << get_device_type_name(device); + return device_type.str(); +} + namespace dpct { typedef sycl::queue *queue_ptr; @@ -942,17 +966,65 @@ namespace dpct private: mutable std::recursive_mutex m_mutex; + static bool compare_dev(sycl::device &device1, sycl::device &device2) + { + dpct::device_info prop1; + dpct::get_device_info(prop1, device1); + dpct::device_info prop2; + dpct::get_device_info(prop2, device2); + return prop1.get_max_compute_units() > prop2.get_max_compute_units(); + } + static int convert_backend_index(std::string & backend) { + if (backend == "ext_oneapi_level_zero:gpu") return 0; + if (backend == "opencl:gpu") return 1; + if (backend == "opencl:cpu") return 2; + if (backend == "opencl:acc") return 3; + printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); + GGML_ASSERT(false); + } + static bool compare_backend(std::string &backend1, std::string &backend2) { + return convert_backend_index(backend1) < convert_backend_index(backend2); + } dev_mgr() { sycl::device default_device = sycl::device(sycl::default_selector_v); _devs.push_back(std::make_shared(default_device)); - std::vector sycl_all_devs = - sycl::device::get_devices(sycl::info::device_type::all); + std::vector sycl_all_devs; // Collect other devices except for the default device. if (default_device.is_cpu()) _cpu_device = 0; + + auto Platforms = sycl::platform::get_platforms(); + // Keep track of the number of devices per backend + std::map DeviceNums; + std::map> backend_devices; + + while (!Platforms.empty()) { + auto Platform = Platforms.back(); + Platforms.pop_back(); + auto devices = Platform.get_devices(); + std::string backend_type = get_device_backend_and_type(devices[0]); + for (const auto &device : devices) { + backend_devices[backend_type].push_back(device); + } + } + + std::vector keys; + for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) { + keys.push_back(it->first); + } + std::sort(keys.begin(), keys.end(), compare_backend); + + for (auto &key : keys) { + std::vector devs = backend_devices[key]; + std::sort(devs.begin(), devs.end(), compare_dev); + for (const auto &dev : devs) { + sycl_all_devs.push_back(dev); + } + } + for (auto &dev : sycl_all_devs) { if (dev == default_device) @@ -3202,6 +3274,11 @@ static int g_work_group_size = 0; #define GGML_SYCL_MMV_Y 1 #endif +enum ggml_sycl_backend_gpu_mode { + SYCL_UNSET_GPU_MODE = -1, + SYCL_SINGLE_GPU_MODE = 0, + SYCL_MUL_GPU_MODE +}; static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); @@ -3401,12 +3478,31 @@ class sycl_gpu_mgr { int work_group_size = 0; std::string gpus_list = ""; + /* + Use all GPUs with same top max compute units + */ sycl_gpu_mgr() { detect_sycl_gpu_list_with_max_cu(); get_allow_gpus(); create_context_with_gpus(); } + /* + Only use the assigned GPU + */ + sycl_gpu_mgr(int main_gpu_id) { + sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id); + dpct::device_info prop; + dpct::get_device_info(prop, device); + gpus.push_back(main_gpu_id); + devices.push_back(device); + work_group_size = prop.get_max_work_group_size(); + max_compute_units = prop.get_max_compute_units(); + + get_allow_gpus(); + create_context_with_gpus(); + } + void create_context_with_gpus() { sycl::context ctx = sycl::context(devices); assert(gpus.size() > 0); @@ -3422,7 +3518,7 @@ class sycl_gpu_mgr { gpus_list += std::to_string(gpus[i]); gpus_list += ","; } - if (gpus_list.length() > 2) { + if (gpus_list.length() > 1) { gpus_list.pop_back(); } } @@ -3471,8 +3567,8 @@ class sycl_gpu_mgr { if (gpus[i] == id) return i; } - assert(false); - return -1; + printf("miss to get device index by id=%d\n", id); + GGML_ASSERT(false); } int get_next_index(int id) { @@ -3481,8 +3577,7 @@ class sycl_gpu_mgr { if (gpus[i] == id) return i; } - assert(false); - return -1; + GGML_ASSERT(false); } bool is_ext_oneapi_device(const sycl::device &dev) { @@ -3500,11 +3595,14 @@ static int g_device_count = -1; static int g_all_sycl_device_count = -1; static int g_main_device = -1; static int g_main_device_id = -1; +static bool g_ggml_backend_sycl_buffer_type_initialized = false; static std::array g_default_tensor_split = {}; static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0}; +static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE; + struct sycl_device_capabilities { int cc; // compute capability bool vmm; // virtual memory support @@ -13008,17 +13106,20 @@ bool ggml_sycl_loaded(void) { return g_sycl_loaded; } -void print_device_detail(int id) { +void print_device_detail(int id, sycl::device &device, std::string device_type) { + dpct::device_info prop; SYCL_CHECK(CHECK_TRY_ERROR( - dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id)))); - sycl::device cur_device = dpct::dev_mgr::instance().get_device(id); + dpct::get_device_info(prop, device))); + std::string version; version += std::to_string(prop.get_major_version()); version += "."; version += std::to_string(prop.get_minor_version()); - fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id, + device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), ""); + + fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(), prop.get_name(), version.c_str(), prop.get_max_compute_units(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(), prop.get_global_mem_size()); @@ -13026,19 +13127,35 @@ void print_device_detail(int id) { void ggml_backend_sycl_print_sycl_devices() { int device_count = dpct::dev_mgr::instance().device_count(); + std::map DeviceNums; fprintf(stderr, "found %d SYCL devices:\n", device_count); - fprintf(stderr, "|ID| Name |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n"); - fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n"); + fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n"); + fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n"); + fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n"); for (int id = 0; id < device_count; ++id) { - print_device_detail(id); + sycl::device device = dpct::dev_mgr::instance().get_device(id); + sycl::backend backend = device.get_backend(); + std::string backend_type = get_device_backend_and_type(device); + int type_id=DeviceNums[backend_type]++; + std::stringstream device_type; + device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]"; + print_device_detail(id, device, device_type.str()); } } void print_gpu_device_list() { - fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n", - g_sycl_gpu_mgr->get_gpu_count(), - g_sycl_gpu_mgr->gpus_list.c_str(), - g_sycl_gpu_mgr->max_compute_units); + GGML_ASSERT(g_sycl_gpu_mgr); + + char* hint=NULL; + if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) { + hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n"; + } else { + hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n"; + } + fprintf(stderr, hint, + g_sycl_gpu_mgr->get_gpu_count(), + g_sycl_gpu_mgr->gpus_list.c_str(), + g_sycl_gpu_mgr->max_compute_units); } int get_sycl_env(const char *env_name, int default_val) { @@ -13074,6 +13191,15 @@ void ggml_init_sycl() try { #else fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__); #endif + +/* NOT REMOVE, keep it for next optimize for XMX. +#if defined(SYCL_USE_XMX) + fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__); +#else + fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); +#endif +*/ + if (CHECK_TRY_ERROR(g_all_sycl_device_count = dpct::dev_mgr::instance().device_count()) != 0) { initialized = true; @@ -13082,68 +13208,65 @@ void ggml_init_sycl() try { } GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); ggml_backend_sycl_print_sycl_devices(); + initialized = true; + g_sycl_loaded = true; + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} - if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr(); - - g_device_count = g_sycl_gpu_mgr->get_gpu_count(); - g_work_group_size = g_sycl_gpu_mgr->work_group_size; - - print_gpu_device_list(); +void ggml_init_by_gpus(int device_count) try { + g_device_count = device_count; + g_work_group_size = g_sycl_gpu_mgr->work_group_size; - int64_t total_vram = 0; + int64_t total_vram = 0; -/* NOT REMOVE, keep it for next optimize for XMX. -#if defined(SYCL_USE_XMX) - fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__); -#else - fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); -#endif -*/ - for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) { - g_device_caps[id].vmm = 0; - g_device_caps[id].device_id = -1; - g_device_caps[id].cc = 0; - g_tensor_split[id] = 0; - g_default_tensor_split[id] = 0; - } + print_gpu_device_list(); - for (int i = 0; i < g_device_count; ++i) { - int device_id = g_sycl_gpu_mgr->gpus[i]; - g_device_caps[i].vmm = 0; + for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) { + g_device_caps[id].vmm = 0; + g_device_caps[id].device_id = -1; + g_device_caps[id].cc = 0; + g_tensor_split[id] = 0; + g_default_tensor_split[id] = 0; + } - dpct::device_info prop; - SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(device_id)))); + for (int i = 0; i < g_device_count; ++i) { + int device_id = g_sycl_gpu_mgr->gpus[i]; + g_device_caps[i].vmm = 0; - g_default_tensor_split[i] = total_vram; - total_vram += prop.get_global_mem_size(); + dpct::device_info prop; + SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(device_id)))); - g_device_caps[i].cc = - 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - } + g_default_tensor_split[i] = total_vram; + total_vram += prop.get_global_mem_size(); - for (int i = 0; i < g_device_count; ++i) { - g_default_tensor_split[i] /= total_vram; - } + g_device_caps[i].cc = + 100 * prop.get_major_version() + 10 * prop.get_minor_version(); + } - for (int i = 0; i < g_device_count; ++i) { - SYCL_CHECK(ggml_sycl_set_device(i)); + for (int i = 0; i < g_device_count; ++i) { + g_default_tensor_split[i] /= total_vram; + } - // create sycl streams - for (int is = 0; is < MAX_STREAMS; ++is) { - SYCL_CHECK(CHECK_TRY_ERROR( - g_syclStreams[i][is] = - dpct::get_current_device().create_queue( - g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device()))); - } + for (int i = 0; i < g_device_count; ++i) { + SYCL_CHECK(ggml_sycl_set_device(i)); - const dpct::queue_ptr stream = g_syclStreams[i][0]; - // create sycl handle - SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream)); + // create sycl streams + for (int is = 0; is < MAX_STREAMS; ++is) { + SYCL_CHECK(CHECK_TRY_ERROR( + g_syclStreams[i][is] = + dpct::get_current_device().create_queue( + g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device()))); } - initialized = true; - g_sycl_loaded = true; + const dpct::queue_ptr stream = g_syclStreams[i][0]; + // create sycl handle + SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream)); } } catch (sycl::exception const &exc) { @@ -16551,22 +16674,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { /* .is_host = */ nullptr, }; -ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { +ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) { + if (device_index>=g_device_count or device_index<0) { + printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", + device_index, g_device_count-1); + GGML_ASSERT(device_indexgpus[i])}, }; } - ggml_backend_sycl_buffer_type_initialized = true; + g_ggml_backend_sycl_buffer_type_initialized = true; } - - return &ggml_backend_sycl_buffer_types[device]; + return &ggml_backend_sycl_buffer_types[device_index]; } // sycl split buffer type @@ -17319,11 +17444,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) { return g_sycl_gpu_mgr->get_index(device_id); } +GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) { + return g_sycl_gpu_mgr->gpus[device_index]; +} + +GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) { + GGML_ASSERT(main_gpu_idget_gpu_count()); + g_ggml_backend_sycl_buffer_type_initialized = false; +} + +GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() { + if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) { + return; + } + + fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n"); + + if (g_sycl_gpu_mgr) { + delete g_sycl_gpu_mgr; + } + g_sycl_gpu_mgr = new sycl_gpu_mgr(); + g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE; + ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count()); + g_ggml_backend_sycl_buffer_type_initialized = false; +} + extern "C" int ggml_backend_sycl_reg_devices(); int ggml_backend_sycl_reg_devices() { - if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr(); - g_device_count = g_sycl_gpu_mgr->get_gpu_count(); + ggml_backend_sycl_set_mul_device_mode(); assert(g_device_count>0); for (int i = 0; i < g_device_count; i++) { int id = g_sycl_gpu_mgr->gpus[i]; diff --git a/ggml-sycl.h b/ggml-sycl.h index bf5b11b369d19..c549a64a1f44c 100644 --- a/ggml-sycl.h +++ b/ggml-sycl.h @@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id); +// TODO: these are temporary +// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670 +GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index); +GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id); +GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode(); #ifdef __cplusplus } #endif diff --git a/llama.cpp b/llama.cpp index b8a8d2723ae7d..8e185d4bfe773 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5064,6 +5064,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam } #endif +#ifdef GGML_USE_SYCL + if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { + ggml_backend_sycl_set_single_device_mode(params.main_gpu); + //SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index. + params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu); + } else { + ggml_backend_sycl_set_mul_device_mode(); + } +#endif + if (!llm_load_tensors( ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data @@ -12921,23 +12931,22 @@ struct llama_context * llama_new_context_with_model( if (model->n_gpu_layers > 0) { // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu); - ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index); + ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index); + int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu); llama_free(ctx); return nullptr; } ctx->backends.push_back(backend); } else { // LLAMA_SPLIT_LAYER requires a backend for each GPU - int id_list[GGML_SYCL_MAX_DEVICES]; - ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { - int device_id = id_list[i]; ggml_backend_t backend = ggml_backend_sycl_init(i); if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i); + int id_list[GGML_SYCL_MAX_DEVICES]; + ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i); llama_free(ctx); return nullptr; } From 3020327f6cd6d2ce50528dd65f4b199d2ea8b1ae Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 15 Mar 2024 13:24:03 +0100 Subject: [PATCH 22/48] cuda : disable unused cudaLaunchHostFunc code (#6078) --- ggml-cuda.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d1b5e52ba9011..db595409aa735 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -11541,6 +11541,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev if (ggml_backend_is_cuda(event->backend)) { CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0)); } else { +#if 0 // untested auto wait_fn = [](void * user_data) { ggml_backend_event_t event = (ggml_backend_event_t)user_data; @@ -11548,6 +11549,8 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev }; CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event)); +#endif + GGML_ASSERT(false); } } From 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc Mon Sep 17 00:00:00 2001 From: Ting Lou Date: Fri, 15 Mar 2024 22:31:05 +0800 Subject: [PATCH 23/48] llava : change API to pure C style for Rust FFI bindgen (#6079) Co-authored-by: Lou Ting --- examples/llava/clip.cpp | 36 ++++++++++++++++++------------------ examples/llava/clip.h | 6 +++--- examples/llava/llava.cpp | 2 +- examples/llava/llava.h | 4 ++-- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 2035554ea8741..a0ed82d7e2328 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1235,16 +1235,16 @@ struct clip_image_f32 * clip_image_f32_init() { void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } -void clip_image_u8_batch_free(struct clip_image_u8_batch & batch) { - if (batch.size > 0) { - delete[] batch.data; - batch.size = 0; +void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { + if (batch->size > 0) { + delete[] batch->data; + batch->size = 0; } } -void clip_image_f32_batch_free(struct clip_image_f32_batch & batch) { - if (batch.size > 0) { - delete[] batch.data; - batch.size = 0; +void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { + if (batch->size > 0) { + delete[] batch->data; + batch->size = 0; } } @@ -1497,7 +1497,7 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) { +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { bool pad_to_square = true; if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -1509,11 +1509,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli pad_to_square = false; } // free the previous res_imgs if any set - if (res_imgs.size > 0) { + if (res_imgs->size > 0) { clip_image_f32_batch_free(res_imgs); } - res_imgs.data = nullptr; - res_imgs.size = 0; + res_imgs->data = nullptr; + res_imgs->size = 0; // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 @@ -1568,11 +1568,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square patches.insert(patches.begin(), image_original_resize); // clip_image_f32_batch_init(patches.size()); - res_imgs.size = patches.size(); - res_imgs.data = new clip_image_f32[res_imgs.size]; + res_imgs->size = patches.size(); + res_imgs->data = new clip_image_f32[res_imgs->size]; int num=0; for (auto& patch : patches) { - normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std); + normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std); num++; } @@ -1660,9 +1660,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli // } // res_imgs.push_back(res); - res_imgs.size = 1; - res_imgs.data = new clip_image_f32[res_imgs.size]; - res_imgs.data[0] = *res; + res_imgs->size = 1; + res_imgs->data = new clip_image_f32[res_imgs->size]; + res_imgs->data[0] = *res; clip_image_f32_free(res); return true; diff --git a/examples/llava/clip.h b/examples/llava/clip.h index e5bd54924a9c8..45bdad6897658 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -60,8 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init(); CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); -CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch); -CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch); +CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); +CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); @@ -69,7 +69,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */ -CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs ); +CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 9801281661b25..29764757aab5d 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -223,7 +223,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli clip_image_f32_batch img_res_v; img_res_v.size = 0; img_res_v.data = nullptr; - if (!clip_image_preprocess(ctx_clip, img, img_res_v)) { + if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { fprintf(stderr, "%s: unable to preprocess image\n", __func__); delete[] img_res_v.data; return false; diff --git a/examples/llava/llava.h b/examples/llava/llava.h index 2d40f3f1d5f84..19212f6e9e9c5 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -29,9 +29,9 @@ struct llava_image_embed { }; /** sanity check for clip <-> llava embed size match */ -LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip); +LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip); -LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); +LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); /** build an image embed from image file bytes */ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); From 12247f4c69a173b9482f68aaa174ec37fc909ccf Mon Sep 17 00:00:00 2001 From: Andrew Canis Date: Fri, 15 Mar 2024 16:41:22 -0400 Subject: [PATCH 24/48] llama : add Command-R support (#6033) Information about the Command-R 35B model (128k context) can be found at: https://huggingface.co/CohereForAI/c4ai-command-r-v01 Based on the llama2 model with a few changes: 1) New hyper parameter to scale output logits (logit_scale) 2) Uses LayerNorm instead of RMSNorm 3) Transfomer layers have a single shared LayerNorm that feeds into both the self-attention and FFN layers in parallel. There is no post-attention LayerNorm. 4) No support for Rotary Position Embeddings (RoPE) scaling 5) No biases used Find GGUF files here: https://huggingface.co/andrewcanis/c4ai-command-r-v01-GGUF To convert model to GGUF format yourself: 1) Download Command-R Hugging Face safetensors: git lfs install git clone https://huggingface.co/CohereForAI/c4ai-command-r-v01 2) Run: python3 convert-hf-to-gguf.py --outtype f16 ./c4ai-command-r-v01 --- README.md | 1 + convert-hf-to-gguf.py | 17 ++++ gguf-py/gguf/constants.py | 15 +++ gguf-py/gguf/gguf_writer.py | 3 + llama.cpp | 183 ++++++++++++++++++++++++++++++++++++ 5 files changed, 219 insertions(+) diff --git a/README.md b/README.md index 61bedc3f86309..5cbdf7e476b09 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,7 @@ Typically finetunes of the base models below are supported as well. - [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [Gemma](https://ai.google.dev/gemma) - [x] [Mamba](https://github.com/state-spaces/mamba) +- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01) **Multimodal models:** diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5eee320163d29..cf1f98d660633 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1965,6 +1965,23 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) +@Model.register("CohereForCausalLM") +class CommandR2Model(Model): + model_arch = gguf.MODEL_ARCH.COMMAND_R + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # max_position_embeddings = 8192 in config.json but model was actually + # trained on 128k context length + self.hparams["max_position_embeddings"] = self.hparams["model_max_length"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 458a641dcd229..4a4facb06ea14 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -42,6 +42,7 @@ class LLM: EXPERT_COUNT = "{arch}.expert_count" EXPERT_USED_COUNT = "{arch}.expert_used_count" POOLING_TYPE = "{arch}.pooling_type" + LOGIT_SCALE = "{arch}.logit_scale" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -121,6 +122,7 @@ class MODEL_ARCH(IntEnum): GEMMA = auto() STARCODER2 = auto() MAMBA = auto() + COMMAND_R = auto() class MODEL_TENSOR(IntEnum): @@ -187,6 +189,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.COMMAND_R: "command-r", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -579,6 +582,18 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_D, MODEL_TENSOR.SSM_OUT, ], + MODEL_ARCH.COMMAND_R: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 1967b633ce261..2ae6c814b52de 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -361,6 +361,9 @@ def add_max_alibi_bias(self, bias: float) -> None: def add_clamp_kqv(self, value: float) -> None: self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) + def add_logit_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value) + def add_expert_count(self, count: int) -> None: self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count) diff --git a/llama.cpp b/llama.cpp index 8e185d4bfe773..fc5dd5cb43a7e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -214,6 +214,7 @@ enum llm_arch { LLM_ARCH_GEMMA, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, + LLM_ARCH_COMMAND_R, LLM_ARCH_UNKNOWN, }; @@ -243,6 +244,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, + { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -268,6 +270,7 @@ enum llm_kv { LLM_KV_EXPERT_COUNT, LLM_KV_EXPERT_USED_COUNT, LLM_KV_POOLING_TYPE, + LLM_KV_LOGIT_SCALE, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -332,6 +335,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, + { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -838,6 +842,21 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, }, }, + { + LLM_ARCH_COMMAND_R, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1597,6 +1616,7 @@ enum e_model { MODEL_20B, MODEL_30B, MODEL_34B, + MODEL_35B, MODEL_40B, MODEL_65B, MODEL_70B, @@ -1643,6 +1663,7 @@ struct llama_hparams { float f_clamp_kqv = 0.0f; float f_max_alibi_bias = 0.0f; + float f_logit_scale = 0.0f; bool causal_attn = true; bool need_kq_pos = false; @@ -3231,6 +3252,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_20B: return "20B"; case MODEL_30B: return "30B"; case MODEL_34B: return "34B"; + case MODEL_35B: return "35B"; case MODEL_40B: return "40B"; case MODEL_65B: return "65B"; case MODEL_70B: return "70B"; @@ -3623,6 +3645,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_COMMAND_R: + { + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_35B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -3944,6 +3975,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias); + LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale); LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); @@ -4918,6 +4950,37 @@ static bool llm_load_tensors( layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}); } } break; + case LLM_ARCH_COMMAND_R: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + // init output from the input tok embed + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -8315,6 +8378,121 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_command_r() { + + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const float f_logit_scale = hparams.f_logit_scale; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + struct ggml_tensor * ffn_inp = cur; + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + struct ggml_tensor * attn_out = cur; + + // feed-forward network + { + cur = llm_build_ffn(ctx0, ffn_inp, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + // add together residual + FFN + self-attention + cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx0, cur, attn_out); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + + if (f_logit_scale) { + cur = ggml_scale(ctx0, cur, f_logit_scale); + } + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -8497,6 +8675,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_mamba(); } break; + case LLM_ARCH_COMMAND_R: + { + result = llm.build_command_r(); + } break; default: GGML_ASSERT(false); } @@ -13147,6 +13329,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_ORION: case LLM_ARCH_INTERNLM2: case LLM_ARCH_MINICPM: + case LLM_ARCH_COMMAND_R: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 From 877b4d0c628cc70dddb5df72ed8fc14d126ca7e8 Mon Sep 17 00:00:00 2001 From: Theia Vogel Date: Fri, 15 Mar 2024 13:43:02 -0700 Subject: [PATCH 25/48] llama : add support for control vectors (#5970) * control vector api and implementation * control-vectors : minor code style updates * disable control vector when data == nullptr use -1 for disabled range (also on init) in case we ever support controlling layer 0 (embeddings) --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 215 ++++++++++++++++++++++++++++++++++++++++++++++ common/common.h | 31 ++++++- llama.cpp | 128 +++++++++++++++++++++++++++ llama.h | 23 ++++- 4 files changed, 392 insertions(+), 5 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 58fbd05aa3516..4912237e0d0f1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -568,6 +568,34 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.lora_base = argv[i]; + } else if (arg == "--control-vector") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back({ 1.0f, argv[i], }); + } else if (arg == "--control-vector-scaled") { + if (++i >= argc) { + invalid_param = true; + break; + } + const char * fname = argv[i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back({ std::stof(argv[i]), fname, }); + } else if (arg == "--control-vector-layer-range") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vector_layer_start = std::stoi(argv[i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vector_layer_end = std::stoi(argv[i]); } else if (arg == "--mmproj") { if (++i >= argc) { invalid_param = true; @@ -1095,6 +1123,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --control-vector FNAME\n"); + printf(" add a control vector\n"); + printf(" --control-vector-scaled FNAME S\n"); + printf(" add a control vector with user defined scaling S\n"); + printf(" --control-vector-layer-range START END\n"); + printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); printf(" -md FNAME, --model-draft FNAME\n"); @@ -1360,6 +1394,30 @@ std::tuple llama_init_from_gpt_par return std::make_tuple(nullptr, nullptr); } + if (!params.control_vectors.empty()) { + if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; + if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); + + const auto cvec = llama_control_vector_load(params.control_vectors); + if (cvec.n_embd == -1) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + + int err = llama_control_vector_apply(lctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + params.control_vector_layer_start, + params.control_vector_layer_end); + if (err) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + } + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); @@ -1890,3 +1948,160 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) return sum / (sqrt(sum1) * sqrt(sum2)); } + +// +// Control vector utils +// + +static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { + int32_t n_tensors; + + size_t n_bytes = 0; + + uint32_t max_direction_layer = 0; + + llama_control_vector_data result = { -1, {} }; + + // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer + { + struct ggml_init_params meta_params = { + /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ true, + }; + ggml_context * meta_ctx = ggml_init(meta_params); + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ true, + /* .ctx = */ &meta_ctx, + }; + struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); + if (!meta_ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + return result; + } + + n_tensors = gguf_get_n_tensors(meta_ctx_gguf); + for (int i = 0; i < n_tensors; i++) { + std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); + + // split on '.' + size_t dotpos = name.find('.'); + if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { + try { + uint32_t layer = std::stoi(name.substr(dotpos + 1)); + if (layer == 0) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + if (layer > max_direction_layer) { + max_direction_layer = layer; + } + } catch (...) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + } + + struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); + if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + if (result.n_embd == -1) { + result.n_embd = ggml_nelements(tensor_meta); + } else if (ggml_nelements(tensor_meta) != result.n_embd) { + fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + n_bytes += ggml_nbytes(tensor_meta); + } + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + } + + if (n_tensors == 0) { + fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); + return result; + } + + // load and scale tensors into final control vector context + struct ggml_init_params ggml_params = { + /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ false, + }; + struct ggml_context * ctx = ggml_init(ggml_params); + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); + ggml_free(ctx); + return result; + } + + // do not store data for layer 0 (it's not used) + result.data.resize(result.n_embd * max_direction_layer); + + for (uint32_t il = 1; il <= max_direction_layer; il++) { + const std::string name = "direction." + std::to_string(il); + const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + + float * dst = result.data.data() + result.n_embd * (il - 1); + + if (tensor) { + const float * src = (const float *) tensor->data; + for (int j = 0; j < result.n_embd; j++) { + dst[j] = src[j] * load_info.strength; + } + } else { + for (int j = 0; j < result.n_embd; j++) { + dst[j] = 0.0f; + } + } + } + + return result; +} + +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos) { + llama_control_vector_data result = { -1, {} }; + + for (const auto & info : load_infos) { + auto cur = llama_control_vector_load_one(info); + + if (cur.n_embd == -1) { + return result; + } + if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) { + fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str()); + return result; + } + + if (result.n_embd == -1) { + result = std::move(cur); + } else { + for (size_t i = 0; i < cur.data.size(); i++) { + result.data[i] += cur.data[i]; + } + } + } + + if (result.n_embd == -1) { + fprintf(stderr, "%s: no vectors passed\n", __func__); + } + + return result; +} diff --git a/common/common.h b/common/common.h index d250eef8b2b6b..687f3425e8544 100644 --- a/common/common.h +++ b/common/common.h @@ -37,10 +37,13 @@ extern char const *LLAMA_COMMIT; extern char const *LLAMA_COMPILER; extern char const *LLAMA_BUILD_TARGET; +struct llama_control_vector_load_info; + +int32_t get_num_physical_cores(); + // // CLI argument parsing // -int32_t get_num_physical_cores(); struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed @@ -103,6 +106,11 @@ struct gpt_params { std::vector> lora_adapter; // lora adapter path with user defined scale std::string lora_base = ""; // base model path for the lora adapter + std::vector control_vectors; // control vector with user defined scale + + int32_t control_vector_layer_start = -1; // layer range for control vector + int32_t control_vector_layer_end = -1; // layer range for control vector + int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line // (which is more convenient to use for plotting) @@ -269,3 +277,24 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40 void llama_embd_normalize(const float * inp, float * out, int n); float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); + +// +// Control vector utils +// + +struct llama_control_vector_data { + int n_embd; + + // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd + std::vector data; +}; + +struct llama_control_vector_load_info { + float strength; + + std::string fname; +}; + +// Load control vectors, scale each by strength, and add them together. +// On error, returns {-1, empty} +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos); diff --git a/llama.cpp b/llama.cpp index fc5dd5cb43a7e..52bd718ba89a5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1894,6 +1894,31 @@ struct llama_kv_cache { } }; +struct llama_control_vector { + std::vector tensors; // per layer + std::vector ctxs; + std::vector bufs; + + int32_t layer_start = -1; + int32_t layer_end = -1; + + ggml_tensor * tensor_for(int il) const { + if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { + return nullptr; + } + return tensors[il]; + } + + ~llama_control_vector() { + for (struct ggml_context * ctx : ctxs) { + ggml_free(ctx); + } + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } + } +}; + struct llama_vocab { using id = int32_t; using token = std::string; @@ -2108,6 +2133,9 @@ struct llama_context { struct ggml_tensor * inp_s_mask; // F32 [1, kv_size] struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch] + // control vectors + struct llama_control_vector cvec; + #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; #endif @@ -5931,6 +5959,12 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } cb(cur, "l_out", il); // input for next layer @@ -13366,6 +13400,10 @@ int32_t llama_n_embd(const struct llama_model * model) { return model->hparams.n_embd; } +int32_t llama_n_layer(const struct llama_model * model) { + return model->hparams.n_layer; +} + float llama_rope_freq_scale_train(const struct llama_model * model) { return model->hparams.rope_freq_scale_train; } @@ -13465,6 +13503,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const } } +static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { + GGML_ASSERT(cvec.tensors.empty()); + GGML_ASSERT(cvec.ctxs.empty()); + GGML_ASSERT(cvec.bufs.empty()); + + // count layer buffer types + std::map buft_layer_count; + for (int64_t i = 0; i < model.hparams.n_layer; i++) { + buft_layer_count[model.buft_layer[i].buft]++; + } + + // allocate contexts + std::map ctx_map; + for (auto & it : buft_layer_count) { + int n_layers = it.second; + struct ggml_init_params params = { + /*.mem_size =*/ n_layers * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); + return 1; + } + ctx_map[it.first] = ctx; + } + + // make tensors + cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 + for (size_t il = 1; il < model.hparams.n_layer; il++) { + struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft); + ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); + cvec.tensors.push_back(tensor); + } + + // allocate tensors / buffers and zero + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); + return false; + } + ggml_backend_buffer_clear(buf, 0); + cvec.ctxs.push_back(ctx); + cvec.bufs.push_back(buf); + } + + return true; +} + +int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) { + const llama_model & model = lctx->model; + llama_control_vector & cvec = lctx->cvec; + + if (data == nullptr) { + // disable the current control vector (but leave allocated for later) + cvec.layer_start = -1; + cvec.layer_end = -1; + return 0; + } + + if (n_embd != (int) model.hparams.n_embd) { + LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); + return 1; + } + + if (cvec.tensors.empty()) { + if (!llama_control_vector_init(cvec, model)) { + return 1; + } + } + + cvec.layer_start = il_start; + cvec.layer_end = il_end; + + for (size_t il = 1; il < model.hparams.n_layer; il++) { + assert(cvec.tensors[il] != nullptr); + + const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present + if (off + n_embd <= len) { + ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il])); + } + } + + return 0; +} + struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) { struct llama_kv_cache_view result = { /*.n_cells = */ 0, diff --git a/llama.h b/llama.h index 90aa5372e740b..40dcf54e394f8 100644 --- a/llama.h +++ b/llama.h @@ -388,6 +388,7 @@ extern "C" { LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_n_embd (const struct llama_model * model); + LLAMA_API int32_t llama_n_layer (const struct llama_model * model); // Get the model's RoPE frequency scaling factor LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model); @@ -435,10 +436,24 @@ extern "C" { // Returns 0 on success LLAMA_API int32_t llama_model_apply_lora_from_file( const struct llama_model * model, - const char * path_lora, - float scale, - const char * path_base_model, - int32_t n_threads); + const char * path_lora, + float scale, + const char * path_base_model, + int32_t n_threads); + + // Apply a loaded control vector to a llama_context, or if data is NULL, clear + // the currently loaded vector. + // n_embd should be the size of a single layer's control, and data should point + // to an n_embd x n_layers buffer starting from layer 1. + // il_start and il_end are the layer range the vector should apply to (both inclusive) + // See llama_control_vector_load in common to load a control vector. + LLAMA_API int32_t llama_control_vector_apply( + struct llama_context * lctx, + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end); // // KV cache From d84c48505f60bcd358b82a751d40418c4d235643 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 15 Mar 2024 22:14:16 +0100 Subject: [PATCH 26/48] llama : fix Baichuan2 13B (#6092) --- llama.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 52bd718ba89a5..e4db288ddd907 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6000,7 +6000,7 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr; // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); @@ -6050,7 +6050,6 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); From a56d09a4407f29c21e149b44fd5308f83aa1cb09 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 16 Mar 2024 13:20:53 +0100 Subject: [PATCH 27/48] ci : close inactive issue with workflow (#6053) * issues: ci - close inactive issue with workflow * ci: close issue, change workflow schedule time --- .github/workflows/close-issue.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/close-issue.yml diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml new file mode 100644 index 0000000000000..bc08a72d03166 --- /dev/null +++ b/.github/workflows/close-issue.yml @@ -0,0 +1,22 @@ +name: Close inactive issues +on: + schedule: + - cron: "42 0 * * *" + +jobs: + close-issues: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v5 + with: + days-before-issue-stale: 30 + days-before-issue-close: 14 + stale-issue-label: "stale" + stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." + close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." + days-before-pr-stale: -1 + days-before-pr-close: -1 + repo-token: ${{ secrets.GITHUB_TOKEN }} From 15961ec04dbd59d21d8984d42e4c0f7e7e7d320a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Sat, 16 Mar 2024 11:39:15 -0400 Subject: [PATCH 28/48] common : refactor nested if causing error C1061 on MSVC (#6101) * Refactor nested if causing error C1061 on MSVC. * Revert back and remove else's. * Add flag to track found arguments. --- common/common.cpp | 475 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 356 insertions(+), 119 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 4912237e0d0f1..1b0ba849398de 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -151,13 +151,17 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { std::replace(arg.begin(), arg.end(), '_', '-'); } + bool arg_found = false; if (arg == "-s" || arg == "--seed") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.seed = std::stoul(argv[i]); - } else if (arg == "-t" || arg == "--threads") { + } + if (arg == "-t" || arg == "--threads") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -166,7 +170,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads <= 0) { params.n_threads = std::thread::hardware_concurrency(); } - } else if (arg == "-tb" || arg == "--threads-batch") { + } + if (arg == "-tb" || arg == "--threads-batch") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -175,7 +181,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_batch <= 0) { params.n_threads_batch = std::thread::hardware_concurrency(); } - } else if (arg == "-td" || arg == "--threads-draft") { + } + if (arg == "-td" || arg == "--threads-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -184,7 +192,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_draft <= 0) { params.n_threads_draft = std::thread::hardware_concurrency(); } - } else if (arg == "-tbd" || arg == "--threads-batch-draft") { + } + if (arg == "-tbd" || arg == "--threads-batch-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -193,25 +203,37 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_batch_draft <= 0) { params.n_threads_batch_draft = std::thread::hardware_concurrency(); } - } else if (arg == "-p" || arg == "--prompt") { + } + if (arg == "-p" || arg == "--prompt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.prompt = argv[i]; - } else if (arg == "-e" || arg == "--escape") { + } + if (arg == "-e" || arg == "--escape") { + arg_found = true; params.escape = true; - } else if (arg == "--prompt-cache") { + } + if (arg == "--prompt-cache") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.path_prompt_cache = argv[i]; - } else if (arg == "--prompt-cache-all") { + } + if (arg == "--prompt-cache-all") { + arg_found = true; params.prompt_cache_all = true; - } else if (arg == "--prompt-cache-ro") { + } + if (arg == "--prompt-cache-ro") { + arg_found = true; params.prompt_cache_ro = true; - } else if (arg == "-bf" || arg == "--binary-file") { + } + if (arg == "-bf" || arg == "--binary-file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -228,7 +250,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { ss << file.rdbuf(); params.prompt = ss.str(); fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); - } else if (arg == "-f" || arg == "--file") { + } + if (arg == "-f" || arg == "--file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -245,51 +269,67 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (!params.prompt.empty() && params.prompt.back() == '\n') { params.prompt.pop_back(); } - } else if (arg == "-n" || arg == "--n-predict") { + } + if (arg == "-n" || arg == "--n-predict") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_predict = std::stoi(argv[i]); - } else if (arg == "--top-k") { + } + if (arg == "--top-k") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.top_k = std::stoi(argv[i]); - } else if (arg == "-c" || arg == "--ctx-size") { + } + if (arg == "-c" || arg == "--ctx-size") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_ctx = std::stoi(argv[i]); - } else if (arg == "--grp-attn-n" || arg == "-gan") { + } + if (arg == "--grp-attn-n" || arg == "-gan") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.grp_attn_n = std::stoi(argv[i]); - } else if (arg == "--grp-attn-w" || arg == "-gaw") { + } + if (arg == "--grp-attn-w" || arg == "-gaw") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.grp_attn_w = std::stoi(argv[i]); - } else if (arg == "--rope-freq-base") { + } + if (arg == "--rope-freq-base") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.rope_freq_base = std::stof(argv[i]); - } else if (arg == "--rope-freq-scale") { + } + if (arg == "--rope-freq-scale") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.rope_freq_scale = std::stof(argv[i]); - } else if (arg == "--rope-scaling") { + } + if (arg == "--rope-scaling") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -299,43 +339,57 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } - } else if (arg == "--rope-scale") { + } + if (arg == "--rope-scale") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.rope_freq_scale = 1.0f/std::stof(argv[i]); - } else if (arg == "--yarn-orig-ctx") { + } + if (arg == "--yarn-orig-ctx") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_orig_ctx = std::stoi(argv[i]); - } else if (arg == "--yarn-ext-factor") { + } + if (arg == "--yarn-ext-factor") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_ext_factor = std::stof(argv[i]); - } else if (arg == "--yarn-attn-factor") { + } + if (arg == "--yarn-attn-factor") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_attn_factor = std::stof(argv[i]); - } else if (arg == "--yarn-beta-fast") { + } + if (arg == "--yarn-beta-fast") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_beta_fast = std::stof(argv[i]); - } else if (arg == "--yarn-beta-slow") { + } + if (arg == "--yarn-beta-slow") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_beta_slow = std::stof(argv[i]); - } else if (arg == "--pooling") { + } + if (arg == "--pooling") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -345,118 +399,156 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } else { invalid_param = true; break; } - } else if (arg == "--defrag-thold" || arg == "-dt") { + } + if (arg == "--defrag-thold" || arg == "-dt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.defrag_thold = std::stof(argv[i]); - } else if (arg == "--samplers") { + } + if (arg == "--samplers") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } const auto sampler_names = string_split(argv[i], ';'); sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); - } else if (arg == "--sampling-seq") { + } + if (arg == "--sampling-seq") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.samplers_sequence = sampler_types_from_chars(argv[i]); - } else if (arg == "--top-p") { + } + if (arg == "--top-p") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.top_p = std::stof(argv[i]); - } else if (arg == "--min-p") { + } + if (arg == "--min-p") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.min_p = std::stof(argv[i]); - } else if (arg == "--temp") { + } + if (arg == "--temp") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.temp = std::stof(argv[i]); sparams.temp = std::max(sparams.temp, 0.0f); - } else if (arg == "--tfs") { + } + if (arg == "--tfs") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.tfs_z = std::stof(argv[i]); - } else if (arg == "--typical") { + } + if (arg == "--typical") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.typical_p = std::stof(argv[i]); - } else if (arg == "--repeat-last-n") { + } + if (arg == "--repeat-last-n") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_last_n = std::stoi(argv[i]); sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); - } else if (arg == "--repeat-penalty") { + } + if (arg == "--repeat-penalty") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_repeat = std::stof(argv[i]); - } else if (arg == "--frequency-penalty") { + } + if (arg == "--frequency-penalty") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_freq = std::stof(argv[i]); - } else if (arg == "--presence-penalty") { + } + if (arg == "--presence-penalty") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_present = std::stof(argv[i]); - } else if (arg == "--dynatemp-range") { + } + if (arg == "--dynatemp-range") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.dynatemp_range = std::stof(argv[i]); - } else if (arg == "--dynatemp-exp") { + } + if (arg == "--dynatemp-exp") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.dynatemp_exponent = std::stof(argv[i]); - } else if (arg == "--mirostat") { + } + if (arg == "--mirostat") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.mirostat = std::stoi(argv[i]); - } else if (arg == "--mirostat-lr") { + } + if (arg == "--mirostat-lr") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.mirostat_eta = std::stof(argv[i]); - } else if (arg == "--mirostat-ent") { + } + if (arg == "--mirostat-ent") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.mirostat_tau = std::stof(argv[i]); - } else if (arg == "--cfg-negative-prompt") { + } + if (arg == "--cfg-negative-prompt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.cfg_negative_prompt = argv[i]; - } else if (arg == "--cfg-negative-prompt-file") { + } + if (arg == "--cfg-negative-prompt-file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -471,86 +563,114 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { sparams.cfg_negative_prompt.pop_back(); } - } else if (arg == "--cfg-scale") { + } + if (arg == "--cfg-scale") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.cfg_scale = std::stof(argv[i]); - } else if (arg == "-b" || arg == "--batch-size") { + } + if (arg == "-b" || arg == "--batch-size") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_batch = std::stoi(argv[i]); - } else if (arg == "-ub" || arg == "--ubatch-size") { + } + if (arg == "-ub" || arg == "--ubatch-size") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_ubatch = std::stoi(argv[i]); - } else if (arg == "--keep") { + } + if (arg == "--keep") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_keep = std::stoi(argv[i]); - } else if (arg == "--draft") { + } + if (arg == "--draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_draft = std::stoi(argv[i]); - } else if (arg == "--chunks") { + } + if (arg == "--chunks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_chunks = std::stoi(argv[i]); - } else if (arg == "-np" || arg == "--parallel") { + } + if (arg == "-np" || arg == "--parallel") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_parallel = std::stoi(argv[i]); - } else if (arg == "-ns" || arg == "--sequences") { + } + if (arg == "-ns" || arg == "--sequences") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_sequences = std::stoi(argv[i]); - } else if (arg == "--p-split" || arg == "-ps") { + } + if (arg == "--p-split" || arg == "-ps") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.p_split = std::stof(argv[i]); - } else if (arg == "-m" || arg == "--model") { + } + if (arg == "-m" || arg == "--model") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.model = argv[i]; - } else if (arg == "-md" || arg == "--model-draft") { + } + if (arg == "-md" || arg == "--model-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.model_draft = argv[i]; - } else if (arg == "-a" || arg == "--alias") { + } + if (arg == "-a" || arg == "--alias") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.model_alias = argv[i]; - } else if (arg == "--lora") { + } + if (arg == "--lora") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.lora_adapter.emplace_back(argv[i], 1.0f); params.use_mmap = false; - } else if (arg == "--lora-scaled") { + } + if (arg == "--lora-scaled") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -562,19 +682,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.use_mmap = false; - } else if (arg == "--lora-base") { + } + if (arg == "--lora-base") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.lora_base = argv[i]; - } else if (arg == "--control-vector") { + } + if (arg == "--control-vector") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.control_vectors.push_back({ 1.0f, argv[i], }); - } else if (arg == "--control-vector-scaled") { + } + if (arg == "--control-vector-scaled") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -585,7 +711,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.control_vectors.push_back({ std::stof(argv[i]), fname, }); - } else if (arg == "--control-vector-layer-range") { + } + if (arg == "--control-vector-layer-range") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -596,49 +724,85 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.control_vector_layer_end = std::stoi(argv[i]); - } else if (arg == "--mmproj") { + } + if (arg == "--mmproj") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.mmproj = argv[i]; - } else if (arg == "--image") { + } + if (arg == "--image") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.image = argv[i]; - } else if (arg == "-i" || arg == "--interactive") { + } + if (arg == "-i" || arg == "--interactive") { + arg_found = true; params.interactive = true; - } else if (arg == "--embedding") { + } + if (arg == "--embedding") { + arg_found = true; params.embedding = true; - } else if (arg == "--interactive-first") { + } + if (arg == "--interactive-first") { + arg_found = true; params.interactive_first = true; - } else if (arg == "-ins" || arg == "--instruct") { + } + if (arg == "-ins" || arg == "--instruct") { + arg_found = true; params.instruct = true; - } else if (arg == "-cml" || arg == "--chatml") { + } + if (arg == "-cml" || arg == "--chatml") { + arg_found = true; params.chatml = true; - } else if (arg == "--infill") { + } + if (arg == "--infill") { + arg_found = true; params.infill = true; - } else if (arg == "-dkvc" || arg == "--dump-kv-cache") { + } + if (arg == "-dkvc" || arg == "--dump-kv-cache") { + arg_found = true; params.dump_kv_cache = true; - } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + } + if (arg == "-nkvo" || arg == "--no-kv-offload") { + arg_found = true; params.no_kv_offload = true; - } else if (arg == "-ctk" || arg == "--cache-type-k") { + } + if (arg == "-ctk" || arg == "--cache-type-k") { + arg_found = true; params.cache_type_k = argv[++i]; - } else if (arg == "-ctv" || arg == "--cache-type-v") { + } + if (arg == "-ctv" || arg == "--cache-type-v") { + arg_found = true; params.cache_type_v = argv[++i]; - } else if (arg == "--multiline-input") { + } + if (arg == "--multiline-input") { + arg_found = true; params.multiline_input = true; - } else if (arg == "--simple-io") { + } + if (arg == "--simple-io") { + arg_found = true; params.simple_io = true; - } else if (arg == "-cb" || arg == "--cont-batching") { + } + if (arg == "-cb" || arg == "--cont-batching") { + arg_found = true; params.cont_batching = true; - } else if (arg == "--color") { + } + if (arg == "--color") { + arg_found = true; params.use_color = true; - } else if (arg == "--mlock") { + } + if (arg == "--mlock") { + arg_found = true; params.use_mlock = true; - } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + } + if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -648,7 +812,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } - } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + } + if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -658,7 +824,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } - } else if (arg == "--main-gpu" || arg == "-mg") { + } + if (arg == "--main-gpu" || arg == "-mg") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -667,7 +835,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { #ifndef GGML_USE_CUBLAS_SYCL fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL - } else if (arg == "--split-mode" || arg == "-sm") { + } + if (arg == "--split-mode" || arg == "-sm") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -691,7 +861,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL - } else if (arg == "--tensor-split" || arg == "-ts") { + } + if (arg == "--tensor-split" || arg == "-ts") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -716,9 +888,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { #ifndef GGML_USE_CUBLAS_SYCL_VULKAN fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL - } else if (arg == "--no-mmap") { + } + if (arg == "--no-mmap") { + arg_found = true; params.use_mmap = false; - } else if (arg == "--numa") { + } + if (arg == "--numa") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -728,17 +904,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } else { invalid_param = true; break; } - } else if (arg == "--verbose-prompt") { + } + if (arg == "--verbose-prompt") { + arg_found = true; params.verbose_prompt = true; - } else if (arg == "--no-display-prompt") { + } + if (arg == "--no-display-prompt") { + arg_found = true; params.display_prompt = false; - } else if (arg == "-r" || arg == "--reverse-prompt") { + } + if (arg == "-r" || arg == "--reverse-prompt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.antiprompt.emplace_back(argv[i]); - } else if (arg == "-ld" || arg == "--logdir") { + } + if (arg == "-ld" || arg == "--logdir") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -748,63 +932,93 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.logdir.back() != DIRECTORY_SEPARATOR) { params.logdir += DIRECTORY_SEPARATOR; } - } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + } + if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.logits_file = argv[i]; - } else if (arg == "--perplexity" || arg == "--all-logits") { + } + if (arg == "--perplexity" || arg == "--all-logits") { + arg_found = true; params.logits_all = true; - } else if (arg == "--ppl-stride") { + } + if (arg == "--ppl-stride") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.ppl_stride = std::stoi(argv[i]); - } else if (arg == "-ptc" || arg == "--print-token-count") { + } + if (arg == "-ptc" || arg == "--print-token-count") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_print = std::stoi(argv[i]); - } else if (arg == "--ppl-output-type") { + } + if (arg == "--ppl-output-type") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.ppl_output_type = std::stoi(argv[i]); - } else if (arg == "--hellaswag") { + } + if (arg == "--hellaswag") { + arg_found = true; params.hellaswag = true; - } else if (arg == "--hellaswag-tasks") { + } + if (arg == "--hellaswag-tasks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.hellaswag_tasks = std::stoi(argv[i]); - } else if (arg == "--winogrande") { + } + if (arg == "--winogrande") { + arg_found = true; params.winogrande = true; - } else if (arg == "--winogrande-tasks") { + } + if (arg == "--winogrande-tasks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.winogrande_tasks = std::stoi(argv[i]); - } else if (arg == "--multiple-choice") { + } + if (arg == "--multiple-choice") { + arg_found = true; params.multiple_choice = true; - } else if (arg == "--multiple-choice-tasks") { + } + if (arg == "--multiple-choice-tasks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.multiple_choice_tasks = std::stoi(argv[i]); - } else if (arg == "--kl-divergence") { + } + if (arg == "--kl-divergence") { + arg_found = true; params.kl_divergence = true; - } else if (arg == "--ignore-eos") { + } + if (arg == "--ignore-eos") { + arg_found = true; params.ignore_eos = true; - } else if (arg == "--no-penalize-nl") { + } + if (arg == "--no-penalize-nl") { + arg_found = true; sparams.penalize_nl = false; - } else if (arg == "-l" || arg == "--logit-bias") { + } + if (arg == "-l" || arg == "--logit-bias") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -823,36 +1037,51 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - } else if (arg == "-h" || arg == "--help") { + } + if (arg == "-h" || arg == "--help") { + arg_found = true; return false; - - } else if (arg == "--version") { + } + if (arg == "--version") { + arg_found = true; fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); - } else if (arg == "--random-prompt") { + } + if (arg == "--random-prompt") { + arg_found = true; params.random_prompt = true; - } else if (arg == "--in-prefix-bos") { + } + if (arg == "--in-prefix-bos") { + arg_found = true; params.input_prefix_bos = true; - } else if (arg == "--in-prefix") { + } + if (arg == "--in-prefix") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.input_prefix = argv[i]; - } else if (arg == "--in-suffix") { + } + if (arg == "--in-suffix") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.input_suffix = argv[i]; - } else if (arg == "--grammar") { + } + if (arg == "--grammar") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.grammar = argv[i]; - } else if (arg == "--grammar-file") { + } + if (arg == "--grammar-file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -868,7 +1097,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { std::istreambuf_iterator(), std::back_inserter(sparams.grammar) ); - } else if (arg == "--override-kv") { + } + if (arg == "--override-kv") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -911,10 +1142,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.kv_overrides.push_back(kvo); #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters - } else if ( log_param_single_parse( argv[i] ) ) { + } + if ( log_param_single_parse( argv[i] ) ) { + arg_found = true; // Do nothing, log_param_single_parse automatically does it's thing // and returns if a match was found and parsed. - } else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) { + } + if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) { + arg_found = true; // We have a matching known parameter requiring an argument, // now we need to check if there is anything after this argv // and flag invalid_param or parse it. @@ -928,7 +1163,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } // End of Parse args for logging parameters #endif // LOG_DISABLE_LOGS - } else { + } + + if (!arg_found) { throw std::invalid_argument("error: unknown argument: " + arg); } } From dfbfdd60f90207404039c6578d709231496831d9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 16 Mar 2024 16:42:08 +0100 Subject: [PATCH 29/48] readme : add wllama as a wasm binding (#6100) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5cbdf7e476b09..c2f3342f0ff42 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,7 @@ Typically finetunes of the base models below are supported as well. - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) +- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) From b5f4ae09c3244ae1644b67c03ed9f4227ab25ad2 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Sat, 16 Mar 2024 16:46:29 +0100 Subject: [PATCH 30/48] gritlm : add initial README.md (#6086) * gritlm: add initial README.md to examples/gritlm This commit adds a suggestion for an initial README.md for the gritlm example. Signed-off-by: Daniel Bevenius * squash! gritlm: add initial README.md to examples/gritlm Use the `scripts/hf.sh` script to download the model file. Signed-off-by: Daniel Bevenius * squash! gritlm: add initial README.md to examples/gritlm Fix editorconfig-checker error in examples/gritlm/README.md. Signed-off-by: Daniel Bevenius --------- Signed-off-by: Daniel Bevenius --- examples/gritlm/README.md | 62 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 examples/gritlm/README.md diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md new file mode 100644 index 0000000000000..64cc192040c43 --- /dev/null +++ b/examples/gritlm/README.md @@ -0,0 +1,62 @@ +## Generative Representational Instruction Tuning (GRIT) Example +[gritlm] a model which can generate embeddings as well as "normal" text +generation depending on the instructions in the prompt. + +* Paper: https://arxiv.org/pdf/2402.09906.pdf + +### Retrieval-Augmented Generation (RAG) use case +One use case for `gritlm` is to use it with RAG. If we recall how RAG works is +that we take documents that we want to use as context, to ground the large +language model (LLM), and we create token embeddings for them. We then store +these token embeddings in a vector database. + +When we perform a query, prompt the LLM, we will first create token embeddings +for the query and then search the vector database to retrieve the most +similar vectors, and return those documents so they can be passed to the LLM as +context. Then the query and the context will be passed to the LLM which will +have to _again_ create token embeddings for the query. But because gritlm is used +the first query can be cached and the second query tokenization generation does +not have to be performed at all. + +### Running the example +Download a Grit model: +```console +$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf +``` + +Run the example using the downloaded model: +```console +$ ./gritlm -m gritlm-7b_q4_1.gguf + +Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605 +Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103 +Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112 +Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547 + +Oh, brave adventurer, who dared to climb +The lofty peak of Mt. Fuji in the night, +When shadows lurk and ghosts do roam, +And darkness reigns, a fearsome sight. + +Thou didst set out, with heart aglow, +To conquer this mountain, so high, +And reach the summit, where the stars do glow, +And the moon shines bright, up in the sky. + +Through the mist and fog, thou didst press on, +With steadfast courage, and a steadfast will, +Through the darkness, thou didst not be gone, +But didst climb on, with a steadfast skill. + +At last, thou didst reach the summit's crest, +And gazed upon the world below, +And saw the beauty of the night's best, +And felt the peace, that only nature knows. + +Oh, brave adventurer, who dared to climb +The lofty peak of Mt. Fuji in the night, +Thou art a hero, in the eyes of all, +For thou didst conquer this mountain, so bright. +``` + +[gritlm]: https://github.com/ContextualAI/gritlm From c47cf414efafb8f60596edc7edb5a2d68065e992 Mon Sep 17 00:00:00 2001 From: AmirAli Mirian <37371367+amiralimi@users.noreply.github.com> Date: Sat, 16 Mar 2024 11:52:02 -0400 Subject: [PATCH 31/48] ggml : add AVX512F SIMD (#6088) --- ggml.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/ggml.c b/ggml.c index c94006e51a092..fa23cb3c44f87 100644 --- a/ggml.c +++ b/ggml.c @@ -931,6 +931,101 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE #endif +#elif defined(__AVX512F__) + +#define GGML_SIMD + +// F32 AVX512 + +#define GGML_F32_STEP 64 +#define GGML_F32_EPR 16 + +#define GGML_F32x16 __m512 +#define GGML_F32x16_ZERO _mm512_setzero_ps() +#define GGML_F32x16_SET1(x) _mm512_set1_ps(x) +#define GGML_F32x16_LOAD _mm512_loadu_ps +#define GGML_F32x16_STORE _mm512_storeu_ps +// _mm512_fmadd_ps is defined in AVX512F so no guard is required +#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) +#define GGML_F32x16_ADD _mm512_add_ps +#define GGML_F32x16_MUL _mm512_mul_ps +#define GGML_F32x16_REDUCE(res, x) \ +do { \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + res = _mm512_reduce_add_ps(x[0]); \ +} while (0) + +// TODO: is this optimal ? + +#define GGML_F32_VEC GGML_F32x16 +#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x16_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD +#define GGML_F32_VEC_STORE GGML_F32x16_STORE +#define GGML_F32_VEC_FMA GGML_F32x16_FMA +#define GGML_F32_VEC_ADD GGML_F32x16_ADD +#define GGML_F32_VEC_MUL GGML_F32x16_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE + +// F16 AVX512 + +// F16 AVX + +#define GGML_F16_STEP 64 +#define GGML_F16_EPR 16 + +// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead + +#define GGML_F32Cx16 __m512 +#define GGML_F32Cx16_ZERO _mm512_setzero_ps() +#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x) + +// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F +// so F16C guard isn't required +#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x))) +#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0)) + +#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) +#define GGML_F32Cx16_ADD _mm512_add_ps +#define GGML_F32Cx16_MUL _mm512_mul_ps +#define GGML_F32Cx16_REDUCE(res, x) \ +do { \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + res = _mm512_reduce_add_ps(x[0]); \ +} while (0) + +#define GGML_F16_VEC GGML_F32Cx16 +#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE + #elif defined(__AVX__) #define GGML_SIMD From dc0f6125487dcfbff913360f9d877bc0ccf6aa57 Mon Sep 17 00:00:00 2001 From: GainLee Date: Mon, 18 Mar 2024 01:12:22 +0800 Subject: [PATCH 32/48] ggml:fix finding transfer queue family index error (#6094) Co-authored-by: GainLee --- ggml-vulkan.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 7cce616ba714f..698b31496f417 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -710,6 +710,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector= 0) { + return compute_index; + } + std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl; for(auto &q_family : queue_family_props) { From cd776c37c945bf58efc8fe44b370456680cb1b59 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 17 Mar 2024 19:51:57 +0200 Subject: [PATCH 33/48] ci : close all stale issues at once (#6115) --- .github/workflows/close-issue.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index bc08a72d03166..2682f308cb46e 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -19,4 +19,5 @@ jobs: close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." days-before-pr-stale: -1 days-before-pr-close: -1 + operations-per-run: 1000 repo-token: ${{ secrets.GITHUB_TOKEN }} From d01b3c4c32357567f3531d4e6ceffc5d23e87583 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sun, 17 Mar 2024 19:12:37 +0100 Subject: [PATCH 34/48] common: llama_load_model_from_url using --model-url (#6098) * common: llama_load_model_from_url with libcurl dependency Co-authored-by: Georgi Gerganov --- .github/workflows/build.yml | 22 ++ .github/workflows/server.yml | 20 +- CMakeLists.txt | 1 + Makefile | 5 + common/CMakeLists.txt | 13 +- common/common.cpp | 238 +++++++++++++++++- common/common.h | 4 + examples/main/README.md | 1 + examples/server/README.md | 1 + examples/server/server.cpp | 8 + examples/server/tests/README.md | 2 +- .../server/tests/features/embeddings.feature | 3 +- examples/server/tests/features/environment.py | 97 +++---- examples/server/tests/features/server.feature | 3 +- examples/server/tests/features/steps/steps.py | 37 ++- examples/server/tests/requirements.txt | 1 + 16 files changed, 399 insertions(+), 57 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0da01d5ba6ead..945df42f886a6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -48,6 +48,28 @@ jobs: CC=gcc-8 make tests -j $(nproc) make test -j $(nproc) + ubuntu-focal-make-curl: + runs-on: ubuntu-20.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev + + - name: Build + id: make_build + env: + LLAMA_FATAL_WARNINGS: 1 + LLAMA_CURL: 1 + run: | + CC=gcc-8 make -j $(nproc) + ubuntu-latest-cmake: runs-on: ubuntu-latest diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 5e38b3547c659..4ea09115a3c44 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -57,7 +57,8 @@ jobs: cmake \ python3-pip \ wget \ - language-pack-en + language-pack-en \ + libcurl4-openssl-dev - name: Build id: cmake_build @@ -67,6 +68,7 @@ jobs: cmake .. \ -DLLAMA_NATIVE=OFF \ -DLLAMA_BUILD_SERVER=ON \ + -DLLAMA_CURL=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server @@ -101,12 +103,21 @@ jobs: with: fetch-depth: 0 + - name: libCURL + id: get_libcurl + env: + CURL_VERSION: 8.6.0_6 + run: | + curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip" + mkdir $env:RUNNER_TEMP/libcurl + tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl + - name: Build id: cmake_build run: | mkdir build cd build - cmake .. -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ; + cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server - name: Python setup @@ -120,6 +131,11 @@ jobs: run: | pip install -r examples/server/tests/requirements.txt + - name: Copy Libcurl + id: prepare_libcurl + run: | + cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll + - name: Tests id: server_integration_tests if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ac2804a6881a..fc4cff28f44ac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "llama: max. batch size for using peer access") +option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF) diff --git a/Makefile b/Makefile index c0f1250366a64..838daf5c02acd 100644 --- a/Makefile +++ b/Makefile @@ -595,6 +595,11 @@ include scripts/get-flags.mk CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic endif +ifdef LLAMA_CURL +override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL +override LDFLAGS := $(LDFLAGS) -lcurl +endif + # # Print build information # diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 350bbdf7f7b1b..af2629a460b93 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -68,6 +68,17 @@ if (BUILD_SHARED_LIBS) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() +set(LLAMA_COMMON_EXTRA_LIBS build_info) + +# Use curl to download model url +if (LLAMA_CURL) + find_package(CURL REQUIRED) + add_definitions(-DLLAMA_USE_CURL) + include_directories(${CURL_INCLUDE_DIRS}) + find_library(CURL_LIBRARY curl REQUIRED) + set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) +endif () + target_include_directories(${TARGET} PUBLIC .) target_compile_features(${TARGET} PUBLIC cxx_std_11) -target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama) +target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama) diff --git a/common/common.cpp b/common/common.cpp index 1b0ba849398de..2f5d965d6511c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -37,6 +37,9 @@ #include #include #endif +#if defined(LLAMA_USE_CURL) +#include +#endif #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -50,6 +53,18 @@ #define GGML_USE_CUBLAS_SYCL_VULKAN #endif +#if defined(LLAMA_USE_CURL) +#ifdef __linux__ +#include +#elif defined(_WIN32) +#define PATH_MAX MAX_PATH +#else +#include +#endif +#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX +#define LLAMA_CURL_MAX_HEADER_LENGTH 256 +#endif // LLAMA_USE_CURL + int32_t get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores @@ -644,6 +659,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } params.model = argv[i]; } + if (arg == "-mu" || arg == "--model-url") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_url = argv[i]; + } if (arg == "-md" || arg == "--model-draft") { arg_found = true; if (++i >= argc) { @@ -1368,6 +1390,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); + printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); + printf(" model download url (default: %s)\n", params.model_url.c_str()); printf(" -md FNAME, --model-draft FNAME\n"); printf(" draft model for speculative decoding\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n"); @@ -1613,10 +1637,222 @@ void llama_batch_add( batch.n_tokens++; } +#ifdef LLAMA_USE_CURL + +struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, + struct llama_model_params params) { + // Basic validation of the model_url + if (!model_url || strlen(model_url) == 0) { + fprintf(stderr, "%s: invalid model_url\n", __func__); + return NULL; + } + + // Initialize libcurl globally + auto curl = curl_easy_init(); + + if (!curl) { + fprintf(stderr, "%s: error initializing libcurl\n", __func__); + return NULL; + } + + // Set the URL, allow to follow http redirection + curl_easy_setopt(curl, CURLOPT_URL, model_url); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); +#if defined(_WIN32) + // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of + // operating system. Currently implemented under MS-Windows. + curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); +#endif + + // Check if the file already exists locally + struct stat model_file_info; + auto file_exists = (stat(path_model, &model_file_info) == 0); + + // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files + char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; + snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model); + + char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; + snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model); + + if (file_exists) { + auto * f_etag = fopen(etag_path, "r"); + if (f_etag) { + if (!fgets(etag, sizeof(etag), f_etag)) { + fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path); + } else { + fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag); + } + fclose(f_etag); + } + + auto * f_last_modified = fopen(last_modified_path, "r"); + if (f_last_modified) { + if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) { + fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path); + } else { + fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path, + last_modified); + } + fclose(f_last_modified); + } + } + + // Send a HEAD request to retrieve the etag and last-modified headers + struct llama_load_model_from_url_headers { + char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + }; + llama_load_model_from_url_headers headers; + { + typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); + auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { + llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; + + const char * etag_prefix = "etag: "; + if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) { + strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF + } + + const char * last_modified_prefix = "last-modified: "; + if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) { + strncpy(headers->last_modified, buffer + strlen(last_modified_prefix), + n_items - strlen(last_modified_prefix) - 2); // Remove CRLF + } + return n_items; + }; + + curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress + curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast(header_callback)); + curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers); + + CURLcode res = curl_easy_perform(curl); + if (res != CURLE_OK) { + curl_easy_cleanup(curl); + fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); + return NULL; + } + + long http_code = 0; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + if (http_code != 200) { + // HEAD not supported, we don't know if the file has changed + // force trigger downloading + file_exists = false; + fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code); + } + } + + // If the ETag or the Last-Modified headers are different: trigger a new download + if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) { + char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; + snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model); + if (file_exists) { + fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model); + if (remove(path_model) != 0) { + curl_easy_cleanup(curl); + fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model); + return NULL; + } + } + + // Set the output file + auto * outfile = fopen(path_model_temporary, "wb"); + if (!outfile) { + curl_easy_cleanup(curl); + fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model); + return NULL; + } + + typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd); + auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t { + return fwrite(data, size, nmemb, (FILE *)fd); + }; + curl_easy_setopt(curl, CURLOPT_NOBODY, 0L); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast(write_callback)); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile); + + // display download progress + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + + // start the download + fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + model_url, path_model, headers.etag, headers.last_modified); + auto res = curl_easy_perform(curl); + if (res != CURLE_OK) { + fclose(outfile); + curl_easy_cleanup(curl); + fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); + return NULL; + } + + long http_code = 0; + curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code); + if (http_code < 200 || http_code >= 400) { + fclose(outfile); + curl_easy_cleanup(curl); + fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); + return NULL; + } + + // Clean up + fclose(outfile); + + // Write the new ETag to the .etag file + if (strlen(headers.etag) > 0) { + auto * etag_file = fopen(etag_path, "w"); + if (etag_file) { + fputs(headers.etag, etag_file); + fclose(etag_file); + fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag); + } + } + + // Write the new lastModified to the .etag file + if (strlen(headers.last_modified) > 0) { + auto * last_modified_file = fopen(last_modified_path, "w"); + if (last_modified_file) { + fputs(headers.last_modified, last_modified_file); + fclose(last_modified_file); + fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path, + headers.last_modified); + } + } + + if (rename(path_model_temporary, path_model) != 0) { + curl_easy_cleanup(curl); + fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model); + return NULL; + } + } + + curl_easy_cleanup(curl); + + return llama_load_model_from_file(path_model, params); +} + +#else + +struct llama_model * llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/, + struct llama_model_params /*params*/) { + fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); + return nullptr; +} + +#endif // LLAMA_USE_CURL + std::tuple llama_init_from_gpt_params(gpt_params & params) { auto mparams = llama_model_params_from_gpt_params(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); + llama_model * model = nullptr; + if (!params.model_url.empty()) { + model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams); + } else { + model = llama_load_model_from_file(params.model.c_str(), mparams); + } if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); return std::make_tuple(nullptr, nullptr); diff --git a/common/common.h b/common/common.h index 687f3425e8544..8dd8a3edc9c94 100644 --- a/common/common.h +++ b/common/common.h @@ -89,6 +89,7 @@ struct gpt_params { struct llama_sampling_params sparams; std::string model = "models/7B/ggml-model-f16.gguf"; // model path + std::string model_url = ""; // model url to download std::string model_draft = ""; // draft model for speculative decoding std::string model_alias = "unknown"; // model alias std::string prompt = ""; @@ -191,6 +192,9 @@ std::tuple llama_init_from_gpt_par struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); +struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, + struct llama_model_params params); + // Batch utils void llama_batch_clear(struct llama_batch & batch); diff --git a/examples/main/README.md b/examples/main/README.md index 7f84e42623274..6a8d1e1c50cbb 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -67,6 +67,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt In this section, we cover the most commonly used options for running the `main` program with the LLaMA models: - `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models. - `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. diff --git a/examples/server/README.md b/examples/server/README.md index 8f8454affaecd..755e1d5384f55 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -20,6 +20,7 @@ The project is under active development, and we are [looking for feedback and co - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`) - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). +- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 895d608fdcc06..d2a8e541d3305 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2195,6 +2195,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co } printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); + printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); + printf(" model download url (default: %s)\n", params.model_url.c_str()); printf(" -a ALIAS, --alias ALIAS\n"); printf(" set an alias for the model, will be added as `model` field in completion response\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); @@ -2317,6 +2319,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, break; } params.model = argv[i]; + } else if (arg == "-mu" || arg == "--model-url") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_url = argv[i]; } else if (arg == "-a" || arg == "--alias") { if (++i >= argc) { invalid_param = true; diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 95a0353b6a9c5..feb2b1d6cf5de 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -57,7 +57,7 @@ Feature or Scenario must be annotated with `@llama.cpp` to be included in the de To run a scenario annotated with `@bug`, start: ```shell -DEBUG=ON ./tests.sh --no-skipped --tags bug +DEBUG=ON ./tests.sh --no-skipped --tags bug --stop ``` After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated. diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature index 57359b267a668..dcf1434f97124 100644 --- a/examples/server/tests/features/embeddings.feature +++ b/examples/server/tests/features/embeddings.feature @@ -4,7 +4,8 @@ Feature: llama.cpp server Background: Server startup Given a server listening on localhost:8080 - And a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models + And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf + And a model file ggml-model-f16.gguf And a model alias bert-bge-small And 42 as server seed And 2 slots diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py index 8ad987e1bb618..82104e9202e5e 100644 --- a/examples/server/tests/features/environment.py +++ b/examples/server/tests/features/environment.py @@ -1,10 +1,12 @@ -import errno import os +import signal import socket -import subprocess +import sys import time +import traceback from contextlib import closing -import signal + +import psutil def before_scenario(context, scenario): @@ -20,33 +22,40 @@ def before_scenario(context, scenario): def after_scenario(context, scenario): - if context.server_process is None: - return - if scenario.status == "failed": - if 'GITHUB_ACTIONS' in os.environ: - print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n") - if os.path.isfile('llama.log'): - with closing(open('llama.log', 'r')) as f: - for line in f: - print(line) - if not is_server_listening(context.server_fqdn, context.server_port): - print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n") - - if not pid_exists(context.server_process.pid): - assert False, f"Server not running pid={context.server_process.pid} ..." - - server_graceful_shutdown(context) - - # Wait few for socket to free up - time.sleep(0.05) - - attempts = 0 - while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port): - server_kill(context) - time.sleep(0.1) - attempts += 1 - if attempts > 5: - server_kill_hard(context) + try: + if 'server_process' not in context or context.server_process is None: + return + if scenario.status == "failed": + if 'GITHUB_ACTIONS' in os.environ: + print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n") + if os.path.isfile('llama.log'): + with closing(open('llama.log', 'r')) as f: + for line in f: + print(line) + if not is_server_listening(context.server_fqdn, context.server_port): + print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n") + + if not pid_exists(context.server_process.pid): + assert False, f"Server not running pid={context.server_process.pid} ..." + + server_graceful_shutdown(context) + + # Wait few for socket to free up + time.sleep(0.05) + + attempts = 0 + while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port): + server_kill(context) + time.sleep(0.1) + attempts += 1 + if attempts > 5: + server_kill_hard(context) + except: + exc = sys.exception() + print("error in after scenario: \n") + print(exc) + print("*** print_tb: \n") + traceback.print_tb(exc.__traceback__, file=sys.stdout) def server_graceful_shutdown(context): @@ -67,11 +76,11 @@ def server_kill_hard(context): path = context.server_path print(f"Server dangling exits, hard killing force {pid}={path}...\n") - if os.name == 'nt': - process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode() - print(process) - else: - os.kill(-pid, signal.SIGKILL) + try: + psutil.Process(pid).kill() + except psutil.NoSuchProcess: + return False + return True def is_server_listening(server_fqdn, server_port): @@ -84,17 +93,9 @@ def is_server_listening(server_fqdn, server_port): def pid_exists(pid): - """Check whether pid exists in the current process table.""" - if pid < 0: + try: + psutil.Process(pid) + except psutil.NoSuchProcess: return False - if os.name == 'nt': - output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode() - print(output) - return "No tasks are running" not in output - else: - try: - os.kill(pid, 0) - except OSError as e: - return e.errno == errno.EPERM - else: - return True + return True + diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 5014f326dc050..7448986e75a49 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -4,7 +4,8 @@ Feature: llama.cpp server Background: Server startup Given a server listening on localhost:8080 - And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + And a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf + And a model file stories260K.gguf And a model alias tinyllama-2 And 42 as server seed # KV Cache corresponds to the total amount of tokens diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index a59a52d21748a..9e348d5fc4c37 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -5,6 +5,8 @@ import re import socket import subprocess +import sys +import threading import time from contextlib import closing from re import RegexFlag @@ -32,6 +34,8 @@ def step_server_config(context, server_fqdn, server_port): context.base_url = f'http://{context.server_fqdn}:{context.server_port}' context.model_alias = None + context.model_file = None + context.model_url = None context.n_batch = None context.n_ubatch = None context.n_ctx = None @@ -65,6 +69,16 @@ def step_download_hf_model(context, hf_file, hf_repo): print(f"model file: {context.model_file}\n") +@step('a model file {model_file}') +def step_model_file(context, model_file): + context.model_file = model_file + + +@step('a model url {model_url}') +def step_model_url(context, model_url): + context.model_url = model_url + + @step('a model alias {model_alias}') def step_model_alias(context, model_alias): context.model_alias = model_alias @@ -141,7 +155,8 @@ def step_start_server(context): async def step_wait_for_the_server_to_be_started(context, expecting_status): match expecting_status: case 'healthy': - await wait_for_health_status(context, context.base_url, 200, 'ok') + await wait_for_health_status(context, context.base_url, 200, 'ok', + timeout=30) case 'ready' | 'idle': await wait_for_health_status(context, context.base_url, 200, 'ok', @@ -1038,8 +1053,11 @@ def start_server_background(context): server_args = [ '--host', server_listen_addr, '--port', context.server_port, - '--model', context.model_file ] + if context.model_file: + server_args.extend(['--model', context.model_file]) + if context.model_url: + server_args.extend(['--model-url', context.model_url]) if context.n_batch: server_args.extend(['--batch-size', context.n_batch]) if context.n_ubatch: @@ -1079,8 +1097,23 @@ def start_server_background(context): pkwargs = { 'creationflags': flags, + 'stdout': subprocess.PIPE, + 'stderr': subprocess.PIPE } context.server_process = subprocess.Popen( [str(arg) for arg in [context.server_path, *server_args]], **pkwargs) + + def log_stdout(process): + for line in iter(process.stdout.readline, b''): + print(line.decode('utf-8'), end='') + thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,)) + thread_stdout.start() + + def log_stderr(process): + for line in iter(process.stderr.readline, b''): + print(line.decode('utf-8'), end='', file=sys.stderr) + thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,)) + thread_stderr.start() + print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}") diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 2e4f42ad28c23..c2c960102b523 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -3,4 +3,5 @@ behave~=1.2.6 huggingface_hub~=0.20.3 numpy~=1.24.4 openai~=0.25.0 +psutil~=5.9.8 prometheus-client~=0.20.0 From 3a6efdd03c46c5ba08e43880d34260c02dd9999b Mon Sep 17 00:00:00 2001 From: Romain D <90720+Artefact2@users.noreply.github.com> Date: Mon, 18 Mar 2024 09:04:41 +0100 Subject: [PATCH 35/48] convert : use f32 outtype for bf16 tensors (#6106) The old behaviour is to use f16, but bf16 to f16 is not a lossless conversion. Change the outtype to f32 to default to a lossless conversion. --- convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index 161430f3e717e..817cb66123a8f 100755 --- a/convert.py +++ b/convert.py @@ -1167,9 +1167,9 @@ def write_all( def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type - if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): + if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)): return GGMLFileType.AllF32 - if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)): + if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16): return GGMLFileType.MostlyF16 if output_type_str == "q8_0": return GGMLFileType.MostlyQ8_0 From 9b03719ad712e2dc36c5c0c20f352bf3e4bda332 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9rence?= <13496987+Royalphax@users.noreply.github.com> Date: Mon, 18 Mar 2024 09:17:00 +0100 Subject: [PATCH 36/48] convert : add support for CamembertModel architecture (#6119) Adding support for CamembertModel architecture used by : https://huggingface.co/dangvantuan/sentence-camembert-large --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index cf1f98d660633..1e49d56c19514 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1634,7 +1634,7 @@ def write_tensors(self): self.post_write_tensors(tensor_map, name, data_torch) -@Model.register("BertModel") +@Model.register("BertModel", "CamembertModel") class BertModel(Model): model_arch = gguf.MODEL_ARCH.BERT From 496bc79bc2b79bfd6124b8687a8dbd6a646e9b06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Mon, 18 Mar 2024 04:27:44 -0400 Subject: [PATCH 37/48] common : tidy-up argument parsing (#6105) * Tidy-up argument parsing. * Missing ref. * common : minor * common : add static classifier --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 1829 +++++++++++++++++++++++---------------------- 1 file changed, 921 insertions(+), 908 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 2f5d965d6511c..919182862cb5c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -154,1040 +154,1053 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return result; } -bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { - bool invalid_param = false; - std::string arg; - const std::string arg_prefix = "--"; - llama_sampling_params & sparams = params.sparams; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } +static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int & i, bool & invalid_param) { + std::string arg = argv[i]; + llama_sampling_params& sparams = params.sparams; - bool arg_found = false; - if (arg == "-s" || arg == "--seed") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.seed = std::stoul(argv[i]); + if (arg == "-s" || arg == "--seed") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-t" || arg == "--threads") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads = std::stoi(argv[i]); - if (params.n_threads <= 0) { - params.n_threads = std::thread::hardware_concurrency(); - } - } - if (arg == "-tb" || arg == "--threads-batch") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads_batch = std::stoi(argv[i]); - if (params.n_threads_batch <= 0) { - params.n_threads_batch = std::thread::hardware_concurrency(); - } - } - if (arg == "-td" || arg == "--threads-draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads_draft = std::stoi(argv[i]); - if (params.n_threads_draft <= 0) { - params.n_threads_draft = std::thread::hardware_concurrency(); - } + params.seed = std::stoul(argv[i]); + return true; + } + if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-tbd" || arg == "--threads-batch-draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads_batch_draft = std::stoi(argv[i]); - if (params.n_threads_batch_draft <= 0) { - params.n_threads_batch_draft = std::thread::hardware_concurrency(); - } + params.n_threads = std::stoi(argv[i]); + if (params.n_threads <= 0) { + params.n_threads = std::thread::hardware_concurrency(); } - if (arg == "-p" || arg == "--prompt") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.prompt = argv[i]; + return true; + } + if (arg == "-tb" || arg == "--threads-batch") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-e" || arg == "--escape") { - arg_found = true; - params.escape = true; + params.n_threads_batch = std::stoi(argv[i]); + if (params.n_threads_batch <= 0) { + params.n_threads_batch = std::thread::hardware_concurrency(); } - if (arg == "--prompt-cache") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.path_prompt_cache = argv[i]; + return true; + } + if (arg == "-td" || arg == "--threads-draft") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--prompt-cache-all") { - arg_found = true; - params.prompt_cache_all = true; + params.n_threads_draft = std::stoi(argv[i]); + if (params.n_threads_draft <= 0) { + params.n_threads_draft = std::thread::hardware_concurrency(); } - if (arg == "--prompt-cache-ro") { - arg_found = true; - params.prompt_cache_ro = true; + return true; + } + if (arg == "-tbd" || arg == "--threads-batch-draft") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-bf" || arg == "--binary-file") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i], std::ios::binary); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - // store the external file name in params - params.prompt_file = argv[i]; - std::ostringstream ss; - ss << file.rdbuf(); - params.prompt = ss.str(); - fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); - } - if (arg == "-f" || arg == "--file") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - // store the external file name in params - params.prompt_file = argv[i]; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (!params.prompt.empty() && params.prompt.back() == '\n') { - params.prompt.pop_back(); - } + params.n_threads_batch_draft = std::stoi(argv[i]); + if (params.n_threads_batch_draft <= 0) { + params.n_threads_batch_draft = std::thread::hardware_concurrency(); } - if (arg == "-n" || arg == "--n-predict") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_predict = std::stoi(argv[i]); + return true; + } + if (arg == "-p" || arg == "--prompt") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--top-k") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.top_k = std::stoi(argv[i]); + params.prompt = argv[i]; + return true; + } + if (arg == "-e" || arg == "--escape") { + params.escape = true; + return true; + } + if (arg == "--prompt-cache") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-c" || arg == "--ctx-size") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ctx = std::stoi(argv[i]); + params.path_prompt_cache = argv[i]; + return true; + } + if (arg == "--prompt-cache-all") { + params.prompt_cache_all = true; + return true; + } + if (arg == "--prompt-cache-ro") { + params.prompt_cache_ro = true; + return true; + } + if (arg == "-bf" || arg == "--binary-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i], std::ios::binary); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + // store the external file name in params + params.prompt_file = argv[i]; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); + return true; + } + if (arg == "-f" || arg == "--file") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--grp-attn-n" || arg == "-gan") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - - params.grp_attn_n = std::stoi(argv[i]); + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; } - if (arg == "--grp-attn-w" || arg == "-gaw") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - - params.grp_attn_w = std::stoi(argv[i]); + // store the external file name in params + params.prompt_file = argv[i]; + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (!params.prompt.empty() && params.prompt.back() == '\n') { + params.prompt.pop_back(); } - if (arg == "--rope-freq-base") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_base = std::stof(argv[i]); + return true; + } + if (arg == "-n" || arg == "--n-predict") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--rope-freq-scale") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_scale = std::stof(argv[i]); + params.n_predict = std::stoi(argv[i]); + return true; + } + if (arg == "--top-k") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--rope-scaling") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } - else { invalid_param = true; break; } - } - if (arg == "--rope-scale") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_scale = 1.0f/std::stof(argv[i]); + sparams.top_k = std::stoi(argv[i]); + return true; + } + if (arg == "-c" || arg == "--ctx-size") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--yarn-orig-ctx") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_orig_ctx = std::stoi(argv[i]); + params.n_ctx = std::stoi(argv[i]); + return true; + } + if (arg == "--grp-attn-n" || arg == "-gan") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--yarn-ext-factor") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_ext_factor = std::stof(argv[i]); + params.grp_attn_n = std::stoi(argv[i]); + return true; + } + if (arg == "--grp-attn-w" || arg == "-gaw") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--yarn-attn-factor") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_attn_factor = std::stof(argv[i]); + params.grp_attn_w = std::stoi(argv[i]); + return true; + } + if (arg == "--rope-freq-base") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--yarn-beta-fast") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_beta_fast = std::stof(argv[i]); + params.rope_freq_base = std::stof(argv[i]); + return true; + } + if (arg == "--rope-freq-scale") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--yarn-beta-slow") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_beta_slow = std::stof(argv[i]); + params.rope_freq_scale = std::stof(argv[i]); + return true; + } + if (arg == "--rope-scaling") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--pooling") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string value(argv[i]); - /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } - else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } - else { invalid_param = true; break; } - } - if (arg == "--defrag-thold" || arg == "-dt") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.defrag_thold = std::stof(argv[i]); + std::string value(argv[i]); + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + else { invalid_param = true; } + return true; + } + if (arg == "--rope-scale") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--samplers") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - const auto sampler_names = string_split(argv[i], ';'); - sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); + params.rope_freq_scale = 1.0f / std::stof(argv[i]); + return true; + } + if (arg == "--yarn-orig-ctx") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--sampling-seq") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.samplers_sequence = sampler_types_from_chars(argv[i]); + params.yarn_orig_ctx = std::stoi(argv[i]); + return true; + } + if (arg == "--yarn-ext-factor") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--top-p") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.top_p = std::stof(argv[i]); + params.yarn_ext_factor = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-attn-factor") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--min-p") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.min_p = std::stof(argv[i]); + params.yarn_attn_factor = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-beta-fast") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--temp") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.temp = std::stof(argv[i]); - sparams.temp = std::max(sparams.temp, 0.0f); + params.yarn_beta_fast = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-beta-slow") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--tfs") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.tfs_z = std::stof(argv[i]); + params.yarn_beta_slow = std::stof(argv[i]); + return true; + } + if (arg == "--pooling") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--typical") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.typical_p = std::stof(argv[i]); + std::string value(argv[i]); + /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else { invalid_param = true; } + return true; + } + if (arg == "--defrag-thold" || arg == "-dt") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--repeat-last-n") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_last_n = std::stoi(argv[i]); - sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); + params.defrag_thold = std::stof(argv[i]); + return true; + } + if (arg == "--samplers") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--repeat-penalty") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_repeat = std::stof(argv[i]); + const auto sampler_names = string_split(argv[i], ';'); + sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); + return true; + } + if (arg == "--sampling-seq") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--frequency-penalty") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_freq = std::stof(argv[i]); + sparams.samplers_sequence = sampler_types_from_chars(argv[i]); + return true; + } + if (arg == "--top-p") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--presence-penalty") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_present = std::stof(argv[i]); + sparams.top_p = std::stof(argv[i]); + return true; + } + if (arg == "--min-p") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--dynatemp-range") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.dynatemp_range = std::stof(argv[i]); + sparams.min_p = std::stof(argv[i]); + return true; + } + if (arg == "--temp") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--dynatemp-exp") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.dynatemp_exponent = std::stof(argv[i]); + sparams.temp = std::stof(argv[i]); + sparams.temp = std::max(sparams.temp, 0.0f); + return true; + } + if (arg == "--tfs") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--mirostat") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.mirostat = std::stoi(argv[i]); + sparams.tfs_z = std::stof(argv[i]); + return true; + } + if (arg == "--typical") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--mirostat-lr") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.mirostat_eta = std::stof(argv[i]); + sparams.typical_p = std::stof(argv[i]); + return true; + } + if (arg == "--repeat-last-n") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--mirostat-ent") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.mirostat_tau = std::stof(argv[i]); + sparams.penalty_last_n = std::stoi(argv[i]); + sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); + return true; + } + if (arg == "--repeat-penalty") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--cfg-negative-prompt") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.cfg_negative_prompt = argv[i]; + sparams.penalty_repeat = std::stof(argv[i]); + return true; + } + if (arg == "--frequency-penalty") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--cfg-negative-prompt-file") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); - if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { - sparams.cfg_negative_prompt.pop_back(); - } + sparams.penalty_freq = std::stof(argv[i]); + return true; + } + if (arg == "--presence-penalty") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--cfg-scale") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.cfg_scale = std::stof(argv[i]); + sparams.penalty_present = std::stof(argv[i]); + return true; + } + if (arg == "--dynatemp-range") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-b" || arg == "--batch-size") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_batch = std::stoi(argv[i]); + sparams.dynatemp_range = std::stof(argv[i]); + return true; + } + if (arg == "--dynatemp-exp") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-ub" || arg == "--ubatch-size") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ubatch = std::stoi(argv[i]); + sparams.dynatemp_exponent = std::stof(argv[i]); + return true; + } + if (arg == "--mirostat") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--keep") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_keep = std::stoi(argv[i]); + sparams.mirostat = std::stoi(argv[i]); + return true; + } + if (arg == "--mirostat-lr") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_draft = std::stoi(argv[i]); + sparams.mirostat_eta = std::stof(argv[i]); + return true; + } + if (arg == "--mirostat-ent") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--chunks") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_chunks = std::stoi(argv[i]); + sparams.mirostat_tau = std::stof(argv[i]); + return true; + } + if (arg == "--cfg-negative-prompt") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-np" || arg == "--parallel") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_parallel = std::stoi(argv[i]); + sparams.cfg_negative_prompt = argv[i]; + return true; + } + if (arg == "--cfg-negative-prompt-file") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-ns" || arg == "--sequences") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_sequences = std::stoi(argv[i]); + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; } - if (arg == "--p-split" || arg == "-ps") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.p_split = std::stof(argv[i]); + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); + if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { + sparams.cfg_negative_prompt.pop_back(); } - if (arg == "-m" || arg == "--model") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.model = argv[i]; + return true; + } + if (arg == "--cfg-scale") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-mu" || arg == "--model-url") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_url = argv[i]; + sparams.cfg_scale = std::stof(argv[i]); + return true; + } + if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-md" || arg == "--model-draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_draft = argv[i]; + params.n_batch = std::stoi(argv[i]); + return true; + } + if (arg == "-ub" || arg == "--ubatch-size") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-a" || arg == "--alias") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_alias = argv[i]; + params.n_ubatch = std::stoi(argv[i]); + return true; + } + if (arg == "--keep") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--lora") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.emplace_back(argv[i], 1.0f); - params.use_mmap = false; + params.n_keep = std::stoi(argv[i]); + return true; + } + if (arg == "--draft") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--lora-scaled") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - const char * lora_adapter = argv[i]; - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); - params.use_mmap = false; + params.n_draft = std::stoi(argv[i]); + return true; + } + if (arg == "--chunks") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--lora-base") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_base = argv[i]; + params.n_chunks = std::stoi(argv[i]); + return true; + } + if (arg == "-np" || arg == "--parallel") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--control-vector") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.control_vectors.push_back({ 1.0f, argv[i], }); + params.n_parallel = std::stoi(argv[i]); + return true; + } + if (arg == "-ns" || arg == "--sequences") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--control-vector-scaled") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - const char * fname = argv[i]; - if (++i >= argc) { - invalid_param = true; - break; - } - params.control_vectors.push_back({ std::stof(argv[i]), fname, }); + params.n_sequences = std::stoi(argv[i]); + return true; + } + if (arg == "--p-split" || arg == "-ps") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--control-vector-layer-range") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.control_vector_layer_start = std::stoi(argv[i]); - if (++i >= argc) { - invalid_param = true; - break; - } - params.control_vector_layer_end = std::stoi(argv[i]); + params.p_split = std::stof(argv[i]); + return true; + } + if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--mmproj") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.mmproj = argv[i]; + params.model = argv[i]; + return true; + } + if (arg == "-mu" || arg == "--model-url") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--image") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.image = argv[i]; + params.model_url = argv[i]; + return true; + } + if (arg == "-md" || arg == "--model-draft") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-i" || arg == "--interactive") { - arg_found = true; - params.interactive = true; + params.model_draft = argv[i]; + return true; + } + if (arg == "-a" || arg == "--alias") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--embedding") { - arg_found = true; - params.embedding = true; + params.model_alias = argv[i]; + return true; + } + if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--interactive-first") { - arg_found = true; - params.interactive_first = true; + params.lora_adapter.emplace_back(argv[i], 1.0f); + params.use_mmap = false; + return true; + } + if (arg == "--lora-scaled") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-ins" || arg == "--instruct") { - arg_found = true; - params.instruct = true; + const char* lora_adapter = argv[i]; + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-cml" || arg == "--chatml") { - arg_found = true; - params.chatml = true; + params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); + params.use_mmap = false; + return true; + } + if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--infill") { - arg_found = true; - params.infill = true; + params.lora_base = argv[i]; + return true; + } + if (arg == "--control-vector") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-dkvc" || arg == "--dump-kv-cache") { - arg_found = true; - params.dump_kv_cache = true; + params.control_vectors.push_back({ 1.0f, argv[i], }); + return true; + } + if (arg == "--control-vector-scaled") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-nkvo" || arg == "--no-kv-offload") { - arg_found = true; - params.no_kv_offload = true; + const char* fname = argv[i]; + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-ctk" || arg == "--cache-type-k") { - arg_found = true; - params.cache_type_k = argv[++i]; + params.control_vectors.push_back({ std::stof(argv[i]), fname, }); + return true; + } + if (arg == "--control-vector-layer-range") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-ctv" || arg == "--cache-type-v") { - arg_found = true; - params.cache_type_v = argv[++i]; + params.control_vector_layer_start = std::stoi(argv[i]); + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--multiline-input") { - arg_found = true; - params.multiline_input = true; + params.control_vector_layer_end = std::stoi(argv[i]); + return true; + } + if (arg == "--mmproj") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--simple-io") { - arg_found = true; - params.simple_io = true; + params.mmproj = argv[i]; + return true; + } + if (arg == "--image") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-cb" || arg == "--cont-batching") { - arg_found = true; - params.cont_batching = true; + params.image = argv[i]; + return true; + } + if (arg == "-i" || arg == "--interactive") { + params.interactive = true; + return true; + } + if (arg == "--embedding") { + params.embedding = true; + return true; + } + if (arg == "--interactive-first") { + params.interactive_first = true; + return true; + } + if (arg == "-ins" || arg == "--instruct") { + params.instruct = true; + return true; + } + if (arg == "-cml" || arg == "--chatml") { + params.chatml = true; + return true; + } + if (arg == "--infill") { + params.infill = true; + return true; + } + if (arg == "-dkvc" || arg == "--dump-kv-cache") { + params.dump_kv_cache = true; + return true; + } + if (arg == "-nkvo" || arg == "--no-kv-offload") { + params.no_kv_offload = true; + return true; + } + if (arg == "-ctk" || arg == "--cache-type-k") { + params.cache_type_k = argv[++i]; + return true; + } + if (arg == "-ctv" || arg == "--cache-type-v") { + params.cache_type_v = argv[++i]; + return true; + } + if (arg == "--multiline-input") { + params.multiline_input = true; + return true; + } + if (arg == "--simple-io") { + params.simple_io = true; + return true; + } + if (arg == "-cb" || arg == "--cont-batching") { + params.cont_batching = true; + return true; + } + if (arg == "--color") { + params.use_color = true; + return true; + } + if (arg == "--mlock") { + params.use_mlock = true; + return true; + } + if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--color") { - arg_found = true; - params.use_color = true; + params.n_gpu_layers = std::stoi(argv[i]); + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } - if (arg == "--mlock") { - arg_found = true; - params.use_mlock = true; + return true; + } + if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_gpu_layers = std::stoi(argv[i]); - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } + params.n_gpu_layers_draft = std::stoi(argv[i]); + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } - if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_gpu_layers_draft = std::stoi(argv[i]); - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } + return true; + } + if (arg == "--main-gpu" || arg == "-mg") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--main-gpu" || arg == "-mg") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.main_gpu = std::stoi(argv[i]); + params.main_gpu = std::stoi(argv[i]); #ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL + return true; + } + if (arg == "--split-mode" || arg == "-sm") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--split-mode" || arg == "-sm") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string arg_next = argv[i]; - if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; - } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; - } else if (arg_next == "row") { + std::string arg_next = argv[i]; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_MODE_NONE; + } + else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_MODE_LAYER; + } + else if (arg_next == "row") { #ifdef GGML_USE_SYCL - fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); - exit(1); + fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); + exit(1); #endif // GGML_USE_SYCL - params.split_mode = LLAMA_SPLIT_MODE_ROW; - } else { - invalid_param = true; - break; - } + params.split_mode = LLAMA_SPLIT_MODE_ROW; + } + else { + invalid_param = true; + return true; + } #ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL - + return true; + } + if (arg == "--tensor-split" || arg == "-ts") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--tensor-split" || arg == "-ts") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string arg_next = argv[i]; + std::string arg_next = argv[i]; - // split string by , and / - const std::regex regex{R"([,/]+)"}; - std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; - std::vector split_arg{it, {}}; - if (split_arg.size() >= llama_max_devices()) { - invalid_param = true; - break; + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + if (split_arg.size() >= llama_max_devices()) { + invalid_param = true; + return true; + } + for (size_t i = 0; i < llama_max_devices(); ++i) { + if (i < split_arg.size()) { + params.tensor_split[i] = std::stof(split_arg[i]); } - for (size_t i = 0; i < llama_max_devices(); ++i) { - if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); - } else { - params.tensor_split[i] = 0.0f; - } + else { + params.tensor_split[i] = 0.0f; } + } #ifndef GGML_USE_CUBLAS_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL + return true; + } + if (arg == "--no-mmap") { + params.use_mmap = false; + return true; + } + if (arg == "--numa") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--no-mmap") { - arg_found = true; - params.use_mmap = false; + std::string value(argv[i]); + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { invalid_param = true; } + return true; + } + if (arg == "--verbose-prompt") { + params.verbose_prompt = true; + return true; + } + if (arg == "--no-display-prompt") { + params.display_prompt = false; + return true; + } + if (arg == "-r" || arg == "--reverse-prompt") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--numa") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string value(argv[i]); - /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { invalid_param = true; break; } - } - if (arg == "--verbose-prompt") { - arg_found = true; - params.verbose_prompt = true; - } - if (arg == "--no-display-prompt") { - arg_found = true; - params.display_prompt = false; - } - if (arg == "-r" || arg == "--reverse-prompt") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.antiprompt.emplace_back(argv[i]); + params.antiprompt.emplace_back(argv[i]); + return true; + } + if (arg == "-ld" || arg == "--logdir") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-ld" || arg == "--logdir") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.logdir = argv[i]; + params.logdir = argv[i]; - if (params.logdir.back() != DIRECTORY_SEPARATOR) { - params.logdir += DIRECTORY_SEPARATOR; - } + if (params.logdir.back() != DIRECTORY_SEPARATOR) { + params.logdir += DIRECTORY_SEPARATOR; } - if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.logits_file = argv[i]; + return true; + } + if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--perplexity" || arg == "--all-logits") { - arg_found = true; - params.logits_all = true; + params.logits_file = argv[i]; + return true; + } + if (arg == "--perplexity" || arg == "--all-logits") { + params.logits_all = true; + return true; + } + if (arg == "--ppl-stride") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--ppl-stride") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.ppl_stride = std::stoi(argv[i]); + params.ppl_stride = std::stoi(argv[i]); + return true; + } + if (arg == "-ptc" || arg == "--print-token-count") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-ptc" || arg == "--print-token-count") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_print = std::stoi(argv[i]); + params.n_print = std::stoi(argv[i]); + return true; + } + if (arg == "--ppl-output-type") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--ppl-output-type") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.ppl_output_type = std::stoi(argv[i]); + params.ppl_output_type = std::stoi(argv[i]); + return true; + } + if (arg == "--hellaswag") { + params.hellaswag = true; + return true; + } + if (arg == "--hellaswag-tasks") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--hellaswag") { - arg_found = true; - params.hellaswag = true; + params.hellaswag_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--winogrande") { + params.winogrande = true; + return true; + } + if (arg == "--winogrande-tasks") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--hellaswag-tasks") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.hellaswag_tasks = std::stoi(argv[i]); + params.winogrande_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--multiple-choice") { + params.multiple_choice = true; + return true; + } + if (arg == "--multiple-choice-tasks") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--winogrande") { - arg_found = true; - params.winogrande = true; + params.multiple_choice_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--kl-divergence") { + params.kl_divergence = true; + return true; + } + if (arg == "--ignore-eos") { + params.ignore_eos = true; + return true; + } + if (arg == "--no-penalize-nl") { + sparams.penalize_nl = false; + return true; + } + if (arg == "-l" || arg == "--logit-bias") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--winogrande-tasks") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; + std::stringstream ss(argv[i]); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); } - params.winogrande_tasks = std::stoi(argv[i]); - } - if (arg == "--multiple-choice") { - arg_found = true; - params.multiple_choice = true; - } - if (arg == "--multiple-choice-tasks") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; + else { + throw std::exception(); } - params.multiple_choice_tasks = std::stoi(argv[i]); } - if (arg == "--kl-divergence") { - arg_found = true; - params.kl_divergence = true; + catch (const std::exception&) { + invalid_param = true; + return true; } - if (arg == "--ignore-eos") { - arg_found = true; - params.ignore_eos = true; + return true; + } + if (arg == "-h" || arg == "--help") { + return false; + } + if (arg == "--version") { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + if (arg == "--random-prompt") { + params.random_prompt = true; + return true; + } + if (arg == "--in-prefix-bos") { + params.input_prefix_bos = true; + return true; + } + if (arg == "--in-prefix") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--no-penalize-nl") { - arg_found = true; - sparams.penalize_nl = false; + params.input_prefix = argv[i]; + return true; + } + if (arg == "--in-suffix") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-l" || arg == "--logit-bias") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::stringstream ss(argv[i]); - llama_token key; - char sign; - std::string value_str; - try { - if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { - sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - } else { - throw std::exception(); - } - } catch (const std::exception&) { - invalid_param = true; - break; - } + params.input_suffix = argv[i]; + return true; + } + if (arg == "--grammar") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "-h" || arg == "--help") { - arg_found = true; - return false; + sparams.grammar = argv[i]; + return true; + } + if (arg == "--grammar-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(sparams.grammar) + ); + return true; + } + if (arg == "--override-kv") { + if (++i >= argc) { + invalid_param = true; + return true; } - if (arg == "--version") { - arg_found = true; - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); - exit(0); + char* sep = strchr(argv[i], '='); + if (sep == nullptr || sep - argv[i] >= 128) { + fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); + invalid_param = true; + return true; } - if (arg == "--random-prompt") { - arg_found = true; - params.random_prompt = true; + struct llama_model_kv_override kvo; + std::strncpy(kvo.key, argv[i], sep - argv[i]); + kvo.key[sep - argv[i]] = 0; + sep++; + if (strncmp(sep, "int:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.int_value = std::atol(sep); } - if (arg == "--in-prefix-bos") { - arg_found = true; - params.input_prefix_bos = true; + else if (strncmp(sep, "float:", 6) == 0) { + sep += 6; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; + kvo.float_value = std::atof(sep); } - if (arg == "--in-prefix") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; + else if (strncmp(sep, "bool:", 5) == 0) { + sep += 5; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; + if (std::strcmp(sep, "true") == 0) { + kvo.bool_value = true; } - params.input_prefix = argv[i]; - } - if (arg == "--in-suffix") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; + else if (std::strcmp(sep, "false") == 0) { + kvo.bool_value = false; } - params.input_suffix = argv[i]; - } - if (arg == "--grammar") { - arg_found = true; - if (++i >= argc) { + else { + fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); invalid_param = true; - break; + return true; } - sparams.grammar = argv[i]; } - if (arg == "--grammar-file") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(sparams.grammar) - ); - } - if (arg == "--override-kv") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - char * sep = strchr(argv[i], '='); - if (sep == nullptr || sep - argv[i] >= 128) { - fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - struct llama_model_kv_override kvo; - std::strncpy(kvo.key, argv[i], sep - argv[i]); - kvo.key[sep - argv[i]] = 0; - sep++; - if (strncmp(sep, "int:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.int_value = std::atol(sep); - } else if (strncmp(sep, "float:", 6) == 0) { - sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; - kvo.float_value = std::atof(sep); - } else if (strncmp(sep, "bool:", 5) == 0) { - sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; - if (std::strcmp(sep, "true") == 0) { - kvo.bool_value = true; - } else if (std::strcmp(sep, "false") == 0) { - kvo.bool_value = false; - } else { - fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - } else { - fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - params.kv_overrides.push_back(kvo); + else { + fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); + invalid_param = true; + return true; + } + params.kv_overrides.push_back(kvo); + return true; + } #ifndef LOG_DISABLE_LOGS - // Parse args for logging parameters - } - if ( log_param_single_parse( argv[i] ) ) { - arg_found = true; - // Do nothing, log_param_single_parse automatically does it's thing - // and returns if a match was found and parsed. - } - if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) { - arg_found = true; - // We have a matching known parameter requiring an argument, - // now we need to check if there is anything after this argv - // and flag invalid_param or parse it. - if (++i >= argc) { - invalid_param = true; - break; - } - if( !log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i-1], argv[i]) ) { - invalid_param = true; - break; - } - // End of Parse args for logging parameters + // Parse args for logging parameters + if (log_param_single_parse(argv[i])) { + // Do nothing, log_param_single_parse automatically does it's thing + // and returns if a match was found and parsed. + return true; + } + if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) { + // We have a matching known parameter requiring an argument, + // now we need to check if there is anything after this argv + // and flag invalid_param or parse it. + if (++i >= argc) { + invalid_param = true; + return true; + } + if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) { + invalid_param = true; + return true; + } + return true; + } + // End of Parse args for logging parameters #endif // LOG_DISABLE_LOGS + + return false; +} + +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { + bool invalid_param = false; + std::string arg; + const std::string arg_prefix = "--"; + llama_sampling_params & sparams = params.sparams; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); } - if (!arg_found) { + if (!gpt_params_find_arg(argc, argv, params, i, invalid_param)) { throw std::invalid_argument("error: unknown argument: " + arg); } } From 2bf8d0f7c4cc1235755ad06961ca761e458c5e55 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 18 Mar 2024 11:03:04 +0100 Subject: [PATCH 38/48] backend : offload large batches to GPU (#6083) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * backend : offload large batches to GPU * fix hip * code cleanup * fix CUDA split buffers * Update ggml-backend-impl.h Co-authored-by: Johannes Gäßler * cuda : fix memset without set_device * imatrix : remove sched affix from weight names * sched : add a new split if the current one has too many inputs reduce max inputs per split more cleanup * update backends ggml-ci --------- Co-authored-by: Johannes Gäßler --- examples/imatrix/imatrix.cpp | 32 ++- examples/llama-bench/llama-bench.cpp | 4 +- ggml-alloc.c | 10 +- ggml-backend-impl.h | 5 + ggml-backend.c | 278 ++++++++++++++----------- ggml-backend.h | 8 +- ggml-cuda.cu | 297 +++++++++------------------ ggml-cuda.h | 21 +- ggml-kompute.cpp | 1 + ggml-metal.m | 1 + ggml-sycl.cpp | 1 + ggml-vulkan.cpp | 1 + ggml.c | 19 +- llama.cpp | 67 +++--- 14 files changed, 349 insertions(+), 396 deletions(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index f21bc48f3b466..ea79b9062ddad 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -56,13 +56,31 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const struct ggml_tensor * src0 = t->src[0]; const struct ggml_tensor * src1 = t->src[1]; + std::string wname; + { + // remove any prefix and suffixes from the name + // CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight + const char * p = strchr(src0->name, '#'); + if (p != NULL) { + p = p + 1; + const char * q = strchr(p, '#'); + if (q != NULL) { + wname = std::string(p, q - p); + } else { + wname = p; + } + } else { + wname = src0->name; + } + } + // when ask is true, the scheduler wants to know if we are interested in data from this tensor // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection if (ask) { if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications if (t->op != GGML_OP_MUL_MAT) return false; if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; - if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false; + if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false; return true; } @@ -94,12 +112,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * // this is necessary to guarantee equal number of "ncall" for each tensor for (int ex = 0; ex < n_as; ++ex) { src0 = t->src[2 + ex]; - auto& e = m_stats[src0->name]; + auto& e = m_stats[wname]; if (e.values.empty()) { e.values.resize(src1->ne[0], 0); } else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); exit(1); //GGML_ASSERT(false); } // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger @@ -107,7 +125,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * //if (idx == t->src[0]->ne[0] - 1) ++e.ncall; ++e.ncall; if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); } for (int row = 0; row < (int)src1->ne[1]; ++row) { const int excur = m_ids[row*n_as + idx]; @@ -129,17 +147,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } } } else { - auto& e = m_stats[src0->name]; + auto& e = m_stats[wname]; if (e.values.empty()) { e.values.resize(src1->ne[0], 0); } else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); exit(1); //GGML_ASSERT(false); } ++e.ncall; if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); } for (int row = 0; row < (int)src1->ne[1]; ++row) { const float * x = data + row * src1->ne[0]; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 32eea786919f6..4cb2308044732 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -114,10 +114,10 @@ static std::string get_cpu_info() { static std::string get_gpu_info() { std::string id; #ifdef GGML_USE_CUBLAS - int count = ggml_cuda_get_device_count(); + int count = ggml_backend_cuda_get_device_count(); for (int i = 0; i < count; i++) { char buf[128]; - ggml_cuda_get_device_description(i, buf, sizeof(buf)); + ggml_backend_cuda_get_device_description(i, buf, sizeof(buf)); id += buf; if (i < count - 1) { id += "/"; diff --git a/ggml-alloc.c b/ggml-alloc.c index 8ac1d3e51470c..643b2e55f2d2a 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view(node)) { + // TODO: better way to add external dependencies + // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to + // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node + // itself is never used and should not be considered a dependency + if (ggml_is_view(node) && node->op != GGML_OP_NONE) { struct ggml_tensor * view_src = node->view_src; ggml_gallocr_hash_get(galloc, view_src)->n_views += 1; } @@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr ggml_gallocr_hash_get(galloc, src)->n_children += 1; - // allocate explicit inputs and leafs - if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) { + // allocate explicit inputs + if (src->flags & GGML_TENSOR_FLAG_INPUT) { ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i)); } } diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index e475e20e5f46a..f121e1de420fa 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -103,6 +103,11 @@ extern "C" { // check if the backend supports an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer + // these should be expensive operations with large batch sizes that may benefit from running on this backend + // even if the weight has to be copied from the CPU temporarily + bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); + // (optional) event synchronization ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); void (*GGML_CALL event_free) (ggml_backend_event_t event); diff --git a/ggml-backend.c b/ggml-backend.c index 31f8d5a6dd30b..9f0084df78942 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_ return err; } -bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return backend->iface.graph_compute(backend, cgraph); } @@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * return backend->iface.supports_op(backend, op); } +bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { + if (backend->iface.offload_op != NULL) { + return backend->iface.offload_op(backend, op); + } + return false; +} + // backend copy static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { @@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg if (cpu_plan->cplan.work_size > 0) { cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); + if (cpu_plan->cplan.work_data == NULL) { + free(cpu_plan); + return NULL; + } } cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; @@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = { /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .supports_op = */ ggml_backend_cpu_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, @@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) { #endif #ifndef GGML_SCHED_MAX_SPLITS -#define GGML_SCHED_MAX_SPLITS 256 +#define GGML_SCHED_MAX_SPLITS 2048 #endif #ifndef GGML_SCHED_MAX_SPLIT_INPUTS -#define GGML_SCHED_MAX_SPLIT_INPUTS 16 +#define GGML_SCHED_MAX_SPLIT_INPUTS 4 #endif #ifndef GGML_SCHED_MAX_COPIES @@ -1043,8 +1055,9 @@ struct ggml_backend_sched { struct ggml_cgraph * graph; // graph splits - struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS]; + struct ggml_backend_sched_split * splits; int n_splits; + int splits_capacity; // pipeline parallelism support int n_copies; @@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st // TODO: use supports_op to check if the backend supports the op // assign pre-allocated nodes to their backend - // dst - int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor); - if (cur_backend != -1) { + int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor); + if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.dst"); - return cur_backend; + return cur_backend_id; } // view_src if (tensor->view_src != NULL) { - cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src); - if (cur_backend != -1) { + cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src); + if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.vsrc"); - return cur_backend; + return cur_backend_id; } } - // input + // graph input if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { - cur_backend = sched->n_backends - 1; // last backend (assumed CPU) + cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU) SET_CAUSE(tensor, "1.inp"); - return cur_backend; + return cur_backend_id; } // assign nodes that use weights to the backend of the weights + // operations with weights are preferably run on the same backend as the weights for (int i = 0; i < GGML_MAX_SRC; i++) { const struct ggml_tensor * src = tensor->src[i]; if (src == NULL) { continue; } if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - int src_backend = ggml_backend_sched_backend_from_buffer(sched, src); - // operations with weights are always run on the same backend as the weights + int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src); + // check if a backend with higher prio wants to offload the op + if (src_backend_id == sched->n_backends - 1) { + for (int b = 0; b < src_backend_id; b++) { + if (ggml_backend_offload_op(sched->backends[b], tensor)) { + SET_CAUSE(tensor, "1.off"); + return b; + } + } + } SET_CAUSE(tensor, "1.wgt%d", i); - return src_backend; + return src_backend_id; } } @@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; - if (tensor_backend_id(leaf) != -1) { + int * leaf_backend_id = &tensor_backend_id(leaf); + if (*leaf_backend_id != -1) { // do not overwrite user assignments continue; } - tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf); + *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); } for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - if (tensor_backend_id(node) != -1) { + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { // do not overwrite user assignments continue; } - tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node); + *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); // src for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { continue; } - if (tensor_backend_id(src) == -1) { - tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src); + int * src_backend_id = &tensor_backend_id(src); + if (*src_backend_id == -1) { + *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src); } } } @@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (ggml_is_view_op(node->op)) { continue; } - int tensor_backend_id = tensor_backend_id(node); - if (tensor_backend_id != -1) { - if (tensor_backend_id == sched->n_backends - 1) { + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { + if (*node_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) cur_backend_id = -1; } else { - cur_backend_id = tensor_backend_id; + cur_backend_id = *node_backend_id; } } else { - tensor_backend_id(node) = cur_backend_id; + *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.2"); } } } - // pass 2.1 expand gpu up { int cur_backend_id = -1; @@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (ggml_is_view_op(node->op)) { continue; } - int tensor_backend_id = tensor_backend_id(node); - if (tensor_backend_id != -1) { - if (tensor_backend_id == sched->n_backends - 1) { + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { + if (*node_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) cur_backend_id = -1; } else { - cur_backend_id = tensor_backend_id; + cur_backend_id = *node_backend_id; } } else { - tensor_backend_id(node) = cur_backend_id; + *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.1"); } } } - - // pass 2.4 expand rest down { int cur_backend_id = -1; @@ -1317,16 +1338,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (ggml_is_view_op(node->op)) { continue; } - int tensor_backend_id = tensor_backend_id(node); - if (tensor_backend_id != -1) { - cur_backend_id = tensor_backend_id; + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { + cur_backend_id = *node_backend_id; } else { - tensor_backend_id(node) = cur_backend_id; + *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.4"); } } } - // pass 2.3 expand rest up + // pass 2.3 expand rest up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { @@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (ggml_is_view_op(node->op)) { continue; } - int tensor_backend_id = tensor_backend_id(node); - if (tensor_backend_id != -1) { - cur_backend_id = tensor_backend_id; + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { + cur_backend_id = *node_backend_id; } else { - tensor_backend_id(node) = cur_backend_id; + *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.3"); } } @@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // pass 3: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - int cur_backend_id = tensor_backend_id(node); - if (node->view_src != NULL && cur_backend_id == -1) { - cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src); + int * cur_backend_id = &tensor_backend_id(node); + if (node->view_src != NULL && *cur_backend_id == -1) { + *cur_backend_id = tensor_backend_id(node->view_src); SET_CAUSE(node, "3.vsrc"); } for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (src == NULL) { continue; } - int src_backend_id = tensor_backend_id(src); - if (src_backend_id == -1) { + int * src_backend_id = &tensor_backend_id(src); + if (*src_backend_id == -1) { if (src->view_src != NULL) { // views are always on the same backend as the source - tensor_backend_id(src) = tensor_backend_id(src->view_src); + *src_backend_id = tensor_backend_id(src->view_src); SET_CAUSE(src, "3.vsrc"); } else { - tensor_backend_id(src) = cur_backend_id; + *src_backend_id = *cur_backend_id; SET_CAUSE(src, "3.cur"); } } @@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // pass 4: split graph, find tensors that need to be copied { - int cur_split = 0; + int i_split = 0; + struct ggml_backend_sched_split * split = &sched->splits[0]; // find the backend of the first split, skipping view ops for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { - sched->splits[0].backend_id = tensor_backend_id(node); + split->backend_id = tensor_backend_id(node); break; } } - sched->splits[0].i_start = 0; - sched->splits[0].n_inputs = 0; - memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK - int cur_backend_id = sched->splits[0].backend_id; + split->i_start = 0; + split->n_inputs = 0; + memset(split->inputs, 0, sizeof(split->inputs)); //HACK + int cur_backend_id = split->backend_id; for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } - int tensor_backend_id = tensor_backend_id(node); + const int node_backend_id = tensor_backend_id(node); - GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now + GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now - if (tensor_backend_id != cur_backend_id) { - sched->splits[cur_split].i_end = i; - cur_split++; - GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS); - sched->splits[cur_split].backend_id = tensor_backend_id; - sched->splits[cur_split].i_start = i; - sched->splits[cur_split].n_inputs = 0; - cur_backend_id = tensor_backend_id; + // check if we should start a new split based on the sources of the current node + bool need_new_split = false; + if (node_backend_id == cur_backend_id && split->n_inputs > 0) { + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + // check if a weight is on a different backend + // by starting a new split, the memory of the previously offloaded weights can be reused + if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + int src_backend_id = tensor_backend_id(src); + if (src_backend_id != -1 && src_backend_id != cur_backend_id) { + need_new_split = true; + break; + } + } + // check if the split has too many inputs + if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { + const size_t id = hash_id(src); + int src_backend_id = sched->tensor_backend_id[id]; + if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) { + //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); + need_new_split = true; + break; + } + } + } + } + + if (node_backend_id != cur_backend_id || need_new_split) { + split->i_end = i; + i_split++; + if (i_split >= sched->splits_capacity) { + sched->splits_capacity *= 2; + sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); + GGML_ASSERT(sched->splits != NULL); + } + GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS); + split = &sched->splits[i_split]; + split->backend_id = node_backend_id; + split->i_start = i; + split->n_inputs = 0; + cur_backend_id = node_backend_id; } // find inputs that are not on the same backend @@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } - int src_backend_id = tensor_backend_id(src); + const int src_backend_id = tensor_backend_id(src); assert(src_backend_id != -1); // all inputs should be assigned by now - if (src->flags & GGML_TENSOR_FLAG_INPUT) { + if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { size_t id = hash_id(src); if (sched->tensor_copies[id][src_backend_id][0] == NULL) { ggml_backend_t backend = sched->backends[src_backend_id]; @@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } sched->tensor_copies[id][src_backend_id][c] = tensor_copy; - tensor_backend_id(tensor_copy) = src_backend_id; SET_CAUSE(tensor_copy, "4.cpy"); } int n_graph_inputs = sched->n_graph_inputs++; @@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - if (src_backend_id != tensor_backend_id) { + if (src_backend_id != node_backend_id) { // create a copy of the input in the split's backend - size_t id = hash_id(src); + const size_t id = hash_id(src); if (sched->tensor_copies[id][cur_backend_id][0] == NULL) { ggml_backend_t backend = sched->backends[cur_backend_id]; for (int c = 0; c < sched->n_copies; c++) { @@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } sched->tensor_copies[id][cur_backend_id][c] = tensor_copy; - tensor_backend_id(tensor_copy) = cur_backend_id; SET_CAUSE(tensor_copy, "4.cpy"); } - int n_inputs = sched->splits[cur_split].n_inputs++; + int n_inputs = split->n_inputs++; GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); - sched->splits[cur_split].inputs[n_inputs] = src; + split->inputs[n_inputs] = src; } node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy]; } } } - sched->splits[cur_split].i_end = graph->n_nodes; - sched->n_splits = cur_split + 1; + split->i_end = graph->n_nodes; + sched->n_splits = i_split + 1; } #ifdef DEBUG_PASS4 fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); #endif -#ifndef NDEBUG - // sanity check: all sources should have the same backend as the node - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); - if (tensor_backend == NULL) { - fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); - } - if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) { - fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n", - node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", - node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ? - ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL"); - } - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; - } - ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); - if (src_backend != tensor_backend /* && src_backend != NULL */) { - fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n", - node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", - j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL"); - } - if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) { - fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n", - src->name, src_backend ? ggml_backend_name(src_backend) : "NULL", - src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ? - ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL"); - } - } - } - fflush(stderr); -#endif - // create copies of the graph for each split // TODO: avoid this copy - struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false); + struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false); for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; split->graph = ggml_graph_view(graph, split->i_start, split->i_end); // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split for (int j = 0; j < split->n_inputs; j++) { + assert(graph_copy->size > (graph_copy->n_nodes + 1)); + struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy]; + const size_t input_id = hash_id(input); + struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy]; // add a dependency to the input source so that it is not freed before the copy is done struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); input_dep->src[0] = input; - sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input); + sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id]; graph_copy->nodes[graph_copy->n_nodes++] = input_dep; // add a dependency to the input copy so that it is allocated at the start of the split @@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } for (int j = split->i_start; j < split->i_end; j++) { + assert(graph_copy->size > graph_copy->n_nodes); sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]); graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; } @@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } ggml_backend_tensor_copy(input, input_cpy); } else { + // wait for the split backend to finish using the input before overwriting it if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]); } else { ggml_backend_synchronize(split_backend); - ggml_backend_synchronize(input_backend); } - ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy); } } @@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new( struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); // initialize hash table - sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS); + sched->hash_set = ggml_hash_set_new(graph_size); sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size); sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size); - sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size); - sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size); + + const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; + sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size); + sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size); sched->n_backends = n_backends; sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; - GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES); + const int initial_splits_capacity = 16; + sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity); + sched->splits_capacity = initial_splits_capacity; for (int b = 0; b < n_backends; b++) { sched->backends[b] = backends[b]; @@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); + free(sched->splits); free(sched->hash_set.keys); free(sched->tensor_backend_id); free(sched->tensor_copies); @@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) { } bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { + GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes); + ggml_backend_sched_split_graph(sched, measure_graph); // TODO: extract this to a separate function @@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * } bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS); + GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes); ggml_backend_sched_split_graph(sched, graph); diff --git a/ggml-backend.h b/ggml-backend.h index 099d9c258794e..422457ab6fc50 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -70,11 +70,11 @@ extern "C" { GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); - - GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); + GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); // tensor copy between different backends GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index db595409aa735..139025588ed61 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -82,6 +82,10 @@ #define cudaGetDeviceProperties hipGetDeviceProperties #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError +#define cudaHostRegister hipHostRegister +#define cudaHostRegisterPortable hipHostRegisterPortable +#define cudaHostRegisterReadOnly hipHostRegisterReadOnly +#define cudaHostUnregister hipHostUnregister #define cudaLaunchHostFunc hipLaunchHostFunc #ifdef GGML_HIP_UMA #define cudaMalloc hipMallocManaged @@ -7787,11 +7791,7 @@ struct cuda_pool_alloc { static bool g_cublas_loaded = false; -GGML_CALL bool ggml_cublas_loaded(void) { - return g_cublas_loaded; -} - -GGML_CALL void ggml_init_cublas() { +static void ggml_init_cublas() { static bool initialized = false; if (!initialized) { @@ -7880,7 +7880,7 @@ GGML_CALL void ggml_init_cublas() { } } -GGML_CALL void * ggml_cuda_host_malloc(size_t size) { +static void * ggml_cuda_host_malloc(size_t size) { if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { return nullptr; } @@ -7890,7 +7890,7 @@ GGML_CALL void * ggml_cuda_host_malloc(size_t size) { if (err != cudaSuccess) { // clear the error cudaGetLastError(); - fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", + fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size/1024.0/1024.0, cudaGetErrorString(err)); return nullptr; } @@ -7898,7 +7898,7 @@ GGML_CALL void * ggml_cuda_host_malloc(size_t size) { return ptr; } -GGML_CALL void ggml_cuda_host_free(void * ptr) { +static void ggml_cuda_host_free(void * ptr) { CUDA_CHECK(cudaFreeHost(ptr)); } @@ -9036,21 +9036,13 @@ static void ggml_cuda_op_soft_max( // positions tensor float * src2_dd = nullptr; - cuda_pool_alloc src2_f; ggml_tensor * src2 = dst->src[2]; const bool use_src2 = src2 != nullptr; if (use_src2) { - const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU; - - if (src2_on_device) { - ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra; - src2_dd = (float *) src2_extra->data_device[g_main_device]; - } else { - src2_dd = src2_f.alloc(ggml_nelements(src2)); - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream)); - } + ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra; + src2_dd = (float *) src2_extra->data_device[g_main_device]; } soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream); @@ -9107,55 +9099,24 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; - const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; - // dd = data device float * src0_ddf = nullptr; float * src1_ddf = nullptr; float * dst_ddf = nullptr; - cuda_pool_alloc src0_f; - cuda_pool_alloc src1_f; - cuda_pool_alloc dst_f; - ggml_cuda_set_device(g_main_device); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - if (src0_on_device) { - src0_ddf = (float *) src0_extra->data_device[g_main_device]; - } else { - src0_ddf = src0_f.alloc(ggml_nelements(src0)); - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream)); - } + src0_ddf = (float *) src0_extra->data_device[g_main_device]; if (use_src1) { - if (src1_on_device) { - src1_ddf = (float *) src1_extra->data_device[g_main_device]; - } else { - src1_ddf = src1_f.alloc(ggml_nelements(src1)); - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream)); - } - } - if (dst_on_device) { - dst_ddf = (float *) dst_extra->data_device[g_main_device]; - } else { - dst_ddf = dst_f.alloc(ggml_nelements(dst)); + src1_ddf = (float *) src1_extra->data_device[g_main_device]; } + dst_ddf = (float *) dst_extra->data_device[g_main_device]; // do the computation op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); CUDA_CHECK(cudaGetLastError()); - - // copy dst to host if necessary - if (!dst_on_device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); - } - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - CUDA_CHECK(cudaDeviceSynchronize()); - } } static void ggml_cuda_set_peer_access(const int n_tokens) { @@ -9251,7 +9212,6 @@ static void ggml_cuda_op_mul_mat( ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src1_is_contiguous = ggml_is_contiguous(src1); @@ -9322,13 +9282,13 @@ static void ggml_cuda_op_mul_mat( used_devices++; - const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; + const bool src1_on_device = id == g_main_device; // TODO: check from buffer + const bool dst_on_device = id == g_main_device; ggml_cuda_set_device(id); cudaStream_t stream = g_cudaStreams[id][0]; - if (src0_on_device && src0_is_contiguous) { + if (src0_is_contiguous) { dev[id].src0_dd = (char *) src0_extra->data_device[id]; } else { dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0)); @@ -9374,8 +9334,8 @@ static void ggml_cuda_op_mul_mat( continue; } - const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; + const bool src1_on_device = id == g_main_device; // TODO: check from buffer + const bool dst_on_device = id == g_main_device; const int64_t row_diff = dev[id].row_high - dev[id].row_low; ggml_cuda_set_device(id); @@ -9400,12 +9360,12 @@ static void ggml_cuda_op_mul_mat( // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed - if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) { + if (id == g_main_device) { dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary - if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) { + if (src1_is_contiguous) { if (id != g_main_device) { if (convert_src1_to_q8_1) { char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset; @@ -9418,19 +9378,19 @@ static void ggml_cuda_op_mul_mat( src1_ncols*ne10*sizeof(float), stream)); } } - } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) { + } else if (src1_on_device && !src1_is_contiguous) { CUDA_CHECK(ggml_cuda_cpy_tensor_2d( src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); } else { GGML_ASSERT(false); } - if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) { + if (convert_src1_to_q8_1 && !src1_is_contiguous) { quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); } - if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) { + if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) { CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream)); } @@ -9441,17 +9401,7 @@ static void ggml_cuda_op_mul_mat( // copy dst to host or other device if necessary if (!dst_on_device) { - void * dst_off_device; - cudaMemcpyKind kind; - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - dst_off_device = dst->data; - kind = cudaMemcpyDeviceToHost; - } else if (dst->backend == GGML_BACKEND_TYPE_GPU) { - dst_off_device = dst_extra->data_device[g_main_device]; - kind = cudaMemcpyDeviceToDevice; - } else { - GGML_ASSERT(false); - } + void * dst_off_device = dst_extra->data_device[g_main_device]; if (split) { // src0 = weight matrix is saved as a transposed matrix for better memory layout. // dst is NOT transposed. @@ -9462,28 +9412,26 @@ static void ggml_cuda_op_mul_mat( GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); dhf_dst_i += src1_col_0*ne0 + dev[id].row_low; #if !defined(GGML_USE_HIPBLAS) - if (kind == cudaMemcpyDeviceToDevice) { - // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices - cudaMemcpy3DPeerParms p = {}; - p.dstDevice = g_main_device; - p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols); - p.srcDevice = id; - p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols); - p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1); - CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream)); - } else + // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices + cudaMemcpy3DPeerParms p = {}; + p.dstDevice = g_main_device; + p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols); + p.srcDevice = id; + p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols); + p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1); + CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream)); +#else + // HIP does not support cudaMemcpy3DPeerAsync or vmm pools + CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), + dst_dd_i, row_diff*sizeof(float), + row_diff*sizeof(float), src1_ncols, + cudaMemcpyDeviceToDevice, stream)); #endif - { - CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), - dst_dd_i, row_diff*sizeof(float), - row_diff*sizeof(float), src1_ncols, - kind, stream)); - } } else { float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); dhf_dst_i += src1_col_0*ne0; - CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream)); + CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream)); } } @@ -9510,11 +9458,6 @@ static void ggml_cuda_op_mul_mat( } } } - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - ggml_cuda_set_device(g_main_device); - CUDA_CHECK(cudaDeviceSynchronize()); - } } static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -9599,36 +9542,19 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; - // dd = data device float * src0_ddf = nullptr; float * src1_ddf = nullptr; float * dst_ddf = nullptr; - cuda_pool_alloc dst_f; - ggml_cuda_set_device(g_main_device); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - if (dst_on_device) { - dst_ddf = (float *) dst_extra->data_device[g_main_device]; - } else { - dst_ddf = dst_f.alloc(ggml_nelements(dst)); - } + dst_ddf = (float *) dst_extra->data_device[g_main_device]; // do the computation ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); CUDA_CHECK(cudaGetLastError()); - - // copy dst to host if necessary - if (!dst_on_device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); - } - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - CUDA_CHECK(cudaDeviceSynchronize()); - } } static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -9639,21 +9565,6 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); } -GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - if (!g_cublas_loaded) return false; - - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && - src1->type == GGML_TYPE_F32 && - dst->type == GGML_TYPE_F32 && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32); -} - static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); @@ -9891,11 +9802,6 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm } static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const bool all_on_device = - (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) && - (src1->backend == GGML_BACKEND_TYPE_GPU) && - ( dst->backend == GGML_BACKEND_TYPE_GPU); - const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; int64_t min_compute_capability = INT_MAX; @@ -9972,13 +9878,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // KQ single-batch ggml_cuda_mul_mat_vec_p021(src0, src1, dst); - } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { + } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_cuda_mul_mat_vec_nc(src0, src1, dst); - } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch ggml_cuda_mul_mat_batched_cublas(src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { @@ -10178,6 +10084,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s ggml_cuda_mul_mat_id_cublas(dst); // TODO: mmq/mmv support #endif + cudaStream_t stream = g_cudaStreams[g_main_device][0]; const size_t nb11 = src1->nb[1]; const size_t nb1 = dst->nb[1]; @@ -10187,16 +10094,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s const int32_t n_as = ((int32_t *) dst->op_params)[1]; std::vector ids_host(ggml_nbytes(ids)); - - cudaStream_t stream = g_cudaStreams[g_main_device][0]; - - if (ids->backend == GGML_BACKEND_TYPE_GPU) { - const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - } else { - memcpy(ids_host.data(), ids->data, ggml_nbytes(ids)); - } + const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra; const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra; @@ -10213,20 +10113,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; - char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ? - (char *) src1->data : (char *) src1_extra->data_device[g_main_device]; - char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ? - (char *) dst->data : (char *) dst_extra->data_device[g_main_device]; + char * src1_original = (char *) src1_extra->data_device[g_main_device]; + char * dst_original = (char *) dst_extra->data_device[g_main_device]; if (src1->ne[1] == 1) { - GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - //int32_t row_id; - //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); - //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); - const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); GGML_ASSERT(row_id >= 0 && row_id < n_as); @@ -10248,11 +10139,6 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s src1_row_extra.data_device[g_main_device] = src1_contiguous.get(); dst_row_extra.data_device[g_main_device] = dst_contiguous.get(); - const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ? - cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; - const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ? - cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice; - for (int32_t row_id = 0; row_id < n_as; ++row_id) { const struct ggml_tensor * src0_row = dst->src[row_id + 2]; @@ -10267,7 +10153,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s GGML_ASSERT(row_id >= 0 && row_id < n_as); CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11, - nb11, src1_kind, stream)); + nb11, cudaMemcpyDeviceToDevice, stream)); num_src1_rows++; } @@ -10299,15 +10185,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s GGML_ASSERT(row_id >= 0 && row_id < n_as); CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1, - nb1, dst_kind, stream)); + nb1, cudaMemcpyDeviceToDevice, stream)); num_src1_rows++; } } } - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - CUDA_CHECK(cudaStreamSynchronize(stream)); - } } static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -10435,7 +10317,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); } -GGML_CALL static void ggml_cuda_set_main_device(const int main_device) { +static void ggml_cuda_set_main_device(const int main_device) { if (main_device >= g_device_count) { fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", main_device, g_device_count, g_main_device); @@ -10450,18 +10332,9 @@ GGML_CALL static void ggml_cuda_set_main_device(const int main_device) { } } -GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +static bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) { if (!g_cublas_loaded) return false; - ggml_cuda_func_t func; - const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU - || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) - || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU); - - if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { - return false; - } - if (tensor->op == GGML_OP_MUL_MAT) { if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { #ifndef NDEBUG @@ -10471,6 +10344,8 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st } } + ggml_cuda_func_t func; + switch (tensor->op) { case GGML_OP_REPEAT: func = ggml_cuda_repeat; @@ -10548,15 +10423,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st func = ggml_cuda_rms_norm; break; case GGML_OP_MUL_MAT: - if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { - return false; - } func = ggml_cuda_mul_mat; break; case GGML_OP_MUL_MAT_ID: - if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) { - return false; - } func = ggml_cuda_mul_mat_id; break; case GGML_OP_SCALE: @@ -10613,17 +10482,11 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st ggml_cuda_set_peer_access(tensor->src[1]->ne[1]); } - if (params->ith != 0) { - return true; - } - if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { - return true; - } func(tensor->src[0], tensor->src[1], tensor); return true; } -GGML_CALL int ggml_cuda_get_device_count() { +static int ggml_cuda_get_device_count() { int device_count; if (cudaGetDeviceCount(&device_count) != cudaSuccess) { return 0; @@ -10631,7 +10494,7 @@ GGML_CALL int ggml_cuda_get_device_count() { return device_count; } -GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { +static void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); snprintf(description, description_size, "%s", prop.name); @@ -10736,6 +10599,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); if (padded_size > original_size && tensor->view_src == nullptr) { + ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size)); } } @@ -10873,6 +10737,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { }; GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { + ggml_init_cublas(); + // FIXME: this is not thread safe if (device >= ggml_backend_cuda_get_device_count()) { return nullptr; @@ -11157,6 +11023,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface }; GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { + ggml_init_cublas(); + // FIXME: this is not thread safe static std::map, struct ggml_backend_buffer_type> buft_map; @@ -11348,9 +11216,6 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t ggml_cuda_set_main_device(cuda_ctx->device); - ggml_compute_params params = {}; - params.type = GGML_TASK_TYPE_COMPUTE; - params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -11372,7 +11237,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t } #endif - bool ok = ggml_cuda_compute_forward(¶ms, node); + bool ok = ggml_cuda_compute_forward(node); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } @@ -11509,6 +11374,14 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons UNUSED(backend); } +GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { + const int min_batch_size = 32; + + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; + + UNUSED(backend); +} + static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; @@ -11571,6 +11444,7 @@ static ggml_backend_i ggml_backend_cuda_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .supports_op = */ ggml_backend_cuda_supports_op, + /* .offload_op = */ ggml_backend_cuda_offload_op, /* .event_new = */ ggml_backend_cuda_event_new, /* .event_free = */ ggml_backend_cuda_event_free, /* .event_record = */ ggml_backend_cuda_event_record, @@ -11584,7 +11458,7 @@ static ggml_guid_t ggml_backend_cuda_guid() { } GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { - ggml_init_cublas(); // TODO: remove from ggml.c + ggml_init_cublas(); if (device < 0 || device >= ggml_cuda_get_device_count()) { fprintf(stderr, "%s: error: invalid device %d\n", __func__, device); @@ -11627,6 +11501,31 @@ GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, si CUDA_CHECK(cudaMemGetInfo(free, total)); } +GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) { + if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { + return false; + } + + cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly); + if (err != cudaSuccess) { + // clear the error + cudaGetLastError(); + + fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__, + size/1024.0/1024.0, cudaGetErrorString(err)); + return false; + } + return true; +} + +GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) { + cudaError_t err = cudaHostUnregister(buffer); + if (err != cudaSuccess) { + // clear the error + cudaGetLastError(); + } +} + // backend registry GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); diff --git a/ggml-cuda.h b/ggml-cuda.h index b1ebd61d7fb66..5eb4af40f4d1f 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -17,29 +17,17 @@ extern "C" { #define GGML_CUDA_MAX_DEVICES 16 -// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`. -GGML_API GGML_CALL void ggml_init_cublas(void); - -// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`. -GGML_API GGML_CALL bool ggml_cublas_loaded(void); - -GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size); -GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr); - -GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); - -GGML_API GGML_CALL int ggml_cuda_get_device_count(void); -GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size); - // backend API GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device); GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend); +// device buffer GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); + // split tensor buffer that splits matrices by rows across multiple devices GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); + // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); @@ -47,6 +35,9 @@ GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void); GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); +GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); +GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer); + #ifdef __cplusplus } #endif diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 4caf2c9e78b02..81dd5067864ce 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1951,6 +1951,7 @@ static struct ggml_backend_i kompute_backend_i = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_kompute_graph_compute, /* .supports_op = */ ggml_backend_kompute_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, diff --git a/ggml-metal.m b/ggml-metal.m index c3451a79b103f..109e5fe6ba13d 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2837,6 +2837,7 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, /* .supports_op = */ ggml_backend_metal_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 6dc5eb20c5a63..d51f23b410595 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -17390,6 +17390,7 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .supports_op = */ ggml_backend_sycl_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 698b31496f417..cbceaa19fbacd 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -5699,6 +5699,7 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_vk_graph_compute, /* .supports_op = */ ggml_backend_vk_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, diff --git a/ggml.c b/ggml.c index fa23cb3c44f87..1d5854960a51b 100644 --- a/ggml.c +++ b/ggml.c @@ -282,8 +282,6 @@ inline static void * ggml_calloc(size_t num, size_t size) { #else #include #endif -#elif defined(GGML_USE_CUBLAS) -#include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" #elif defined(GGML_USE_VULKAN) @@ -2640,9 +2638,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } -#if defined(GGML_USE_CUBLAS) - ggml_init_cublas(); -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) ggml_cl_init(); #elif defined(GGML_USE_VULKAN) ggml_vk_init_cpu_assist(); @@ -11105,7 +11101,6 @@ static void ggml_compute_forward_out_prod_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod // TODO: #if defined(GGML_USE_CLBLAST) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) @@ -11305,7 +11300,6 @@ static void ggml_compute_forward_out_prod_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (params->type == GGML_TASK_TYPE_INIT) { @@ -16051,14 +16045,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm return; } -#ifdef GGML_USE_CUBLAS - bool skip_cpu = ggml_cuda_compute_forward(params, tensor); - if (skip_cpu) { - return; - } - GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU); - GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); -#elif defined(GGML_USE_VULKAN) +#if defined(GGML_USE_VULKAN) const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor); #ifdef GGML_VULKAN_CHECK_RESULTS if (skip_cpu) { @@ -16070,7 +16057,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU); GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); -#endif // GGML_USE_CUBLAS +#endif // GGML_USE_VULKAN #ifdef GGML_USE_SYCL bool skip_cpu = ggml_sycl_compute_forward(params, tensor); diff --git a/llama.cpp b/llama.cpp index e4db288ddd907..b8bef6dafe735 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2040,6 +2040,11 @@ struct llama_model { ggml_free(ctx); } for (ggml_backend_buffer_t buf : bufs) { +#ifdef GGML_USE_CUBLAS + if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) { + ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf)); + } +#endif ggml_backend_buffer_free(buf); } } @@ -5033,6 +5038,13 @@ static bool llm_load_tensors( size_t first, last; ml.get_mapping_range(&first, &last, ctx); buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first); +#ifdef GGML_USE_CUBLAS + if (n_layer >= n_gpu_layers) { + ggml_backend_cuda_register_host_buffer( + ggml_backend_buffer_get_base(buf), + ggml_backend_buffer_get_size(buf)); + } +#endif } #ifdef GGML_USE_METAL else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) { @@ -8231,7 +8243,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -8601,12 +8612,15 @@ static struct ggml_cgraph * llama_build_graph( } // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends - // to fix this, we assign the norm layer manually to the backend of its layer - if (il != -1 && strcmp(name, "norm") == 0) { - for (auto * backend : lctx.backends) { - if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { - ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); - break; + // FIXME: fix in ggml_backend_sched + const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer; + if (batch.n_tokens < 32 || full_offload) { + if (il != -1 && strcmp(name, "norm") == 0) { + for (auto * backend : lctx.backends) { + if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { + ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); + break; + } } } } @@ -13107,27 +13121,25 @@ struct llama_context * llama_new_context_with_model( ctx->backends.push_back(ctx->backend_metal); } #elif defined(GGML_USE_CUBLAS) - if (model->n_gpu_layers > 0) { + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); + ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU + for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { + ggml_backend_t backend = ggml_backend_cuda_init(device); if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device); llama_free(ctx); return nullptr; } ctx->backends.push_back(backend); - } else { - // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU - for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { - ggml_backend_t backend = ggml_backend_cuda_init(device); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); - } } } #elif defined(GGML_USE_VULKAN) @@ -13285,14 +13297,17 @@ struct llama_context * llama_new_context_with_model( ggml_backend_t backend = ctx->backends[i]; ggml_backend_buffer_type_t buft = backend_buft[i]; size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend); - LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, - ggml_backend_buft_name(buft), - size / 1024.0 / 1024.0); + if (size > 1) { + LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); + } } // note: the number of splits during measure is higher than during inference due to the kv shift int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); - LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits); + LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes); + LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits); } } From 4f6d1337ca5a409dc74aca8c479b7c34408a69c0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Mar 2024 13:45:27 +0200 Subject: [PATCH 39/48] ci : temporary disable sanitizer builds (#6128) --- .github/workflows/build.yml | 68 ++++++++++++++++++------------------ .github/workflows/server.yml | 6 ++-- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 945df42f886a6..992c34a0302cc 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -98,40 +98,40 @@ jobs: cd build ctest -L main --verbose --timeout 900 - ubuntu-latest-cmake-sanitizer: - runs-on: ubuntu-latest - - continue-on-error: true - - strategy: - matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug, Release] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 +# ubuntu-latest-cmake-sanitizer: +# runs-on: ubuntu-latest +# +# continue-on-error: true +# +# strategy: +# matrix: +# sanitizer: [ADDRESS, THREAD, UNDEFINED] +# build_type: [Debug, Release] +# +# steps: +# - name: Clone +# id: checkout +# uses: actions/checkout@v3 +# +# - name: Dependencies +# id: depends +# run: | +# sudo apt-get update +# sudo apt-get install build-essential +# +# - name: Build +# id: cmake_build +# run: | +# mkdir build +# cd build +# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} +# cmake --build . --config ${{ matrix.build_type }} -j $(nproc) +# +# - name: Test +# id: cmake_test +# run: | +# cd build +# ctest -L main --verbose --timeout 900 ubuntu-latest-cmake-mpi: runs-on: ubuntu-latest diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 4ea09115a3c44..65ca7d9ca3ae9 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -24,13 +24,13 @@ jobs: strategy: matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] + # TODO: temporary disabled due to linux kernel issues + #sanitizer: [ADDRESS, THREAD, UNDEFINED] + sanitizer: [UNDEFINED] build_type: [Debug] include: - build_type: Release sanitizer: "" - - build_type: Debug - sanitizer: THREAD disabled_on_pr: true fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken From ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Mar 2024 13:45:38 +0200 Subject: [PATCH 40/48] ci : disable stale issue messages (#6126) --- .github/workflows/close-issue.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index 2682f308cb46e..eaffd074de0aa 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -15,7 +15,6 @@ jobs: days-before-issue-stale: 30 days-before-issue-close: 14 stale-issue-label: "stale" - stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." days-before-pr-stale: -1 days-before-pr-close: -1 From 5e1b7f94a03e0b3b8e4578625bbdadc7bbd2b93c Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 18 Mar 2024 16:33:44 +0100 Subject: [PATCH 41/48] backend : set max split inputs to GGML_MAX_SRC (#6137) --- ggml-backend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-backend.c b/ggml-backend.c index 9f0084df78942..6026570ae95aa 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1015,7 +1015,7 @@ static bool ggml_is_view_op(enum ggml_op op) { #endif #ifndef GGML_SCHED_MAX_SPLIT_INPUTS -#define GGML_SCHED_MAX_SPLIT_INPUTS 4 +#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC #endif #ifndef GGML_SCHED_MAX_COPIES From 104f5e0fc156d48476258295457cafeec2a2af10 Mon Sep 17 00:00:00 2001 From: Felix Date: Mon, 18 Mar 2024 16:40:22 +0100 Subject: [PATCH 42/48] clip : fix memory leak (#6138) --- examples/llava/clip.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index a0ed82d7e2328..690bca2eb7732 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -497,7 +497,6 @@ struct clip_ctx { // memory buffers to evaluate the model ggml_backend_buffer_t params_buffer = NULL; - ggml_backend_buffer_t compute_buffer = NULL; ggml_backend_t backend = NULL; ggml_gallocr_t compute_alloc = NULL; @@ -1676,6 +1675,9 @@ void clip_free(clip_ctx * ctx) { ggml_free(ctx->ctx_data); gguf_free(ctx->ctx_gguf); + ggml_backend_buffer_free(ctx->params_buffer); + ggml_backend_free(ctx->backend); + ggml_gallocr_free(ctx->compute_alloc); delete ctx; } From d199ca79f279e84ebe27caafe0aa59c461d88969 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Mon, 18 Mar 2024 12:49:02 -0400 Subject: [PATCH 43/48] mpt : implement backwards compatiblity with duped output tensor (#6139) --- llama.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/llama.cpp b/llama.cpp index b8bef6dafe735..1a9fe0c4d2cea 100644 --- a/llama.cpp +++ b/llama.cpp @@ -540,6 +540,7 @@ static const std::map> LLM_TENSOR_NA { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output"}, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, @@ -4300,9 +4301,9 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) { - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); - } else { + + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + if (!model.output) { model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU ml.n_created--; // artificial tensor ml.size_data += ggml_nbytes(model.output); @@ -4507,10 +4508,12 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false); - // same as tok_embd, duplicated to allow offloading - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + if (!model.output) { + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } } for (int i = 0; i < n_layer; ++i) { From 2d15886bb092c3b780c676b5cc57ff3337af9c83 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 17 Mar 2024 06:37:44 +0000 Subject: [PATCH 44/48] flake.lock: Update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/9df3e30ce24fd28c7b3e2de0d986769db5d6225d' (2024-03-06) → 'github:NixOS/nixpkgs/d691274a972b3165335d261cc4671335f5c67de9' (2024-03-14) --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index f9865d5e464cd..80de76dbf3ed6 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1709703039, - "narHash": "sha256-6hqgQ8OK6gsMu1VtcGKBxKQInRLHtzulDo9Z5jxHEFY=", + "lastModified": 1710451336, + "narHash": "sha256-pP86Pcfu3BrAvRO7R64x7hs+GaQrjFes+mEPowCfkxY=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "9df3e30ce24fd28c7b3e2de0d986769db5d6225d", + "rev": "d691274a972b3165335d261cc4671335f5c67de9", "type": "github" }, "original": { From 4c28b8252907561165827125d2d1a4bad6926ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Tue, 19 Mar 2024 01:59:36 -0400 Subject: [PATCH 45/48] common : print usage on '-h' and '--help' (#6145) --- common/common.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 919182862cb5c..5f10718ece90c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1056,7 +1056,8 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int return true; } if (arg == "-h" || arg == "--help") { - return false; + gpt_print_usage(argc, argv, gpt_params()); + exit(0); } if (arg == "--version") { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); From 970a48060ab9a6cc67aa063870323781c2a7bd7d Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 19 Mar 2024 09:06:54 +0100 Subject: [PATCH 46/48] ci : exempt some labels from being tagged as stale (#6140) --- .github/workflows/close-issue.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index eaffd074de0aa..a151c6780aaa5 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -12,6 +12,7 @@ jobs: steps: - uses: actions/stale@v5 with: + exempt-issue-labels: "refactor,help wanted,good first issue,research" days-before-issue-stale: 30 days-before-issue-close: 14 stale-issue-label: "stale" From b80cf3b2d1dee0ad325f7a794fecc66befce7336 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 Mar 2024 10:21:54 +0200 Subject: [PATCH 47/48] common : disable repeat penalties by default (#6127) --- common/sampling.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/sampling.h b/common/sampling.h index 48b2459d1f944..79a998be8e408 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -32,13 +32,13 @@ typedef struct llama_sampling_params { float dynatemp_range = 0.00f; // 0.0 = disabled float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.10f; // 1.0 = disabled + float penalty_repeat = 1.00f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled float penalty_present = 0.00f; // 0.0 = disabled int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = true; // consider newlines as a repeatable token + bool penalize_nl = false; // consider newlines as a repeatable token std::vector samplers_sequence = { llama_sampler_type::TOP_K, From d0d5de42e5a65865b5fddb6f5c785083539b74c3 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Tue, 19 Mar 2024 12:05:44 +0100 Subject: [PATCH 48/48] gguf-split: split and merge gguf per batch of tensors (#6135) * gguf-split: split and merge gguf files per tensor * gguf-split: build with make toolchain * gguf-split: rename `--split-tensors-size` to `--split-max-tensors`. Set general.split_count KV to all split * split : minor style + fix compile warnings * gguf-split: remove --upload not implemented --------- Co-authored-by: Georgi Gerganov --- Makefile | 4 + examples/CMakeLists.txt | 1 + examples/gguf-split/CMakeLists.txt | 5 + examples/gguf-split/README.md | 9 + examples/gguf-split/gguf-split.cpp | 491 +++++++++++++++++++++++++++++ 5 files changed, 510 insertions(+) create mode 100644 examples/gguf-split/CMakeLists.txt create mode 100644 examples/gguf-split/README.md create mode 100644 examples/gguf-split/gguf-split.cpp diff --git a/Makefile b/Makefile index 838daf5c02acd..1daad45ed8e4a 100644 --- a/Makefile +++ b/Makefile @@ -753,6 +753,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e762cf8b9238b..b59cc65bfedb0 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,6 +21,7 @@ else() add_subdirectory(embedding) add_subdirectory(finetune) add_subdirectory(gritlm) + add_subdirectory(gguf-split) add_subdirectory(infill) add_subdirectory(llama-bench) add_subdirectory(llava) diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt new file mode 100644 index 0000000000000..828e624352c8d --- /dev/null +++ b/examples/gguf-split/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET gguf-split) +add_executable(${TARGET} gguf-split.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf-split/README.md b/examples/gguf-split/README.md new file mode 100644 index 0000000000000..ddb1f76497aed --- /dev/null +++ b/examples/gguf-split/README.md @@ -0,0 +1,9 @@ +## GGUF split Example + +CLI to split / merge GGUF files. + +**Command line options:** + +- `--split`: split GGUF to multiple GGUF, default operation. +- `--split-max-tensors`: maximum tensors in each split: default(128) +- `--merge`: merge multiple GGUF to a single GGUF. diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp new file mode 100644 index 0000000000000..5d7040ab5cbb3 --- /dev/null +++ b/examples/gguf-split/gguf-split.cpp @@ -0,0 +1,491 @@ +#include "llama.h" +#include "ggml.h" +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +enum split_operation : uint8_t { + SPLIT_OP_SPLIT, + SPLIT_OP_MERGE, +}; + +static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split"; +static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count"; + +static const int SPLIT_FILENAME_MAX = 256; + +static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf"; + +struct split_params { + split_operation operation = SPLIT_OP_SPLIT; + int n_split_tensors = 128; + std::string input; + std::string output; +}; + +static void split_print_usage(const char * executable) { + const split_params default_params; + printf("\n"); + printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable); + printf("\n"); + printf("Apply a GGUF operation on IN to OUT."); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" --version show version and build info\n"); + printf(" --split split GGUF to multiple GGUF (default)\n"); + printf(" --split-max-tensors max tensors in each split: default(%d)\n", default_params.n_split_tensors); + printf(" --merge merge multiple GGUF to a single GGUF\n"); + printf("\n"); +} + +static bool split_params_parse_ex(int argc, const char ** argv, split_params & params) { + std::string arg; + const std::string arg_prefix = "--"; + bool invalid_param = false; + + int arg_idx = 1; + for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { + arg = argv[arg_idx]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + bool arg_found = false; + if (arg == "-h" || arg == "--help") { + split_print_usage(argv[0]); + exit(0); + } + if (arg == "--version") { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + + if (arg == "--merge") { + arg_found = true; + params.operation = SPLIT_OP_MERGE; + } + if (arg == "--split") { + arg_found = true; + params.operation = SPLIT_OP_SPLIT; + } + if (arg == "--split-max-tensors") { + if (++arg_idx >= argc) { + invalid_param = true; + break; + } + arg_found = true; + params.n_split_tensors = atoi(argv[arg_idx]); + } + + if (!arg_found) { + throw std::invalid_argument("error: unknown argument: " + arg); + } + } + + if (invalid_param) { + throw std::invalid_argument("error: invalid parameter for argument: " + arg); + } + + if (argc - arg_idx < 2) { + printf("%s: bad arguments\n", argv[0]); + split_print_usage(argv[0]); + return false; + } + + params.input = argv[arg_idx++]; + params.output = argv[arg_idx++]; + + return true; +} + +static bool split_params_parse(int argc, const char ** argv, split_params & params) { + bool result = true; + try { + if (!split_params_parse_ex(argc, argv, params)) { + split_print_usage(argv[0]); + exit(1); + } + } + catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); + split_print_usage(argv[0]); + exit(1); + } + return result; +} + +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} + +static std::string split_file_name(const std::string & path, int i_split, int n_split) { + char f_split[SPLIT_FILENAME_MAX] = {0}; + snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split); + return std::string(f_split); +} + +struct split_strategy { + const split_params params; + std::ifstream & f_input; + struct gguf_context * ctx_gguf; + struct ggml_context * ctx_meta = NULL; + const int n_tensors; + + const int n_split; + int i_split = 0; + + int i_tensor = 0; + + std::vector read_data; + + struct gguf_context * ctx_out; + std::ofstream fout; + + split_strategy(const split_params & params, + std::ifstream & f_input, + struct gguf_context * ctx_gguf, + struct ggml_context * ctx_meta) : + params(params), + f_input(f_input), + ctx_gguf(ctx_gguf), + ctx_meta(ctx_meta), + n_tensors(gguf_get_n_tensors(ctx_gguf)), + n_split(std::ceil(1. * n_tensors / params.n_split_tensors)) { + } + + bool should_split() const { + return i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0; + } + + void split_start() { + ctx_out = gguf_init_empty(); + + // Save all metadata in first split only + if (i_split == 0) { + gguf_set_kv(ctx_out, ctx_gguf); + } + gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split); + gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split); + + // populate the original tensors, so we get an initial metadata + for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) { + struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); + gguf_add_tensor(ctx_out, meta); + } + + auto split_name = split_file_name(params.output, i_split, n_split); + + fprintf(stderr, "%s: %s ...", __func__, split_name.c_str()); + fout = std::ofstream(split_name, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + + auto meta_size = gguf_get_meta_size(ctx_out); + + // placeholder for the meta data + ::zeros(fout, meta_size); + + i_split++; + } + + void next_tensor() { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + auto n_bytes = ggml_nbytes(t); + + if (read_data.size() < n_bytes) { + read_data.resize(n_bytes); + } + + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor); + f_input.seekg(offset); + f_input.read((char *)read_data.data(), n_bytes); + + t->data = read_data.data(); + + // write tensor data + padding + fout.write((const char *)t->data, n_bytes); + zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); + + i_tensor++; + } + + void split_end() { + // go back to beginning of file and write the updated metadata + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *)data.data(), data.size()); + + fout.close(); + gguf_free(ctx_out); + + fprintf(stderr, "\033[3Ddone\n"); + } +}; + +static void gguf_split(const split_params & split_params) { + struct ggml_context * ctx_meta = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + }; + + std::ifstream f_input(split_params.input.c_str(), std::ios::binary); + if (!f_input.is_open()) { + fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(1); + } + + auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(1); + } + + split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta); + fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n", + __func__, split_params.input.c_str(), + split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(), + split_params.n_split_tensors); + + strategy.split_start(); + + while (strategy.i_tensor < strategy.n_tensors) { + strategy.next_tensor(); + if (strategy.should_split()) { + strategy.split_end(); + strategy.split_start(); + } + } + strategy.split_end(); + + gguf_free(ctx_gguf); + f_input.close(); + + fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n", + __func__, strategy.n_split, strategy.n_tensors); +} + +static void gguf_merge(const split_params & split_params) { + fprintf(stderr, "%s: %s -> %s\n", + __func__, split_params.input.c_str(), + split_params.output.c_str()); + int n_split = 1; + int total_tensors = 0; + + auto * ctx_out = gguf_init_empty(); + std::ofstream fout(split_params.output.c_str(), std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + + std::vector read_data; + std::vector ctx_metas; + std::vector ctx_ggufs; + + std::string split_prefix; + + // First pass to find KV and tensors metadata + for (int i_split = 0; i_split < n_split; i_split++) { + struct ggml_context * ctx_meta = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + }; + + auto split_name = split_params.input; + if (i_split > 0) { + split_name = split_file_name(split_prefix, i_split, n_split); + } + fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str()); + + auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(1); + } + ctx_ggufs.push_back(ctx_gguf); + ctx_metas.push_back(ctx_meta); + + if (i_split == 0) { + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT); + if (key_n_split < 0) { + fprintf(stderr, + "\n%s: input file does not contain %s metadata\n", + __func__, + LLM_KV_GENERAL_SPLIT_N_SPLIT); + gguf_free(ctx_gguf); + gguf_free(ctx_out); + fout.close(); + exit(1); + } + + n_split = gguf_get_val_u8(ctx_gguf, key_n_split); + if (n_split < 1) { + fprintf(stderr, + "\n%s: input file does not contain a valid split count %d\n", + __func__, + n_split); + gguf_free(ctx_gguf); + gguf_free(ctx_out); + fout.close(); + exit(1); + } + + // Do not trigger merge if we try to merge again the output + gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0); + + // Set metadata from the first split + gguf_set_kv(ctx_out, ctx_gguf); + } + + // Verify the file naming + { + int i_split_file = 0; + int n_split_file = 0; + const char * i_split_format = "-00000-of-00000.gguf"; + + if (split_name.size() < strlen(i_split_format)) { + fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str()); + for (auto * _ctx_gguf : ctx_ggufs) { + gguf_free(_ctx_gguf); + } + gguf_free(ctx_out); + fout.close(); + exit(1); + } + + split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format)); + + const char * split_name_c_str = split_name.c_str(); + int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file); + + if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) { + fprintf(stderr, "\n%s: unexpected input file name: %s" + " i_split=%d i_split_file=%d" + " n_split=%d n_split_file=%d\n", __func__, + split_params.input.c_str(), + i_split, i_split_file, + n_split, n_split_file); + for (auto * _ctx_gguf : ctx_ggufs) { + gguf_free(_ctx_gguf); + } + gguf_free(ctx_out); + fout.close(); + exit(1); + } + } + + auto n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + gguf_add_tensor(ctx_out, t); + } + total_tensors += n_tensors; + + fprintf(stderr, "\033[3Ddone\n"); + } + + // placeholder for the meta data + { + auto meta_size = gguf_get_meta_size(ctx_out); + ::zeros(fout, meta_size); + } + + // Write tensors data + for (int i_split = 0; i_split < n_split; i_split++) { + auto split_name = split_file_name(split_prefix, i_split, n_split); + std::ifstream f_input(split_name.c_str(), std::ios::binary); + if (!f_input.is_open()) { + fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_name.c_str()); + for (auto * _ctx_gguf : ctx_ggufs) { + gguf_free(_ctx_gguf); + } + gguf_free(ctx_out); + fout.close(); + exit(1); + } + fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str()); + + auto * ctx_gguf = ctx_ggufs[i_split]; + auto * ctx_meta = ctx_metas[i_split]; + + auto n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + + auto n_bytes = ggml_nbytes(t); + + if (read_data.size() < n_bytes) { + read_data.resize(n_bytes); + } + + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor); + f_input.seekg(offset); + f_input.read((char *)read_data.data(), n_bytes); + + // write tensor data + padding + fout.write((const char *)read_data.data(), n_bytes); + zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); + } + + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + f_input.close(); + fprintf(stderr, "\033[3Ddone\n"); + } + + { + // go back to beginning of file and write the updated metadata + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *)data.data(), data.size()); + + fout.close(); + gguf_free(ctx_out); + } + + fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n", + __func__, split_params.output.c_str(), n_split, total_tensors); +} + +int main(int argc, const char ** argv) { + if (argc < 3) { + split_print_usage(argv[0]); + } + + split_params params; + split_params_parse(argc, argv, params); + + switch (params.operation) { + case SPLIT_OP_SPLIT: gguf_split(params); + break; + case SPLIT_OP_MERGE: gguf_merge(params); + break; + default:split_print_usage(argv[0]); + exit(1); + } + + return 0; +}