Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
5220a99
Increase 3B scratch buffers. (#1698)
SlyEcho Jun 5, 2023
99009e7
ggml : add SOTA 2,3,4,5,6 bit k-quantizations (#1684)
ikawrakow Jun 5, 2023
e7fe66e
ci : disable auto tidy (#1705)
ggerganov Jun 5, 2023
efe0507
ggml : fix internal overflow in ggml_time_us on Windows (#1702)
grahameth Jun 5, 2023
9d0693b
metal : use shared buffers between CPU and GPU (#1696)
kiltyj Jun 5, 2023
c2df36d
llama : consistently catch and throw only exceptions deriving from st…
mgroeber9110 Jun 5, 2023
f146562
readme : fix typo (#1700)
Foul-Tarnished Jun 5, 2023
f4c55d3
docs : add performance troubleshoot + example benchmark documentation…
Yuval-Peled Jun 5, 2023
590250f
metal : add checks for buffer size (#1706)
spencersutton Jun 6, 2023
7a74dee
llama : temporary disable Q6_K output quantization (#1711)
ggerganov Jun 6, 2023
7ad7750
gitignore : add .clang-tidy
ggerganov Jun 6, 2023
2d43387
ggml : fix builds, add ggml-quants-k.o (close #1712, close #1710)
ggerganov Jun 6, 2023
d5b111f
Clblast fixes + enhancements to save VRAM and offload more layers (#1…
LostRuins Jun 6, 2023
44f906e
metal : add f16 support
ggerganov Jun 6, 2023
17366df
Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)
JohannesGaessler Jun 6, 2023
2a4e41a
llama : fix compile warnings
ggerganov Jun 6, 2023
2d7bf11
llama : fix vram_scratch var
ggerganov Jun 6, 2023
35a8491
main: add the possibility to open the prompt cache read-only (#1640)
wtarreau Jun 7, 2023
4dc62c5
readme : add June roadmap
ggerganov Jun 7, 2023
5b57a5b
flake : update to support metal on m1/m2 (#1724)
jpetrucciani Jun 7, 2023
5c64a09
k-quants : allow to optionally disable at compile time (#1734)
ggerganov Jun 7, 2023
0035858
k-quants : add missing compile definition to CMakeLists (#1748)
johnson442 Jun 8, 2023
4161bdc
metal : add Q4_K implementation (#1733)
ikawrakow Jun 8, 2023
53aba3f
clang-tidy : restore dot file from accidental deletion
ggerganov Jun 8, 2023
b50b570
ggml : fix fprintf warnings (#1720)
sroussey Jun 8, 2023
8fc8179
Add llama.cpp docker support for non-latin languages (#1673)
qingfengfenga Jun 8, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .devops/full.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,6 @@ COPY . .

RUN make

ENV LC_ALL=C.utf8

ENTRYPOINT ["/app/.devops/tools.sh"]
2 changes: 2 additions & 0 deletions .devops/main.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@ FROM ubuntu:$UBUNTU_VERSION as runtime

COPY --from=build /app/main /main

ENV LC_ALL=C.utf8

ENTRYPOINT [ "/main" ]
2 changes: 1 addition & 1 deletion .github/workflows/tidy-post.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: clang-tidy review post comments

on:
workflow_run:
workflow_dispatch:
workflows: ["clang-tidy-review"]
types:
- completed
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
.envrc
.swiftpm
.venv
.clang-tidy
.vs/
.vscode/

Expand Down Expand Up @@ -34,6 +35,7 @@ models/*
/benchmark-matmult
/vdot
/Pipfile
/libllama.so

build-info.h
arm_neon.h
Expand Down
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)

option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
Expand Down Expand Up @@ -226,6 +227,11 @@ if (LLAMA_METAL)
)
endif()

if (LLAMA_K_QUANTS)
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
add_compile_definitions(GGML_USE_K_QUANTS)
endif()

if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
Expand Down Expand Up @@ -399,6 +405,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL}
${GGML_SOURCES_METAL}
${GGML_SOURCES_EXTRA}
)

target_include_directories(ggml PUBLIC .)
Expand Down
29 changes: 21 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,11 @@ endif
#

# keep standard at C11 and C++11
CFLAGS = -I. -O3 -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
# -Ofast tends to produce faster code, but may not be available for some compilers.
#OPT = -Ofast
OPT = -O3
CFLAGS = -I. $(OPT) -std=c11 -fPIC
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
LDFLAGS =

ifdef LLAMA_DEBUG
Expand Down Expand Up @@ -118,6 +121,11 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
endif
endif

ifndef LLAMA_NO_K_QUANTS
CFLAGS += -DGGML_USE_K_QUANTS
OBJS += k_quants.o
endif

ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
Expand All @@ -137,7 +145,7 @@ ifdef LLAMA_OPENBLAS
endif # LLAMA_OPENBLAS

ifdef LLAMA_BLIS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS

Expand Down Expand Up @@ -209,6 +217,11 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif

ifdef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS

#
# Print build information
#
Expand Down Expand Up @@ -247,22 +260,22 @@ clean:
# Examples
#

main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo

quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
Expand Down
16 changes: 9 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

**Hot topics:**

- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
Expand Down Expand Up @@ -267,11 +268,11 @@ Any value larger than 0 will offload the computation to the GPU. For example:

Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:

- **Accelerate Framework**:
- #### Accelerate Framework:

This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.

- **OpenBLAS**:
- #### OpenBLAS:

This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.

Expand Down Expand Up @@ -305,22 +306,22 @@ Building the program with BLAS support may lead to some performance improvements
cmake --build . --config Release
```

- **BLIS**
- #### BLIS

Check [BLIS.md](BLIS.md) for more information.

- **Intel MKL**
- #### Intel MKL

By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:

```bash
mkdir build
cd build
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake --build . -config Release
cmake --build . --config Release
```

- **cuBLAS**
- #### cuBLAS

This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
- Using `make`:
Expand All @@ -339,7 +340,7 @@ Building the program with BLAS support may lead to some performance improvements

The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.

- **CLBlast**
- #### CLBlast

OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.

Expand Down Expand Up @@ -684,3 +685,4 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /mode
### Docs

- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
File renamed without changes.
40 changes: 40 additions & 0 deletions docs/token_generation_performance_tips.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Token generation performance troubleshooting

## Verifying that the model is running on the GPU with cuBLAS
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell
./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
```

When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
```shell
llama_model_load_internal: [cublas] offloading 60 layers to GPU
llama_model_load_internal: [cublas] offloading output layer to GPU
llama_model_load_internal: [cublas] total VRAM used: 17223 MB
... rest of inference
```

If you see these lines, then the GPU is being used.

## Verifying that the CPU is not oversaturated
llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physicial CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.

# Example of runtime flags effect on inference speed benchmark
These runs were tested on the following machine:
GPU: A6000 (48GB VRAM)
CPU: 7 physical cores
RAM: 32GB

Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)

Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`

Result:

| command | tokens/second (higher is better) |
| - | - |
| -ngl 2000000 | N/A (less than 0.1) |
| -t 7 | 1.7 |
| -t 1 -ngl 2000000 | 5.5 |
| -t 7 -ngl 2000000 | 8.7 |
| -t 4 -ngl 2000000 | 9.1 |
44 changes: 44 additions & 0 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <algorithm>
#include <sstream>
#include <unordered_set>
#include <regex>

#if defined(__APPLE__) && defined(__MACH__)
#include <sys/types.h>
Expand Down Expand Up @@ -131,6 +132,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.path_prompt_cache = argv[i];
} else if (arg == "--prompt-cache-all") {
params.prompt_cache_all = true;
} else if (arg == "--prompt-cache-ro") {
params.prompt_cache_ro = true;
} else if (arg == "-f" || arg == "--file") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -295,6 +298,40 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
params.main_gpu = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
#endif
} else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
std::string arg_next = argv[i];

// split string by , and /
const std::regex regex{R"([,/]+)"};
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);

for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
if (i < split_arg.size()) {
params.tensor_split[i] = std::stof(split_arg[i]);
} else {
params.tensor_split[i] = 0.0f;
}
}
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--mtest") {
Expand Down Expand Up @@ -397,6 +434,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
fprintf(stderr, " not supported with --interactive or other interactive options\n");
fprintf(stderr, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
Expand Down Expand Up @@ -438,6 +476,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
fprintf(stderr, " number of layers to store in VRAM\n");
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
#endif
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
Expand Down Expand Up @@ -483,7 +524,10 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params();

lparams.n_ctx = params.n_ctx;
lparams.n_batch = params.n_batch;
lparams.n_gpu_layers = params.n_gpu_layers;
lparams.main_gpu = params.main_gpu;
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
Expand Down
17 changes: 10 additions & 7 deletions examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@
int32_t get_num_physical_cores();

struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs

// sampling parameters
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
Expand Down Expand Up @@ -60,6 +62,7 @@ struct gpt_params {
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it

bool embedding = false; // get only sentence embedding
bool interactive_first = false; // wait for user input immediately
Expand Down
2 changes: 2 additions & 0 deletions examples/main/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -286,5 +286,7 @@ These options provide extra functionality and customization when running the LLa
- `--verbose-prompt`: Print the prompt before generating text.
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
4 changes: 2 additions & 2 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ int main(int argc, char ** argv) {
const bool penalize_nl = params.penalize_nl;

// optionally save the session on first sample (for faster prompt loading next time)
if (!path_session.empty() && need_to_save_session) {
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
need_to_save_session = false;
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
Expand Down Expand Up @@ -630,7 +630,7 @@ int main(int argc, char ** argv) {
}
}

if (!path_session.empty() && params.prompt_cache_all) {
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
Expand Down
Loading