Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
03c0946
convert : support models with multiple chat templates (#6588)
CISC Apr 18, 2024
0d56246
ggml : group all experts in a single ggml_mul_mat_id (#6505)
slaren Apr 18, 2024
bca40e9
fix wrong parameter in cmd in readme-sycl.md (#6755)
arthw Apr 19, 2024
8b1b1f4
train : add general name (#6752)
teleprint-me Apr 19, 2024
9958c81
Implement the OLMo architecture (#6741)
nopperl Apr 19, 2024
637e9a8
server: static: upstream upgrade (#6765)
phymbert Apr 19, 2024
0e4802b
ci: add ubuntu latest release and fix missing build number (mac & ubu…
loonerin Apr 19, 2024
aed82f6
common : try to fix Android CI (#6780)
ggerganov Apr 20, 2024
b8109bc
doc : server tests require llama to be built with curl enabled (#6788)
kaetemi Apr 20, 2024
b97bc39
llama : support Llama 3 HF conversion (#6745)
pcuenca Apr 21, 2024
89b0bf0
llava : use logger in llava-cli (#6797)
jart Apr 21, 2024
2cca09d
readme : add Fedora instructions (#6783)
Man2Dev Apr 21, 2024
e8d35f4
doc : add link to falcon (#6789)
kaetemi Apr 21, 2024
c1386c9
gguf-py : add IQ1_M to GGML_QUANT_SIZES (#6761)
pmysl Apr 21, 2024
7dbdba5
llama : add llama-3 chat template (#6751)
DifferentialityDevelopment Apr 21, 2024
b9cc76d
ggml : fix ggml_backend_cpu_supports_op() for CPY (#0)
ggerganov Apr 21, 2024
40f74e4
llama : add option to render special/control tokens (#6807)
ggerganov Apr 21, 2024
5cf5e7d
`build`: generate hex dump of server assets during build (#6661)
ochafik Apr 21, 2024
e9b4a1b
flake.lock: Update
github-actions[bot] Apr 21, 2024
c0956b0
ci: fix job are cancelling each other (#6781)
phymbert Apr 22, 2024
8960fe8
llama : fix typo in <|im_end|> token text (#6745)
ggerganov Apr 22, 2024
e931888
ggml : fix calloc argument ordering. (#6820)
airlied Apr 22, 2024
192090b
llamafile : improve sgemm.cpp (#6796)
jart Apr 22, 2024
4e96a81
[SYCL] Windows default build instructions without -DLLAMA_SYCL_F16 fl…
aahouzi Apr 23, 2024
bf59e42
added implementation of DRY sampler
l3utterfly Apr 23, 2024
11487ed
fixed size_t/int comparison error
l3utterfly Apr 23, 2024
f9e416d
removed unused parameters in llama_sample_dry
l3utterfly Apr 23, 2024
62b9cb4
removed trailing whitespace
l3utterfly Apr 23, 2024
babff06
removed unnecessary comparison
l3utterfly Apr 23, 2024
d8b47da
refactored sample_dry to be more efficient and closer to original imp…
l3utterfly Apr 24, 2024
a31ce1a
added cmd to main for dry sampler
l3utterfly Apr 24, 2024
7e08885
skip dry sampler if last token is part of sequence breakers
l3utterfly Apr 24, 2024
5a3059d
Merge branch 'layla-build' into dry-sampler
l3utterfly Apr 25, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ on:
- cron: '04 2 * * *'

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}-${{ github.event.inputs.sha }}
cancel-in-progress: true

jobs:
Expand Down
33 changes: 33 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Dependencies
id: depends
Expand Down Expand Up @@ -88,6 +90,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Dependencies
id: depends
Expand Down Expand Up @@ -206,6 +210,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Dependencies
id: depends
Expand Down Expand Up @@ -238,6 +244,33 @@ jobs:
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi

- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*

- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
name: llama-bin-ubuntu-x64.zip

# ubuntu-latest-cmake-sanitizer:
# runs-on: ubuntu-latest
#
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ on:
- cron: '2 4 * * *'

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
cancel-in-progress: true

jobs:
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ lcov-report/
gcovr-report/

build*
!build.zig
cmake-build-*
out/
tmp/
Expand Down Expand Up @@ -101,6 +102,9 @@ qnt-*.txt
perf-*.txt

examples/jeopardy/results.txt
examples/server/*.html.hpp
examples/server/*.js.hpp
examples/server/*.mjs.hpp

poetry.lock
poetry.toml
Expand Down
16 changes: 5 additions & 11 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,11 @@ else()
set(LLAMA_METAL_DEFAULT OFF)
endif()

# TODO: fix this for Android CI
# https://github.com/ggerganov/llama.cpp/pull/6716#issuecomment-2061509191
#if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
# set(LLAMA_LLAMAFILE_DEFAULT OFF)
#else()
# set(LLAMA_LLAMAFILE_DEFAULT ON)
#endif()

# TODO: temporary disable until MoE is fixed
# https://github.com/ggerganov/llama.cpp/pull/6716
set(LLAMA_LLAMAFILE_DEFAULT OFF)
if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
set(LLAMA_LLAMAFILE_DEFAULT OFF)
else()
set(LLAMA_LLAMAFILE_DEFAULT ON)
endif()

# general
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
Expand Down
17 changes: 11 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -384,10 +384,6 @@ ifdef LLAMA_OPENBLAS
MK_LDFLAGS += $(shell pkg-config --libs openblas)
endif # LLAMA_OPENBLAS

# TODO: temporary disable until MoE is fixed
# https://github.com/ggerganov/llama.cpp/pull/6716
LLAMA_NO_LLAMAFILE := 1

ifndef LLAMA_NO_LLAMAFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
OBJS += sgemm.o
Expand Down Expand Up @@ -699,7 +695,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@

COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o

common.o: common/common.cpp $(COMMON_H_DEPS)
Expand Down Expand Up @@ -800,10 +796,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)

# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
examples/server/%.hpp: examples/server/public/% Makefile
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
echo "unsigned char $${NAME}[] = {" && \
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
echo "};" && \
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
) > $@

gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
Expand Down
21 changes: 12 additions & 9 deletions README-sycl.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,12 +229,11 @@ source /opt/intel/oneapi/setvars.sh
# Build LLAMA with MKL BLAS acceleration for intel GPU
mkdir -p build && cd build

# Option 1: Use FP16 for better performance in long-prompt inference
cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Or without "--build", run "make" next
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

# Option 2: Use FP32 by default
cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON

#build all binary
cmake --build . --config Release -j -v
Expand All @@ -251,11 +250,11 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL
mkdir -p build && cd build

# Option 1: Use FP16 for better performance in long-prompt inference
cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

# Option 2: Use FP32 by default
cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON

#build all binary
cmake --build . --config Release -j -v
Expand Down Expand Up @@ -417,6 +416,10 @@ mkdir -p build
cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release

# Option 2: Or FP16
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON

make -j
Expand Down
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

### Recent API changes

- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
Expand Down Expand Up @@ -95,7 +96,7 @@ Typically finetunes of the base models below are supported as well.
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
- [X] Falcon
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
Expand All @@ -122,6 +123,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo)

(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))

Expand Down Expand Up @@ -548,7 +550,7 @@ Building the program with BLAS support may lead to some performance improvements
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.

You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
- For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.

- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.

Expand All @@ -573,6 +575,12 @@ Building the program with BLAS support may lead to some performance improvements

Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.

Linux packaging:
Fedora Linux:
```bash
sudo dnf install clblast
```

Alternatively, they may be built from source.

- <details>
Expand Down
29 changes: 29 additions & 0 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -140,4 +140,33 @@ pub fn build(b: *std.build.Builder) !void {
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}

const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
for (server_assets) |asset| {
const input_path = b.fmt("examples/server/public/{s}", .{asset});
const output_path = b.fmt("examples/server/{s}.hpp", .{asset});

// Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:

const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
defer b.allocator.free(input);

var buf = std.ArrayList(u8).init(b.allocator);
defer buf.deinit();

for (input) |byte| {
try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
}

var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
defer b.allocator.free(name);
std.mem.replaceScalar(u8, name, '.', '_');

try std.fs.cwd().writeFile(output_path, b.fmt(
"unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
.{ name, buf.items, name, input.len },
));

std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
}
}
36 changes: 32 additions & 4 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}

#if defined(__x86_64__) && defined(__linux__)
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
#include <pthread.h>

static void cpuid(unsigned leaf, unsigned subleaf,
Expand Down Expand Up @@ -162,7 +162,7 @@ static int count_math_cpus(int cpu_count) {
* Returns number of CPUs on system that are useful for math.
*/
int get_math_cpu_count() {
#if defined(__x86_64__) && defined(__linux__)
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
if (cpu_count < 1) {
return get_num_physical_cores();
Expand Down Expand Up @@ -582,6 +582,30 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
sparams.penalty_present = std::stof(argv[i]);
return true;
}
if (arg == "--dry-multiplier") {
if (++i >= argc) {
invalid_param = true;
return true;
}
sparams.dry_multiplier = std::stof(argv[i]);
return true;
}
if (arg == "--dry-base") {
if (++i >= argc) {
invalid_param = true;
return true;
}
sparams.dry_base = std::stoi(argv[i]);
return true;
}
if (arg == "--dry-allowed-length") {
if (++i >= argc) {
invalid_param = true;
return true;
}
sparams.dry_allowed_length = std::stoi(argv[i]);
return true;
}
if (arg == "--dynatemp-range") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -1429,6 +1453,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
printf(" --dry-multiplier N DRY sampler multiplier (default: %.1f, 0.0 = disabled)\n", (double)sparams.dry_multiplier);
printf(" --dry-base N DRY sampler base (default: %.1f)\n", (double)sparams.dry_base);
printf(" --dry-allowed-length N\n");
printf(" DRY sampler allowed length (default: %d)\n", sparams.dry_allowed_length);
printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
printf(" --mirostat N use Mirostat sampling.\n");
Expand Down Expand Up @@ -2333,10 +2361,10 @@ std::vector<llama_token> llama_tokenize(

std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
Expand Down
7 changes: 3 additions & 4 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, dry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d",
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
params.mirostat, params.mirostat_eta, params.mirostat_tau);
params.mirostat, params.mirostat_eta, params.mirostat_tau, params.dry_multiplier, params.dry_base, params.dry_allowed_length);

return std::string(result);
}
Expand Down Expand Up @@ -314,12 +314,11 @@ static llama_token_data_array llama_sampling_prepare_impl(

// DRY penalties (multiplier > 0 means enabled)
if(dry_multiplier > 0.0f) {
llama_sample_dry(ctx_main, &cur_p,
llama_sample_dry(&cur_p,
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
penalty_tokens_used_size, dry_base, dry_multiplier, dry_allowed_length,
params.dry_sequence_breakers.data(), params.dry_sequence_breakers.size());
}


if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
Expand Down
Loading