Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -161,15 +161,16 @@ jobs:
- name: Dawn Dependency
id: dawn-depends
run: |
DAWN_VERSION="v1.0.0"
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.zip"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
curl -L -o artifact.tar.gz \
curl -L -o artifact.zip \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
mkdir dawn
tar -xvf artifact.tar.gz -C dawn --strip-components=1
unzip artifact.zip
tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.tar.gz -C dawn --strip-components=1

- name: Build
id: cmake_build
Expand Down Expand Up @@ -521,15 +522,16 @@ jobs:
id: dawn-depends
run: |
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
DAWN_VERSION="v1.0.0"
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.zip"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
curl -L -o artifact.tar.gz \
curl -L -o artifact.zip \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
mkdir dawn
tar -xvf artifact.tar.gz -C dawn --strip-components=1
unzip artifact.zip
tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.tar.gz -C dawn --strip-components=1

- name: Build
id: cmake_build
Expand Down
14 changes: 14 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
exit(0);
}
));
add_opt(common_arg(
{"-cl", "--cache-list"},
"show list of models in cache",
[](common_params &) {
printf("model cache directory: %s\n", fs_get_cache_directory().c_str());
auto models = common_list_cached_models();
printf("number of models in cache: %zu\n", models.size());
for (size_t i = 0; i < models.size(); i++) {
auto & model = models[i];
printf("%4d. %s\n", (int) i + 1, model.to_string().c_str());
}
exit(0);
}
));
add_opt(common_arg(
{"--completion-bash"},
"print source-able bash completion script for llama.cpp",
Expand Down
33 changes: 33 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -908,6 +908,39 @@ std::string fs_get_cache_file(const std::string & filename) {
return cache_directory + filename;
}

std::vector<common_file_info> fs_list_files(const std::string & path) {
std::vector<common_file_info> files;
if (path.empty()) return files;

std::filesystem::path dir(path);
if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
return files;
}

for (const auto & entry : std::filesystem::directory_iterator(dir)) {
try {
// Only include regular files (skip directories)
const auto & p = entry.path();
if (std::filesystem::is_regular_file(p)) {
common_file_info info;
info.path = p.string();
info.name = p.filename().string();
try {
info.size = static_cast<size_t>(std::filesystem::file_size(p));
} catch (const std::filesystem::filesystem_error &) {
info.size = 0;
}
files.push_back(std::move(info));
}
} catch (const std::filesystem::filesystem_error &) {
// skip entries we cannot inspect
continue;
}
}

return files;
}


//
// Model utils
Expand Down
7 changes: 7 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,13 @@ bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory();
std::string fs_get_cache_file(const std::string & filename);

struct common_file_info {
std::string path;
std::string name;
size_t size = 0; // in bytes
};
std::vector<common_file_info> fs_list_files(const std::string & path);

//
// Model utils
//
Expand Down
50 changes: 45 additions & 5 deletions common/download.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,22 @@ using json = nlohmann::ordered_json;
// downloader
//

// validate repo name format: owner/repo
static bool validate_repo_name(const std::string & repo) {
static const std::regex repo_regex(R"(^[A-Za-z0-9_.\-]+\/[A-Za-z0-9_.\-]+$)");
return std::regex_match(repo, repo_regex);
}

static std::string get_manifest_path(const std::string & repo, const std::string & tag) {
// we use "=" to avoid clashing with other component, while still being allowed on windows
std::string fname = "manifest=" + repo + "=" + tag + ".json";
if (!validate_repo_name(repo)) {
throw std::runtime_error("error: repo name must be in the format 'owner/repo'");
}
string_replace_all(fname, "/", "=");
return fs_get_cache_file(fname);
}

static std::string read_file(const std::string & fname) {
std::ifstream file(fname);
if (!file) {
Expand Down Expand Up @@ -829,17 +845,13 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, cons
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
// User-Agent header is already set in common_remote_get_content, no need to set it here

// we use "=" to avoid clashing with other component, while still being allowed on windows
std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
string_replace_all(cached_response_fname, "/", "_");
std::string cached_response_path = fs_get_cache_file(cached_response_fname);

// make the request
common_remote_params params;
params.headers = headers;
long res_code = 0;
std::string res_str;
bool use_cache = false;
std::string cached_response_path = get_manifest_path(hf_repo, tag);
if (!offline) {
try {
auto res = common_remote_get_content(url, params);
Expand Down Expand Up @@ -895,6 +907,33 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, cons
return { hf_repo, ggufFile, mmprojFile };
}

std::vector<common_cached_model_info> common_list_cached_models() {
std::vector<common_cached_model_info> models;
const std::string cache_dir = fs_get_cache_directory();
const std::vector<common_file_info> files = fs_list_files(cache_dir);
for (const auto & file : files) {
if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
common_cached_model_info model_info;
model_info.manifest_path = file.path;
std::string fname = file.name;
string_replace_all(fname, ".json", ""); // remove extension
auto parts = string_split<std::string>(fname, '=');
if (parts.size() == 4) {
// expect format: manifest=<user>=<model>=<tag>=<other>
model_info.user = parts[1];
model_info.model = parts[2];
model_info.tag = parts[3];
} else {
// invalid format
continue;
}
model_info.size = 0; // TODO: get GGUF size, not manifest size
models.push_back(model_info);
}
}
return models;
}

//
// Docker registry functions
//
Expand Down Expand Up @@ -959,6 +998,7 @@ std::string common_docker_resolve_model(const std::string & docker) {
std::string token = common_docker_get_token(repo); // Get authentication token

// Get manifest
// TODO: cache the manifest response so that it appears in the model list
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
std::string manifest_url = url_prefix + "/manifests/" + tag;
common_remote_params manifest_params;
Expand Down
22 changes: 18 additions & 4 deletions common/download.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,23 @@ struct common_params_model;
// download functionalities
//

struct common_cached_model_info {
std::string manifest_path;
std::string user;
std::string model;
std::string tag;
size_t size = 0; // GGUF size in bytes
std::string to_string() const {
return user + "/" + model + ":" + tag;
}
};

struct common_hf_file_res {
std::string repo; // repo name with ":tag" removed
std::string ggufFile;
std::string mmprojFile;
};

// resolve and download model from Docker registry
// return local path to downloaded model file
std::string common_docker_resolve_model(const std::string & docker);

/**
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
Expand All @@ -39,3 +46,10 @@ bool common_download_model(
const common_params_model & model,
const std::string & bearer_token,
bool offline);

// returns list of cached models
std::vector<common_cached_model_info> common_list_cached_models();

// resolve and download model from Docker registry
// return local path to downloaded model file
std::string common_docker_resolve_model(const std::string & docker);
2 changes: 1 addition & 1 deletion ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
option(GGML_VXE "ggml: enable vxe" ON)
option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})

option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ if (CUDAToolkit_FOUND)

if (GGML_CUDA_DEBUG)
list(APPEND CUDA_FLAGS -lineinfo)
add_compile_definitions(GGML_CUDA_DEBUG)
endif()

if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
Expand Down
39 changes: 11 additions & 28 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
#include "ggml-cuda/mmq.cuh"
#include "ggml-cuda/mmvf.cuh"
#include "ggml-cuda/mmvq.cuh"
#include "ggml-cuda/moe-expert-reduce.cuh"
#include "ggml-cuda/norm.cuh"
#include "ggml-cuda/opt-step-adamw.cuh"
#include "ggml-cuda/opt-step-sgd.cuh"
Expand Down Expand Up @@ -3152,8 +3151,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx

for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];


#ifdef GGML_CUDA_DEBUG
const int nodes_fused = i - prev_i - 1;
prev_i = i;
Expand Down Expand Up @@ -3199,31 +3196,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue;
}

if (node->op == GGML_OP_MUL) {
int current_node = i + 1;
int num_views = 0;
int num_adds = 0;
while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_VIEW) {
num_views++;
current_node++;
}

while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_ADD &&
num_adds < num_views - 1) {
num_adds++;
current_node++;
}

if (num_adds == num_views - 1 && num_views > 0) {
ggml_tensor * dst_node = cgraph->nodes[current_node - 1];
if (ggml_cuda_should_use_moe_expert_reduce(cgraph, i, current_node)) {
ggml_cuda_op_moe_expert_reduce(*cuda_ctx, node->src[0], node->src[1], dst_node);
i += num_views + num_adds;
continue;
}
}
}

if (node->op == GGML_OP_ADD) {
int n_fuse = 0;
ggml_op ops[8];
Expand Down Expand Up @@ -3302,6 +3274,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue;
}

// we don't support repeating adds
if (bias_op == GGML_OP_ADD &&
(!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
!ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
continue;
}

const ggml_tensor * src0 = up_n->src[0];
const ggml_tensor * src1 = up_n->src[1];
const ggml_tensor * ids = up_n->src[2];
Expand Down Expand Up @@ -3411,6 +3390,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue;
}

if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
continue;
}

ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.x_bias = bias_tensor;

Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cuda/mmq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -3494,7 +3494,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
const int col_diff = col_high - col_low;

for (int j = threadIdx.y*warp_size + threadIdx.x; j < mmq_x; j += nwarps*warp_size) {
ids_dst_shared[j] = ids_dst[col_low + j];
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
}
__syncthreads();

Expand Down
Loading