janhq
diff --git a/‎common/arg.cpp‎
Lines changed: 644 additions & 71 deletions b/‎common/arg.cpp‎
Lines changed: 644 additions & 71 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 14 additions & 519 deletions b/‎common/common.cpp‎
Lines changed: 14 additions & 519 deletions
diff --git a/‎common/common.h‎
Lines changed: 13 additions & 36 deletions b/‎common/common.h‎
Lines changed: 13 additions & 36 deletions
diff --git a/‎examples/batched-bench/batched-bench.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/batched-bench/batched-bench.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/batched/batched.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/batched/batched.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/export-lora/export-lora.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/export-lora/export-lora.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/gritlm/gritlm.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/gritlm/gritlm.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/llava/README-gemma3.md‎
Lines changed: 20 additions & 0 deletions b/‎examples/llava/README-gemma3.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎examples/llava/gemma3-cli.cpp‎
Lines changed: 3 additions & 3 deletions b/‎examples/llava/gemma3-cli.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/llava/llava-cli.cpp‎
Lines changed: 3 additions & 3 deletions b/‎examples/llava/llava-cli.cpp‎
Lines changed: 3 additions & 3 deletions
@@ -121,10 +121,6 @@ struct common_grammar_trigger {
     common_grammar_trigger_type type;
     std::string value;
     llama_token token = LLAMA_TOKEN_NULL;
-
-    // T can only be nlohmann::ordered_json
-    template <class T> T to_json() const;
-    template <class T> static common_grammar_trigger from_json(const T & in);
 };
 
 // sampling parameters
@@ -184,6 +180,13 @@ struct common_params_sampling {
     std::string print() const;
 };
 
+struct common_params_model {
+    std::string path    = ""; // model local path                                           // NOLINT
+    std::string url     = ""; // model url to download                                      // NOLINT
+    std::string hf_repo = ""; // HF repo                                                    // NOLINT
+    std::string hf_file = ""; // HF file                                                    // NOLINT
+};
+
 struct common_params_speculative {
     std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
 
@@ -197,19 +200,11 @@ struct common_params_speculative {
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
 
-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
-
-    std::string model = "";     // draft model for speculative decoding                      // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
+    struct common_params_model model;
 };
 
 struct common_params_vocoder {
-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
-
-    std::string model     = ""; // model path                                                // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
+    struct common_params_model model;
 
     std::string speaker_file = ""; // speaker file path                                      // NOLINT
 
@@ -267,12 +262,10 @@ struct common_params {
     struct common_params_speculative speculative;
     struct common_params_vocoder     vocoder;
 
-    std::string model                = ""; // model path                                                    // NOLINT
+    struct common_params_model model;
+
     std::string model_alias          = ""; // model alias                                                   // NOLINT
-    std::string model_url            = ""; // model url to download                                         // NOLINT
     std::string hf_token             = ""; // HF token                                                      // NOLINT
-    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
-    std::string hf_file              = ""; // HF file                                                       // NOLINT
     std::string prompt               = "";                                                                  // NOLINT
     std::string system_prompt        = "";                                                                  // NOLINT
     std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
@@ -286,6 +279,7 @@ struct common_params {
     std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
     std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
 
     bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
     std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@@ -347,7 +341,7 @@ struct common_params {
     common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
 
     // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
+    struct common_params_model mmproj;
     std::vector<std::string> image; // path to image file(s)
 
     // embedding
@@ -546,23 +540,6 @@ struct llama_model_params     common_model_params_to_llama  (      common_params
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 
-struct llama_model * common_load_model_from_url(
-    const std::string & model_url,
-    const std::string & local_path,
-    const std::string & hf_token,
-    const struct llama_model_params & params);
-
-struct llama_model * common_load_model_from_hf(
-    const std::string & repo,
-    const std::string & remote_path,
-    const std::string & local_path,
-    const std::string & hf_token,
-    const struct llama_model_params & params);
-
-std::pair<std::string, std::string> common_get_hf_file(
-    const std::string & hf_repo_with_tag,
-    const std::string & hf_token);
-
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
 
 
@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
 
     llama_model_params model_params = common_model_params_to_llama(params);
 
-    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
 
     if (model == NULL) {
         fprintf(stderr , "%s: error: unable to load model\n" , __func__);
 
@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
 
     llama_model_params model_params = common_model_params_to_llama(params);
 
-    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
 
     if (model == NULL) {
         LOG_ERR("%s: error: unable to load model\n" , __func__);
 
@@ -421,7 +421,7 @@ int main(int argc, char ** argv) {
 
     g_verbose = (params.verbosity > 1);
     try {
-        lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
+        lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
         ctx.run_merge();
     } catch (const std::exception & err) {
         fprintf(stderr, "%s\n", err.what());
 
@@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {
 
     llama_backend_init();
 
-    llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
 
     // create generation context
     llama_context * ctx = llama_init_from_model(model, cparams);
 
@@ -4,6 +4,26 @@
 >
 > This is very experimental, only used for demo purpose.
 
+## Quick started
+
+You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account
+
+```bash
+# build
+cmake -B build
+cmake --build build --target llama-gemma3-cli
+
+# alternatively, install from brew (MacOS)
+brew install llama.cpp
+
+# run it
+llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
+llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
+llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF
+
+# note: 1B model does not support vision
+```
+
 ## How to get mmproj.gguf?
 
 ```bash
 
@@ -78,7 +78,7 @@ struct gemma3_context {
     }
 
     void init_clip_model(common_params & params) {
-        const char * clip_path = params.mmproj.c_str();
+        const char * clip_path = params.mmproj.path.c_str();
         ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
     }
 
@@ -232,13 +232,13 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    if (params.mmproj.empty()) {
+    if (params.mmproj.path.empty()) {
         show_additional_info(argc, argv);
         return 1;
     }
 
     gemma3_context ctx(params);
-    printf("%s: %s\n", __func__, params.model.c_str());
+    printf("%s: %s\n", __func__, params.model.path.c_str());
 
     bool is_single_turn = !params.prompt.empty() && !params.image.empty();
 
 
@@ -225,7 +225,7 @@ static struct llama_model * llava_init(common_params * params) {
 
     llama_model_params model_params = common_model_params_to_llama(*params);
 
-    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n" , __func__);
         return NULL;
@@ -234,7 +234,7 @@ static struct llama_model * llava_init(common_params * params) {
 }
 
 static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
-    const char * clip_path = params->mmproj.c_str();
+    const char * clip_path = params->mmproj.path.c_str();
 
     auto prompt = params->prompt;
     if (prompt.empty()) {
@@ -283,7 +283,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+    if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
         print_usage(argc, argv);
         return 1;
     }
Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,7 @@ struct gemma3_context {`
`78`	`78`	`}`
`79`	`79`
`80`	`80`	`void init_clip_model(common_params & params) {`
`81`		`- const char * clip_path = params.mmproj.c_str();`
	`81`	`+ const char * clip_path = params.mmproj.path.c_str();`
`82`	`82`	`ctx_clip = clip_model_load(clip_path, params.verbosity > 1);`
`83`	`83`	`}`
`84`	`84`
`@@ -232,13 +232,13 @@ int main(int argc, char ** argv) {`
`232`	`232`
`233`	`233`	`common_init();`
`234`	`234`
`235`		`- if (params.mmproj.empty()) {`
	`235`	`+ if (params.mmproj.path.empty()) {`
`236`	`236`	`show_additional_info(argc, argv);`
`237`	`237`	`return 1;`
`238`	`238`	`}`
`239`	`239`
`240`	`240`	`gemma3_context ctx(params);`
`241`		`- printf("%s: %s\n", __func__, params.model.c_str());`
	`241`	`+ printf("%s: %s\n", __func__, params.model.path.c_str());`
`242`	`242`
`243`	`243`	`bool is_single_turn = !params.prompt.empty() && !params.image.empty();`
`244`	`244`