From de14b2acd66af64502d8fa54b8687bc525169336 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Thu, 13 Nov 2025 16:16:33 +0000
Subject: [PATCH 1/8] feat: support max_image_fetch_size to limit

---
 backends/llamacpp/src/main.rs |  5 ++++
 backends/trtllm/src/main.rs   |  4 +++
 backends/v2/src/main.rs       |  4 +++
 backends/v3/src/main.rs       |  4 +++
 router/src/server.rs          |  4 +++
 router/src/validation.rs      | 46 ++++++++++++++++++++++++++++++++---
 6 files changed, 63 insertions(+), 4 deletions(-)
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
index 9ee61ce6e2e..ea72dcaee53 100644
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@@ -157,6 +157,10 @@ struct Args {
     /// Maximum payload size in bytes.
     #[clap(default_value = "2000000", long, env)]
     payload_limit: usize,
+
+    /// Maximum image fetch size in bytes.
+    #[clap(default_value = "1073741824", long, env)]
+    max_image_fetch_size: usize,
 }
 
 #[tokio::main]
@@ -320,6 +324,7 @@ async fn main() -> Result<(), RouterError> {
         args.max_client_batch_size,
         args.usage_stats,
         args.payload_limit,
+        args.max_image_fetch_size,
         args.prometheus_port,
     )
     .await?;
diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
index 543f8e6e352..26d1dbf8c9c 100644
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@@ -67,6 +67,8 @@ struct Args {
     usage_stats: UsageStatsLevel,
     #[clap(default_value = "2000000", long, env)]
     payload_limit: usize,
+    #[clap(default_value = "1073741824", long, env)]
+    max_image_fetch_size: usize,
 }
 
 async fn get_tokenizer(tokenizer_name: &str, revision: Option<&str>) -> Option<Tokenizer> {
@@ -244,6 +246,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
         executor_worker,
         usage_stats,
         payload_limit,
+        max_image_fetch_size,
     } = args;
 
     // Launch Tokio runtime
@@ -325,6 +328,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
                 max_client_batch_size,
                 usage_stats,
                 payload_limit,
+                max_image_fetch_size,
                 prometheus_port,
             )
             .await?;
diff --git a/backends/v2/src/main.rs b/backends/v2/src/main.rs
index 60b5d52bbe2..575e62a0204 100644
--- a/backends/v2/src/main.rs
+++ b/backends/v2/src/main.rs
@@ -74,6 +74,8 @@ struct Args {
     usage_stats: usage_stats::UsageStatsLevel,
     #[clap(default_value = "2000000", long, env)]
     payload_limit: usize,
+    #[clap(default_value = "1073741824", long, env)]
+    max_image_fetch_size: usize,
 }
 
 #[derive(Debug, Subcommand)]
@@ -120,6 +122,7 @@ async fn main() -> Result<(), RouterError> {
         max_client_batch_size,
         usage_stats,
         payload_limit,
+        max_image_fetch_size,
     } = args;
 
     if let Some(Commands::PrintSchema) = command {
@@ -201,6 +204,7 @@ async fn main() -> Result<(), RouterError> {
         max_client_batch_size,
         usage_stats,
         payload_limit,
+        max_image_fetch_size,
         prometheus_port,
     )
     .await?;
diff --git a/backends/v3/src/main.rs b/backends/v3/src/main.rs
index 44e63853e04..a71ec3d09a4 100644
--- a/backends/v3/src/main.rs
+++ b/backends/v3/src/main.rs
@@ -74,6 +74,8 @@ struct Args {
     usage_stats: usage_stats::UsageStatsLevel,
     #[clap(default_value = "2000000", long, env)]
     payload_limit: usize,
+    #[clap(default_value = "1073741824", long, env)]
+    max_image_fetch_size: usize,
 }
 
 #[derive(Debug, Subcommand)]
@@ -120,6 +122,7 @@ async fn main() -> Result<(), RouterError> {
         max_client_batch_size,
         usage_stats,
         payload_limit,
+        max_image_fetch_size,
     } = args;
 
     if let Some(Commands::PrintSchema) = command {
@@ -217,6 +220,7 @@ async fn main() -> Result<(), RouterError> {
         max_client_batch_size,
         usage_stats,
         payload_limit,
+        max_image_fetch_size,
         prometheus_port,
     )
     .await?;
diff --git a/router/src/server.rs b/router/src/server.rs
index 97a0cea25ec..7f0bf74ebe0 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1523,6 +1523,7 @@ pub async fn run(
     max_client_batch_size: usize,
     usage_stats_level: usage_stats::UsageStatsLevel,
     payload_limit: usize,
+    max_image_fetch_size: usize,
     prometheus_port: u16,
 ) -> Result<(), WebServerError> {
     // CORS allowed origins
@@ -1827,6 +1828,7 @@ pub async fn run(
         compat_return_full_text,
         allow_origin,
         payload_limit,
+        max_image_fetch_size,
         prometheus_port,
     )
     .await;
@@ -1889,6 +1891,7 @@ async fn start(
     compat_return_full_text: bool,
     allow_origin: Option<AllowOrigin>,
     payload_limit: usize,
+    max_image_fetch_size: usize,
     prometheus_port: u16,
 ) -> Result<(), WebServerError> {
     // Determine the server port based on the feature and environment variable.
@@ -1920,6 +1923,7 @@ async fn start(
         max_input_tokens,
         max_total_tokens,
         disable_grammar_support,
+        max_image_fetch_size,
     );
 
     let infer = Infer::new(
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 7717f373e4f..e614e7f27b5 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -12,7 +12,7 @@ use rand::{thread_rng, Rng};
 use serde_json::Value;
 /// Payload validation logic
 use std::cmp::min;
-use std::io::Cursor;
+use std::io::{Cursor, Read};
 use std::iter;
 use std::sync::Arc;
 use thiserror::Error;
@@ -34,6 +34,7 @@ pub struct Validation {
     max_input_length: usize,
     max_total_tokens: usize,
     disable_grammar_support: bool,
+    max_image_fetch_size: usize,
     /// Channel to communicate with the background tokenization task
     sender: mpsc::UnboundedSender<TokenizerRequest>,
 }
@@ -51,6 +52,7 @@ impl Validation {
         max_input_length: usize,
         max_total_tokens: usize,
         disable_grammar_support: bool,
+        max_image_fetch_size: usize,
     ) -> Self {
         let workers = if let Tokenizer::Python { .. } = &tokenizer {
             1
@@ -78,6 +80,7 @@ impl Validation {
                         config_clone,
                         preprocessor_config_clone,
                         tokenizer_receiver,
+                        max_image_fetch_size,
                     )
                 });
             }
@@ -96,6 +99,7 @@ impl Validation {
             max_input_length,
             max_total_tokens,
             disable_grammar_support,
+            max_image_fetch_size,
         }
     }
 
@@ -480,6 +484,7 @@ fn tokenizer_worker(
     config: Option<Config>,
     preprocessor_config: Option<HubPreprocessorConfig>,
     mut receiver: mpsc::UnboundedReceiver<TokenizerRequest>,
+    max_image_fetch_size: usize,
 ) {
     match tokenizer {
         Tokenizer::Python {
@@ -503,6 +508,7 @@ fn tokenizer_worker(
                                 &tokenizer,
                                 config.as_ref(),
                                 preprocessor_config.as_ref(),
+                                max_image_fetch_size,
                             ))
                             .unwrap_or(())
                     })
@@ -524,6 +530,7 @@ fn tokenizer_worker(
                             &tokenizer,
                             config.as_ref(),
                             preprocessor_config.as_ref(),
+                            max_image_fetch_size,
                         ))
                         .unwrap_or(())
                 })
@@ -562,10 +569,29 @@ fn format_to_mimetype(format: ImageFormat) -> String {
     .to_string()
 }
 
-fn fetch_image(input: &str) -> Result<(Vec<u8>, String, usize, usize), ValidationError> {
+fn fetch_image(input: &str, max_image_fetch_size: usize) -> Result<(Vec<u8>, String, usize, usize), ValidationError> {
     if input.starts_with("![](http://") || input.starts_with("![](https://") {
         let url = &input["![](".len()..input.len() - 1];
-        let data = reqwest::blocking::get(url)?.bytes()?;
+        let mut response = reqwest::blocking::get(url)?;
+
+        // Check Content-Length header if present
+        if let Some(content_length) = response.content_length() {
+            if content_length as usize > max_image_fetch_size {
+                return Err(ValidationError::ImageTooLarge(
+                    content_length as usize,
+                    max_image_fetch_size,
+                ));
+            }
+        }
+
+        // Read the body with size limit to prevent unbounded memory allocation
+        let mut data = Vec::new();
+        let mut limited_reader = response.take((max_image_fetch_size + 1) as u64);
+        limited_reader.read_to_end(&mut data)?;
+
+        if data.len() > max_image_fetch_size {
+            return Err(ValidationError::ImageTooLarge(data.len(), max_image_fetch_size));
+        }
 
         let format = image::guess_format(&data)?;
         // TODO Remove this clone
@@ -787,6 +813,7 @@ fn prepare_input<T: TokenizerTrait>(
     tokenizer: &T,
     config: Option<&Config>,
     preprocessor_config: Option<&HubPreprocessorConfig>,
+    max_image_fetch_size: usize,
 ) -> Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError> {
     use Config::*;
     static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
@@ -805,7 +832,7 @@ fn prepare_input<T: TokenizerTrait>(
                     input_chunks.push(Chunk::Text(inputs[start..chunk_start].to_string()));
                     tokenizer_query.push_str(&inputs[start..chunk_start]);
                 }
-                let (data, mimetype, height, width) = fetch_image(&inputs[chunk_start..chunk_end])?;
+                let (data, mimetype, height, width) = fetch_image(&inputs[chunk_start..chunk_end], max_image_fetch_size)?;
                 input_chunks.push(Chunk::Image(Image { data, mimetype }));
                 tokenizer_query.push_str(&image_tokens(config, preprocessor_config, height, width));
                 start = chunk_end;
@@ -990,6 +1017,10 @@ pub enum ValidationError {
     InvalidImageContent(String),
     #[error("Could not fetch image: {0}")]
     FailedFetchImage(#[from] reqwest::Error),
+    #[error("Image size {0} bytes exceeds maximum allowed size of {1} bytes")]
+    ImageTooLarge(usize, usize),
+    #[error("Failed to read image data: {0}")]
+    ImageReadError(#[from] std::io::Error),
     #[error("{0} modality is not supported")]
     UnsupportedModality(&'static str),
 }
@@ -1023,6 +1054,7 @@ mod tests {
             max_input_length,
             max_total_tokens,
             disable_grammar_support,
+            1024 * 1024 * 1024, // 1GB
         );
 
         let max_new_tokens = 10;
@@ -1058,6 +1090,7 @@ mod tests {
             max_input_length,
             max_total_tokens,
             disable_grammar_support,
+            1024 * 1024 * 1024, // 1GB
         );
 
         let max_new_tokens = 10;
@@ -1092,6 +1125,7 @@ mod tests {
             max_input_length,
             max_total_tokens,
             disable_grammar_support,
+            1024 * 1024 * 1024, // 1GB
         );
         match validation
             .validate(GenerateRequest {
@@ -1132,6 +1166,7 @@ mod tests {
             max_input_length,
             max_total_tokens,
             disable_grammar_support,
+            1024 * 1024 * 1024, // 1GB
         );
         match validation
             .validate(GenerateRequest {
@@ -1203,6 +1238,7 @@ mod tests {
             max_input_length,
             max_total_tokens,
             disable_grammar_support,
+            1024 * 1024 * 1024, // 1GB
         );
         match validation
             .validate(GenerateRequest {
@@ -1293,6 +1329,7 @@ mod tests {
             max_input_length,
             max_total_tokens,
             disable_grammar_support,
+            1024 * 1024 * 1024, // 1GB
         );
 
         let chunks = match validation
@@ -1349,6 +1386,7 @@ mod tests {
             max_input_length,
             max_total_tokens,
             disable_grammar_support,
+            1024 * 1024 * 1024, // 1GB
         );
 
         let (encoding, chunks) = match validation

From 768dcdb75fdf5b2e4a02b7401fb98a6fb29fcfa1 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Thu, 13 Nov 2025 17:03:35 +0000
Subject: [PATCH 2/8] fix: update model path for test

---
 server/tests/models/test_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/tests/models/test_model.py b/server/tests/models/test_model.py
index 8441e8c6e3f..46a27253d06 100644
--- a/server/tests/models/test_model.py
+++ b/server/tests/models/test_model.py
@@ -14,7 +14,7 @@ def batch_type(self):
         def generate_token(self, batch):
             raise NotImplementedError
 
-    tokenizer = AutoTokenizer.from_pretrained("huggingface/llama-7b")
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
 
     model = TestModel(
         "test_model_id",

From ffd4831abdd24caeb8f7749d09dcb4ffd8075556 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Thu, 13 Nov 2025 17:14:29 +0000
Subject: [PATCH 3/8] fix: adjust model repo id for test again

---
 server/tests/models/test_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/tests/models/test_model.py b/server/tests/models/test_model.py
index 46a27253d06..d048b9cc166 100644
--- a/server/tests/models/test_model.py
+++ b/server/tests/models/test_model.py
@@ -14,7 +14,7 @@ def batch_type(self):
         def generate_token(self, batch):
             raise NotImplementedError
 
-    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+    tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
 
     model = TestModel(
         "test_model_id",

From 2305b5710019f8521de20b5e0fe235729737bd8e Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Thu, 13 Nov 2025 18:51:57 +0000
Subject: [PATCH 4/8] fix: apply clippy lints

---
 router/src/chat.rs       |  2 +-
 router/src/validation.rs | 15 +++++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/router/src/chat.rs b/router/src/chat.rs
index d5824fea014..824b775b39c 100644
--- a/router/src/chat.rs
+++ b/router/src/chat.rs
@@ -673,7 +673,7 @@ mod tests {
                 let (name, arguments) = get_tool_call_content(&events[0]);
                 if let Some(name) = name {
                     assert_eq!(name, "get_current_weather");
-                    output_name.push_str(&name);
+                    output_name.push_str(name);
                 }
                 output.push_str(arguments);
             } else {
diff --git a/router/src/validation.rs b/router/src/validation.rs
index e614e7f27b5..65cd93b4247 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -569,10 +569,13 @@ fn format_to_mimetype(format: ImageFormat) -> String {
     .to_string()
 }
 
-fn fetch_image(input: &str, max_image_fetch_size: usize) -> Result<(Vec<u8>, String, usize, usize), ValidationError> {
+fn fetch_image(
+    input: &str,
+    max_image_fetch_size: usize,
+) -> Result<(Vec<u8>, String, usize, usize), ValidationError> {
     if input.starts_with("![](http://") || input.starts_with("![](https://") {
         let url = &input["![](".len()..input.len() - 1];
-        let mut response = reqwest::blocking::get(url)?;
+        let response = reqwest::blocking::get(url)?;
 
         // Check Content-Length header if present
         if let Some(content_length) = response.content_length() {
@@ -590,7 +593,10 @@ fn fetch_image(input: &str, max_image_fetch_size: usize) -> Result<(Vec<u8>, Str
         limited_reader.read_to_end(&mut data)?;
 
         if data.len() > max_image_fetch_size {
-            return Err(ValidationError::ImageTooLarge(data.len(), max_image_fetch_size));
+            return Err(ValidationError::ImageTooLarge(
+                data.len(),
+                max_image_fetch_size,
+            ));
         }
 
         let format = image::guess_format(&data)?;
@@ -832,7 +838,8 @@ fn prepare_input<T: TokenizerTrait>(
                     input_chunks.push(Chunk::Text(inputs[start..chunk_start].to_string()));
                     tokenizer_query.push_str(&inputs[start..chunk_start]);
                 }
-                let (data, mimetype, height, width) = fetch_image(&inputs[chunk_start..chunk_end], max_image_fetch_size)?;
+                let (data, mimetype, height, width) =
+                    fetch_image(&inputs[chunk_start..chunk_end], max_image_fetch_size)?;
                 input_chunks.push(Chunk::Image(Image { data, mimetype }));
                 tokenizer_query.push_str(&image_tokens(config, preprocessor_config, height, width));
                 start = chunk_end;

From 91aa786427a87783517fe2b2c67557d90b8225fb Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Thu, 13 Nov 2025 20:43:13 +0000
Subject: [PATCH 5/8] fix: clippy fix

---
 router/src/validation.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/router/src/validation.rs b/router/src/validation.rs
index 65cd93b4247..b32f5f8b50e 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -34,7 +34,6 @@ pub struct Validation {
     max_input_length: usize,
     max_total_tokens: usize,
     disable_grammar_support: bool,
-    max_image_fetch_size: usize,
     /// Channel to communicate with the background tokenization task
     sender: mpsc::UnboundedSender<TokenizerRequest>,
 }
@@ -99,7 +98,6 @@ impl Validation {
             max_input_length,
             max_total_tokens,
             disable_grammar_support,
-            max_image_fetch_size,
         }
     }
 

From 42a65f154aa31f4476b1df03694f8683fedd493e Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Fri, 14 Nov 2025 19:18:29 +0000
Subject: [PATCH 6/8] fix: avoid torch build isolation in docker

---
 server/Makefile-flash-att-v2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/Makefile-flash-att-v2 b/server/Makefile-flash-att-v2
index 9a946d97f8b..2097af3fd1b 100644
--- a/server/Makefile-flash-att-v2
+++ b/server/Makefile-flash-att-v2
@@ -3,7 +3,7 @@ flash_att_v2_commit_rocm := 47bd46e0204a95762ae48712fd1a3978827c77fd
 
 build-flash-attention-v2-cuda:
 	pip install -U packaging wheel
-	pip install flash-attn==$(flash_att_v2_commit_cuda)
+	pip install --no-build-isolation flash-attn==$(flash_att_v2_commit_cuda)
 
 install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
 	echo "Flash v2 installed"

From 9f17ef62cec275497095e45a5709115ea0db047e Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Mon, 17 Nov 2025 21:48:53 +0000
Subject: [PATCH 7/8] fix: bump repo id in flash llama tests

---
 integration-tests/models/test_flash_llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration-tests/models/test_flash_llama.py b/integration-tests/models/test_flash_llama.py
index bf49dc0b4b0..f24215a08cc 100644
--- a/integration-tests/models/test_flash_llama.py
+++ b/integration-tests/models/test_flash_llama.py
@@ -3,7 +3,7 @@
 
 @pytest.fixture(scope="module")
 def flash_llama_handle(launcher):
-    with launcher("huggingface/llama-7b", num_shard=2) as handle:
+    with launcher("huggyllama/llama-7b", num_shard=2) as handle:
         yield handle
 
 

From 8a307a2fb91c9aedd6ff43e231b9034e5f5e7c78 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Tue, 18 Nov 2025 01:09:14 +0000
Subject: [PATCH 8/8] fix: temporarily avoid problematic repos in tests

---
 integration-tests/models/test_flash_llama_fp8.py       | 3 +++
 integration-tests/models/test_flash_llama_marlin_24.py | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/integration-tests/models/test_flash_llama_fp8.py b/integration-tests/models/test_flash_llama_fp8.py
index 1980846d301..1d065f5d6a0 100644
--- a/integration-tests/models/test_flash_llama_fp8.py
+++ b/integration-tests/models/test_flash_llama_fp8.py
@@ -13,6 +13,7 @@ async def flash_llama_fp8(flash_llama_fp8_handle):
     return flash_llama_fp8_handle.client
 
 
+@pytest.mark.skip(reason="Issue with the model access")
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
@@ -26,6 +27,7 @@ async def test_flash_llama_fp8(flash_llama_fp8, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.skip(reason="Issue with the model access")
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
@@ -49,6 +51,7 @@ async def test_flash_llama_fp8_all_params(flash_llama_fp8, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.skip(reason="Issue with the model access")
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
diff --git a/integration-tests/models/test_flash_llama_marlin_24.py b/integration-tests/models/test_flash_llama_marlin_24.py
index 3eb94f02e18..bd364ecf9b9 100644
--- a/integration-tests/models/test_flash_llama_marlin_24.py
+++ b/integration-tests/models/test_flash_llama_marlin_24.py
@@ -15,6 +15,7 @@ async def flash_llama_marlin(flash_llama_marlin24_handle):
     return flash_llama_marlin24_handle.client
 
 
+@pytest.mark.skip(reason="Issue with the model access")
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
@@ -27,6 +28,7 @@ async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.skip(reason="Issue with the model access")
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
@@ -50,6 +52,7 @@ async def test_flash_llama_marlin24_all_params(flash_llama_marlin, response_snap
     assert response == response_snapshot
 
 
+@pytest.mark.skip(reason="Issue with the model access")
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private