From de14b2acd66af64502d8fa54b8687bc525169336 Mon Sep 17 00:00:00 2001 From: drbh Date: Thu, 13 Nov 2025 16:16:33 +0000 Subject: [PATCH 1/8] feat: support max_image_fetch_size to limit --- backends/llamacpp/src/main.rs | 5 ++++ backends/trtllm/src/main.rs | 4 +++ backends/v2/src/main.rs | 4 +++ backends/v3/src/main.rs | 4 +++ router/src/server.rs | 4 +++ router/src/validation.rs | 46 ++++++++++++++++++++++++++++++++--- 6 files changed, 63 insertions(+), 4 deletions(-) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 9ee61ce6e2e..ea72dcaee53 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -157,6 +157,10 @@ struct Args { /// Maximum payload size in bytes. #[clap(default_value = "2000000", long, env)] payload_limit: usize, + + /// Maximum image fetch size in bytes. + #[clap(default_value = "1073741824", long, env)] + max_image_fetch_size: usize, } #[tokio::main] @@ -320,6 +324,7 @@ async fn main() -> Result<(), RouterError> { args.max_client_batch_size, args.usage_stats, args.payload_limit, + args.max_image_fetch_size, args.prometheus_port, ) .await?; diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index 543f8e6e352..26d1dbf8c9c 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -67,6 +67,8 @@ struct Args { usage_stats: UsageStatsLevel, #[clap(default_value = "2000000", long, env)] payload_limit: usize, + #[clap(default_value = "1073741824", long, env)] + max_image_fetch_size: usize, } async fn get_tokenizer(tokenizer_name: &str, revision: Option<&str>) -> Option { @@ -244,6 +246,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { executor_worker, usage_stats, payload_limit, + max_image_fetch_size, } = args; // Launch Tokio runtime @@ -325,6 +328,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { max_client_batch_size, usage_stats, payload_limit, + max_image_fetch_size, prometheus_port, ) .await?; diff --git a/backends/v2/src/main.rs b/backends/v2/src/main.rs index 60b5d52bbe2..575e62a0204 100644 --- a/backends/v2/src/main.rs +++ b/backends/v2/src/main.rs @@ -74,6 +74,8 @@ struct Args { usage_stats: usage_stats::UsageStatsLevel, #[clap(default_value = "2000000", long, env)] payload_limit: usize, + #[clap(default_value = "1073741824", long, env)] + max_image_fetch_size: usize, } #[derive(Debug, Subcommand)] @@ -120,6 +122,7 @@ async fn main() -> Result<(), RouterError> { max_client_batch_size, usage_stats, payload_limit, + max_image_fetch_size, } = args; if let Some(Commands::PrintSchema) = command { @@ -201,6 +204,7 @@ async fn main() -> Result<(), RouterError> { max_client_batch_size, usage_stats, payload_limit, + max_image_fetch_size, prometheus_port, ) .await?; diff --git a/backends/v3/src/main.rs b/backends/v3/src/main.rs index 44e63853e04..a71ec3d09a4 100644 --- a/backends/v3/src/main.rs +++ b/backends/v3/src/main.rs @@ -74,6 +74,8 @@ struct Args { usage_stats: usage_stats::UsageStatsLevel, #[clap(default_value = "2000000", long, env)] payload_limit: usize, + #[clap(default_value = "1073741824", long, env)] + max_image_fetch_size: usize, } #[derive(Debug, Subcommand)] @@ -120,6 +122,7 @@ async fn main() -> Result<(), RouterError> { max_client_batch_size, usage_stats, payload_limit, + max_image_fetch_size, } = args; if let Some(Commands::PrintSchema) = command { @@ -217,6 +220,7 @@ async fn main() -> Result<(), RouterError> { max_client_batch_size, usage_stats, payload_limit, + max_image_fetch_size, prometheus_port, ) .await?; diff --git a/router/src/server.rs b/router/src/server.rs index 97a0cea25ec..7f0bf74ebe0 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -1523,6 +1523,7 @@ pub async fn run( max_client_batch_size: usize, usage_stats_level: usage_stats::UsageStatsLevel, payload_limit: usize, + max_image_fetch_size: usize, prometheus_port: u16, ) -> Result<(), WebServerError> { // CORS allowed origins @@ -1827,6 +1828,7 @@ pub async fn run( compat_return_full_text, allow_origin, payload_limit, + max_image_fetch_size, prometheus_port, ) .await; @@ -1889,6 +1891,7 @@ async fn start( compat_return_full_text: bool, allow_origin: Option, payload_limit: usize, + max_image_fetch_size: usize, prometheus_port: u16, ) -> Result<(), WebServerError> { // Determine the server port based on the feature and environment variable. @@ -1920,6 +1923,7 @@ async fn start( max_input_tokens, max_total_tokens, disable_grammar_support, + max_image_fetch_size, ); let infer = Infer::new( diff --git a/router/src/validation.rs b/router/src/validation.rs index 7717f373e4f..e614e7f27b5 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -12,7 +12,7 @@ use rand::{thread_rng, Rng}; use serde_json::Value; /// Payload validation logic use std::cmp::min; -use std::io::Cursor; +use std::io::{Cursor, Read}; use std::iter; use std::sync::Arc; use thiserror::Error; @@ -34,6 +34,7 @@ pub struct Validation { max_input_length: usize, max_total_tokens: usize, disable_grammar_support: bool, + max_image_fetch_size: usize, /// Channel to communicate with the background tokenization task sender: mpsc::UnboundedSender, } @@ -51,6 +52,7 @@ impl Validation { max_input_length: usize, max_total_tokens: usize, disable_grammar_support: bool, + max_image_fetch_size: usize, ) -> Self { let workers = if let Tokenizer::Python { .. } = &tokenizer { 1 @@ -78,6 +80,7 @@ impl Validation { config_clone, preprocessor_config_clone, tokenizer_receiver, + max_image_fetch_size, ) }); } @@ -96,6 +99,7 @@ impl Validation { max_input_length, max_total_tokens, disable_grammar_support, + max_image_fetch_size, } } @@ -480,6 +484,7 @@ fn tokenizer_worker( config: Option, preprocessor_config: Option, mut receiver: mpsc::UnboundedReceiver, + max_image_fetch_size: usize, ) { match tokenizer { Tokenizer::Python { @@ -503,6 +508,7 @@ fn tokenizer_worker( &tokenizer, config.as_ref(), preprocessor_config.as_ref(), + max_image_fetch_size, )) .unwrap_or(()) }) @@ -524,6 +530,7 @@ fn tokenizer_worker( &tokenizer, config.as_ref(), preprocessor_config.as_ref(), + max_image_fetch_size, )) .unwrap_or(()) }) @@ -562,10 +569,29 @@ fn format_to_mimetype(format: ImageFormat) -> String { .to_string() } -fn fetch_image(input: &str) -> Result<(Vec, String, usize, usize), ValidationError> { +fn fetch_image(input: &str, max_image_fetch_size: usize) -> Result<(Vec, String, usize, usize), ValidationError> { if input.starts_with("![](http://") || input.starts_with("![](https://") { let url = &input["![](".len()..input.len() - 1]; - let data = reqwest::blocking::get(url)?.bytes()?; + let mut response = reqwest::blocking::get(url)?; + + // Check Content-Length header if present + if let Some(content_length) = response.content_length() { + if content_length as usize > max_image_fetch_size { + return Err(ValidationError::ImageTooLarge( + content_length as usize, + max_image_fetch_size, + )); + } + } + + // Read the body with size limit to prevent unbounded memory allocation + let mut data = Vec::new(); + let mut limited_reader = response.take((max_image_fetch_size + 1) as u64); + limited_reader.read_to_end(&mut data)?; + + if data.len() > max_image_fetch_size { + return Err(ValidationError::ImageTooLarge(data.len(), max_image_fetch_size)); + } let format = image::guess_format(&data)?; // TODO Remove this clone @@ -787,6 +813,7 @@ fn prepare_input( tokenizer: &T, config: Option<&Config>, preprocessor_config: Option<&HubPreprocessorConfig>, + max_image_fetch_size: usize, ) -> Result<(tokenizers::Encoding, Vec), ValidationError> { use Config::*; static RE: Lazy = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap()); @@ -805,7 +832,7 @@ fn prepare_input( input_chunks.push(Chunk::Text(inputs[start..chunk_start].to_string())); tokenizer_query.push_str(&inputs[start..chunk_start]); } - let (data, mimetype, height, width) = fetch_image(&inputs[chunk_start..chunk_end])?; + let (data, mimetype, height, width) = fetch_image(&inputs[chunk_start..chunk_end], max_image_fetch_size)?; input_chunks.push(Chunk::Image(Image { data, mimetype })); tokenizer_query.push_str(&image_tokens(config, preprocessor_config, height, width)); start = chunk_end; @@ -990,6 +1017,10 @@ pub enum ValidationError { InvalidImageContent(String), #[error("Could not fetch image: {0}")] FailedFetchImage(#[from] reqwest::Error), + #[error("Image size {0} bytes exceeds maximum allowed size of {1} bytes")] + ImageTooLarge(usize, usize), + #[error("Failed to read image data: {0}")] + ImageReadError(#[from] std::io::Error), #[error("{0} modality is not supported")] UnsupportedModality(&'static str), } @@ -1023,6 +1054,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); let max_new_tokens = 10; @@ -1058,6 +1090,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); let max_new_tokens = 10; @@ -1092,6 +1125,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); match validation .validate(GenerateRequest { @@ -1132,6 +1166,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); match validation .validate(GenerateRequest { @@ -1203,6 +1238,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); match validation .validate(GenerateRequest { @@ -1293,6 +1329,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); let chunks = match validation @@ -1349,6 +1386,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); let (encoding, chunks) = match validation From 768dcdb75fdf5b2e4a02b7401fb98a6fb29fcfa1 Mon Sep 17 00:00:00 2001 From: drbh Date: Thu, 13 Nov 2025 17:03:35 +0000 Subject: [PATCH 2/8] fix: update model path for test --- server/tests/models/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/tests/models/test_model.py b/server/tests/models/test_model.py index 8441e8c6e3f..46a27253d06 100644 --- a/server/tests/models/test_model.py +++ b/server/tests/models/test_model.py @@ -14,7 +14,7 @@ def batch_type(self): def generate_token(self, batch): raise NotImplementedError - tokenizer = AutoTokenizer.from_pretrained("huggingface/llama-7b") + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") model = TestModel( "test_model_id", From ffd4831abdd24caeb8f7749d09dcb4ffd8075556 Mon Sep 17 00:00:00 2001 From: drbh Date: Thu, 13 Nov 2025 17:14:29 +0000 Subject: [PATCH 3/8] fix: adjust model repo id for test again --- server/tests/models/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/tests/models/test_model.py b/server/tests/models/test_model.py index 46a27253d06..d048b9cc166 100644 --- a/server/tests/models/test_model.py +++ b/server/tests/models/test_model.py @@ -14,7 +14,7 @@ def batch_type(self): def generate_token(self, batch): raise NotImplementedError - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") model = TestModel( "test_model_id", From 2305b5710019f8521de20b5e0fe235729737bd8e Mon Sep 17 00:00:00 2001 From: drbh Date: Thu, 13 Nov 2025 18:51:57 +0000 Subject: [PATCH 4/8] fix: apply clippy lints --- router/src/chat.rs | 2 +- router/src/validation.rs | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/router/src/chat.rs b/router/src/chat.rs index d5824fea014..824b775b39c 100644 --- a/router/src/chat.rs +++ b/router/src/chat.rs @@ -673,7 +673,7 @@ mod tests { let (name, arguments) = get_tool_call_content(&events[0]); if let Some(name) = name { assert_eq!(name, "get_current_weather"); - output_name.push_str(&name); + output_name.push_str(name); } output.push_str(arguments); } else { diff --git a/router/src/validation.rs b/router/src/validation.rs index e614e7f27b5..65cd93b4247 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -569,10 +569,13 @@ fn format_to_mimetype(format: ImageFormat) -> String { .to_string() } -fn fetch_image(input: &str, max_image_fetch_size: usize) -> Result<(Vec, String, usize, usize), ValidationError> { +fn fetch_image( + input: &str, + max_image_fetch_size: usize, +) -> Result<(Vec, String, usize, usize), ValidationError> { if input.starts_with("![](http://") || input.starts_with("![](https://") { let url = &input["![](".len()..input.len() - 1]; - let mut response = reqwest::blocking::get(url)?; + let response = reqwest::blocking::get(url)?; // Check Content-Length header if present if let Some(content_length) = response.content_length() { @@ -590,7 +593,10 @@ fn fetch_image(input: &str, max_image_fetch_size: usize) -> Result<(Vec, Str limited_reader.read_to_end(&mut data)?; if data.len() > max_image_fetch_size { - return Err(ValidationError::ImageTooLarge(data.len(), max_image_fetch_size)); + return Err(ValidationError::ImageTooLarge( + data.len(), + max_image_fetch_size, + )); } let format = image::guess_format(&data)?; @@ -832,7 +838,8 @@ fn prepare_input( input_chunks.push(Chunk::Text(inputs[start..chunk_start].to_string())); tokenizer_query.push_str(&inputs[start..chunk_start]); } - let (data, mimetype, height, width) = fetch_image(&inputs[chunk_start..chunk_end], max_image_fetch_size)?; + let (data, mimetype, height, width) = + fetch_image(&inputs[chunk_start..chunk_end], max_image_fetch_size)?; input_chunks.push(Chunk::Image(Image { data, mimetype })); tokenizer_query.push_str(&image_tokens(config, preprocessor_config, height, width)); start = chunk_end; From 91aa786427a87783517fe2b2c67557d90b8225fb Mon Sep 17 00:00:00 2001 From: drbh Date: Thu, 13 Nov 2025 20:43:13 +0000 Subject: [PATCH 5/8] fix: clippy fix --- router/src/validation.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/router/src/validation.rs b/router/src/validation.rs index 65cd93b4247..b32f5f8b50e 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -34,7 +34,6 @@ pub struct Validation { max_input_length: usize, max_total_tokens: usize, disable_grammar_support: bool, - max_image_fetch_size: usize, /// Channel to communicate with the background tokenization task sender: mpsc::UnboundedSender, } @@ -99,7 +98,6 @@ impl Validation { max_input_length, max_total_tokens, disable_grammar_support, - max_image_fetch_size, } } From 42a65f154aa31f4476b1df03694f8683fedd493e Mon Sep 17 00:00:00 2001 From: drbh Date: Fri, 14 Nov 2025 19:18:29 +0000 Subject: [PATCH 6/8] fix: avoid torch build isolation in docker --- server/Makefile-flash-att-v2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/Makefile-flash-att-v2 b/server/Makefile-flash-att-v2 index 9a946d97f8b..2097af3fd1b 100644 --- a/server/Makefile-flash-att-v2 +++ b/server/Makefile-flash-att-v2 @@ -3,7 +3,7 @@ flash_att_v2_commit_rocm := 47bd46e0204a95762ae48712fd1a3978827c77fd build-flash-attention-v2-cuda: pip install -U packaging wheel - pip install flash-attn==$(flash_att_v2_commit_cuda) + pip install --no-build-isolation flash-attn==$(flash_att_v2_commit_cuda) install-flash-attention-v2-cuda: build-flash-attention-v2-cuda echo "Flash v2 installed" From 9f17ef62cec275497095e45a5709115ea0db047e Mon Sep 17 00:00:00 2001 From: drbh Date: Mon, 17 Nov 2025 21:48:53 +0000 Subject: [PATCH 7/8] fix: bump repo id in flash llama tests --- integration-tests/models/test_flash_llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration-tests/models/test_flash_llama.py b/integration-tests/models/test_flash_llama.py index bf49dc0b4b0..f24215a08cc 100644 --- a/integration-tests/models/test_flash_llama.py +++ b/integration-tests/models/test_flash_llama.py @@ -3,7 +3,7 @@ @pytest.fixture(scope="module") def flash_llama_handle(launcher): - with launcher("huggingface/llama-7b", num_shard=2) as handle: + with launcher("huggyllama/llama-7b", num_shard=2) as handle: yield handle From 8a307a2fb91c9aedd6ff43e231b9034e5f5e7c78 Mon Sep 17 00:00:00 2001 From: drbh Date: Tue, 18 Nov 2025 01:09:14 +0000 Subject: [PATCH 8/8] fix: temporarily avoid problematic repos in tests --- integration-tests/models/test_flash_llama_fp8.py | 3 +++ integration-tests/models/test_flash_llama_marlin_24.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/integration-tests/models/test_flash_llama_fp8.py b/integration-tests/models/test_flash_llama_fp8.py index 1980846d301..1d065f5d6a0 100644 --- a/integration-tests/models/test_flash_llama_fp8.py +++ b/integration-tests/models/test_flash_llama_fp8.py @@ -13,6 +13,7 @@ async def flash_llama_fp8(flash_llama_fp8_handle): return flash_llama_fp8_handle.client +@pytest.mark.skip(reason="Issue with the model access") @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private @@ -26,6 +27,7 @@ async def test_flash_llama_fp8(flash_llama_fp8, response_snapshot): assert response == response_snapshot +@pytest.mark.skip(reason="Issue with the model access") @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private @@ -49,6 +51,7 @@ async def test_flash_llama_fp8_all_params(flash_llama_fp8, response_snapshot): assert response == response_snapshot +@pytest.mark.skip(reason="Issue with the model access") @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private diff --git a/integration-tests/models/test_flash_llama_marlin_24.py b/integration-tests/models/test_flash_llama_marlin_24.py index 3eb94f02e18..bd364ecf9b9 100644 --- a/integration-tests/models/test_flash_llama_marlin_24.py +++ b/integration-tests/models/test_flash_llama_marlin_24.py @@ -15,6 +15,7 @@ async def flash_llama_marlin(flash_llama_marlin24_handle): return flash_llama_marlin24_handle.client +@pytest.mark.skip(reason="Issue with the model access") @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private @@ -27,6 +28,7 @@ async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot): assert response == response_snapshot +@pytest.mark.skip(reason="Issue with the model access") @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private @@ -50,6 +52,7 @@ async def test_flash_llama_marlin24_all_params(flash_llama_marlin, response_snap assert response == response_snapshot +@pytest.mark.skip(reason="Issue with the model access") @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private