-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Description
System Info
OS: Ubuntu 20.04
Docker setup:
docker-compose.yml
services:
tgi:
image: ghcr.io/huggingface/text-generation-inference:3.2.3
command: --model-id ${MODEL}
environment:
- HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
ports:
- "80:80"
volumes:
- ~/.cache/huggingface/hub:/data
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
shm_size: 1g
Model being used: Qwen/Qwen2.5-VL-3B-Instruct
Hardware: A6000
Information
- Docker
- The CLI directly
Tasks
- An officially supported command
- My own modifications
Reproduction
Steps to reproduce issue:
- Start up TGI using docker compose above.
It produces this output:
tgi-1 | 2025-04-16T17:42:18.100436Z INFO text_generation_launcher: Args {
tgi-1 | model_id: "Qwen/Qwen2.5-VL-3B-Instruct",
tgi-1 | revision: None,
tgi-1 | validation_workers: 2,
tgi-1 | sharded: None,
tgi-1 | num_shard: None,
tgi-1 | quantize: None,
tgi-1 | speculate: None,
tgi-1 | dtype: None,
tgi-1 | kv_cache_dtype: None,
tgi-1 | trust_remote_code: false,
tgi-1 | max_concurrent_requests: 128,
tgi-1 | max_best_of: 2,
tgi-1 | max_stop_sequences: 4,
tgi-1 | max_top_n_tokens: 5,
tgi-1 | max_input_tokens: None,
tgi-1 | max_input_length: None,
tgi-1 | max_total_tokens: None,
tgi-1 | waiting_served_ratio: 0.3,
tgi-1 | max_batch_prefill_tokens: None,
tgi-1 | max_batch_total_tokens: None,
tgi-1 | max_waiting_tokens: 20,
tgi-1 | max_batch_size: None,
tgi-1 | cuda_graphs: None,
tgi-1 | hostname: "15090dd2add3",
tgi-1 | port: 80,
tgi-1 | shard_uds_path: "/tmp/text-generation-server",
tgi-1 | master_addr: "localhost",
tgi-1 | master_port: 29500,
tgi-1 | huggingface_hub_cache: None,
tgi-1 | weights_cache_override: None,
tgi-1 | disable_custom_kernels: false,
tgi-1 | cuda_memory_fraction: 1.0,
tgi-1 | rope_scaling: None,
tgi-1 | rope_factor: None,
tgi-1 | json_output: false,
tgi-1 | otlp_endpoint: None,
tgi-1 | otlp_service_name: "text-generation-inference.router",
tgi-1 | cors_allow_origin: [],
tgi-1 | api_key: None,
tgi-1 | watermark_gamma: None,
tgi-1 | watermark_delta: None,
tgi-1 | ngrok: false,
tgi-1 | ngrok_authtoken: None,
tgi-1 | ngrok_edge: None,
tgi-1 | tokenizer_config_path: None,
tgi-1 | disable_grammar_support: false,
tgi-1 | env: false,
tgi-1 | max_client_batch_size: 4,
tgi-1 | lora_adapters: None,
tgi-1 | usage_stats: On,
tgi-1 | payload_limit: 2000000,
tgi-1 | enable_prefill_logprobs: false,
tgi-1 | graceful_termination_timeout: 90,
tgi-1 | }
tgi-1 | 2025-04-16T17:42:19.440429Z INFO text_generation_launcher: Disabling prefix caching because of VLM model
tgi-1 | 2025-04-16T17:42:19.440443Z INFO text_generation_launcher: Using attention flashinfer - Prefix caching 0
tgi-1 | 2025-04-16T17:42:19.501683Z WARN text_generation_launcher: Unkown compute for card nvidia-rtx-a6000
tgi-1 | 2025-04-16T17:42:19.568964Z INFO text_generation_launcher: Default `max_batch_prefill_tokens` to 10000
tgi-1 | 2025-04-16T17:42:19.568984Z INFO text_generation_launcher: Using default cuda graphs [1, 2, 4, 8, 16, 32]
tgi-1 | 2025-04-16T17:42:19.569139Z INFO download: text_generation_launcher: Starting check and download process for Qwen/Qwen2.5-VL-3B-Instruct
tgi-1 | 2025-04-16T17:42:23.364798Z INFO text_generation_launcher: Files are already present on the host. Skipping download.
tgi-1 | 2025-04-16T17:42:23.990024Z INFO download: text_generation_launcher: Successfully downloaded weights for Qwen/Qwen2.5-VL-3B-Instruct
tgi-1 | 2025-04-16T17:42:23.990262Z INFO shard-manager: text_generation_launcher: Starting shard rank=0
tgi-1 | 2025-04-16T17:42:27.702616Z INFO text_generation_launcher: Using prefix caching = False
tgi-1 | 2025-04-16T17:42:27.702644Z INFO text_generation_launcher: Using Attention = flashinfer
tgi-1 | 2025-04-16T17:42:34.016804Z INFO shard-manager: text_generation_launcher: Waiting for shard to be ready... rank=0
tgi-1 | 2025-04-16T17:42:39.499586Z INFO text_generation_launcher: Using prefill chunking = False
tgi-1 | 2025-04-16T17:42:40.220379Z INFO text_generation_launcher: Server started at unix:///tmp/text-generation-server-0
tgi-1 | 2025-04-16T17:42:40.222196Z INFO shard-manager: text_generation_launcher: Shard ready in 16.212793444s rank=0
tgi-1 | 2025-04-16T17:42:40.305185Z INFO text_generation_launcher: Starting Webserver
tgi-1 | 2025-04-16T17:42:40.403139Z INFO text_generation_router_v3: backends/v3/src/lib.rs:125: Warming up model
tgi-1 | 2025-04-16T17:42:40.481633Z INFO text_generation_launcher: Using optimized Triton indexing kernels.
tgi-1 | 2025-04-16T17:42:44.453072Z INFO text_generation_launcher: KV-cache blocks: 1008993, size: 1
tgi-1 | 2025-04-16T17:42:44.500701Z INFO text_generation_launcher: Cuda Graphs are enabled for sizes [32, 16, 8, 4, 2, 1]
tgi-1 | 2025-04-16T17:42:45.831884Z INFO text_generation_router_v3: backends/v3/src/lib.rs:137: Setting max batch total tokens to 1008993
tgi-1 | 2025-04-16T17:42:45.831905Z INFO text_generation_router_v3: backends/v3/src/lib.rs:166: Using backend V3
tgi-1 | 2025-04-16T17:42:45.831911Z INFO text_generation_router: backends/v3/src/main.rs:162: Maximum input tokens defaulted to 9999
tgi-1 | 2025-04-16T17:42:45.831915Z INFO text_generation_router: backends/v3/src/main.rs:168: Maximum total tokens defaulted to 10000
tgi-1 | 2025-04-16T17:42:45.831954Z INFO text_generation_router::server: router/src/server.rs:1560: Using the Hugging Face API
tgi-1 | 2025-04-16T17:42:46.127361Z INFO text_generation_router::server: router/src/server.rs:2309: Serving revision 66285546d2b821cf421d4f5eb2576359d3770cd3 of model Qwen/Qwen2.5-VL-3B-Instruct
tgi-1 | 2025-04-16T17:42:46.127423Z WARN text_generation_router::server: router/src/server.rs:1648: Tokenizer_config None - Some("/data/hub/models--Qwen--Qwen2.5-VL-3B-Instruct/snapshots/66285546d2b821cf421d4f5eb2576359d3770cd3/tokenizer_config.json")
tgi-1 | 2025-04-16T17:42:46.127465Z INFO text_generation_router::server: router/src/server.rs:1661: Using chat template from chat_template.json
tgi-1 | 2025-04-16T17:42:48.864511Z INFO text_generation_router::server: router/src/server.rs:1716: Using config Some(Qwen2_5Vl(Qwen2_5Vl { vision_config: Qwen2_5VlVisionConfig { depth: 32, hidden_act: "silu", hidden_size: 1280, intermediate_size: 3420, num_heads: 16, in_chans: 3, out_hidden_size: 2048, patch_size: 14, spatial_merge_size: 2, spatial_patch_size: 14, window_size: 112, fullatt_block_indexes: [7, 15, 23, 31], tokens_per_second: 2, temporal_patch_size: 2 } }))
tgi-1 | 2025-04-16T17:42:49.135511Z WARN text_generation_router::server: router/src/server.rs:1879: Invalid hostname, defaulting to 0.0.0.0
tgi-1 | 2025-04-16T17:42:49.204635Z INFO text_generation_router::server: router/src/server.rs:2266: Connected
- Run the curl command that has a long context (replace "inputs" with a very long prompt):
curl -X 'POST' \
'http://localhost:80/generate' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"inputs": "My name is Olivier and I My name is Olivier and I My name is Olivier and I...",
"parameters": {
}
}'
It produces this output:
tgi-1 | 2025-04-16T17:45:51.750055Z ERROR generate{parameters=GenerateParameters { best_of: None, temperature: None, repetition_penalty: None, frequency_penalty: None, top_k: None, top_p: None, typical_p: None, do_sample: false, max_new_tokens: None, return_full_text: None, stop: [], truncate: None, watermark: false, details: false, decoder_input_details: false, seed: None, top_n_tokens: None, grammar: None, adapter_id: None }}:generate:generate_stream: text_generation_router::infer: router/src/infer/mod.rs:126: `inputs` tokens + `max_new_tokens` must be <= 10000. Given: 14815 `inputs` tokens and 0 `max_new_tokens`
Expected behavior
I expected it to succeed with this long context request. More specifically, I would have expected the server to initialize the Maximum input tokens and Maximum total tokens to match the model's max_position_embeddings which is 128000 (https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct/blob/main/config.json#L17). Instead it initialized with 10000 for a reason unknown to me: Maximum input tokens defaulted to 9999 and Maximum total tokens defaulted to 10000. The same happens with meta-llama/Llama-3.2-11B-Vision-Instruct.
What I expected to happen is the output when I run with model Qwen/Qwen2.5-3B-Instruct (a non-vision model), which runs fine. The same happens with meta-llama/Llama-3.1-8B-Instruct.
tgi-1 | 2025-04-16T17:55:33.277248Z INFO text_generation_launcher: Args {
tgi-1 | model_id: "Qwen/Qwen2.5-3B-Instruct",
tgi-1 | revision: None,
tgi-1 | validation_workers: 2,
tgi-1 | sharded: None,
tgi-1 | num_shard: None,
tgi-1 | quantize: None,
tgi-1 | speculate: None,
tgi-1 | dtype: None,
tgi-1 | kv_cache_dtype: None,
tgi-1 | trust_remote_code: false,
tgi-1 | max_concurrent_requests: 128,
tgi-1 | max_best_of: 2,
tgi-1 | max_stop_sequences: 4,
tgi-1 | max_top_n_tokens: 5,
tgi-1 | max_input_tokens: None,
tgi-1 | max_input_length: None,
tgi-1 | max_total_tokens: None,
tgi-1 | waiting_served_ratio: 0.3,
tgi-1 | max_batch_prefill_tokens: None,
tgi-1 | max_batch_total_tokens: None,
tgi-1 | max_waiting_tokens: 20,
tgi-1 | max_batch_size: None,
tgi-1 | cuda_graphs: None,
tgi-1 | hostname: "21d0e584572b",
tgi-1 | port: 80,
tgi-1 | shard_uds_path: "/tmp/text-generation-server",
tgi-1 | master_addr: "localhost",
tgi-1 | master_port: 29500,
tgi-1 | huggingface_hub_cache: None,
tgi-1 | weights_cache_override: None,
tgi-1 | disable_custom_kernels: false,
tgi-1 | cuda_memory_fraction: 1.0,
tgi-1 | rope_scaling: None,
tgi-1 | rope_factor: None,
tgi-1 | json_output: false,
tgi-1 | otlp_endpoint: None,
tgi-1 | otlp_service_name: "text-generation-inference.router",
tgi-1 | cors_allow_origin: [],
tgi-1 | api_key: None,
tgi-1 | watermark_gamma: None,
tgi-1 | watermark_delta: None,
tgi-1 | ngrok: false,
tgi-1 | ngrok_authtoken: None,
tgi-1 | ngrok_edge: None,
tgi-1 | tokenizer_config_path: None,
tgi-1 | disable_grammar_support: false,
tgi-1 | env: false,
tgi-1 | max_client_batch_size: 4,
tgi-1 | lora_adapters: None,
tgi-1 | usage_stats: On,
tgi-1 | payload_limit: 2000000,
tgi-1 | enable_prefill_logprobs: false,
tgi-1 | graceful_termination_timeout: 90,
tgi-1 | }
tgi-1 | 2025-04-16T17:55:34.937620Z INFO text_generation_launcher: Using attention flashinfer - Prefix caching true
tgi-1 | 2025-04-16T17:55:34.977759Z WARN text_generation_launcher: Unkown compute for card nvidia-rtx-a6000
tgi-1 | 2025-04-16T17:55:35.018086Z INFO text_generation_launcher: Default `max_batch_prefill_tokens` to 4096
tgi-1 | 2025-04-16T17:55:35.018106Z INFO text_generation_launcher: Using default cuda graphs [1, 2, 4, 8, 16, 32]
tgi-1 | 2025-04-16T17:55:35.018254Z INFO download: text_generation_launcher: Starting check and download process for Qwen/Qwen2.5-3B-Instruct
tgi-1 | 2025-04-16T17:55:38.974552Z INFO text_generation_launcher: Download file: model-00001-of-00002.safetensors
tgi-1 | 2025-04-16T17:56:14.760098Z INFO text_generation_launcher: Downloaded /data/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/aa8e72537993ba99e69dfaafa59ed015b17504d1/model-00001-of-00002.safetensors in 0:00:35.
tgi-1 | 2025-04-16T17:56:14.760253Z INFO text_generation_launcher: Download: [1/2] -- ETA: 0:00:35
tgi-1 | 2025-04-16T17:56:14.760703Z INFO text_generation_launcher: Download file: model-00002-of-00002.safetensors
tgi-1 | 2025-04-16T17:56:36.298985Z INFO text_generation_launcher: Downloaded /data/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/aa8e72537993ba99e69dfaafa59ed015b17504d1/model-00002-of-00002.safetensors in 0:00:21.
tgi-1 | 2025-04-16T17:56:36.299139Z INFO text_generation_launcher: Download: [2/2] -- ETA: 0
tgi-1 | 2025-04-16T17:56:36.988721Z INFO download: text_generation_launcher: Successfully downloaded weights for Qwen/Qwen2.5-3B-Instruct
tgi-1 | 2025-04-16T17:56:36.988997Z INFO shard-manager: text_generation_launcher: Starting shard rank=0
tgi-1 | 2025-04-16T17:56:40.726163Z INFO text_generation_launcher: Using prefix caching = True
tgi-1 | 2025-04-16T17:56:40.726192Z INFO text_generation_launcher: Using Attention = flashinfer
tgi-1 | 2025-04-16T17:56:47.019674Z INFO shard-manager: text_generation_launcher: Waiting for shard to be ready... rank=0
tgi-1 | 2025-04-16T17:56:56.805715Z INFO text_generation_launcher: Using prefill chunking = True
tgi-1 | 2025-04-16T17:56:57.035209Z INFO shard-manager: text_generation_launcher: Waiting for shard to be ready... rank=0
tgi-1 | 2025-04-16T17:56:57.060259Z INFO text_generation_launcher: Server started at unix:///tmp/text-generation-server-0
tgi-1 | 2025-04-16T17:56:57.135310Z INFO shard-manager: text_generation_launcher: Shard ready in 20.124977041s rank=0
tgi-1 | 2025-04-16T17:56:57.215907Z INFO text_generation_launcher: Starting Webserver
tgi-1 | 2025-04-16T17:56:57.310651Z INFO text_generation_router_v3: backends/v3/src/lib.rs:125: Warming up model
tgi-1 | 2025-04-16T17:56:57.338336Z INFO text_generation_launcher: Using optimized Triton indexing kernels.
tgi-1 | 2025-04-16T17:56:59.574858Z INFO text_generation_launcher: KV-cache blocks: 1069251, size: 1
tgi-1 | 2025-04-16T17:56:59.595217Z INFO text_generation_launcher: Cuda Graphs are enabled for sizes [32, 16, 8, 4, 2, 1]
tgi-1 | 2025-04-16T17:57:01.163878Z INFO text_generation_router_v3: backends/v3/src/lib.rs:137: Setting max batch total tokens to 1069251
tgi-1 | 2025-04-16T17:57:01.163898Z WARN text_generation_router_v3::backend: backends/v3/src/backend.rs:39: Model supports prefill chunking. `waiting_served_ratio` and `max_waiting_tokens` will be ignored.
tgi-1 | 2025-04-16T17:57:01.163909Z INFO text_generation_router_v3: backends/v3/src/lib.rs:166: Using backend V3
tgi-1 | 2025-04-16T17:57:01.163914Z INFO text_generation_router: backends/v3/src/main.rs:162: Maximum input tokens defaulted to 32767
tgi-1 | 2025-04-16T17:57:01.163918Z INFO text_generation_router: backends/v3/src/main.rs:168: Maximum total tokens defaulted to 32768
tgi-1 | 2025-04-16T17:57:01.163958Z INFO text_generation_router::server: router/src/server.rs:1560: Using the Hugging Face API
tgi-1 | 2025-04-16T17:57:01.592383Z INFO text_generation_router::server: router/src/server.rs:2309: Serving revision aa8e72537993ba99e69dfaafa59ed015b17504d1 of model Qwen/Qwen2.5-3B-Instruct
tgi-1 | 2025-04-16T17:57:01.592403Z WARN text_generation_router::server: router/src/server.rs:1648: Tokenizer_config None - Some("/data/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/aa8e72537993ba99e69dfaafa59ed015b17504d1/tokenizer_config.json")
tgi-1 | 2025-04-16T17:57:04.483820Z INFO text_generation_router::server: router/src/server.rs:1716: Using config Some(Qwen2)
tgi-1 | 2025-04-16T17:57:04.720059Z WARN text_generation_router::server: router/src/server.rs:1879: Invalid hostname, defaulting to 0.0.0.0
tgi-1 | 2025-04-16T17:57:04.787658Z INFO text_generation_router::server: router/src/server.rs:2266: Connected
tgi-1 | 2025-04-16T17:57:44.821938Z INFO text_generation_router_v3::radix: backends/v3/src/radix.rs:108: Prefix 0 - Suffix 15838
tgi-1 | 2025-04-16T17:58:12.285642Z INFO text_generation_router_v3::radix: backends/v3/src/radix.rs:108: Prefix 14815 - Suffix 2047
tgi-1 | 2025-04-16T17:58:29.783466Z INFO text_generation_launcher: Terminating webserver
tgi-1 | 2025-04-16T17:58:29.783495Z INFO text_generation_launcher: Waiting for webserver to gracefully shutdown
tgi-1 | 2025-04-16T17:58:29.783633Z INFO text_generation_router::server: router/src/server.rs:2363: signal received, starting graceful shutdown
tgi-1 | 2025-04-16T17:58:36.061853Z INFO text_generation_router_v3::radix: backends/v3/src/radix.rs:108: Prefix 15839 - Suffix 2047
tgi-1 | 2025-04-16T17:58:45.571689Z INFO text_generation_launcher: Args {
tgi-1 | model_id: "Qwen/Qwen2.5-3B-Instruct",
tgi-1 | revision: None,
tgi-1 | validation_workers: 2,
tgi-1 | sharded: None,
tgi-1 | num_shard: None,
tgi-1 | quantize: None,
tgi-1 | speculate: None,
tgi-1 | dtype: None,
tgi-1 | kv_cache_dtype: None,
tgi-1 | trust_remote_code: false,
tgi-1 | max_concurrent_requests: 128,
tgi-1 | max_best_of: 2,
tgi-1 | max_stop_sequences: 4,
tgi-1 | max_top_n_tokens: 5,
tgi-1 | max_input_tokens: None,
tgi-1 | max_input_length: None,
tgi-1 | max_total_tokens: None,
tgi-1 | waiting_served_ratio: 0.3,
tgi-1 | max_batch_prefill_tokens: None,
tgi-1 | max_batch_total_tokens: None,
tgi-1 | max_waiting_tokens: 20,
tgi-1 | max_batch_size: None,
tgi-1 | cuda_graphs: None,
tgi-1 | hostname: "21d0e584572b",
tgi-1 | port: 80,
tgi-1 | shard_uds_path: "/tmp/text-generation-server",
tgi-1 | master_addr: "localhost",
tgi-1 | master_port: 29500,
tgi-1 | huggingface_hub_cache: None,
tgi-1 | weights_cache_override: None,
tgi-1 | disable_custom_kernels: false,
tgi-1 | cuda_memory_fraction: 1.0,
tgi-1 | rope_scaling: None,
tgi-1 | rope_factor: None,
tgi-1 | json_output: false,
tgi-1 | otlp_endpoint: None,
tgi-1 | otlp_service_name: "text-generation-inference.router",
tgi-1 | cors_allow_origin: [],
tgi-1 | api_key: None,
tgi-1 | watermark_gamma: None,
tgi-1 | watermark_delta: None,
tgi-1 | ngrok: false,
tgi-1 | ngrok_authtoken: None,
tgi-1 | ngrok_edge: None,
tgi-1 | tokenizer_config_path: None,
tgi-1 | disable_grammar_support: false,
tgi-1 | env: false,
tgi-1 | max_client_batch_size: 4,
tgi-1 | lora_adapters: None,
tgi-1 | usage_stats: On,
tgi-1 | payload_limit: 2000000,
tgi-1 | enable_prefill_logprobs: false,
tgi-1 | graceful_termination_timeout: 90,
tgi-1 | }
tgi-1 | 2025-04-16T17:58:46.896119Z INFO text_generation_launcher: Using attention flashinfer - Prefix caching true
tgi-1 | 2025-04-16T17:58:46.949971Z WARN text_generation_launcher: Unkown compute for card nvidia-rtx-a6000
tgi-1 | 2025-04-16T17:58:47.006974Z INFO text_generation_launcher: Default `max_batch_prefill_tokens` to 4096
tgi-1 | 2025-04-16T17:58:47.006993Z INFO text_generation_launcher: Using default cuda graphs [1, 2, 4, 8, 16, 32]
tgi-1 | 2025-04-16T17:58:47.007168Z INFO download: text_generation_launcher: Starting check and download process for Qwen/Qwen2.5-3B-Instruct
tgi-1 | 2025-04-16T17:58:50.894011Z INFO text_generation_launcher: Files are already present on the host. Skipping download.
tgi-1 | 2025-04-16T17:58:51.532562Z INFO download: text_generation_launcher: Successfully downloaded weights for Qwen/Qwen2.5-3B-Instruct
tgi-1 | 2025-04-16T17:58:51.532887Z INFO shard-manager: text_generation_launcher: Starting shard rank=0
tgi-1 | 2025-04-16T17:58:55.182433Z INFO text_generation_launcher: Using prefix caching = True
tgi-1 | 2025-04-16T17:58:55.182460Z INFO text_generation_launcher: Using Attention = flashinfer
tgi-1 | 2025-04-16T17:59:01.561594Z INFO shard-manager: text_generation_launcher: Waiting for shard to be ready... rank=0
tgi-1 | 2025-04-16T17:59:08.928743Z INFO text_generation_launcher: Using prefill chunking = True
tgi-1 | 2025-04-16T17:59:09.523752Z INFO text_generation_launcher: Server started at unix:///tmp/text-generation-server-0
tgi-1 | 2025-04-16T17:59:09.569182Z INFO shard-manager: text_generation_launcher: Shard ready in 18.016616627s rank=0
tgi-1 | 2025-04-16T17:59:09.649364Z INFO text_generation_launcher: Starting Webserver
tgi-1 | 2025-04-16T17:59:09.728550Z INFO text_generation_router_v3: backends/v3/src/lib.rs:125: Warming up model
tgi-1 | 2025-04-16T17:59:09.754738Z INFO text_generation_launcher: Using optimized Triton indexing kernels.
tgi-1 | 2025-04-16T17:59:11.267111Z INFO text_generation_launcher: KV-cache blocks: 1069251, size: 1
tgi-1 | 2025-04-16T17:59:11.286388Z INFO text_generation_launcher: Cuda Graphs are enabled for sizes [32, 16, 8, 4, 2, 1]
tgi-1 | 2025-04-16T17:59:12.844359Z INFO text_generation_router_v3: backends/v3/src/lib.rs:137: Setting max batch total tokens to 1069251
tgi-1 | 2025-04-16T17:59:12.844372Z WARN text_generation_router_v3::backend: backends/v3/src/backend.rs:39: Model supports prefill chunking. `waiting_served_ratio` and `max_waiting_tokens` will be ignored.
tgi-1 | 2025-04-16T17:59:12.844382Z INFO text_generation_router_v3: backends/v3/src/lib.rs:166: Using backend V3
tgi-1 | 2025-04-16T17:59:12.844386Z INFO text_generation_router: backends/v3/src/main.rs:162: Maximum input tokens defaulted to 32767
tgi-1 | 2025-04-16T17:59:12.844390Z INFO text_generation_router: backends/v3/src/main.rs:168: Maximum total tokens defaulted to 32768
tgi-1 | 2025-04-16T17:59:12.844424Z INFO text_generation_router::server: router/src/server.rs:1560: Using the Hugging Face API
tgi-1 | 2025-04-16T17:59:13.347590Z INFO text_generation_router::server: router/src/server.rs:2309: Serving revision aa8e72537993ba99e69dfaafa59ed015b17504d1 of model Qwen/Qwen2.5-3B-Instruct
tgi-1 | 2025-04-16T17:59:13.347621Z WARN text_generation_router::server: router/src/server.rs:1648: Tokenizer_config None - Some("/data/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/aa8e72537993ba99e69dfaafa59ed015b17504d1/tokenizer_config.json")
tgi-1 | 2025-04-16T17:59:16.124567Z INFO text_generation_router::server: router/src/server.rs:1716: Using config Some(Qwen2)
tgi-1 | 2025-04-16T17:59:16.380703Z WARN text_generation_router::server: router/src/server.rs:1879: Invalid hostname, defaulting to 0.0.0.0
tgi-1 | 2025-04-16T17:59:16.451632Z INFO text_generation_router::server: router/src/server.rs:2266: Connected
Output of request:
tgi-1 | 2025-04-16T17:59:21.877400Z INFO text_generation_router_v3::radix: backends/v3/src/radix.rs:108: Prefix 0 - Suffix 14834
tgi-1 | 2025-04-16T17:59:26.725525Z INFO generate{parameters=GenerateParameters { best_of: None, temperature: None, repetition_penalty: None, frequency_penalty: None, top_k: None, top_p: None, typical_p: None, do_sample: false, max_new_tokens: Some(20), return_full_text: None, stop: [], truncate: None, watermark: false, details: false, decoder_input_details: false, seed: None, top_n_tokens: None, grammar: None, adapter_id: None } total_time="4.87328776s" validation_time="25.183876ms" queue_time="373.328µs" inference_time="4.847730686s" time_per_token="242.386534ms" seed="None"}: text_generation_router::server: router/src/server.rs:424: Success