Skip to content

Zero config not working for VLMs #3181

@loganlebanoff

Description

@loganlebanoff

System Info

OS: Ubuntu 20.04

Docker setup:
docker-compose.yml

services:
  tgi:
    image: ghcr.io/huggingface/text-generation-inference:3.2.3
    command: --model-id ${MODEL}
    environment:
      - HF_TOKEN=${HUGGING_FACE_HUB_TOKEN}
    ports:
      - "80:80"
    volumes:
      - ~/.cache/huggingface/hub:/data
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    shm_size: 1g

Model being used: Qwen/Qwen2.5-VL-3B-Instruct

Hardware: A6000

Information

  • Docker
  • The CLI directly

Tasks

  • An officially supported command
  • My own modifications

Reproduction

Steps to reproduce issue:

  1. Start up TGI using docker compose above.

It produces this output:

tgi-1  | 2025-04-16T17:42:18.100436Z  INFO text_generation_launcher: Args {
tgi-1  |     model_id: "Qwen/Qwen2.5-VL-3B-Instruct",
tgi-1  |     revision: None,
tgi-1  |     validation_workers: 2,
tgi-1  |     sharded: None,
tgi-1  |     num_shard: None,
tgi-1  |     quantize: None,
tgi-1  |     speculate: None,
tgi-1  |     dtype: None,
tgi-1  |     kv_cache_dtype: None,
tgi-1  |     trust_remote_code: false,
tgi-1  |     max_concurrent_requests: 128,
tgi-1  |     max_best_of: 2,
tgi-1  |     max_stop_sequences: 4,
tgi-1  |     max_top_n_tokens: 5,
tgi-1  |     max_input_tokens: None,
tgi-1  |     max_input_length: None,
tgi-1  |     max_total_tokens: None,
tgi-1  |     waiting_served_ratio: 0.3,
tgi-1  |     max_batch_prefill_tokens: None,
tgi-1  |     max_batch_total_tokens: None,
tgi-1  |     max_waiting_tokens: 20,
tgi-1  |     max_batch_size: None,
tgi-1  |     cuda_graphs: None,
tgi-1  |     hostname: "15090dd2add3",
tgi-1  |     port: 80,
tgi-1  |     shard_uds_path: "/tmp/text-generation-server",
tgi-1  |     master_addr: "localhost",
tgi-1  |     master_port: 29500,
tgi-1  |     huggingface_hub_cache: None,
tgi-1  |     weights_cache_override: None,
tgi-1  |     disable_custom_kernels: false,
tgi-1  |     cuda_memory_fraction: 1.0,
tgi-1  |     rope_scaling: None,
tgi-1  |     rope_factor: None,
tgi-1  |     json_output: false,
tgi-1  |     otlp_endpoint: None,
tgi-1  |     otlp_service_name: "text-generation-inference.router",
tgi-1  |     cors_allow_origin: [],
tgi-1  |     api_key: None,
tgi-1  |     watermark_gamma: None,
tgi-1  |     watermark_delta: None,
tgi-1  |     ngrok: false,
tgi-1  |     ngrok_authtoken: None,
tgi-1  |     ngrok_edge: None,
tgi-1  |     tokenizer_config_path: None,
tgi-1  |     disable_grammar_support: false,
tgi-1  |     env: false,
tgi-1  |     max_client_batch_size: 4,
tgi-1  |     lora_adapters: None,
tgi-1  |     usage_stats: On,
tgi-1  |     payload_limit: 2000000,
tgi-1  |     enable_prefill_logprobs: false,
tgi-1  |     graceful_termination_timeout: 90,
tgi-1  | }
tgi-1  | 2025-04-16T17:42:19.440429Z  INFO text_generation_launcher: Disabling prefix caching because of VLM model
tgi-1  | 2025-04-16T17:42:19.440443Z  INFO text_generation_launcher: Using attention flashinfer - Prefix caching 0
tgi-1  | 2025-04-16T17:42:19.501683Z  WARN text_generation_launcher: Unkown compute for card nvidia-rtx-a6000
tgi-1  | 2025-04-16T17:42:19.568964Z  INFO text_generation_launcher: Default `max_batch_prefill_tokens` to 10000
tgi-1  | 2025-04-16T17:42:19.568984Z  INFO text_generation_launcher: Using default cuda graphs [1, 2, 4, 8, 16, 32]
tgi-1  | 2025-04-16T17:42:19.569139Z  INFO download: text_generation_launcher: Starting check and download process for Qwen/Qwen2.5-VL-3B-Instruct
tgi-1  | 2025-04-16T17:42:23.364798Z  INFO text_generation_launcher: Files are already present on the host. Skipping download.
tgi-1  | 2025-04-16T17:42:23.990024Z  INFO download: text_generation_launcher: Successfully downloaded weights for Qwen/Qwen2.5-VL-3B-Instruct
tgi-1  | 2025-04-16T17:42:23.990262Z  INFO shard-manager: text_generation_launcher: Starting shard rank=0
tgi-1  | 2025-04-16T17:42:27.702616Z  INFO text_generation_launcher: Using prefix caching = False
tgi-1  | 2025-04-16T17:42:27.702644Z  INFO text_generation_launcher: Using Attention = flashinfer
tgi-1  | 2025-04-16T17:42:34.016804Z  INFO shard-manager: text_generation_launcher: Waiting for shard to be ready... rank=0
tgi-1  | 2025-04-16T17:42:39.499586Z  INFO text_generation_launcher: Using prefill chunking = False
tgi-1  | 2025-04-16T17:42:40.220379Z  INFO text_generation_launcher: Server started at unix:///tmp/text-generation-server-0
tgi-1  | 2025-04-16T17:42:40.222196Z  INFO shard-manager: text_generation_launcher: Shard ready in 16.212793444s rank=0
tgi-1  | 2025-04-16T17:42:40.305185Z  INFO text_generation_launcher: Starting Webserver
tgi-1  | 2025-04-16T17:42:40.403139Z  INFO text_generation_router_v3: backends/v3/src/lib.rs:125: Warming up model
tgi-1  | 2025-04-16T17:42:40.481633Z  INFO text_generation_launcher: Using optimized Triton indexing kernels.
tgi-1  | 2025-04-16T17:42:44.453072Z  INFO text_generation_launcher: KV-cache blocks: 1008993, size: 1
tgi-1  | 2025-04-16T17:42:44.500701Z  INFO text_generation_launcher: Cuda Graphs are enabled for sizes [32, 16, 8, 4, 2, 1]
tgi-1  | 2025-04-16T17:42:45.831884Z  INFO text_generation_router_v3: backends/v3/src/lib.rs:137: Setting max batch total tokens to 1008993
tgi-1  | 2025-04-16T17:42:45.831905Z  INFO text_generation_router_v3: backends/v3/src/lib.rs:166: Using backend V3
tgi-1  | 2025-04-16T17:42:45.831911Z  INFO text_generation_router: backends/v3/src/main.rs:162: Maximum input tokens defaulted to 9999
tgi-1  | 2025-04-16T17:42:45.831915Z  INFO text_generation_router: backends/v3/src/main.rs:168: Maximum total tokens defaulted to 10000
tgi-1  | 2025-04-16T17:42:45.831954Z  INFO text_generation_router::server: router/src/server.rs:1560: Using the Hugging Face API
tgi-1  | 2025-04-16T17:42:46.127361Z  INFO text_generation_router::server: router/src/server.rs:2309: Serving revision 66285546d2b821cf421d4f5eb2576359d3770cd3 of model Qwen/Qwen2.5-VL-3B-Instruct
tgi-1  | 2025-04-16T17:42:46.127423Z  WARN text_generation_router::server: router/src/server.rs:1648: Tokenizer_config None - Some("/data/hub/models--Qwen--Qwen2.5-VL-3B-Instruct/snapshots/66285546d2b821cf421d4f5eb2576359d3770cd3/tokenizer_config.json")
tgi-1  | 2025-04-16T17:42:46.127465Z  INFO text_generation_router::server: router/src/server.rs:1661: Using chat template from chat_template.json
tgi-1  | 2025-04-16T17:42:48.864511Z  INFO text_generation_router::server: router/src/server.rs:1716: Using config Some(Qwen2_5Vl(Qwen2_5Vl { vision_config: Qwen2_5VlVisionConfig { depth: 32, hidden_act: "silu", hidden_size: 1280, intermediate_size: 3420, num_heads: 16, in_chans: 3, out_hidden_size: 2048, patch_size: 14, spatial_merge_size: 2, spatial_patch_size: 14, window_size: 112, fullatt_block_indexes: [7, 15, 23, 31], tokens_per_second: 2, temporal_patch_size: 2 } }))
tgi-1  | 2025-04-16T17:42:49.135511Z  WARN text_generation_router::server: router/src/server.rs:1879: Invalid hostname, defaulting to 0.0.0.0
tgi-1  | 2025-04-16T17:42:49.204635Z  INFO text_generation_router::server: router/src/server.rs:2266: Connected
  1. Run the curl command that has a long context (replace "inputs" with a very long prompt):
curl -X 'POST' \
  'http://localhost:80/generate' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
  "inputs": "My name is Olivier and I My name is Olivier and I My name is Olivier and I...",
  "parameters": {
  }
}'

It produces this output:

tgi-1  | 2025-04-16T17:45:51.750055Z ERROR generate{parameters=GenerateParameters { best_of: None, temperature: None, repetition_penalty: None, frequency_penalty: None, top_k: None, top_p: None, typical_p: None, do_sample: false, max_new_tokens: None, return_full_text: None, stop: [], truncate: None, watermark: false, details: false, decoder_input_details: false, seed: None, top_n_tokens: None, grammar: None, adapter_id: None }}:generate:generate_stream: text_generation_router::infer: router/src/infer/mod.rs:126: `inputs` tokens + `max_new_tokens` must be <= 10000. Given: 14815 `inputs` tokens and 0 `max_new_tokens`

Expected behavior

I expected it to succeed with this long context request. More specifically, I would have expected the server to initialize the Maximum input tokens and Maximum total tokens to match the model's max_position_embeddings which is 128000 (https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct/blob/main/config.json#L17). Instead it initialized with 10000 for a reason unknown to me: Maximum input tokens defaulted to 9999 and Maximum total tokens defaulted to 10000. The same happens with meta-llama/Llama-3.2-11B-Vision-Instruct.

What I expected to happen is the output when I run with model Qwen/Qwen2.5-3B-Instruct (a non-vision model), which runs fine. The same happens with meta-llama/Llama-3.1-8B-Instruct.

tgi-1  | 2025-04-16T17:55:33.277248Z  INFO text_generation_launcher: Args {
tgi-1  |     model_id: "Qwen/Qwen2.5-3B-Instruct",
tgi-1  |     revision: None,
tgi-1  |     validation_workers: 2,
tgi-1  |     sharded: None,
tgi-1  |     num_shard: None,
tgi-1  |     quantize: None,
tgi-1  |     speculate: None,
tgi-1  |     dtype: None,
tgi-1  |     kv_cache_dtype: None,
tgi-1  |     trust_remote_code: false,
tgi-1  |     max_concurrent_requests: 128,
tgi-1  |     max_best_of: 2,
tgi-1  |     max_stop_sequences: 4,
tgi-1  |     max_top_n_tokens: 5,
tgi-1  |     max_input_tokens: None,
tgi-1  |     max_input_length: None,
tgi-1  |     max_total_tokens: None,
tgi-1  |     waiting_served_ratio: 0.3,
tgi-1  |     max_batch_prefill_tokens: None,
tgi-1  |     max_batch_total_tokens: None,
tgi-1  |     max_waiting_tokens: 20,
tgi-1  |     max_batch_size: None,
tgi-1  |     cuda_graphs: None,
tgi-1  |     hostname: "21d0e584572b",
tgi-1  |     port: 80,
tgi-1  |     shard_uds_path: "/tmp/text-generation-server",
tgi-1  |     master_addr: "localhost",
tgi-1  |     master_port: 29500,
tgi-1  |     huggingface_hub_cache: None,
tgi-1  |     weights_cache_override: None,
tgi-1  |     disable_custom_kernels: false,
tgi-1  |     cuda_memory_fraction: 1.0,
tgi-1  |     rope_scaling: None,
tgi-1  |     rope_factor: None,
tgi-1  |     json_output: false,
tgi-1  |     otlp_endpoint: None,
tgi-1  |     otlp_service_name: "text-generation-inference.router",
tgi-1  |     cors_allow_origin: [],
tgi-1  |     api_key: None,
tgi-1  |     watermark_gamma: None,
tgi-1  |     watermark_delta: None,
tgi-1  |     ngrok: false,
tgi-1  |     ngrok_authtoken: None,
tgi-1  |     ngrok_edge: None,
tgi-1  |     tokenizer_config_path: None,
tgi-1  |     disable_grammar_support: false,
tgi-1  |     env: false,
tgi-1  |     max_client_batch_size: 4,
tgi-1  |     lora_adapters: None,
tgi-1  |     usage_stats: On,
tgi-1  |     payload_limit: 2000000,
tgi-1  |     enable_prefill_logprobs: false,
tgi-1  |     graceful_termination_timeout: 90,
tgi-1  | }
tgi-1  | 2025-04-16T17:55:34.937620Z  INFO text_generation_launcher: Using attention flashinfer - Prefix caching true
tgi-1  | 2025-04-16T17:55:34.977759Z  WARN text_generation_launcher: Unkown compute for card nvidia-rtx-a6000
tgi-1  | 2025-04-16T17:55:35.018086Z  INFO text_generation_launcher: Default `max_batch_prefill_tokens` to 4096
tgi-1  | 2025-04-16T17:55:35.018106Z  INFO text_generation_launcher: Using default cuda graphs [1, 2, 4, 8, 16, 32]
tgi-1  | 2025-04-16T17:55:35.018254Z  INFO download: text_generation_launcher: Starting check and download process for Qwen/Qwen2.5-3B-Instruct
tgi-1  | 2025-04-16T17:55:38.974552Z  INFO text_generation_launcher: Download file: model-00001-of-00002.safetensors
tgi-1  | 2025-04-16T17:56:14.760098Z  INFO text_generation_launcher: Downloaded /data/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/aa8e72537993ba99e69dfaafa59ed015b17504d1/model-00001-of-00002.safetensors in 0:00:35.
tgi-1  | 2025-04-16T17:56:14.760253Z  INFO text_generation_launcher: Download: [1/2] -- ETA: 0:00:35
tgi-1  | 2025-04-16T17:56:14.760703Z  INFO text_generation_launcher: Download file: model-00002-of-00002.safetensors
tgi-1  | 2025-04-16T17:56:36.298985Z  INFO text_generation_launcher: Downloaded /data/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/aa8e72537993ba99e69dfaafa59ed015b17504d1/model-00002-of-00002.safetensors in 0:00:21.
tgi-1  | 2025-04-16T17:56:36.299139Z  INFO text_generation_launcher: Download: [2/2] -- ETA: 0
tgi-1  | 2025-04-16T17:56:36.988721Z  INFO download: text_generation_launcher: Successfully downloaded weights for Qwen/Qwen2.5-3B-Instruct
tgi-1  | 2025-04-16T17:56:36.988997Z  INFO shard-manager: text_generation_launcher: Starting shard rank=0
tgi-1  | 2025-04-16T17:56:40.726163Z  INFO text_generation_launcher: Using prefix caching = True
tgi-1  | 2025-04-16T17:56:40.726192Z  INFO text_generation_launcher: Using Attention = flashinfer
tgi-1  | 2025-04-16T17:56:47.019674Z  INFO shard-manager: text_generation_launcher: Waiting for shard to be ready... rank=0
tgi-1  | 2025-04-16T17:56:56.805715Z  INFO text_generation_launcher: Using prefill chunking = True
tgi-1  | 2025-04-16T17:56:57.035209Z  INFO shard-manager: text_generation_launcher: Waiting for shard to be ready... rank=0
tgi-1  | 2025-04-16T17:56:57.060259Z  INFO text_generation_launcher: Server started at unix:///tmp/text-generation-server-0
tgi-1  | 2025-04-16T17:56:57.135310Z  INFO shard-manager: text_generation_launcher: Shard ready in 20.124977041s rank=0
tgi-1  | 2025-04-16T17:56:57.215907Z  INFO text_generation_launcher: Starting Webserver
tgi-1  | 2025-04-16T17:56:57.310651Z  INFO text_generation_router_v3: backends/v3/src/lib.rs:125: Warming up model
tgi-1  | 2025-04-16T17:56:57.338336Z  INFO text_generation_launcher: Using optimized Triton indexing kernels.
tgi-1  | 2025-04-16T17:56:59.574858Z  INFO text_generation_launcher: KV-cache blocks: 1069251, size: 1
tgi-1  | 2025-04-16T17:56:59.595217Z  INFO text_generation_launcher: Cuda Graphs are enabled for sizes [32, 16, 8, 4, 2, 1]
tgi-1  | 2025-04-16T17:57:01.163878Z  INFO text_generation_router_v3: backends/v3/src/lib.rs:137: Setting max batch total tokens to 1069251
tgi-1  | 2025-04-16T17:57:01.163898Z  WARN text_generation_router_v3::backend: backends/v3/src/backend.rs:39: Model supports prefill chunking. `waiting_served_ratio` and `max_waiting_tokens` will be ignored.
tgi-1  | 2025-04-16T17:57:01.163909Z  INFO text_generation_router_v3: backends/v3/src/lib.rs:166: Using backend V3
tgi-1  | 2025-04-16T17:57:01.163914Z  INFO text_generation_router: backends/v3/src/main.rs:162: Maximum input tokens defaulted to 32767
tgi-1  | 2025-04-16T17:57:01.163918Z  INFO text_generation_router: backends/v3/src/main.rs:168: Maximum total tokens defaulted to 32768
tgi-1  | 2025-04-16T17:57:01.163958Z  INFO text_generation_router::server: router/src/server.rs:1560: Using the Hugging Face API
tgi-1  | 2025-04-16T17:57:01.592383Z  INFO text_generation_router::server: router/src/server.rs:2309: Serving revision aa8e72537993ba99e69dfaafa59ed015b17504d1 of model Qwen/Qwen2.5-3B-Instruct
tgi-1  | 2025-04-16T17:57:01.592403Z  WARN text_generation_router::server: router/src/server.rs:1648: Tokenizer_config None - Some("/data/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/aa8e72537993ba99e69dfaafa59ed015b17504d1/tokenizer_config.json")
tgi-1  | 2025-04-16T17:57:04.483820Z  INFO text_generation_router::server: router/src/server.rs:1716: Using config Some(Qwen2)
tgi-1  | 2025-04-16T17:57:04.720059Z  WARN text_generation_router::server: router/src/server.rs:1879: Invalid hostname, defaulting to 0.0.0.0
tgi-1  | 2025-04-16T17:57:04.787658Z  INFO text_generation_router::server: router/src/server.rs:2266: Connected
tgi-1  | 2025-04-16T17:57:44.821938Z  INFO text_generation_router_v3::radix: backends/v3/src/radix.rs:108: Prefix 0 - Suffix 15838
tgi-1  | 2025-04-16T17:58:12.285642Z  INFO text_generation_router_v3::radix: backends/v3/src/radix.rs:108: Prefix 14815 - Suffix 2047
tgi-1  | 2025-04-16T17:58:29.783466Z  INFO text_generation_launcher: Terminating webserver
tgi-1  | 2025-04-16T17:58:29.783495Z  INFO text_generation_launcher: Waiting for webserver to gracefully shutdown
tgi-1  | 2025-04-16T17:58:29.783633Z  INFO text_generation_router::server: router/src/server.rs:2363: signal received, starting graceful shutdown
tgi-1  | 2025-04-16T17:58:36.061853Z  INFO text_generation_router_v3::radix: backends/v3/src/radix.rs:108: Prefix 15839 - Suffix 2047
tgi-1  | 2025-04-16T17:58:45.571689Z  INFO text_generation_launcher: Args {
tgi-1  |     model_id: "Qwen/Qwen2.5-3B-Instruct",
tgi-1  |     revision: None,
tgi-1  |     validation_workers: 2,
tgi-1  |     sharded: None,
tgi-1  |     num_shard: None,
tgi-1  |     quantize: None,
tgi-1  |     speculate: None,
tgi-1  |     dtype: None,
tgi-1  |     kv_cache_dtype: None,
tgi-1  |     trust_remote_code: false,
tgi-1  |     max_concurrent_requests: 128,
tgi-1  |     max_best_of: 2,
tgi-1  |     max_stop_sequences: 4,
tgi-1  |     max_top_n_tokens: 5,
tgi-1  |     max_input_tokens: None,
tgi-1  |     max_input_length: None,
tgi-1  |     max_total_tokens: None,
tgi-1  |     waiting_served_ratio: 0.3,
tgi-1  |     max_batch_prefill_tokens: None,
tgi-1  |     max_batch_total_tokens: None,
tgi-1  |     max_waiting_tokens: 20,
tgi-1  |     max_batch_size: None,
tgi-1  |     cuda_graphs: None,
tgi-1  |     hostname: "21d0e584572b",
tgi-1  |     port: 80,
tgi-1  |     shard_uds_path: "/tmp/text-generation-server",
tgi-1  |     master_addr: "localhost",
tgi-1  |     master_port: 29500,
tgi-1  |     huggingface_hub_cache: None,
tgi-1  |     weights_cache_override: None,
tgi-1  |     disable_custom_kernels: false,
tgi-1  |     cuda_memory_fraction: 1.0,
tgi-1  |     rope_scaling: None,
tgi-1  |     rope_factor: None,
tgi-1  |     json_output: false,
tgi-1  |     otlp_endpoint: None,
tgi-1  |     otlp_service_name: "text-generation-inference.router",
tgi-1  |     cors_allow_origin: [],
tgi-1  |     api_key: None,
tgi-1  |     watermark_gamma: None,
tgi-1  |     watermark_delta: None,
tgi-1  |     ngrok: false,
tgi-1  |     ngrok_authtoken: None,
tgi-1  |     ngrok_edge: None,
tgi-1  |     tokenizer_config_path: None,
tgi-1  |     disable_grammar_support: false,
tgi-1  |     env: false,
tgi-1  |     max_client_batch_size: 4,
tgi-1  |     lora_adapters: None,
tgi-1  |     usage_stats: On,
tgi-1  |     payload_limit: 2000000,
tgi-1  |     enable_prefill_logprobs: false,
tgi-1  |     graceful_termination_timeout: 90,
tgi-1  | }
tgi-1  | 2025-04-16T17:58:46.896119Z  INFO text_generation_launcher: Using attention flashinfer - Prefix caching true
tgi-1  | 2025-04-16T17:58:46.949971Z  WARN text_generation_launcher: Unkown compute for card nvidia-rtx-a6000
tgi-1  | 2025-04-16T17:58:47.006974Z  INFO text_generation_launcher: Default `max_batch_prefill_tokens` to 4096
tgi-1  | 2025-04-16T17:58:47.006993Z  INFO text_generation_launcher: Using default cuda graphs [1, 2, 4, 8, 16, 32]
tgi-1  | 2025-04-16T17:58:47.007168Z  INFO download: text_generation_launcher: Starting check and download process for Qwen/Qwen2.5-3B-Instruct
tgi-1  | 2025-04-16T17:58:50.894011Z  INFO text_generation_launcher: Files are already present on the host. Skipping download.
tgi-1  | 2025-04-16T17:58:51.532562Z  INFO download: text_generation_launcher: Successfully downloaded weights for Qwen/Qwen2.5-3B-Instruct
tgi-1  | 2025-04-16T17:58:51.532887Z  INFO shard-manager: text_generation_launcher: Starting shard rank=0
tgi-1  | 2025-04-16T17:58:55.182433Z  INFO text_generation_launcher: Using prefix caching = True
tgi-1  | 2025-04-16T17:58:55.182460Z  INFO text_generation_launcher: Using Attention = flashinfer
tgi-1  | 2025-04-16T17:59:01.561594Z  INFO shard-manager: text_generation_launcher: Waiting for shard to be ready... rank=0
tgi-1  | 2025-04-16T17:59:08.928743Z  INFO text_generation_launcher: Using prefill chunking = True
tgi-1  | 2025-04-16T17:59:09.523752Z  INFO text_generation_launcher: Server started at unix:///tmp/text-generation-server-0
tgi-1  | 2025-04-16T17:59:09.569182Z  INFO shard-manager: text_generation_launcher: Shard ready in 18.016616627s rank=0
tgi-1  | 2025-04-16T17:59:09.649364Z  INFO text_generation_launcher: Starting Webserver
tgi-1  | 2025-04-16T17:59:09.728550Z  INFO text_generation_router_v3: backends/v3/src/lib.rs:125: Warming up model
tgi-1  | 2025-04-16T17:59:09.754738Z  INFO text_generation_launcher: Using optimized Triton indexing kernels.
tgi-1  | 2025-04-16T17:59:11.267111Z  INFO text_generation_launcher: KV-cache blocks: 1069251, size: 1
tgi-1  | 2025-04-16T17:59:11.286388Z  INFO text_generation_launcher: Cuda Graphs are enabled for sizes [32, 16, 8, 4, 2, 1]
tgi-1  | 2025-04-16T17:59:12.844359Z  INFO text_generation_router_v3: backends/v3/src/lib.rs:137: Setting max batch total tokens to 1069251
tgi-1  | 2025-04-16T17:59:12.844372Z  WARN text_generation_router_v3::backend: backends/v3/src/backend.rs:39: Model supports prefill chunking. `waiting_served_ratio` and `max_waiting_tokens` will be ignored.
tgi-1  | 2025-04-16T17:59:12.844382Z  INFO text_generation_router_v3: backends/v3/src/lib.rs:166: Using backend V3
tgi-1  | 2025-04-16T17:59:12.844386Z  INFO text_generation_router: backends/v3/src/main.rs:162: Maximum input tokens defaulted to 32767
tgi-1  | 2025-04-16T17:59:12.844390Z  INFO text_generation_router: backends/v3/src/main.rs:168: Maximum total tokens defaulted to 32768
tgi-1  | 2025-04-16T17:59:12.844424Z  INFO text_generation_router::server: router/src/server.rs:1560: Using the Hugging Face API
tgi-1  | 2025-04-16T17:59:13.347590Z  INFO text_generation_router::server: router/src/server.rs:2309: Serving revision aa8e72537993ba99e69dfaafa59ed015b17504d1 of model Qwen/Qwen2.5-3B-Instruct
tgi-1  | 2025-04-16T17:59:13.347621Z  WARN text_generation_router::server: router/src/server.rs:1648: Tokenizer_config None - Some("/data/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/aa8e72537993ba99e69dfaafa59ed015b17504d1/tokenizer_config.json")
tgi-1  | 2025-04-16T17:59:16.124567Z  INFO text_generation_router::server: router/src/server.rs:1716: Using config Some(Qwen2)
tgi-1  | 2025-04-16T17:59:16.380703Z  WARN text_generation_router::server: router/src/server.rs:1879: Invalid hostname, defaulting to 0.0.0.0
tgi-1  | 2025-04-16T17:59:16.451632Z  INFO text_generation_router::server: router/src/server.rs:2266: Connected

Output of request:

tgi-1  | 2025-04-16T17:59:21.877400Z  INFO text_generation_router_v3::radix: backends/v3/src/radix.rs:108: Prefix 0 - Suffix 14834
tgi-1  | 2025-04-16T17:59:26.725525Z  INFO generate{parameters=GenerateParameters { best_of: None, temperature: None, repetition_penalty: None, frequency_penalty: None, top_k: None, top_p: None, typical_p: None, do_sample: false, max_new_tokens: Some(20), return_full_text: None, stop: [], truncate: None, watermark: false, details: false, decoder_input_details: false, seed: None, top_n_tokens: None, grammar: None, adapter_id: None } total_time="4.87328776s" validation_time="25.183876ms" queue_time="373.328µs" inference_time="4.847730686s" time_per_token="242.386534ms" seed="None"}: text_generation_router::server: router/src/server.rs:424: Success

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions