Skip to content

Prefix caching causes 2 different responses from the same HTTP call with seed set depending on what machine calls #2670

@sam-ulrich1

Description

@sam-ulrich1

System Info

tag:2.3.1 docker image running on nvidia 4090 on top of 20.04 Ubuntu

2024-10-18T19:25:04.160854Z  INFO text_generation_launcher: Args {
    model_id: "Qwen/Qwen2.5-Coder-1.5B",
    revision: None,
    validation_workers: 2,
    sharded: None,
    num_shard: None,
    quantize: Some(
        Fp8,
    ),
    speculate: Some(
        6,
    ),
    dtype: None,
    trust_remote_code: false,
    max_concurrent_requests: 128,
    max_best_of: 2,
    max_stop_sequences: 4,
    max_top_n_tokens: 5,
    max_input_tokens: Some(
        9000,
    ),
    max_input_length: None,
    max_total_tokens: Some(
        9999,
    ),
    waiting_served_ratio: 0.3,
    max_batch_prefill_tokens: Some(
        10000,
    ),
    max_batch_total_tokens: None,
    max_waiting_tokens: 20,
    max_batch_size: None,
    cuda_graphs: None,
    hostname: "3f2367249b02",
    port: 80,
    shard_uds_path: "/tmp/text-generation-server",
    master_addr: "localhost",
    master_port: 29500,
    huggingface_hub_cache: None,
    weights_cache_override: None,
    disable_custom_kernels: false,
    cuda_memory_fraction: 1.0,
    rope_scaling: None,
    rope_factor: None,
    json_output: false,
    otlp_endpoint: None,
    otlp_service_name: "text-generation-inference.router",
    cors_allow_origin: [],
    api_key: None,
    watermark_gamma: None,
    watermark_delta: None,
    ngrok: false,
    ngrok_authtoken: None,
    ngrok_edge: None,
    tokenizer_config_path: None,
    disable_grammar_support: false,
    env: false,
    max_client_batch_size: 4,
    lora_adapters: None,
    usage_stats: On,
}

Information

  • Docker
  • The CLI directly

Tasks

  • An officially supported command
  • My own modifications

Reproduction

Dumped the raw HTTP request from my server that is calling then replicated on my personal machine to the same TGI server and get 2 different responses. I dumped the raw HTTP calls because after validating the payload my only thought was maybe headers but there are no headers included aside from Content-Type and Content-Length. Using fasthttp in golang to make the call. The current example isn't the best since the responses are close. Normally I get garbage from the server call and quality from the local machine call. I tried rolling back to v2.2.0 to exclude prefix caching in case that was the problem but the qwen model is not supported. Is it possible to disable prefix caching to test?
Server

POST /generate HTTP/1.1
Host: <REDACTED>:8080
Content-Type: application/json
Content-Length: 789

{"inputs":"\u003c|file_sep|\u003ebot/tts_handler.py\n\u003c|fim_prefix|\u003eimport io\nimport logging\nfrom elevenlabs import generate, Voice, VoiceSettings, set_api_key\nfrom config import Config\n\nlogger= logging.getLogger(__name__)\n\nclass TTSHandler:\n    def __init(self, config: Config):\n        self.config = config\n        self.voice = Voice(config.voice_id)\n        self.voice_settings = VoiceSettings(config.voice_settings)\n        \u003c|fim_suffix|\u003e\n        \n\n\n\u003c|fim_middle|\u003e","parameters":{"do_sample":false,"max_new_tokens":1000,"return_full_text":false,"stop":["\u003c|file_sep|\u003e","\u003c|repo_name|\u003e","\u003c|fim_prefix|\u003e","\n"],"seed":69420,"temperature":0.3,"top_k":50,"top_p":0.8,"watermark":false,"details":true},"stream":false}

Response

{"generated_text":"set_api_key(config.elevenlabs_api_key1\n","details":{"finish_reason":"stop_sequence","generated_tokens":12,"seed":69420,"prefill":[],"tokens":[{"id":746,"text":"set","logprob":0.0,"special":false},{"id":11697,"text":"_api","logprob":0.0,"special":false},{"id":3097,"text":"_key","logprob":0.0,"special":false},{"id":8754,"text":"(config","logprob":0.0,"special":false},{"id":1734,"text":".e","logprob":0.0,"special":false},{"id":273,"text":"le","logprob":0.0,"special":false},{"id":1037,"text":"ven","logprob":0.0,"special":false},{"id":70271,"text":"labs","logprob":0.0,"special":false},{"id":11697,"text":"_api","logprob":0.0,"special":false},{"id":3097,"text":"_key","logprob":0.0,"special":false},{"id":16,"text":"1","logprob":0.0,"special":false},{"id":198,"text":"\n","logprob":0.0,"special":false}]}}

Local

POST /generate HTTP/1.1
Host: <REDACTED>:8080
Content-Type: application/json
Content-Length: 789

{"inputs":"\u003c|file_sep|\u003ebot/tts_handler.py\n\u003c|fim_prefix|\u003eimport io\nimport logging\nfrom elevenlabs import generate, Voice, VoiceSettings, set_api_key\nfrom config import Config\n\nlogger= logging.getLogger(__name__)\n\nclass TTSHandler:\n    def __init(self, config: Config):\n        self.config = config\n        self.voice = Voice(config.voice_id)\n        self.voice_settings = VoiceSettings(config.voice_settings)\n        \u003c|fim_suffix|\u003e\n        \n\n\n\u003c|fim_middle|\u003e","parameters":{"do_sample":false,"max_new_tokens":1000,"return_full_text":false,"stop":["\u003c|file_sep|\u003e","\u003c|repo_name|\u003e","\u003c|fim_prefix|\u003e","\n"],"seed":69420,"temperature":0.3,"top_k":50,"top_p":0.8,"watermark":false,"details":true},"stream":false}

Response

{"generated_text":"set_api_key(config.elevenlabs_api_key(config123456789012\\\n","details":{"finish_reason":"stop_sequence","generated_tokens":25,"seed":69420,"prefill":[],"tokens":[{"id":746,"text":"set","logprob":0.0,"special":false},{"id":11697,"text":"_api","logprob":0.0,"special":false},{"id":3097,"text":"_key","logprob":0.0,"special":false},{"id":8754,"text":"(config","logprob":0.0,"special":false},{"id":1734,"text":".e","logprob":0.0,"special":false},{"id":273,"text":"le","logprob":0.0,"special":false},{"id":1037,"text":"ven","logprob":0.0,"special":false},{"id":70271,"text":"labs","logprob":0.0,"special":false},{"id":11697,"text":"_api","logprob":0.0,"special":false},{"id":3097,"text":"_key","logprob":0.0,"special":false},{"id":8754,"text":"(config","logprob":0.0,"special":false},{"id":16,"text":"1","logprob":0.0,"special":false},{"id":17,"text":"2","logprob":0.0,"special":false},{"id":18,"text":"3","logprob":0.0,"special":false},{"id":19,"text":"4","logprob":0.0,"special":false},{"id":20,"text":"5","
logprob":0.0,"special":false},{"id":21,"text":"6","logprob":0.0,"special":false},{"id":22,"text":"7","logprob":0.0,"special":false},{"id":23,"text":"8","logprob":0.0,"special":false},{"id":24,"text":"9","logprob":0.0,"special":false},{"id":15,"text":"0","logprob":0.0,"special":false},{"id":16,"text":"1","logprob":-0.3125,"special":false},{"id":17,"text":"2","logprob":0.0,"special":false},{"id":59,"text":"\\","logprob":-2.078125,"special":false},{"id":198,"text":"\n","logprob":0.0,"special":false}]}}

Expected behavior

Same response with the same call to the same tgi server regardless of the macine

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions