huggingface · Narsil · Apr 12, 2024 · Apr 11, 2024 · Apr 11, 2024 · Apr 11, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/basic_tutorials/launcher.md
@@ -60,9 +60,9 @@ Options:
           [env: QUANTIZE=]
 
           Possible values:
-          - awq:              4 bit quantization. Requires a specific AWQ quantized model: https://hf.co/models?search=awq. Should replace GPTQ models wherever possible because of the better latency
-          - eetq:             8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from https://github.com/NetEase-FuXi/EETQ.git
-          - gptq:             4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=gptq. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
+          - awq:              4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
+          - eetq:             8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
+          - gptq:             4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
           - bitsandbytes:     Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16
           - bitsandbytes-nf4: Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16
           - bitsandbytes-fp4: Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model
@@ -129,23 +129,29 @@ Options:
           [env: MAX_TOP_N_TOKENS=]
           [default: 5]
 
+```
+## MAX_INPUT_TOKENS
+```shell
+      --max-input-tokens <MAX_INPUT_TOKENS>
+          This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle. Default to min(max_position_embeddings - 1, 13383)
+
+          [env: MAX_INPUT_TOKENS=]
+
 ```
 ## MAX_INPUT_LENGTH
 ```shell
       --max-input-length <MAX_INPUT_LENGTH>
-          This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle
+          Legacy version of [`Args::max_input_tokens`]
 
           [env: MAX_INPUT_LENGTH=]
-          [default: 1024]
 
 ```
 ## MAX_TOTAL_TOKENS
 ```shell
       --max-total-tokens <MAX_TOTAL_TOKENS>
-          This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be
+          This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be. Default to min(max_position_embeddings, 16384)
 
           [env: MAX_TOTAL_TOKENS=]
-          [default: 2048]
 
 ```
 ## WAITING_SERVED_RATIO
@@ -162,10 +168,9 @@ Options:
 ## MAX_BATCH_PREFILL_TOKENS
 ```shell
       --max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
-          Limits the number of tokens for the prefill operation. Since this operation take the most memory and is compute bound, it is interesting to limit the number of requests that can be sent
+          Limits the number of tokens for the prefill operation. Since this operation take the most memory and is compute bound, it is interesting to limit the number of requests that can be sent. Default to min(max_input_length + 50, 16384) to give a bit of room
 
           [env: MAX_BATCH_PREFILL_TOKENS=]
-          [default: 4096]
 
 ```
 ## MAX_BATCH_TOTAL_TOKENS
@@ -210,10 +215,9 @@ Options:
 ## CUDA_GRAPHS
 ```shell
       --cuda-graphs <CUDA_GRAPHS>
-          Specify the batch sizes to compute cuda graphs for. Use "0" to disable
+          Specify the batch sizes to compute cuda graphs for. Use "0" to disable. Default = "1,2,4,8,16,32"
 
           [env: CUDA_GRAPHS=]
-          [default: 1,2,4,8,16,32,64,96,128]
 
 ```
 ## HOSTNAME

diff --git a/integration-tests/models/test_t5_sharded.py b/integration-tests/models/test_t5_sharded.py
@@ -3,7 +3,7 @@
 
 @pytest.fixture(scope="module")
 def t5_sharded_handle(launcher):
-    with launcher("google/flan-t5-xxl", num_shard=2) as handle:
+    with launcher("google/flan-t5-xxl", num_shard=4) as handle:
         yield handle
 
 

diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml
@@ -9,8 +9,10 @@ homepage.workspace = true
 [dependencies]
 clap = { version = "4.4.5", features = ["derive", "env"] }
 ctrlc = { version = "3.4.1", features = ["termination"] }
+hf-hub = "0.3.2"
 nix = { version = "0.28.0", features = ["signal"] }
-serde = { version = "1.0.188", features = ["derive"]  }
+once_cell = "1.19.0"
+serde = { version = "1.0.188", features = ["derive"] }
 serde_json = "1.0.107"
 tracing = "0.1.37"
 tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }