huggingface · OlivierDehaene · Apr 12, 2024 · Apr 12, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,7 +9,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "1.4.5"
+version = "2.0.0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"

diff --git a/README.md b/README.md
@@ -76,7 +76,7 @@ For a detailed starting guide, please see the [Quick Tour](https://huggingface.c
 model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 --model-id $model
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
 ```
 
 And then you can make requests like
@@ -90,7 +90,7 @@ curl 127.0.0.1:8080/generate_stream \
 
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0-rocm --model-id $model` instead of the command above.
 
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@@ -120,7 +120,7 @@ model=meta-llama/Llama-2-7b-chat-hf
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>
 
-docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 --model-id $model
+docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)

diff --git a/docs/openapi.json b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "1.4.5"
+    "version": "2.0.0"
   },
   "paths": {
     "/": {

diff --git a/...ration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json b/...ration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json
@@ -17,7 +17,7 @@
   "id": "",
   "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
   "object": "text_completion",
-  "system_fingerprint": "1.4.5-native",
+  "system_fingerprint": "2.0.0-native",
   "usage": {
     "completion_tokens": 100,
     "prompt_tokens": 60,

diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json
@@ -31,7 +31,7 @@
   "id": "",
   "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
   "object": "text_completion",
-  "system_fingerprint": "1.4.5-native",
+  "system_fingerprint": "2.0.0-native",
   "usage": {
     "completion_tokens": 29,
     "prompt_tokens": 316,

diff --git a/...tion-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json b/...tion-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json
@@ -31,7 +31,7 @@
   "id": "",
   "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
   "object": "text_completion",
-  "system_fingerprint": "1.4.5-native",
+  "system_fingerprint": "2.0.0-native",
   "usage": {
     "completion_tokens": 29,
     "prompt_tokens": 316,

diff --git a/...on-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json b/...on-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json
@@ -30,7 +30,7 @@
   "id": "",
   "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
   "object": "text_completion",
-  "system_fingerprint": "1.4.5-native",
+  "system_fingerprint": "2.0.0-native",
   "usage": {
     "completion_tokens": 21,
     "prompt_tokens": 187,

diff --git a/...on-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json b/...on-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json
@@ -23,5 +23,5 @@
   "id": "",
   "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
   "object": "text_completion",
-  "system_fingerprint": "1.4.5-native"
+  "system_fingerprint": "2.0.0-native"
 }
diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-integration-tests"
-version = "1.4.5"
+version = "2.0.0"
 description = "Text Generation Inference integration tests"
 authors = ["Nicolas Patry <nicolas@huggingface.co>"]
 

diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-server"
-version = "1.4.5"
+version = "2.0.0"
 description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
 

diff --git a/server/text_generation_server/interceptor.py b/server/text_generation_server/interceptor.py
@@ -23,6 +23,10 @@ async def intercept(
             method_name = method_name.split("/")[-1]
             logger.exception(f"Method {method_name} encountered an error.")
 
+            # Runtime Error cannot be recovered from
+            if isinstance(err, RuntimeError):
+                exit(1)
+
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
 

diff --git a/server/text_generation_server/models/cache_manager.py b/server/text_generation_server/models/cache_manager.py
@@ -55,9 +55,10 @@ def allocate(
     ):
         # Get free blocks indices by finding values in mask that are not set to 0
         free_block_indices = self.free_block_mask.nonzero()
-        assert (
-            len(free_block_indices) >= blocks
-        ), f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
+        if blocks > len(free_block_indices):
+            raise RuntimeError(
+                f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
+            )
 
         # Slice by the number of required blocks
         block_indices = free_block_indices[:blocks]

diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
@@ -503,6 +503,10 @@ def forward(
         self, input: torch.Tensor
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         logits = self.lm_head(input)
+        # If we have too many tokens, we skip speculative logits
+        if input.shape[0] > 128:
+            return logits, None
+
         speculative_logits = self.medusa(input)
         return logits, speculative_logits
 
@@ -549,6 +553,11 @@ def __init__(self, config, prefix, weights):
         self.lm_head = TensorParallelHead.load(config, prefix, weights)
 
     def forward(self, x):
+        # If we have too many tokens, we skip speculative logits
+        if x.shape[0] > 128:
+            logits = self.lm_head(x)
+            return logits, None
+
         size = x.shape[-1]
         block_size = (size + self.world_size - 1) // self.world_size
         start = self.rank * block_size